Example #1
0
								if(j=='-'):
									j = ' '
							bookclass = cls
							break
						except ValueError as ve:
							bookclass = cls
							break
					except ValueError as ve:
						continue
				except ValueError as ve:
					continue
			buff = []
			xml += '<book class="'+str(bookclass.encode('utf8'))+'">'
			xml += '<title>'+str(title.encode('utf8'))+'</title>'
			xml += '<noticekoha>'+str(noticekoha.encode('utf8'))+'</noticekoha>'
			xml += '<category>'+str(Classification.classToCategory(bookclass).encode('utf8'))+'</category>'
			xml += '</book>'
		else:
			buff.append(ref);
	
	xml += '</Document>'
	session.close()

	# Write String xml into database (.xml file).
	session1.add("bookref.xml", xml)
	xml = xmldom.parseString(xml)
	pretty_xml_as_string = xml.toprettyxml()
	with open(outFile,"w") as f:
		f.write(pretty_xml_as_string.encode('utf8'));
	session1.close()
Example #2
0
							break
						except ValueError as ve:
							bookclass = cls
							break
					except ValueError as ve:
						continue
				except ValueError as ve:
					continue

			#print bookclass
			lang_offset = buff.index("##")
			lang = []
			for i in range(0,lang_offset):
				lang.append(buff[i])

			code = str(Classification.classToCategory(bookclass).encode('utf8'))
			if code not in books :
				books[code] = dict()

			#keyword for a book of 'code'&'ref', set() for non-duplicate
			ref = buff[lang_offset+1]
			if ref not in books[code] :
				books[code][ref] = set()
			print code,ref
			for i in range(lang_offset+2,len(buff)):
				tokens = wpt.tokenize(buff[i])
				#remove stopwords before stem
				filtered_tokens = [w for w in tokens if not w in stopwords_list_encoded]
				for token in filtered_tokens:
					#use only token that doesn't have punctuation