def do_index(): initVM() indexDir = "/home/william/woyaoo/luceneindex" version = Version.LUCENE_CURRENT standardAnalyzer = StandardAnalyzer(version) # chineseAnalyzer = CJKAnalyzer(version) engine = data.engine_from_config("indexdb.config") # engine = data.engine_from_config() db = data.init_datafactory(engine) docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all() print len(docs) idxDir = SimpleFSDirectory(File(indexDir)) perIndexCount = 5000 writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512)) # add field for doc in docs: # print repr(doc.description) lucenedoc = Document() descriptionValue = doc.description.strip("\r\n").encode("UTF-8") # descriptionValue ='中国 abc' print repr(descriptionValue) lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED)) lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED)) # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED)) lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(lucenedoc) writer.optimize() writer.close() print "index finished"
def my_learnstore(): engine = data.engine_from_config("localdb.config") db = data.init_datafactory(engine) uselessTalks = dbfactory.Session().query(bayeslib.BayeseLib).filter_by(is_useless=1).all() for uselessTalk in uselessTalks: userlessStr = uselessTalk.value.encode("utf8") words = my_chinesesegment.splitchinese(userlessStr) mybayes.my_learn(words, True) usefulTalks = dbfactory.Session().query(bayeslib.BayeseLib).filter_by(is_useless=0).all() for usefulTalk in usefulTalks: usefulStr = usefulTalk.value.encode("utf8") words = my_chinesesegment.splitchinese(usefulStr) mybayes.my_learn(words, False)