Exemple #1
0
def do_index():
    initVM()
    indexDir = "/home/william/woyaoo/luceneindex"
    version = Version.LUCENE_CURRENT
    standardAnalyzer = StandardAnalyzer(version)
    # chineseAnalyzer = CJKAnalyzer(version)
    engine = data.engine_from_config("indexdb.config")
    # engine = data.engine_from_config()
    db = data.init_datafactory(engine)
    docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
    print len(docs)
    idxDir = SimpleFSDirectory(File(indexDir))
    perIndexCount = 5000
    writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))

    # add field
    for doc in docs:
        # print repr(doc.description)
        lucenedoc = Document()
        descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
        # descriptionValue ='中国 abc'
        print repr(descriptionValue)
        lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
        lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
        # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(lucenedoc)
        writer.optimize()
    writer.close()
    print "index finished"
Exemple #2
0
def my_learnstore():
    engine = data.engine_from_config("localdb.config")
    db = data.init_datafactory(engine)

    uselessTalks = dbfactory.Session().query(bayeslib.BayeseLib).filter_by(is_useless=1).all()
    for uselessTalk in uselessTalks:
        userlessStr = uselessTalk.value.encode("utf8")
        words = my_chinesesegment.splitchinese(userlessStr)
        mybayes.my_learn(words, True)
    usefulTalks = dbfactory.Session().query(bayeslib.BayeseLib).filter_by(is_useless=0).all()
    for usefulTalk in usefulTalks:
        usefulStr = usefulTalk.value.encode("utf8")
        words = my_chinesesegment.splitchinese(usefulStr)
        mybayes.my_learn(words, False)