Beispiel #1
0
    print "cpu_num=" + str(cpu_num)
    print "doc_limit=" + str(doc_limit)
    if os.path.exists(docpath):
        from ar import filebyfileHandle
        filebyfileHandle(docpath, doc_limit, cpu_num,
                         NUM_DOC)  # 100字符内的文件抛掉不处理,多进程不指定默认 multiprocess=4
    else:
        mkdir(docpath)
    t02 = time.time()

    print "prepare time = ", t02 - t01

    t11 = time.time()
    from dict_stream_train import getDictionary

    dict = getDictionary(lsipath=lsipath, docpath=docpath)
    t12 = time.time()
    print "dict time = ", t12 - t11

    t21 = time.time()
    from corpus_stream_train import getCorpus

    corpus = getCorpus(lsipath=lsipath, docpath=docpath)
    t22 = time.time()
    print "corpus time = ", t22 - t21

    #gc
    del dict, corpus
    gc.collect()

    t31 = time.time()
	NUM_TOPIC = 300		# 主题的数量,默认为 300
	NUM_DOC = -1		# 所选取的语料集中的文件数量

	# if os.path.exists(docpath):
	# 	shutil.rmtree(docpath)  # 删除目录
	# if os.path.exists(lsipath):
	# 	shutil.rmtree(lsipath)  # 删除目录
	t01 = time.time()
	if  os.path.exists(docpath):
		from ar import filebyfileHandle
		filebyfileHandle(docpath,100,4,NUM_DOC)   #100字符内的文件抛掉不处理,多进程默认 multiprocess=4
	t02 = time.time()

	t11 = time.time()
	from dict_stream_train import getDictionary
	dict = getDictionary(lsipath=lsipath, docpath=docpath)
	t12 = time.time()

	t21 = time.time()
	from corpus_stream_train import getCorpus
	corpus = getCorpus(lsipath=lsipath, docpath=docpath)
	t22 = time.time()

	t31 = time.time()
	from lsi_stream_train import getLsiModel
	lsimodel = getLsiModel(lsipath=lsipath, num_topics=NUM_TOPIC)
	t32 = time.time()

	t41 = time.time()
	from index_stream_train import getIndex
	getIndex(lsipath, NUM_TOPIC)	#change by baobao ,add NUM_TOPIC
def sim_update(results):
    """
    Update Models.
    :param results:
    :return:
    """

    shutil.rmtree(lsitemp,ignore_errors=True)
    mkdir(lsitemp)


    t_total_begin = time.time()

    # print("Checking repeat ...")
    # results_temp = check_repet_new(results)
    results_temp = results
    # print("Check repeat complete!")
    print("Prefix mapping ...")
    results = prefix_map(results_temp)
    print("Prefix map complete!")
    del results_temp

    print("Building LSI model ...")

    # Extended Dictionary
    dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
    # Load Models
    corpus_raw = corpora.MmCorpus(lsipath + 'viva.mm')
    lsi = lsimodel.LsiModel.load(lsipath + 'viva.lsi')  # 将 mm 文件中的 corpus 映射到 LSI 空间当中

    mkdir(news_post_add)

    # Preporcessing text. Get corpus_add.
    for postfile in results:
        deltags = stripTags(postfile['text'])
        text_del = delstopwords("".join(deltags.split()))
        # text_vec = jieba.lcut(text_del)
        # del and
        with open(news_post_add + postfile['name'], 'w') as fp:
            fp.write(text_del)

    files = os.listdir(news_post_add)
    for i in files:
        shutil.copy(news_post_add + i, docpath)

    from dict_stream_train import getDictionary
    dict2 = getDictionary(lsipath=lsitemp, docpath=news_post_add)
    dict2 = corpora.Dictionary.load(lsitemp + 'viva.dict')

    from corpus_stream_train import getCorpus
    corpus2 = getCorpus(lsipath=lsitemp, docpath=news_post_add)
    corpus2 = corpora.MmCorpus(lsitemp + 'viva.mm')

    dict2_to_dict1 = dictionary.merge_with(dict2)
    # dict2_to_dict1.save(lsipath + 'viva2.dict')
    # dict2_to_dict1 = corpora.Dictionary.load(lsipath + 'viva2.dict')

    merged_corpus = itertools.chain(corpus_raw, dict2_to_dict1[corpus2])
    corpora.MmCorpus.serialize(lsipath + 'viva.mm', [i for i in merged_corpus])
    merged_corpus = corpora.MmCorpus(lsipath + 'viva.mm')

    # Get TF-IDF vecters of documents
    tfidf = tfidfmodel.TfidfModel(merged_corpus)
    print("Building tfidf model ...")
    corpus_tfidf = tfidf[merged_corpus]
    print("Building corpus_tfidf model ...")
    # Updated LSI Model

    # lsi.add_documents(corpus_tfidf, chunksize=chunksize, decay=DECAY_FACTOR)
    # # lsi.add_documents(corpus_tfidf, chunksize=chunksize)
    #
    # print("Builded lsi add documents to model ...")
    # # Updated Corpus
    # if not os.path.exists(lsipath):
    #     os.mkdir(lsipath)
    # # corpus = corpora.MmCorpus.serialize(lsipath + 'viva.mm', itertools.chain(corpus_raw, corpus2))

    lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=NUM_TOPIC, chunksize=chunksize, power_iters=2, onepass=True)  # 其他参数都是默认

    lsi.save(lsipath + 'viva.lsi')
    lsi = models.lsimodel.LsiModel.load(lsipath + 'viva.lsi')
    index = similarities.docsim.Similarity(lsipath + 'viva.index', lsi[merged_corpus], num_features=NUM_TOPIC)
    # Save Models

    index.save(lsipath + 'viva.index')
    print("LSI model saved!")

    # Print elasped time
    t2 = time.time()
    print "Total elapsed time is: ", t2 - t_total_begin, "s"
Beispiel #4
0
def sim_update(results):
    """
    Update Models.
    :param results:
    :return:
    """

    shutil.rmtree(lsitemp, ignore_errors=True)
    mkdir(lsitemp)

    t_total_begin = time.time()

    # print("Checking repeat ...")
    # results_temp = check_repet_new(results)
    results_temp = results
    # print("Check repeat complete!")
    print("Prefix mapping ...")
    results = prefix_map(results_temp)
    print("Prefix map complete!")
    del results_temp

    print("Building LSI model ...")

    # Extended Dictionary
    dictionary = corpora.Dictionary.load(lsipath + 'viva.dict')
    # Load Models
    corpus_raw = corpora.MmCorpus(lsipath + 'viva.mm')
    lsi = lsimodel.LsiModel.load(lsipath +
                                 'viva.lsi')  # 将 mm 文件中的 corpus 映射到 LSI 空间当中

    mkdir(news_post_add)

    # Preporcessing text. Get corpus_add.
    for postfile in results:
        deltags = stripTags(postfile['text'])
        text_del = delstopwords("".join(deltags.split()))
        # text_vec = jieba.lcut(text_del)
        # del and
        with open(news_post_add + postfile['name'], 'w') as fp:
            fp.write(text_del)

    files = os.listdir(news_post_add)
    for i in files:
        shutil.copy(news_post_add + i, docpath)

    from dict_stream_train import getDictionary
    dict2 = getDictionary(lsipath=lsitemp, docpath=news_post_add)
    dict2 = corpora.Dictionary.load(lsitemp + 'viva.dict')

    from corpus_stream_train import getCorpus
    corpus2 = getCorpus(lsipath=lsitemp, docpath=news_post_add)
    corpus2 = corpora.MmCorpus(lsitemp + 'viva.mm')

    dict2_to_dict1 = dictionary.merge_with(dict2)
    # dict2_to_dict1.save(lsipath + 'viva2.dict')
    # dict2_to_dict1 = corpora.Dictionary.load(lsipath + 'viva2.dict')

    merged_corpus = itertools.chain(corpus_raw, dict2_to_dict1[corpus2])
    corpora.MmCorpus.serialize(lsipath + 'viva.mm', [i for i in merged_corpus])
    merged_corpus = corpora.MmCorpus(lsipath + 'viva.mm')

    # Get TF-IDF vecters of documents
    tfidf = tfidfmodel.TfidfModel(merged_corpus)
    print("Building tfidf model ...")
    corpus_tfidf = tfidf[merged_corpus]
    print("Building corpus_tfidf model ...")
    # Updated LSI Model

    # lsi.add_documents(corpus_tfidf, chunksize=chunksize, decay=DECAY_FACTOR)
    # # lsi.add_documents(corpus_tfidf, chunksize=chunksize)
    #
    # print("Builded lsi add documents to model ...")
    # # Updated Corpus
    # if not os.path.exists(lsipath):
    #     os.mkdir(lsipath)
    # # corpus = corpora.MmCorpus.serialize(lsipath + 'viva.mm', itertools.chain(corpus_raw, corpus2))

    lsi = lsimodel.LsiModel(corpus_tfidf,
                            id2word=dictionary,
                            num_topics=NUM_TOPIC,
                            chunksize=chunksize,
                            power_iters=2,
                            onepass=True)  # 其他参数都是默认

    lsi.save(lsipath + 'viva.lsi')
    lsi = models.lsimodel.LsiModel.load(lsipath + 'viva.lsi')
    index = similarities.docsim.Similarity(lsipath + 'viva.index',
                                           lsi[merged_corpus],
                                           num_features=NUM_TOPIC)
    # Save Models

    index.save(lsipath + 'viva.index')
    print("LSI model saved!")

    # Print elasped time
    t2 = time.time()
    print "Total elapsed time is: ", t2 - t_total_begin, "s"