Exemple #1
0
def get_save_wikitext(wiki_filename, text_filename):
    output = open(text_filename, 'w')
    wiki = corpora.WikiCorpus(wiki_filename, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        # text = delNOTNeedWords(text,"../../stopwords.txt")[1]
        output.write(" ".join(text) + "\n")
        i = i + 1
        if (i % 10000 == 0):
            logging.info("Saved " + str(i) + " articles")
    output.close()
Exemple #2
0
import os
import sys
import logging

from gensim import corpora

if __name__ == "__main__":
    program = os.path.basename(sys.argv[0])  #获取程序名称
    logger = logging.getLogger(program)
    logging.basicConfig(format="%(asctime)s: %(levelname)s: %(message)s")
    logging.root.setLevel(level=logging.INFO)
    logging.info("running %s" % "".join(sys.argv))

    output = "zhi_wiki_la.txt"
    f = open(output, "w")
    wiki = corpora.WikiCorpus(
        "zhwiki-20180901-pages-articles-multistream.xml.bz2",
        lemmatize=False,
        dictionary={})
    i = 0
    for text in wiki.get_texts():
        f.write(b"".join(text).decode("utf-8") + "\n")
        i += 1
        if (i % 1000 == 0):
            logger.info("Saved" + str(i) + "articles")
    f.close()
    logger.info("Finished saved" + str(i) + "articles")
Exemple #3
0
#_*_coding:utf-8_*_

from gensim import models,corpora
import jieba
import codecs
import logging
from langconv import *
#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


zhwiki = '/Users/yangyang/Desktop/NLP/data/zhwiki-latest-pages-articles.xml.bz2'
wiki = corpora.WikiCorpus(zhwiki,lemmatize=False,dictionary={})

'''
gensim LabeledSentence:将文本(分词)、标签一起训练,得到文本向量
'''
class TaggedWikiDocument(object):
    def __init__(self, wiki):
        self.wiki = wiki
        self.wiki.metadata = True
    def __iter__(self):
        for content, (page_id, title) in self.wiki.get_texts():
            yield models.doc2vec.LabeledSentence(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title])


documents = TaggedWikiDocument(wiki)
model = models.Doc2Vec(documents,dm=0,window=8,dbow_words=1,size=192,min_alpha=19,iter=5,workers=6)
model.save('./data/zhiwiki_news.doc2vec')