def get_save_wikitext(wiki_filename, text_filename): output = open(text_filename, 'w') wiki = corpora.WikiCorpus(wiki_filename, lemmatize=False, dictionary={}) for text in wiki.get_texts(): # text = delNOTNeedWords(text,"../../stopwords.txt")[1] output.write(" ".join(text) + "\n") i = i + 1 if (i % 10000 == 0): logging.info("Saved " + str(i) + " articles") output.close()
import os import sys import logging from gensim import corpora if __name__ == "__main__": program = os.path.basename(sys.argv[0]) #获取程序名称 logger = logging.getLogger(program) logging.basicConfig(format="%(asctime)s: %(levelname)s: %(message)s") logging.root.setLevel(level=logging.INFO) logging.info("running %s" % "".join(sys.argv)) output = "zhi_wiki_la.txt" f = open(output, "w") wiki = corpora.WikiCorpus( "zhwiki-20180901-pages-articles-multistream.xml.bz2", lemmatize=False, dictionary={}) i = 0 for text in wiki.get_texts(): f.write(b"".join(text).decode("utf-8") + "\n") i += 1 if (i % 1000 == 0): logger.info("Saved" + str(i) + "articles") f.close() logger.info("Finished saved" + str(i) + "articles")
#_*_coding:utf-8_*_ from gensim import models,corpora import jieba import codecs import logging from langconv import * #enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) zhwiki = '/Users/yangyang/Desktop/NLP/data/zhwiki-latest-pages-articles.xml.bz2' wiki = corpora.WikiCorpus(zhwiki,lemmatize=False,dictionary={}) ''' gensim LabeledSentence:将文本(分词)、标签一起训练,得到文本向量 ''' class TaggedWikiDocument(object): def __init__(self, wiki): self.wiki = wiki self.wiki.metadata = True def __iter__(self): for content, (page_id, title) in self.wiki.get_texts(): yield models.doc2vec.LabeledSentence(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title]) documents = TaggedWikiDocument(wiki) model = models.Doc2Vec(documents,dm=0,window=8,dbow_words=1,size=192,min_alpha=19,iter=5,workers=6) model.save('./data/zhiwiki_news.doc2vec')