def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def extract_wiki(thresh, env_path, vec_file): program = os.path.basename(env_path[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] space = " " i = 0 print('--- load ck12 word2vec') model = gensim.models.Word2Vec.load_word2vec_format(vec_file, binary=False) print('--- filtering keywords based on sim to ck12 keyword science') output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): topic =[ w for w in text[:20] if w not in stopwords.words('english')] sim = np.mean([ model[w].dot(model['science']) if w in model else 0 for w in topic]) #sim = model['science'].dot(topic_vec) if sim > thresh: output.write(space.join(text) + "\n") i = i + 1 if (i % 100 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def process_enwiki(input_file, output_file): space = ' ' i = 0 output = open(output_file, 'w') wiki = WikiCorpus(input_file, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(space.join(text) + '\n') i += 1 if i % 10000 == 0: logger.info('Saved ' + str(i) + ' articles') output.close()
def parse(filename): OUTPATH = '../gen_data/wikicorpus' fout = open(OUTPATH, 'w') wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5) count = 0 for text in wiki.get_texts(): fout.write(" ".join(text) + "\n") count = count + 1 if (count % 10000 == 0): logging.info("Save "+str(count) + " articles") fout.close() logging.info("Finished saved "+str(count) + "articles")
def parse_wiki(filename): fout = file('../../paper/data/wiki/wiki_corpus', 'w') wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5) count = 0 for text in wiki.get_texts(): fout.write('%s\n' % ' '.join(text)) if count % 10000 == 0: logging.info(count) count += 1 fout.close() logging.info('Finish %d' % count)
def process_wiki(infile, outfile): from gensim.corpora import WikiCorpus wiki = WikiCorpus(infile, lemmatize=False, dictionary={}) i = 0 with open(outfile, 'w') as fw: for text in wiki.get_texts(): text = ' '.join(text) cut_text = cut(text) fw.write(re.sub(r' {1,}', ' ', ' '.join(cut_text)) + '\n') i += 1 if i % 1000 == 0: logger.info('Saved ' + str(i) + ' texts') logger.info('Finished ' + str(i) + ' texts')
def preprocessing(logger, data_path, output_filename): i = 0 output = open(output_filename, 'w', encoding="utf-8") wiki = WikiCorpus(data_path, lemmatize=False, dictionary={}, lower=False) for text in wiki.get_texts(): output.write(' '.join(text) + '\n') i = i + 1 if i % 10000 == 0: logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def extract_corpus(infile, outfile): print(' '.join([ 'Extracting Wikipedia corpus file ' + infile + '.', 'This may take a couple minutes...', ])) with open(outfile, 'w') as output: wiki = WikiCorpus(infile) # "text" is actually each individual article for i, text in enumerate(wiki.get_texts()): output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') if i > 0 and i % 10000 == 0: print('Processed ' + str(i) + ' articles so far.') print('Processing complete! Yippee!')
def wiki_to_txt(file_name, output_name): logging.info("開始 wiki_to_txt") wiki_corpus = WikiCorpus(file_name, dictionary={}) texts_num = 0 converter = opencc.OpenCC('s2t.json') with open(output_name, 'w', encoding='utf-8') as output: for texts in wiki_corpus.get_texts(): r = converter.convert(' '.join(texts)) output.write(r + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("已處理 %d 篇文章" % texts_num) logging.info("結束 wiki_to_txt")
def build_corpus(infile,outfile): """ Converts a Wikipedia xml dump to a text corpus""" output = open(outfile,'w') wiki = WikiCorpus(infile) i = 0 for text in wiki.get_texts(): output.write(bytes(' '.join(text),'utf-8').decode('utf-8')+'\n') i += 1 if (i%10 == 0): print('Processed '+ str(i) + ' articles') output.close() print('Processing complete!')
def make_corpus(self): """Convert Wikipedia xml dump file to text corpus""" output = open(self.wiki_file, 'w', encoding="utf-8") wiki = WikiCorpus(self.dump_file) i = 0 for text in wiki.get_texts(): output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') i = i + 1 if (i % 10000 == 0): print('Processed ' + str(i) + ' articles') output.close() print('Processing complete!')
def make_corpus(in_f, out_f): """Convert Wikipedia xml dump file to text corpus""" output = open(out_f, 'w') wiki = WikiCorpus(in_f, tokenizer_func=tokenize) i = 0 for text in wiki.get_texts(): output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') i = i + 1 if i % 10000 == 0: print('Processed ' + str(i) + ' articles') output.close() print('Processing complete!')
def make_corpus(input_file): """Convert Wikipedia xml dump file to text corpus""" wiki = WikiCorpus(input_file) wiki.metadata = True output_folder = '../corpus' Path(output_folder).mkdir(parents=True, exist_ok=True) for article in wiki.get_texts(): text = article[0] page_id, title = article[1] filename = f'{output_folder}/{page_id}-{slugify(title)}.txt' with open(filename, 'a') as file: file.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') print(f'{page_id} {title}')
def make_corpus(in_f, out_f): """Convert Wikipedia xml dump file to text corpus""" output = open(out_f, 'w') wiki = WikiCorpus(in_f) i = 0 for text in wiki.get_texts(): output.write(bytes(' '.join(text)) + '\n') i = i + 1 if (i % 1 == 0): print('Processed ' + str(i) + ' articles') output.close() print('Processing complete!')
def wiki_to_text(wiki_data_path): logging.info("开始将维基语料转换为普通文本格式:") if os.path.exists("wiki_texts.txt"): return wiki_corpus = WikiCorpus(wiki_data_path, dictionary={}) texts_num = 0 with open("wiki_texts.txt",'w',encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("已处理 %d 篇文章" % texts_num)
def create_corpus(input_file_name, output_file_name): output = open(output_file_name, 'w', encoding='utf-8') wiki = WikiCorpus(input_file_name, lemmatize=False, dictionary={}, lower=False) i = 0 for text in wiki.get_texts(): output.write(' '.join(text) + '\n') i = i + 1 if i % 10000 == 0: logger.info(f"Saved {i} articles") output.close() logger.info(f"Finished Saved {i} articles")
def wikiToTxt(self): # This function takes about 25 minutes logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) wiki_corpus = WikiCorpus( './build/zhwiki-latest-pages-articles.xml.bz2', dictionary={}) texts_num = 0 with open('./build/wiki_texts.txt', 'w') as output: for text in wiki_corpus.get_texts( ): # get_texts()一次會回傳一篇文章,其中一句話為一個item組成一個list output.write(' '.join(text) + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("壓縮檔轉為文字檔(以空格分開句子),已處理 %d 篇文章" % texts_num)
def make_corpus(in_f, out_f): """Convert Wikipedia xml dump file to text corpus""" output = open(out_f, "w") wiki = WikiCorpus(in_f) i = 0 for text in wiki.get_texts(): output.write(bytes(" ".join(text), "utf-8").decode("utf-8") + "\n") i = i + 1 if i % 10000 == 0: print("Processed " + str(i) + " articles") output.close() print("Processing complete!")
def wikiToTxt(self): if os.path.exists(self.wiki_texts): return # This function takes about 25 minutes from gensim.corpora import WikiCorpus wiki_corpus = WikiCorpus(self.wiki_dump, dictionary={}) texts_num = 0 with open(self.wiki_texts, 'w', encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(' '.join(text) + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("already processed %d articles" % texts_num)
def process_wiki(in_f, out_f): """Convert Wikipedia xml dump file to text corpus""" output = open(out_f, 'w', encoding='utf-8') wiki = WikiCorpus(in_f) i = 0 print('start') for text in wiki.get_texts(): output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') i = i + 1 if (i % 10 == 0): print('Processed ' + str(i) + ' articles') output.close() print('Processing complete!')
def wikicorpus2text(source, target): ''' 将维基语料转化成text文本 :param source: 原始维基语料压缩文件格式(bz2)地址, 下载地址:https://dumps.wikimedia.org/zhwiki/ :param target: 文件的目标位置 :return: ''' wiki = WikiCorpus(source, lemmatize=False, dictionary=[]) with open(target, 'w') as t: i = 1 for text in tqdm(wiki.get_texts()): t.write(' '.join(text) + "\n") if (i % 10000 == 0): print(f'{i * 10000} is done') i += 1
def main(): # Load wikipedia data print("... Load wikipedia data") wiki = WikiCorpus(WIKI_FILE_PATH, lemmatize=False,tokenizer_func = tokenize_and_stem) # Save the wikipedia data before word2vec training, in case of any erros in the training phase print("... Save tokenized data") with open(TOKENIZED_WIKI_FILE_PATH,"w",encoding="utf-8") as output_file: for text in wiki.get_texts(): output_file.write(" ".join(text)+"\n") # Train word2vec model and save it do disk print("... Train word2vec model") model = Word2Vec(LineSentence(TOKENIZED_WIKI_FILE_PATH), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count()) model.save(WORD2VEC_MODEL_FILE_PATH)
def make_corpus(wiki_in_file, wiki_out_file): """Convert Wikipedia xml dump file to text corpus""" path_to_wiki_dump = datapath(wiki_in_file) with open(wiki_out_file, 'w') as output: wiki = WikiCorpus(path_to_wiki_dump) # create word->word_id mapping i = 0 for text in wiki.get_texts(): output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') i += 1 if i % 10000 == 0: print('Processed ' + str(i) + ' articles') output.close() print('Processing complete!')
def handle_wiki_data(): print os.getcwd() wiki_source_path = sys.path[0] + '/zhwiki-latest-pages-articles.xml.bz2' print 'wiki baike 数据路径', wiki_source_path # 解析后终保存的目录 wiki_source_path_out = sys.path[0] + '/wiki-zh-1.3g.txt' print 'wiki baike 数据解析结果路径', wiki_source_path_out wiki = WikiCorpus(wiki_source_path, lemmatize=False, dictionary={}) file_out = open(wiki_source_path_out, 'w') for text in wiki.get_texts(): str_line = ' '.join(text) + "\n" # 转化成简体中文 simple_line = tradition2simple(str_line.encode('utf-8')) file_out.write(simple_line) file_out.close()
def get_wiki_text(): outp = "../../data/wiki/wiki.zh.txt" inp = "../../data/wiki/zhwiki-20190720-pages-articles-multistream.xml.bz2" space = " " output = open(outp, 'w', encoding='utf-8') # gensim里的维基百科处理类WikiCorpus wiki = WikiCorpus(inp, lemmatize=False, dictionary=[]) # 通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容 for text in wiki.get_texts(): output.write(space.join(text) + "\n") output.close()
def make_corpus(in_f, out_f, num_articles): """Convert Wikipedia xml dump file to text corpus""" output = open(out_f, 'w+') wiki = WikiCorpus(in_f, tokenizer_func=tokenize) i = 0 for text in wiki.get_texts(): output.write((bytes(' ', 'utf-8').join(text)).decode('utf-8') + '\n') i += 1 if (i % 100 == 0): print('Processed ' + str(i) + ' articles') if (i >= num_articles): break output.close() print('Processing complete!')
def enwiki(srcPath, tarPath): index = 0 space = " " output = open(tarPath, 'w') wiki = WikiCorpus(srcPath, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(' '.join(text) + '\n') index += 1 if (index % 10000 == 0): print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "\tSaved " + str(index) + " articles.") output.close() print("Finished saved " + str(index) + " articles.")
def generate_wiki(): i = 0 # Use the WikiCorpus API to read text contents from the raw dump file wiki = WikiCorpus(path_to_yue_wiki, lemmatize=False) file = codecs.open('./text/yue_wiki2.txt', 'w', 'utf-8') # Write texts into the new file article by article for text in wiki.get_texts(): str_lines = " ".join(text) + "\n" file.write(str_lines) i += 1 if (i % 100 == 0): print("Save " + str(i) + " articles") file.close() print("Finished saved " + str(i) + " articles")
def save_to_batches(input, doc_set=set(), batch_path='.', batch_size=1000, lang='@body'): if not doc_set: # is empty return wiki = WikiCorpus(input, lemmatize=False, dictionary='empty dictionary') wiki.metadata = True # request to extract page_id and title num_docs_found = 0 batch_dict = {} NNZ = 0 batch = artm.messages_pb2.Batch() for (text, page_id_and_title) in wiki.get_texts(): page_id = page_id_and_title[0] title = page_id_and_title[1] if page_id in doc_set: num_docs_found += 1 print num_docs_found, page_id, title # get tokens tf in the text text_tf = Counter(text) for token in text: # update batch dictionary if token not in batch_dict: batch.token.append(unicode(token, 'utf-8')) batch_dict[token] = len(batch.token) - 1 # add item to batch item = batch.item.add() item.id = int(page_id) item.title = title field = item.field.add() field.name = lang for token in text_tf: field.token_id.append(batch_dict[token]) field.token_count.append(text_tf[token]) NNZ += text_tf[token] if len(batch.item) == batch_size: artm.library.Library().SaveBatch(batch, batch_path) print 'Batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ) batch = artm.messages_pb2.Batch() batch_dict = {} NNZ = 0 if len(batch.item) > 0: artm.library.Library().SaveBatch(batch, batch_path) print 'Last batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)
def process_wiki_to_text(input_filename, output_text_filename, output_sentences_filename): if os.path.isfile(output_text_filename) and os.path.isfile(output_sentences_filename): logging.info('Skipping process_wiki_to_text(). Files already exist: {} {}'.format(output_text_filename, output_sentences_filename)) return start = time.time() intermediary_time = None sentences_count = 0 with open(output_text_filename, 'w') as out: with open(output_sentences_filename, 'w') as out_sentences: # Open the Wiki Dump with gensim wiki = WikiCorpus(input_filename, lemmatize=False, dictionary={}, processes=cpu_count()) wiki.metadata = True texts = wiki.get_texts() for i, article in enumerate(texts): # article[1] refers to the name of the article. text_list = article[0] sentences = text_list sentences_count += len(sentences) # Write sentences per line for sentence in sentences: out_sentences.write((sentence + '\n')) # Write each page in one line text = ' '.join(sentences) + '\n' out.write(text) # This is just for the logging if i % (100 - 1) == 0 and i != 0: if intermediary_time is None: intermediary_time = time.time() elapsed = intermediary_time - start else: new_time = time.time() elapsed = new_time - intermediary_time intermediary_time = new_time sentences_per_sec = int(len(sentences) / elapsed) logging.info('Saved {0} articles containing {1} sentences ({2} sentences/sec).'.format(i + 1, sentences_count, sentences_per_sec)) logging.info( 'Finished process_wiki_to_text(). It took {0:.2f} s to execute.'.format(round(time.time() - start, 2)))
def main(): if len(sys.argv) != 2: print("Usage: python3 " + sys.argv[0] + " wiki_data_path") exit() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) wiki_corpus = WikiCorpus(sys.argv[1], dictionary={}) texts_num = 0 with open("wiki_texts_en.txt",'w',encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(' '.join(text) + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("已處理 %d 篇文章" % texts_num)
def my_function(): zhwiki_name = './zhwiki-latest-pages-articles.xml.bz2' wiki = WikiCorpus(zhwiki_name, dictionary={}) documents = TaggedWikiDocument(wiki) model = g.Doc2Vec(documents, dm=0, dbow_words=1, vector_size=docvec_size, window=8, min_count=19, workers=8) model.save('./wiki.doc2vec.model')
class LineSentences(object): def __init__(self, dirname, wikipath=None, lower=True): self.dirname = dirname self.wiki = None if wikipath: self.wiki = WikiCorpus(wikipath, lemmatize=False, dictionary={}, lower=lower) self.wiki.metadata = False def __iter__(self): # if self.wiki: for content in self.wiki.get_texts(): # print(content) yield content for fname in os.listdir(self.dirname): _, ext = splitext(fname) if ".txt" in ext: for line in open(os.path.join(self.dirname, fname)): line = line.rstrip('\n') words = word_tokenize(line) if words: # print(words) yield words
def data_process(self): """ extract txt content from xml file """ space = " " i = 0 output = open(self.txt_path, 'w', encoding='utf-8') wiki = WikiCorpus(self.origin_path, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(space.join(text) + "\n") i = i + 1 if i % 10000 == 0: print('Saved ' + str(i) + ' articles') output.close() print('Finished Saved ' + str(i) + ' articles')
def my_function(): zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2' wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={}) documents = TaggedWikiDocument(wiki) model = g.Doc2Vec(documents, dm=0, dbow_words=1, size=docvec_size, window=8, min_count=19, iter=5, workers=8) model.save('data/zhiwiki_news.doc2vec')
def main(): if len(sys.argv) != 2: print("Usage: python3 " + sys.argv[0] + " wiki_data_path") exit() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) wiki_corpus = WikiCorpus(sys.argv[1], dictionary={}) texts_num = 0 with io.open("wiki_texts.txt",'w',encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("已處理 %d 篇文章" % texts_num)
def dataprocess(_config): i = 0 output = None if six.PY3: output = open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w') else: output = codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w') wiki = WikiCorpus(os.path.join(_config.data_path, _config.zhwiki_bz2), lemmatize=False, dictionary={}) for text in wiki.get_texts(): if six.PY3: output.write(b' '.join(text).decode('utf-8', 'ignore') + '\n') else: output.write(' '.join(text) + '\n') i += 1 if i % 10000 == 0: print('Saved ' + str(i) + ' articles') output.close() print('Finished Saved ' + str(i) + ' articles')
def process_wiki(inp, outp): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) i = 0 output = open(outp, 'w', encoding='utf-8') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') i = i + 1 if i % 10000 == 0: logger.info('Saved ' + str(i) + ' articles') output.close() logger.info('Finished ' + str(i) + ' articles')
def my_function(): space = ' ' i = 0 l = [] zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2' f = open('./data/reduce_zhiwiki.txt', 'w') wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={}) for text in wiki.get_texts(): for temp_sentence in text: temp_sentence = Converter('zh-hans').convert(temp_sentence) seg_list = list(jieba.cut(temp_sentence)) for temp_term in seg_list: l.append(temp_term) f.write(space.join(l) + '\n') l = [] i = i + 1 if (i %200 == 0): print('Saved ' + str(i) + ' articles') f.close()
def process_wiki(inp,outp): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) space = " " i = 0 output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write(space.join(text) + "\n") i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def make_wiki_corpus(inp, outp, logger): ''' Предобработка википедии. :param inp: путь к файлу, например: enwiki-20150304-pages-articles.xml.bz2 :param outp: выходной текстовый файл с предобработанной базой текстов например: wiki.en.text :param logger: логер для вывода информации о процессе предобработки ''' output = open(outp, 'w') wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) i = 0 space = " " for text in wiki.get_texts(): output.write(space.join(text) + "\n") i += 1 if i % 10000 == 0: logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
def zhwiki2chars(in_file, out_file): reg = re.compile(r'^[a-zA-Z]+$') def _isalpha(string): return reg.match(string) is not None i = 0 out = open(out_file, 'w') wiki = WikiCorpus(in_file, lemmatize=False, dictionary={}) for article in wiki.get_texts(): tokens = [] for token in article: token = token.decode("utf-8").strip() if _isalpha(token): continue tokens.append(" ".join(token)) # divided by character out.write(" ".join(tokens) + "\n") i += 1 if i % 10000 == 0: print("process %d articles" % i) out.close()
def main(): gensim.corpora.wikicorpus.tokenize = replacement_tokenize infn, outfn = sys.argv[1:3] wiki = WikiCorpus(infn, lemmatize=False, dictionary={}) with open(outfn, 'w') as outfile: for i, article in enumerate(wiki.get_texts()): article = [entry.decode("utf-8") for entry in article] text = " ".join(article) mostly_sentences = nltk.sent_tokenize(text) sentences = [] for sent in mostly_sentences: for line in sent.splitlines(): sentences.append(line.strip()) for sentence in sentences: sentence = cleanup(sentence) if sentence: print(sentence, file=outfile) if (i % 10000 == 0): print("Saved ", i, "articles")
def convert(input_path, output_path): logger.info("Converting Wiki Corpus...") corpus_path = check_path(input_path) wiki_text_output_path = output_path start_time = time.time() space = " " i = 0 wiki = WikiCorpus(corpus_path, lemmatize=False, dictionary={}) output = open(wiki_text_output_path, 'w') # Convert WikiCorpus into Text output (1 article per line) for text in wiki.get_texts(): output.write(space.join(text) + '\n') i += 1 if i % 10000 == 0: logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles. Time needed: " + str(time.time() - start_time))
if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print "Usage: extractwiki.py infile_name outfile_name" sys.exit(1) infilename, outfilename = sys.argv[1:3] if os.path.isfile(outfilename): logger.error("Output file %s exists. Change the file name and try again." %outfilename) sys.exit(1) i = 0 output = open(outfilename, 'w') wiki = WikiCorpus(infilename, lemmatize=False, dictionary={}) for text in wiki.get_texts(): output.write( " ".join(text) + "\n") i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles")
if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE online = 'online' in program lemmatize = 'lemma' in program debug = 'nodebug' not in program if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) mywiki = myWikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
program = os.path.basename(sys.argv[0])#得到文件名 #program = os.path.basename()#得到文件名 logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] space = " " i = 0 output = open(outp, 'w',encoding='utf-8') wiki =WikiCorpus(inp, lemmatize=False, dictionary=[])#gensim里的维基百科处理类WikiCorpus for text in wiki.get_texts():#通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容 output.write(space.join(text) + "\n") i = i+1 if (i % 10000 == 0): logger.info("Saved "+str(i)+" articles.") output.close() logger.info("Finished Saved "+str(i)+" articles.")
if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE online = 'online' in program lemmatize = 'lemma' in program debug = 'nodebug' not in program if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt') wiki.save(outp + '_corpus.pkl') dictionary.allow_update = False else: wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above
if __name__ == '__main__': # set up logging program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running: %s" % ' '.join(sys.argv)) # check and process input arguments args = parse_args(sys.argv[1:]) if not 'input' in args: logger.error("No input given!") sys.exit(1) # get args inp, outp, limit = args['input'], args['output'], args['limit'] # prepare corpus wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) texts = slice(wiki.get_texts(), limit); # save this for efficiency space = " " output = open(outp, 'w') iterate_with_logging(logger, 10000, texts, lambda text: output.write(space.join(text) + "\n")) output.close()
gamma, _ = self.inference([bow]) theta = numpy.exp(dirichlet_expectation(gamma[0])) topicDist = theta / theta.sum() # normalize to proper distribution return [(topicId, topicValue) for topicId, topicValue in enumerate(topicDist) if topicValue >= eps] # ignore document's topics that have prob < eps #endclass LdaModel if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logger.setLevel(level = logging.DEBUG) logger.info("running %s" % ' '.join(sys.argv)) import os.path program = os.path.basename(sys.argv[0]) from gensim.corpora import WikiCorpus, MmCorpus, LowCorpus numpy.random.seed(100000001) vocab = WikiCorpus.loadDictionary('/Users/kofola/gensim/results/wiki10_en_wordids.txt') corpus = MmCorpus('/Users/kofola/gensim/results/wiki10_en_bow.mm') K = 50 olda = LdaModel(numTopics=K, id2word=vocab, alpha=1./K, eta=1./K, decay=0.5) olda.update(corpus) olda.save('olda2.pkl') logging.info("finished running %s" % program)
import os.path import sys from gensim.corpora import WikiCorpus from gensim.models import TfidfModel, Word2Vec if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] wiki = WikiCorpus(inp, dictionary={}) model = Word2Vec(size=300, window=5, min_count=5, workers=8) sentences = wiki.get_texts() model.build_vocab(sentences) sentences = wiki.get_texts() model.train(sentences) model.save(outp) model.init_sims(replace=True) model.save('trimmed-model')
#! /usr/bin/env python # -*- coding: utf-8 -*- from gensim.corpora import WikiCorpus from gensim.models.word2vec import Word2Vec import logging, os logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) corpus = WikiCorpus('../fawiki-latest-pages-articles.xml.bz2',dictionary=False) max_sentence = -1 def generate_lines(): for index, text in enumerate(corpus.get_texts()): if index < max_sentence or max_sentence==-1: yield text else: break # Check if model is not exist model = Word2Vec() if ((os.path.exists('../model_farsi')) and (os.path.isfile('../model_farsi'))): model = Word2Vec.load('../model_farsi') result_1 = model.most_similar('روز') result_2 = model.most_similar(positive=['زن', 'پادشاه'], negative=['مرد'], topn=10) print "result is:" for (re,v) in result_1: print re + ' '+ str(v) print "======================="
if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE online = 'online' in program lemmatize = 'lemma' in program debug = 'nodebug' not in program if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True) wiki.save(outp + '_corpus.pkl.bz2', use_bzip2=True) dictionary.allow_update = False else: wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True) # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above
# SETTINGS # model parameters and output lsa_dim = 100 w2v_dim = 50 f_bow = "{0}.bow".format(prefix) f_tfidf = "{0}_voc{1}.tfidf".format(prefix, voc_size) f_lsa = "{0}_voc{1}_dim{2}.lsa".format(prefix, voc_size, lsa_dim) f_dict = "{0}_voc{1}.dict".format(prefix, voc_size) f_w2v = "{0}_voc{1}_dim{2}_win5.bin".format(prefix, voc_size, w2v_dim) # CORPUS PREPROCESSING if wiki: # models will be trained on the Dutch Wikipedia corpus if os.path.exists(f_bow): corpus = WikiCorpus.load(f_bow) else: # download wikipedia training corpus (2015/10/14 18:45, 132MB) if not os.path.exists(f_corpus): wiki_lang, wiki_size, wiki_url = wikis[lang] if raw_input("About to download {0} Wikipedia corpus ({1}). Do you want to proceed? (y/n) ".format(wiki_lang, wiki_size)).startswith("y"): util.download_file(wiki_url, f_corpus, progress=True) else: sys.exit() corpus = WikiCorpus(f_corpus) # corpus.save(f_bow) else: # models will be trained on your own corpus if os.path.exists(f_bow): corpus = TextCorpus.load(f_bow) else: corpus = TextCorpus(f_corpus)
logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO ) parser = argparse.ArgumentParser() parser.add_argument("-a", "--articles", help="path to enwiki-latest-pages-articles.xml.bz2") parser.add_argument("-m", "--model", help="path to model dir") parser.add_argument("-d", "--demo", help="path to question-words.txt analogies") parser.add_argument("-l", "--lines", help="path to wiki-lines.txt") args = parser.parse_args() # Load or create wiki-lines.txt if not (os.path.isfile(args.lines)): wiki_corpus = WikiCorpus(args.articles, lemmatize=False) wiki_lines = wiki_corpus.get_texts() # Write wiki_lines out for future use lines_file = open(args.lines, 'w') for text in wiki_lines: lines_file.write(" ".join(text) + "\n") lines_file.close() else: wiki_lines = open(args.lines) model = Word2Vec( sentences=LineSentence(wiki_lines), size=400, hs=1, window=5,
#!/usr/bin/python from gensim.corpora import WikiCorpus from gensim.models.word2vec import Word2Vec corpus = WikiCorpus('dewiki-latest-pages-articles.xml.bz2', dictionary=False, lemmatize=False) model = Word2Vec(size=300, window=7, min_count=7, workers=4, negative=10, hs=0) model.build_vocab(corpus.get_texts()) model.train(corpus.get_texts()) model.init_sims(replace=True) model.save('dewiki.w2v')
parser.add_argument('--online', default=False, action='store_true') parser.add_argument('--debug', default=False, action='store_true') parser.add_argument('--keep-words', default=DEFAULT_DICT_SIZE, type=int, help='number of words to keep') args = parser.parse_args() logger = logging.getLogger('gensim.scripts.read_stream_items') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %r" % args.__dict__) if args.online: dictionary = HashDictionary(id_range=args.keep_words, debug=args.debug) dictionary.allow_update = True # start collecting document frequencies ## cannot use --max-articles or --expect-streamitems wiki = WikiCorpus(args.input, lemmatize=args.lemmatize, dictionary=dictionary) MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(args.output + '_corpus.pkl.bz2') dictionary.allow_update = False else: ## not online # takes about 9h on a macbook pro, for 3.5m articles (june 2011) wiki = WikiCorpus( args.input, lemmatize=args.lemmatize, max_articles=args.max_articles, expect_streamitems=args.expect_streamitems, file_name_pattern=args.file_name_pattern, )