def process_wiki_to_text(input_filename, output_text_filename, output_sentences_filename): if os.path.isfile(output_text_filename) and os.path.isfile(output_sentences_filename): return start = time.time() intermediary_time = None sentences_count = 0 with open(output_text_filename, 'w') as out: with open(output_sentences_filename, 'w') as out_sentences: wiki = WikiCorpus(input_filename, lemmatize=False, dictionary={}, processes=cpu_count()) wiki.metadata = True texts = wiki.get_texts() for i, article in enumerate(texts): text_list = article[0] # article[1] refers to the name of the article. sentences = [elt.decode('utf-8') for elt in text_list] # re.search('[a-zA-Z]+', string) maybe add it sentences_count += len(sentences) for sentence in sentences: out_sentences.write((sentence + u'\n').encode('utf-8')) text = ' '.join(sentences) + u'\n' out.write(text.encode('utf-8')) if i % (100 - 1) == 0 and i != 0: if intermediary_time is None: intermediary_time = time.time() elapsed = intermediary_time - start else: new_time = time.time() elapsed = new_time - intermediary_time intermediary_time = new_time sentences_per_sec = int(len(sentences) / elapsed) logging.info('Saved {0} articles containing {1} sentences ({2} sentences/sec).'.format(i + 1, sentences_count, sentences_per_sec)) print('Finished process_wiki_to_text(). It took {0:.2f} s to execute.'.format(round(time.time() - start, 2)))
def process_wiki_to_text(input_filename, output_text_filename, output_sentences_filename): if os.path.isfile(output_text_filename) and os.path.isfile( output_sentences_filename): logging.info( 'Skipping process_wiki_to_text(). Files already exist: {} {}'. format(output_text_filename, output_sentences_filename)) return start = time.time() intermediary_time = None sentences_count = 0 with open(output_text_filename, 'w', encoding='utf-8') as out: with open(output_sentences_filename, 'w', encoding='utf-8') as out_sentences: # Open the Wiki Dump with gensim wiki = WikiCorpus(input_filename, lemmatize=False, dictionary={}, processes=cpu_count(), tokenizer_func=tokenizer_func) wiki.metadata = True texts = wiki.get_texts() for i, article in enumerate(texts): # article[1] refers to the name of the article. text_list = article[0] sentences = text_list sentences_count += len(sentences) # Write sentences per line for sentence in sentences: out_sentences.write((sentence + '\n')) # Write each page in one line text = ' '.join(sentences) + '\n' out.write(text) # This is just for the logging if i % (100 - 1) == 0 and i != 0: if intermediary_time is None: intermediary_time = time.time() elapsed = intermediary_time - start else: new_time = time.time() elapsed = new_time - intermediary_time intermediary_time = new_time sentences_per_sec = int(len(sentences) / elapsed) logging.info( 'Saved {0} articles containing {1} sentences ({2} sentences/sec).' .format(i + 1, sentences_count, sentences_per_sec)) logging.info( 'Finished process_wiki_to_text(). It took {0:.2f} s to execute.'. format(round(time.time() - start, 2)))
def make_corpus(input_file): """Convert Wikipedia xml dump file to text corpus""" wiki = WikiCorpus(input_file) wiki.metadata = True output_folder = '../corpus' Path(output_folder).mkdir(parents=True, exist_ok=True) for article in wiki.get_texts(): text = article[0] page_id, title = article[1] filename = f'{output_folder}/{page_id}-{slugify(title)}.txt' with open(filename, 'a') as file: file.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n') print(f'{page_id} {title}')
def save_to_batches(input, doc_set=set(), batch_path='.', batch_size=1000, lang='@body'): if not doc_set: # is empty return wiki = WikiCorpus(input, lemmatize=False, dictionary='empty dictionary') wiki.metadata = True # request to extract page_id and title num_docs_found = 0 batch_dict = {} NNZ = 0 batch = artm.messages_pb2.Batch() for (text, page_id_and_title) in wiki.get_texts(): page_id = page_id_and_title[0] title = page_id_and_title[1] if page_id in doc_set: num_docs_found += 1 print num_docs_found, page_id, title # get tokens tf in the text text_tf = Counter(text) for token in text: # update batch dictionary if token not in batch_dict: batch.token.append(unicode(token, 'utf-8')) batch_dict[token] = len(batch.token) - 1 # add item to batch item = batch.item.add() item.id = int(page_id) item.title = title field = item.field.add() field.name = lang for token in text_tf: field.token_id.append(batch_dict[token]) field.token_count.append(text_tf[token]) NNZ += text_tf[token] if len(batch.item) == batch_size: artm.library.Library().SaveBatch(batch, batch_path) print 'Batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ) batch = artm.messages_pb2.Batch() batch_dict = {} NNZ = 0 if len(batch.item) > 0: artm.library.Library().SaveBatch(batch, batch_path) print 'Last batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)
def load_wiki_corpus(path_data_in=None, path_data_out=None, word2vec=True): if path_data_in == None: corpus_path = path_nlp + r'zhwiki-latest-pages-articles.xml.bz2' else: corpus_path = path_data_in if path_data_out == None: if word2vec == True: corpus_processed_path = path_nlp + 'corpus_word2vec.txt' else: corpus_processed_path = path_nlp + 'corpus_doc2vec.txt' else: corpus_processed_path = path_data_out cc = OpenCC('t2s') count = 0 with open(corpus_processed_path, 'w', encoding='utf-8') as corpus_processed: corpus = WikiCorpus(corpus_path, lemmatize=False, dictionary={}) if word2vec == True: for doc in corpus.get_texts(): doc_new = series(doc).apply(lambda x: ' '.join( jieba.cut(cc.convert(x), cut_all=False))) corpus_processed.write(' '.join(doc_new) + "\n") count += 1 if (count % 100 == 0): logging.warning('Saved ' + str(count) + ' articles') if ((flag_test == True) and (count == 1000)): return else: corpus.metadata = True for doc, (page_id, title) in corpus.get_texts(): doc_new = TaggedDocument(words=[ word for sentence in doc for word in jieba.cut(cc.convert(sentence)) ], tags=[cc.convert(title)]) corpus_processed.write(' '.join(doc_new[0]) + '\t' + '\t'.join(doc_new[1]) + "\n") count += 1 if (count % 100 == 0): logging.warning('Saved ' + str(count) + ' articles') if ((flag_test == True) and (count == 1000)): return return
# TODO -- This text format lets you peruse it, but you can # compress it better as binary... wiki.dictionary.save_as_text('./data/dictionary.txt.bz2') # ======== STEP 2: Convert Articles To Bag-of-words ======== # Now that we have our finalized dictionary, we can create bag-of-words # representations for the Wikipedia articles. This means taking another # pass over the Wikipedia dump! if True: # Load the dictionary if you're just running this section. dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2') wiki = WikiCorpus(dump_file, dictionary=dictionary) # Turn on metadata so that wiki.get_texts() returns the article titles. wiki.metadata = True print '\nConverting to bag of words...' sys.stdout.flush() t0 = time.time() # Generate bag-of-words vectors (term-document frequency matrix) and # write these directly to disk. # On my machine, this took 3.53 hrs. # By setting metadata = True, this will also record all of the article # titles into a separate pickle file, 'bow.mm.metadata.cpickle' MmCorpus.serialize('./data/bow.mm', wiki, metadata=True, progress_cnt=10000) print ' Conversion to bag-of-words took %s' % formatTime(time.time() - t0) sys.stdout.flush()
print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] if not os.path.isdir(os.path.dirname(outp)): raise SystemExit( "Error: The output directory does not exist. Create the directory and try again." ) if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE wiki = WikiCorpus(inp, lemmatize=True) wiki.metadata = True # Ensure doc id is captured # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # Save the document ids to titles as a dictionary -- this will take a long time # Also may be unnessesary if metadata works correctly docmap = {} for index, doc in enumerate(wiki.get_texts()): docmap[index] = doc[1][1] with bz2.BZ2File('doc_index.pickle.bz2', 'w') as f: pickle.dump(docmap, f) # save dictionary and bag-of-words (term-document frequency matrix)