コード例 #1
0
def process_wiki_to_text(input_filename, output_text_filename, output_sentences_filename):
    if os.path.isfile(output_text_filename) and os.path.isfile(output_sentences_filename):
        return
    start = time.time()
    intermediary_time = None
    sentences_count = 0
    with open(output_text_filename, 'w') as out:
        with open(output_sentences_filename, 'w') as out_sentences:
            wiki = WikiCorpus(input_filename, lemmatize=False, dictionary={}, processes=cpu_count())
            wiki.metadata = True
            texts = wiki.get_texts()
            for i, article in enumerate(texts):
                text_list = article[0]  # article[1] refers to the name of the article.
                sentences = [elt.decode('utf-8') for elt in text_list]
                # re.search('[a-zA-Z]+', string) maybe add it
                sentences_count += len(sentences)
                for sentence in sentences:
                    out_sentences.write((sentence + u'\n').encode('utf-8'))
                text = ' '.join(sentences) + u'\n'
                out.write(text.encode('utf-8'))
                if i % (100 - 1) == 0 and i != 0:
                    if intermediary_time is None:
                        intermediary_time = time.time()
                        elapsed = intermediary_time - start
                    else:
                        new_time = time.time()
                        elapsed = new_time - intermediary_time
                        intermediary_time = new_time
                    sentences_per_sec = int(len(sentences) / elapsed)
                    logging.info('Saved {0} articles containing {1} sentences ({2} sentences/sec).'.format(i + 1,
                                                                                                           sentences_count,
                                                                                                           sentences_per_sec))
    print('Finished process_wiki_to_text(). It took {0:.2f} s to execute.'.format(round(time.time() - start, 2)))
コード例 #2
0
def process_wiki_to_text(input_filename, output_text_filename,
                         output_sentences_filename):

    if os.path.isfile(output_text_filename) and os.path.isfile(
            output_sentences_filename):
        logging.info(
            'Skipping process_wiki_to_text(). Files already exist: {} {}'.
            format(output_text_filename, output_sentences_filename))
        return

    start = time.time()
    intermediary_time = None
    sentences_count = 0

    with open(output_text_filename, 'w', encoding='utf-8') as out:
        with open(output_sentences_filename, 'w',
                  encoding='utf-8') as out_sentences:

            # Open the Wiki Dump with gensim
            wiki = WikiCorpus(input_filename,
                              lemmatize=False,
                              dictionary={},
                              processes=cpu_count(),
                              tokenizer_func=tokenizer_func)
            wiki.metadata = True
            texts = wiki.get_texts()

            for i, article in enumerate(texts):
                # article[1] refers to the name of the article.
                text_list = article[0]
                sentences = text_list
                sentences_count += len(sentences)

                # Write sentences per line
                for sentence in sentences:
                    out_sentences.write((sentence + '\n'))

                # Write each page in one line
                text = ' '.join(sentences) + '\n'
                out.write(text)

                # This is just for the logging
                if i % (100 - 1) == 0 and i != 0:
                    if intermediary_time is None:
                        intermediary_time = time.time()
                        elapsed = intermediary_time - start
                    else:
                        new_time = time.time()
                        elapsed = new_time - intermediary_time
                        intermediary_time = new_time
                    sentences_per_sec = int(len(sentences) / elapsed)
                    logging.info(
                        'Saved {0} articles containing {1} sentences ({2} sentences/sec).'
                        .format(i + 1, sentences_count, sentences_per_sec))
        logging.info(
            'Finished process_wiki_to_text(). It took {0:.2f} s to execute.'.
            format(round(time.time() - start, 2)))
コード例 #3
0
def make_corpus(input_file):
    """Convert Wikipedia xml dump file to text corpus"""

    wiki = WikiCorpus(input_file)
    wiki.metadata = True
    output_folder = '../corpus'
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    for article in wiki.get_texts():
        text = article[0]
        page_id, title = article[1]
        filename = f'{output_folder}/{page_id}-{slugify(title)}.txt'
        with open(filename, 'a') as file:
            file.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
            print(f'{page_id} {title}')
コード例 #4
0
def save_to_batches(input, doc_set=set(), batch_path='.', batch_size=1000, lang='@body'):
    if not doc_set: # is empty
        return
    wiki = WikiCorpus(input, lemmatize=False, dictionary='empty dictionary')
    wiki.metadata = True  # request to extract page_id and title
    
    num_docs_found = 0
    batch_dict = {}
    NNZ = 0
    batch = artm.messages_pb2.Batch()
    for (text, page_id_and_title) in wiki.get_texts():
        page_id = page_id_and_title[0]
        title = page_id_and_title[1]

        if page_id in doc_set:
            num_docs_found += 1
            print num_docs_found, page_id, title

            # get tokens tf in the text
            text_tf = Counter(text)
            for token in text:
                # update batch dictionary
                if token not in batch_dict:
                    batch.token.append(unicode(token, 'utf-8'))
                    batch_dict[token] = len(batch.token) - 1

            # add item to batch
            item = batch.item.add()
            item.id = int(page_id)
            item.title = title
            field = item.field.add()
            field.name = lang
            for token in text_tf:
                field.token_id.append(batch_dict[token])
                field.token_count.append(text_tf[token])
                NNZ += text_tf[token]
       
            if len(batch.item) == batch_size:
                artm.library.Library().SaveBatch(batch, batch_path)
                print 'Batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)

                batch = artm.messages_pb2.Batch()
                batch_dict = {}
                NNZ = 0

    if len(batch.item) > 0:
        artm.library.Library().SaveBatch(batch, batch_path)
        print 'Last batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)
コード例 #5
0
ファイル: demo_text2vec.py プロジェクト: archfool/nlp
def load_wiki_corpus(path_data_in=None, path_data_out=None, word2vec=True):
    if path_data_in == None:
        corpus_path = path_nlp + r'zhwiki-latest-pages-articles.xml.bz2'
    else:
        corpus_path = path_data_in
    if path_data_out == None:
        if word2vec == True:
            corpus_processed_path = path_nlp + 'corpus_word2vec.txt'
        else:
            corpus_processed_path = path_nlp + 'corpus_doc2vec.txt'
    else:
        corpus_processed_path = path_data_out
    cc = OpenCC('t2s')
    count = 0
    with open(corpus_processed_path, 'w',
              encoding='utf-8') as corpus_processed:
        corpus = WikiCorpus(corpus_path, lemmatize=False, dictionary={})
        if word2vec == True:
            for doc in corpus.get_texts():
                doc_new = series(doc).apply(lambda x: ' '.join(
                    jieba.cut(cc.convert(x), cut_all=False)))
                corpus_processed.write(' '.join(doc_new) + "\n")
                count += 1
                if (count % 100 == 0):
                    logging.warning('Saved ' + str(count) + ' articles')
                if ((flag_test == True) and (count == 1000)):
                    return
        else:
            corpus.metadata = True
            for doc, (page_id, title) in corpus.get_texts():
                doc_new = TaggedDocument(words=[
                    word for sentence in doc
                    for word in jieba.cut(cc.convert(sentence))
                ],
                                         tags=[cc.convert(title)])
                corpus_processed.write(' '.join(doc_new[0]) + '\t' +
                                       '\t'.join(doc_new[1]) + "\n")
                count += 1
                if (count % 100 == 0):
                    logging.warning('Saved ' + str(count) + ' articles')
                if ((flag_test == True) and (count == 1000)):
                    return
    return
コード例 #6
0
     # TODO -- This text format lets you peruse it, but you can
     # compress it better as binary...
     wiki.dictionary.save_as_text('./data/dictionary.txt.bz2')
         
 # ======== STEP 2: Convert Articles To Bag-of-words ========    
 # Now that we have our finalized dictionary, we can create bag-of-words
 # representations for the Wikipedia articles. This means taking another
 # pass over the Wikipedia dump!
 if True:
 
     # Load the dictionary if you're just running this section.
     dictionary = Dictionary.load_from_text('./data/dictionary.txt.bz2')
     wiki = WikiCorpus(dump_file, dictionary=dictionary)    
 
     # Turn on metadata so that wiki.get_texts() returns the article titles.
     wiki.metadata = True         
 
     print '\nConverting to bag of words...'
     sys.stdout.flush()
     
     t0 = time.time()
 
     # Generate bag-of-words vectors (term-document frequency matrix) and 
     # write these directly to disk.
     # On my machine, this took 3.53 hrs. 
     # By setting metadata = True, this will also record all of the article
     # titles into a separate pickle file, 'bow.mm.metadata.cpickle'
     MmCorpus.serialize('./data/bow.mm', wiki, metadata=True, progress_cnt=10000)
     
     print '    Conversion to bag-of-words took %s' % formatTime(time.time() - t0)
     sys.stdout.flush()
コード例 #7
0
ファイル: make_data.py プロジェクト: carl-ellis/topic_service
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]

    if not os.path.isdir(os.path.dirname(outp)):
        raise SystemExit(
            "Error: The output directory does not exist. Create the directory and try again."
        )

    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    wiki = WikiCorpus(inp, lemmatize=True)
    wiki.metadata = True  # Ensure doc id is captured

    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)

    # Save the document ids to titles as a dictionary -- this will take a long time
    # Also may be unnessesary if metadata works correctly
    docmap = {}
    for index, doc in enumerate(wiki.get_texts()):
        docmap[index] = doc[1][1]
    with bz2.BZ2File('doc_index.pickle.bz2', 'w') as f:
        pickle.dump(docmap, f)

    # save dictionary and bag-of-words (term-document frequency matrix)