Ejemplo n.º 1
0
def train_w2v(data_directory, corpus_path, wiki_text_output_path, word2vec_output_path, w2v_dim, multiwords=True, druid_cutoff_score=0.4):
    start_time = time.time()
    
    # Convert Wikipedia XML dump into .txt format
    if not exists(wiki_text_output_path):
        logger.info('Converting ' + str(corpus_path) + ' into plain text file: ' + wiki_text_output_path) 
        wikidump2text.convert(corpus_path, wiki_text_output_path)

    # Load Multiword Expressions as Dictionary
    stopwords_path = join(data_directory, 'stopwords_en.txt')

    if multiwords:
        logger.info('Using druid_en.bz2 in  ' + data_directory + ' as multiword dictionary.')
        druid_path = join(data_directory, 'druid_en.bz2')
        druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=druid_cutoff_score)
        logger.info('Loaded Druid with cutoff'+str(druid_cutoff_score))
        # Train the word2vec model, also use DRUID multiwords
        sentences = MySentences(wiki_text_output_path, druid_dict, multiwords=True)  # a memory-friendly iterator
    else:
        logger.info('Using no multiword dicitionary, just single words')
        sentences = MySentences(wiki_text_output_path, None, multiwords=False)

    # bigram_transformer = Phrases(sentences)
    # logger.info("Finished transforming bigrams. Time needed: " + str(time.time() - start_time))
    
    logger.info("Starting model training, will save to: " + word2vec_output_path)
    model = Word2Vec(sentences, size=w2v_dim, window=5, min_count=5, workers=multiprocessing.cpu_count())

    # trim unneeded model memory = use(much) less RAM
    model.init_sims(replace=True)
    
    logger.info("Saving to the following path: " + word2vec_output_path)
    model.save(word2vec_output_path, ignore=[])

    logger.info("Finished building Word2Vec model. Time needed: " + str(time.time() - start_time))
Ejemplo n.º 2
0
def build_tfidf_model(data_directory, corpus_path, wiki_text_output_path, model_output_path, multiwords=True, druid_cutoff_score=0.3):

    stemmer = nltk.stem.PorterStemmer()
    tokenid_dictionary = corpora.Dictionary()

    if not exists(wiki_text_output_path):
        logger.info('Converting ' + str(corpus_path) + ' into plain text file: ' + wiki_text_output_path)
        # Convert Wikipedia XML dump into .txt format
        wikidump2text.convert(corpus_path, wiki_text_output_path)
    else:
        logger.info('Found ', wiki_text_output_path, ' not converting from the raw bz2 file.')

    # Load Multiword Expressions as Dictionary
    stopwords_path = join(data_directory, 'stopwords_en.txt')
    
    if multiwords:
        druid_path = join(data_directory, 'druid_en.bz2')
        druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=druid_cutoff_score)
        logger.info('Loaded Druid with cutoff' + str(druid_cutoff_score))
    else:
        druid_dict = None

    logger.info("Building tfidf model...")
    start_time = time.time()

    if multiwords:
        logger.info('Using druid_en.bz2 in  ' + data_directory + ' as multiword dictionary.')
        articles = TextCorpus(wiki_text_output_path, druid_dict, multiwords=True)  # a memory-friendly iterator
    else:
        logger.info('Using no multiword dicitionary, just single words')
        articles = TextCorpus(wiki_text_output_path, None, multiwords=False)
    
    tokenid_dictionary.add_documents(articles)


    model = TfidfModel(BowCorpus(wiki_text_output_path, druid_dict, tokenid_dictionary, multiwords=multiwords), id2word=tokenid_dictionary)
    model.save(model_output_path)

    logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
program = basename(sys.argv[0])
logger = logging.getLogger(program)


def data_directory():
    return join(dirname(dirname(abspath(__file__))), 'data')

corpus_path = join(data_directory(), 'enwiki-latest-pages-articles12.xml-p001825001p002425000.bz2')
wiki_text_output_path = join(data_directory(), 'enwiki-latest-pages-articles12.txt')
model_output_path = join(data_directory(), 'wiki.tfidf')

stemmer = nltk.stem.PorterStemmer()
dictionary = corpora.Dictionary()

# Convert Wikipedia XML dump into .txt format
wikidump2text.convert(corpus_path, wiki_text_output_path)

# Load Multiword Expressions as Dictionary
stopwords_path = join(data_directory(), 'stopwords_en.txt')
druid_path = join(data_directory(), 'druid_en.bz2')
druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=0.0)


logger.info("Building tfidf model...")
start_time = time.time()


class TextCorpus(object):
    def __init__(self, filename):
        self.corpus = codecs.open(filename, 'r', encoding='utf-8')