Ejemplos de LineSentence en Python, ejemplos de gensim.models.word2vec.LineSentence en Python

Ejemplo n.º 1

0

Mostrar archivo

print("Création d'index ...")
#sauvegarder dans un dossier
os.mkdir(documentsDict)
for f in documents:
    out_file=open(join(documentsDict,f), 'w')
    out_file.write(str(documents[f]))
    out_file.close()

out_file=open(doc_freqTerm, 'w')
out_file.write(str(doc_freq))
out_file.close()
print("corpus traité avec succès \n")

#2 apprentissage
print("Apprentissage ...")
sentences = LineSentence(corpusAsSentences)
model = Word2Vec(sentences, size=dimConcept, window=win, min_count=minc, workers=4) # lancer la génération du vocabulaire
model.save_word2vec_format(index+'/word2vec'+str(dimConcept)+'_win'+str(win)+'_min'+str(minc)+'.txt', fvocab=None, binary=False)
print("vocabulaire ok \n")

#3 representation des documents en vecteurs  
print("Documents to vectors ...")
os.mkdir(matDoc)
for f in listdir(collection): # ici lire tte la collection 
    doc=documents[f]
    tdoc=0 # taille du document à calculer :
    tdoc=sum(doc.values())
    vec_doc=numpy.zeros(dimConcept)
    mat_doc={}
    for word in doc:
        if(word in model.vocab):

Ejemplo n.º 2

0

Mostrar archivo

# -*-coding=utf-8-*-
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import codecs

filename = 'html.txt'
sentences = LineSentence(filename)
model = Word2Vec(sentences, size=128, window=5, min_count=5, workers=4)
model.save('word_embedding_128')

items = model.most_similar('中国')
for item in items:
    print(item[0], item[1])

print(model.similarity('男人', '女人'))

filename = 'wikizhword.text'
f = codecs.open(filename, 'r', encoding='utf-8')
line = 20
for _ in range(line):
    print(f.readline())
# sentences = LineSentence(f)
# model = Word2Vec(sentences,size=128,window=5,min_count=5,workers=4)
# model.save('word_embedding_128')
#
# #model=Word2Evc.load('word_embedding_128')
# items = model.most_similar('中国')
# for item in items:
# 	print(item[0],item[1])
#
#

Ejemplo n.º 3

0

Mostrar archivo

import os
import sys
root_path = "/home/ubuntu/answerbot-tool/src"
sys.path.append(root_path)
from gensim.models.word2vec import Word2Vec, LineSentence
from utils.time_utils import get_current_time

corpus_fpath = '../_1_preprocessing/corpus.txt'

print 'start time : ', get_current_time()
sentences = LineSentence(corpus_fpath)
print "begin training..."

# size is the dimensionality of the feature vectors.
# window is the maximum distance between the current and predicted word within a sentence.
# min_count = ignore all words with total frequency lower than this.
# workers = use this many worker threads to train the model (=faster training with multicore machines).

model = Word2Vec(sentences,
                 size=200,
                 window=5,
                 min_count=0,
                 workers=4,
                 iter=100)

model.save('model')
print 'end time : ', get_current_time()

Ejemplo n.º 4

0

Mostrar archivo

from gensim.models.word2vec import LineSentence
import multiprocessing

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',
                        level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # load word2vec model
    inp = '..\dataset\ChnSentiCorp_htl_ba_6000\\6000_all_cut.txt'
    output1 = 'word2vec.model'
    output2 = 'word2vec.vector'

    # size:生成的词向量的维度;
    # min_count:可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5;
    # window：即词向量上下文最大距离，skip-gram和cbow算法是基于滑动窗口来做预测,默认值为5，对于一般的语料推荐在[5,10]之间。
    model = Word2Vec(LineSentence(inp),
                     size=300,
                     min_count=5,
                     window=5,
                     workers=multiprocessing.cpu_count(),
                     iter=1)
    # 生成word2vec词典
    model.build_vocab(inp)
    # 训练word2vec模型
    model.train(inp, total_examples=model.corpus_count, epochs=50)
    model.save('Word2vec_model.pkl')
    model.wv.save_word2vec_format('Word2vec_model.vector', binary=False)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: train_word2vec.py Proyecto: WallaceLiu/word2vec_learn

config_pattern = "size{}window{}sg{}min_count{}negative{}iter{}"
config_str = config_pattern.format(args.size, args.window, args.sg,
                                   args.min_count, args.negative, args.iter)
outputfile1 = outputpath + config_str + ".model"
outputfile2 = outputpath + config_str + ".vector"
############### end of config #################

logging.basicConfig(filename=config_str + '.log',
                    filemode='w',
                    format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger = logging.getLogger()
logger.info("running train process in custom: %s" % args.train)

model = Word2Vec(
    LineSentence(inpputfile),
    size=args.size,
    window=args.window,
    min_count=args.
    min_count,  # with 0.35 billion corpus, #3000 can retain 9228 unique words
    workers=args.workers,  # multiprocessing.cpu_count()
    #sample=args.sample,
    sg=args.sg,
    #hs=args.hs,
    negative=args.
    negative,  # follow tensorflow's word2vec_optimized.py num_neg_samples 25
    iter=args.iter)

# trim unneeded model memory = use(much) less RAM
# model.init_sims(replace=True)
model.save(outputfile1)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: word_embedding_train.py Proyecto: owenbyj/NLP_proj1

def word2vec_training(text_file):
    sentences = LineSentence(text_file)
    model = Word2Vec(sentences, size=300, window=5, min_count=1, workers=16)
    model.wv.save("merge_with_unk.kv")
    # model.wv.save_word2vec_format("merge_with_unk_vector.txt", binary=False)
    return model

Ejemplo n.º 7

0

Mostrar archivo

Archivo: corpus_preprocessor.py Proyecto: zzcoolj/reordered-word2vec

def stool_simulator(total_epoch, special_epoch_count, restricted_vocab_name):
    # corpus_file = '/Users/zzcoolj/Code/GoW/data/training data/Wikipedia-Dumps_en_20170420_prep/AA/wiki_01.txt'
    corpus_file = 'input/enwiki-1G.txt'
    xlsx_path = 'output/test1G-vocab50000-stool-iter' + str(total_epoch) + '-first' + str(special_epoch_count) \
                + 'EpochInitial-' + str(restricted_vocab_name) + '.xlsx'
    df = pd.DataFrame(columns=[
        # word embeddings file name
        'file name',
        # wordsim353
        'wordsim353_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # simlex999
        'simlex999_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # MTURK-771
        'MTURK771_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # questions-words
        'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total'
    ])
    line_number_in_xlsx = 0
    lr = 0.025
    alphas = alpha_splitter(start=lr, epochs=total_epoch)
    print('alphas', alphas)

    # special starting epochs (final notIn)
    restricted_vocab = read_file_to_dict('../word_embeddings_evaluator/data/distinct-tokens/' +
                                         str(restricted_vocab_name) + '.txt')
    restricted_type = 1
    params = {
        'alpha': lr,
        'min_alpha': alphas[special_epoch_count],
        'size': 200,
        'window': 5,
        'iter': special_epoch_count,
        'max_vocab_size': 50000,
        'sample': 1e-4,
        'sg': 1,  # 1 for skip-gram
        'hs': 0,  # If 0, and negative is non-zero, negative sampling will be used.
        'negative': 5,
        'workers': 3,

        'restricted_vocab': restricted_vocab,  # [modified] ATTENTION: It must be a dictionary not a list!
        'restricted_type': restricted_type  # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn
    }
    print('special epochs half', special_epoch_count)
    gs_model = Word2Vec(LineSentence(corpus_file), **params)
    df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch_count) + '-half')

    # special starting epochs (final in)
    print('special epochs entire', special_epoch_count)
    gs_model.restricted_type = 2
    gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
                   start_alpha=lr, end_alpha=alphas[special_epoch_count])
    line_number_in_xlsx += 1
    df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch_count) + '-entire')

    # original ending epochs
    print('roof epochs')
    gs_model.restricted_type = 0
    gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=total_epoch-special_epoch_count,
                   start_alpha=alphas[special_epoch_count], end_alpha=alphas[-1])
    line_number_in_xlsx += 1
    df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(total_epoch))

    writer = pd.ExcelWriter(xlsx_path)
    df.to_excel(writer, 'Sheet1')
    writer.save()

Ejemplo n.º 8

0

Mostrar archivo

def train_d2v_model(infile, embedding_file):
    model = gensim.models.Word2Vec(LineSentence(infile),
                                   size=200,
                                   window=5,
                                   min_count=5)
    model.save(embedding_file)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: word2vec_standalone.py Proyecto: trideeprath/mr2vec_train

    parser.add_argument(
        "-accuracy",
        help="Use questions from file ACCURACY to evaluate the model")

    args = parser.parse_args()

    if args.cbow == 0:
        skipgram = 1
        if not args.alpha:
            args.alpha = 0.025
    else:
        skipgram = 0
        if not args.alpha:
            args.alpha = 0.05

    corpus = LineSentence(args.train)

    model = Word2Vec(corpus,
                     size=args.size,
                     min_count=args.min_count,
                     workers=args.threads,
                     window=args.window,
                     sample=args.sample,
                     alpha=args.alpha,
                     sg=skipgram,
                     hs=args.hs,
                     negative=args.negative,
                     cbow_mean=1,
                     iter=args.iter)

    if args.output:

Ejemplo n.º 10

0

Mostrar archivo

import pandas as pd
import numpy as np
import os

if __name__ == "__main__":
    print("Loading data...")
    data = pd.concat([
        pd.read_csv(
            "/data/SO_data/downvoter/wv_train_processed_data.csv").body,
        pd.read_csv("/data/SO_data/downvoter/wv_val_processed_data.csv").body
    ])
    print(data.shape)

    # save data to one line per doc file
    np.savetxt("data/wdocfile.txt", data.values, fmt="%s")
    tagged_data = LineSentence("data/wdocfile.txt")

    max_epochs = 50
    alpha = 0.025

    model_file = "./final/word_model.w2v"

    model = Word2Vec(size=50,
                     alpha=alpha,
                     min_alpha=0.01,
                     min_count=25,
                     window=30,
                     workers=16)

    print("Building the vocabulary...")
    model.build_vocab(tagged_data)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: analysis_part_2_clean.py Proyecto: mattsintaylert/Text-Analysis

def bow(filepath):
    for rev in LineSentence(filepath):
        yield tri_dictionary.doc2bow(rev)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: PhraseDetection.py Proyecto: yaldahashemi/DomainSpecificThesaurus

    def transform(self, sentencesPath, savePath):
        """
        use trained phrases to transform sentences
        :param sentencesPath: the path of text file, the text file should be the format: one line one sentence
        :param savePath: the path of transformed text file, the text file are the format: one line one sentence
        """
        with codecs.open(savePath, mode="w", encoding="utf-8") as fr:
            sentences = TxtIter(sentences=codecs.open(sentencesPath,
                                                      mode="r",
                                                      encoding="utf-8"),
                                ngrams=self.phrasers)
            lines = []
            for line in sentences:
                lines.append(" ".join(line) + "\n")
                if len(lines) > 500000:
                    fr.writelines(lines)
                    lines = []
            fr.writelines(lines)
        logger.info("delete all phraser to save memory")
        for i in self.phrasers:
            del i
        del self.phrasers
        self.phrasers = None
        gc.collect()


if __name__ == "__main__":
    ls = LineSentence("E:/a.txt")
    for i in ls:
        print(i)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: train_word2vec.py Proyecto: leolle/deep_learning

    #     print("Please use python wiki_preprocess.py output_path")
    #     exit()
    #    output_path = sys.argv[1]
    logging.info("start")
    begin = time()

    dir_path = sys.argv[1]
    output_path = sys.argv[2]

    for root, dirs, files in os.walk(dir_path):
        for filename in files:
            file_path = root + '/' + filename
            logging.info(filename)
            ls_pageid = find_category_page(file_path)
            if len(ls_pageid) == 0:
                continue
            # ls_pg_text_clean = extract_pages(ls_pageid)
            extract_pages(ls_pageid)
            model = gensim.models.Word2Vec(LineSentence('/tmp/test.txt'),
                                           size=200,
                                           window=5,
                                           min_count=2,
                                           workers=multiprocessing.cpu_count())
            model.wv.save_word2vec_format(
                complete_dir_path(output_path) + filename[:-4] + ".w2v_org",
                complete_dir_path(output_path) + filename[:-4] + ".vocab",
                binary=False)
    end = time()
    load_duration = end - begin
    logging.info("Total procesing time: %d seconds" % (end - begin))

Ejemplo n.º 14

0

Mostrar archivo

Archivo: train.py Proyecto: akkefa/Islam-360

    "negative": [5],
    "sample": [0],
    "sg": [1],
    "size": [25],
    "window": [5],
    "workers": [cpu_count() - 1]
}

SENTENCES = "/home/ikram/workplace/projects/Islam-360/embedding/w2v/translation_sentences.txt"

for index, param in enumerate(ParameterGrid(PARAMS)):
    file_name = ""
    for key, value in param.items():
        file_name += f"{key}={value}|"
    print(f"Training: {file_name}")
    file = LineSentence(SENTENCES)
    model = gensim.models.Word2Vec(file, **param)

    predication = []
    with open('../urdu_similar_words.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            try:
                if model.wv.similarity(row[0], row[1]) > 0.7:
                    predication.append(1)
                else:
                    predication.append(0)
            except KeyError:
                continue

    data = [1] * len(predication)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: np2vec.py Proyecto: cdj0311/nlp-architect

    def __init__(
            self,
            corpus,
            corpus_format='txt',
            mark_char='_',
            word_embedding_type='word2vec',
            sg=0,
            size=100,
            window=10,
            alpha=0.025,
            min_alpha=0.0001,
            min_count=5,
            sample=1e-5,
            workers=20,
            hs=0,
            negative=25,
            cbow_mean=1,
            iter=15,
            min_n=3,
            max_n=6,
            word_ngrams=1):
        """
        Initialize np2vec model and train it.

        Args:
          corpus (str): path to the corpus.
          corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json
          formats are supported. For json format, the file should contain an iterable of
          sentences. Each sentence is a list of terms (unicode strings) that will be used for
          training.
          mark_char (char): special character that marks NP's suffix.
          word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and
          fasttext are supported.
          np2vec_model_file (str): path to the file where the trained np2vec model has to be
          stored.
          binary (bool): boolean indicating whether the model is stored in binary format; if
          word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True.
          sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training
          algorithm. If 1, CBOW is used,otherwise, skip-gram is employed.
          size (int): model training hyperparameter, size of the feature vectors.
          window (int): model training hyperparameter, maximum distance between the current and
          predicted word within a sentence.
          alpha (float): model training hyperparameter. The initial learning rate.
          min_alpha (float): model training hyperparameter. Learning rate will linearly drop to
          `min_alpha` as training progresses.
          min_count (int): model training hyperparameter, ignore all words with total frequency
          lower than this.
          sample (float): model training hyperparameter, threshold for configuring which
          higher-frequency words are randomly downsampled, useful range is (0, 1e-5)
          workers (int): model training hyperparameter, number of worker threads.
          hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1,
          hierarchical softmax will be used for model training. If set to 0, and `negative` is non-
                        zero, negative sampling will be used.
          negative (int): model training hyperparameter, negative sampling. If > 0, negative
          sampling will be used, the int for negative specifies how many "noise words" should be
          drawn (usually between 5-20). If set to 0, no negative sampling is used.
          cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context
          word vectors. If 1, use the mean, only applies when cbow is used.
          iter (int): model training hyperparameter, number of iterations.
          min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used
          for training word representations.
          max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for
          training word representations. Set `max_n` to be lesser than `min_n` to avoid char
          ngrams being used.
          word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word
          vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training.

        """

        self.mark_char = mark_char
        self.word_embedding_type = word_embedding_type
        self.sg = sg
        self.size = size
        self.window = window
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.min_count = min_count
        self.sample = sample
        self.workers = workers
        self.hs = hs
        self.negative = negative
        self.cbow_mean = cbow_mean
        self.iter = iter
        self.min_n = min_n
        self.max_n = max_n
        self.word_ngrams = word_ngrams

        if corpus_format == 'txt':
            self._sentences = LineSentence(corpus)
        elif corpus_format == 'json':
            with open(corpus) as json_data:
                self._sentences = json.load(json_data)
        elif corpus_format == 'conll2000':
            try:
                self._sentences = list()
                for chunked_sent in conll2000.chunked_sents(corpus):
                    tokens = list()
                    for chunk in chunked_sent:
                        if hasattr(chunk, '_label') and chunk._label == 'NP':
                            s = ''
                            for w in chunk:
                                s += w[0] + self.mark_char
                            tokens.append(s)
                        else:
                            if isinstance(chunk, nltk.Tree):
                                for w in chunk:
                                    tokens.append(w[0])
                            else:
                                tokens.append(chunk[0])
                        self._sentences.append(tokens)
            except Exception:
                print('Conll2000 dataset is missing from NLTK. See downloading details in the '
                      'README file')
        else:
            logger.error('invalid corpus format: ' + corpus_format)
            sys.exit(0)

        if word_embedding_type == 'fasttext' and word_ngrams == 1:
            # remove the marking character at the end for subword fasttext model training
            for i, sentence in enumerate(self._sentences):
                self._sentences[i] = [
                    w[:-1] if self.is_marked(w) else w for w in sentence]

        logger.info('training np2vec model')
        self._train()

Ejemplo n.º 16

0

Mostrar archivo

Archivo: check_ms2vec.py Proyecto: jackee777/ms2vec

from gensim.models.word2vec import LineSentence
from ms2vec.ms2vec import MultiSense2Vec

corpus = LineSentence("../enwiki_cleaning.txt")
model = MultiSense2Vec(corpus,
                       sg=1,
                       negative=5,
                       workers=8,
                       iter=5,
                       window=5,
                       min_count=10,
                       min_sense_count=1000,
                       max_sense_num=3,
                       size=300,
                       np_value=-0.5,
                       cv2zero=True,
                       use_all_window=True,
                       seed=0)

#print(model.wv.index2word)
print(model.most_similar("mouse"))
model_name = "npmssg_m0.5_enwiki_sense_10_neg_5_min_1000"
model.save(model_name)
model.wv.save_word2vec_format(model_name + ".bin", binary=True)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: prepare_data.py Proyecto: aligusnet/data-experiments

 def _train_phrase_detection_model(self, input_filepath, output_filepath):
     sentences = LineSentence(input_filepath)
     model = Phraser(Phrases(sentences))
     self._save_sentences(sentences, model, output_filepath)
     return model

Ejemplo n.º 18

0

Mostrar archivo

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time    : 2018/4/17 14:25
@Author  : Junya Lu
@Site    : 
"""
import warnings
from gensim.models.word2vec import LineSentence
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models import word2vec
from string import punctuation

# LineSentence('G:\project3\\Data\\train\\genes\\genes_one_line_space.txt')
# model = word2vec.Word2Vec(LineSentence('G:\project3\\Data\\train\\genes\\genes_one_line_space.txt'))
# model = word2vec.Word2Vec(LineSentence('G:\project3\\Data\\train\\terms\\terms.txt'))
model = word2vec.Word2Vec(LineSentence('genes0504.txt'), min_count=0)
print(model['C1Q'])
print('the number of vocabulary', len(model.wv.vocab))
vocab = list(model.wv.vocab.keys())
print(vocab[:10])
# model.save('G:\project3\\Data\\train\\w2v.model')
# model.wv.save_word2vec_format('G:\project3\\Data\\train\\vector_genes0504.txt')

# print (model.similarity('dogs', 'you'))
# print (model.similar_by_vector('dogs'))
# print (model['you'])

Ejemplo n.º 19

0

Mostrar archivo

Archivo: corpus_preprocessor.py Proyecto: zzcoolj/reordered-word2vec

    # MTURK-771
    'MTURK771_Pearson correlation', 'Pearson pvalue',
    'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
    # questions-words
    'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total'
])

for i in range(5):
    params = {
        'alpha': 0.025,
        'min_alpha': 0.0001,
        'size': 200,
        'window': 5,
        'iter': 5,
        'max_vocab_size': 50000,
        'sample': 1e-4,
        'sg': 1,  # 1 for skip-gram
        'hs': 0,  # If 0, and negative is non-zero, negative sampling will be used.
        'negative': 5,
        'workers': 3,

        'restricted_vocab': None,  # [modified] ATTENTION: It must be a dictionary not a list!
        'restricted_type': 0  # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn
    }
    gs_model = Word2Vec(LineSentence(corpus_file), **params)
    df.loc[i] = evaluate(gs_model.wv, str(i))

writer = pd.ExcelWriter(xlsx_path)
df.to_excel(writer, 'Sheet1')
writer.save()

Ejemplo n.º 20

0

Mostrar archivo

Archivo: main.py Proyecto: Blender14RUS/NLP

    with open("res.txt", "w", encoding="utf8") as resultFile:
        lines = file.readlines()
        for line in lines:

            intermediate = word_tokenize(line)
            words_literals = [word for word in intermediate if word.isalpha()]

            stop_words = set(stopwords.words('english'))
            clear_tokens = [w for w in words_literals if w not in stop_words]

            if not clear_tokens:
                continue
            resultFile.write(' '.join(clear_tokens))
            resultFile.write('\n')

sentences = LineSentence("res.txt")
model = gensim.models.Word2Vec(sentences,
                               min_count=5,
                               size=300,
                               workers=4,
                               window=10,
                               sg=1,
                               negative=5)
print('Similar for "Marfa"')
print(model.wv.most_similar(positive=['Marfa']))

print('\nSimilar for "Petersburg"')
print(model.wv.most_similar(positive=['Petersburg']))

model.wv.save_word2vec_format("IDIOT_preproc.model")

Ejemplo n.º 21

0

Mostrar archivo

Archivo: corpus_preprocessor.py Proyecto: zzcoolj/reordered-word2vec

def iteration_simulator(total_epoch, special_epoch_count, restricted_vocab_name, jumps):
    # corpus_file = '/Users/zzcoolj/Code/GoW/data/training data/Wikipedia-Dumps_en_20170420_prep/AA/wiki_01.txt'
    corpus_file = 'input/enwiki-1G.txt'
    xlsx_path = 'output/test1G-vocab50000-original-iter' + str(total_epoch) + '-last' + str(special_epoch_count) \
                + 'EpochInitial-' + str(restricted_vocab_name) + '-jump'+''.join(str(x) for x in jumps)+'.xlsx'
    df = pd.DataFrame(columns=[
        # word embeddings file name
        'file name',
        # wordsim353
        'wordsim353_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # simlex999
        'simlex999_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # MTURK-771
        'MTURK771_Pearson correlation', 'Pearson pvalue',
        'Spearman correlation', 'Spearman pvalue', 'Ration of pairs with OOV',
        # questions-words
        'sem_acc', '#sem', 'syn_acc', '#syn', 'total_acc', '#total'
    ])
    line_number_in_xlsx = 0

    # epoch 0
    lr = 0.025
    alphas = alpha_splitter(start=lr, epochs=total_epoch)
    print('alphas', alphas)
    min_alpha = alphas[1]
    restricted_vocab = read_file_to_dict('../word_embeddings_evaluator/data/distinct-tokens/' +
                                         str(restricted_vocab_name) + '.txt')
    restricted_type = 0
    params = {
        'alpha': lr,
        'min_alpha': min_alpha,
        'size': 200,
        'window': 5,
        'iter': 0,  # TODO NOW
        'max_vocab_size': 50000,
        'sample': 1e-4,
        'sg': 1,  # 1 for skip-gram
        'hs': 0,  # If 0, and negative is non-zero, negative sampling will be used.
        'negative': 5,
        'workers': 3,

        'restricted_vocab': restricted_vocab,  # [modified] ATTENTION: It must be a dictionary not a list!
        'restricted_type': restricted_type  # [modified] 0: train_batch_sg_original; 1: train_batch_sg_in; 2: train_batch_sg_notIn
    }
    print('cur_epoch', 0)
    gs_model = Word2Vec(LineSentence(corpus_file), **params)
    df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch0')
    gs_model.epochs = 1  # TODO NOW

    # # epoch 0.5
    # gs_model.restricted_type = 2
    # gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
    #                start_alpha=lr, end_alpha=min_alpha)
    # df.loc[1] = evaluate(gs_model.wv, 'X-iter0.5')

    # epoch 1+
    # gs_model.restricted_type = 0
    for cur_epoch in range(1, total_epoch-special_epoch_count):
        print('cur_epoch', cur_epoch)
        start_alpha = alphas[cur_epoch]
        end_alpha = alphas[cur_epoch+1]
        print('start_alpha', start_alpha)
        print('end_alpha', end_alpha)
        gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
                       start_alpha=start_alpha, end_alpha=end_alpha)
        line_number_in_xlsx += 1
        df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch'+str(cur_epoch))

    # # save common base model
    # write_to_pickle(gs_model, xlsx_path.split('.xlsx')[0]+'-base')

    for special_epoch in range(total_epoch-special_epoch_count, total_epoch):
        print('special epoch', special_epoch)
        start_alpha = alphas[special_epoch]
        end_alpha = alphas[special_epoch+1]
        print('start_alpha', start_alpha)
        print('end_alpha', end_alpha)
        # final special epochs 0.5
        gs_model.restricted_type = 1
        gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
                       start_alpha=start_alpha, end_alpha=end_alpha)
        line_number_in_xlsx += 1
        df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch'+str(special_epoch)+'-half')

        # final special epochs final
        if special_epoch not in jumps:
            gs_model.restricted_type = 2
            gs_model.train(LineSentence(corpus_file), total_examples=gs_model.corpus_count, epochs=gs_model.iter,
                           start_alpha=start_alpha, end_alpha=end_alpha)
            line_number_in_xlsx += 1
            df.loc[line_number_in_xlsx] = evaluate(gs_model.wv, 'epoch' + str(special_epoch)+'-entire')

    # # baseline (final original word2vec epochs)
    # gs_model_base = read_pickle(xlsx_path.split('.xlsx')[0] + '-base')
    # gs_model_base.restricted_type = 0
    # for baseline_epoch in range(total_epoch - special_epoch_count, total_epoch):
    #     print('baseline epoch', baseline_epoch)
    #     start_alpha = alphas[baseline_epoch]
    #     end_alpha = alphas[baseline_epoch + 1]
    #     print('start_alpha', start_alpha)
    #     print('end_alpha', end_alpha)
    #     gs_model_base.train(LineSentence(corpus_file), total_examples=gs_model_base.corpus_count, epochs=gs_model_base.iter,
    #                         start_alpha=start_alpha, end_alpha=end_alpha)
    #     line_number_in_xlsx += 1
    #     df.loc[line_number_in_xlsx] = evaluate(gs_model_base.wv, 'epoch' + str(baseline_epoch)+'-baseline')

    writer = pd.ExcelWriter(xlsx_path)
    df.to_excel(writer, 'Sheet1')
    writer.save()

Ejemplo n.º 22

0

Mostrar archivo

Archivo: word2vec_train.py Proyecto: sig0410/NLP

from gensim.test.utils import get_tmpfile
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

sentences = LineSentence('./word2vec_train.txt')
# Linesentence : 라이별로 하나의 문장이라고 생각하는 것

# model = Word2Vec(sentences, size = 100, window = 3, min_count = 1, iter = 1000)
#
# model.save('Basic_word2vec.model')


model = Word2Vec.load('Basic_word2vec.model')

print(model.wv.most_similar('Korea', topn = 10))

print(len(model.wv.vocab))

score, predictions = model.wv.evaluate_word_analogies('./')

Ejemplo n.º 23

0

Mostrar archivo

def train_w2v():
    with open('./data/reduced_zhwiki.txt', 'r', encoding='utf8') as f:
        # 使用gensim的Word2Vec类来生成词向量
        model = Word2Vec(LineSentence(f), sg=0, size=192, window=5,
                         min_count=5, workers=4)
        model.save('./data/zhwiki_news.word2vec')

Ejemplo n.º 24

0

Mostrar archivo

def train_w():
    sentences = LineSentence('../file/after_fenci.txt')
    model = Word2Vec(sentences, size=128)
    model.save('../file/tarining')

Ejemplo n.º 25

0

Mostrar archivo

    executor = Parallel(n_jobs=n_jobs,
                        backend="multiprocessing",
                        prefer="processes")
    do = delayed(partial(tokenize_sentence_corpus, corpus_out_path))
    tasks = (do(i, batch) for i, batch in enumerate(partitions))

    executor(tasks)


# process_texts(documents_path, year='2020', court='01', corpus_out_path=unigram_sentences_path, batch_size=8, n_jobs=2,
#               debug=True)

stop_words = get_custom_stop_words()

pruned_words, counters, total_words = Phrases.learn_vocab(
    sentences=LineSentence(unigram_sentences_path),
    max_vocab_size=800000000,
    common_terms=stop_words,
    progress_per=100)

counters = sorted(counters.items(),
                  key=lambda key_value: key_value[1],
                  reverse=True)

count = 0
for key, value in counters:
    count += 1
    print(any2unicode(key), value)
print(count)

bigram_model = Phrases(LineSentence(unigram_sentences_path),

Ejemplo n.º 26

0

Mostrar archivo

# model generation and save

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing as mp

en_fname = './data/prepro_en_wiki.txt'
model_fname = './model/taekeun/en.bin'

model = Word2Vec(LineSentence(en_fname),
                 size=300,
                 workers=mp.cpu_count(),
                 sg=1)

model.save(model_fname)

Ejemplo n.º 27

0

Mostrar archivo

import os
import codecs
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import pandas as pd

import settings

trigram_sentences = LineSentence(
    os.path.join(settings.DATA_PATH, 'trigram_sentences.txt'))
word2vec_filepath = os.path.join(settings.DATA_PATH, 'word2vec_model')

if 0 == 1:
    text2vec = Word2Vec(trigram_sentences,
                        size=100,
                        window=5,
                        min_count=20,
                        sg=1,
                        workers=4)
    text2vec.save(word2vec_filepath)
    for i in range(1, 12):
        text2vec.train(trigram_sentences)
        text2vec.save(word2vec_filepath)

text2vec = Word2Vec.load(word2vec_filepath)
text2vec.init_sims()

print('{} training epochs so far.'.format(text2vec.train_count))
print('{:,} terms in the text2vec vocabulary.'.format(len(text2vec.vocab)))

Ejemplo n.º 28

0

Mostrar archivo

Archivo: word2vec.py Proyecto: geasyheart/sentiment_analysis

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
model = Word2Vec(
    LineSentence('data/simple.reg.txt'),
    size=400,
    window=5,
    min_count=5,
    workers=multiprocessing.cpu_count() - 2,
)

outp1 = 'data/simple.zh.text.model'
outp2 = 'data/simple.zh.text.vector'
model.save(outp1)
model.wv.save_word2vec_format(outp2)

Ejemplo n.º 29

0

Mostrar archivo

Archivo: test.py Proyecto: elternativeht/DecisionTree_NLP

#coding:utf-8
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence
import logging

inFile = 'corpus.txt'
outFile = 'output_demoModel.out'
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

model = word2vec.Word2Vec(LineSentence(inFile),
                          size=100,
                          window=3,
                          min_count=1)
print model.wv[u'理']

Ejemplo n.º 30

0

Mostrar archivo

Archivo: phrase_docs.py Proyecto: dangadadanga/topic_modeling

def nlp_preprocess(filepath_dict: dict,
                   col: str,
                   df=None,
                   verbose: bool = True,
                   overwrite_interim: bool = True) -> pd.DataFrame:
    def clean_doc(corpus):
        '''
        generator function to read in docs from the file,
        and substitute and remove substrings
        '''
        for doc in corpus:
            yield au_tu.remove_substrings(au_tu.clean_tokens(
                doc,
                tokens=to_replace_dict,
                whole_words_only=whole_words_only,
                ignore_case=ignore_case,
            ),
                                          to_remove_list=to_remove_list,
                                          whole_words_only=whole_words_only,
                                          ignore_case=ignore_case)

    def tokenize_entities(parsed_doc):
        txt = parsed_doc.text
        for ent in parsed_doc.ents:
            txt = txt[:ent.start_char] + ent.text.replace(
                ' ', '_') + txt[ent.end_char:]
        return txt

    def cleaned_doc_corpus(corpus):
        '''
        generator function to use spaCy to parse docs, clean docs,
        tokenize named entities, and yield documents
        '''
        for parsed_doc in nlp.pipe(clean_doc(corpus),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):
            yield tokenize_entities(parsed_doc)

    def punct_space_more(token):
        '''
        helper function to eliminate tokens that are
        pure punctuation or whitespace or digits or only 1 character
        '''
        return (
            token.is_punct or token.is_space or token.is_digit
            or token.text == "'s" or token.lemma_ == '-PRON-' or
            # token.lemma_ == 'say' or
            # token.lemma_ == 'tell' or
            # token.lemma_ == 'be' or
            len(token.text) <= 1)

    def line_doc(filename):
        '''
        generator function to read in docs from the file,
        un-escape the original line breaks in the text,
        and do additional cleaning
        '''
        def hyp_to_us(doc):
            return re.sub(r'\b-\b', '_', doc)

        def remove_punct(doc):
            # keep: alphanumberic (w), spaces (s), single quote, underscore
            return re.sub(r'[^\w\s\'_]+', '', doc)

        # with codecs.open(filename, encoding='utf_8') as f:
        with smart_open(filename) as f:
            for doc in f:
                yield remove_punct(hyp_to_us(doc.decode())).replace(
                    '\\n', '\n')

    def lemmatized_sentence_corpus(filename):
        '''
        generator function to use spaCy to parse docs,
        lemmatize the text, and yield sentences
        '''
        for parsed_doc in nlp.pipe(line_doc(filename),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):

            for sent in parsed_doc.sents:
                yield ' '.join([
                    token.lemma_ for token in sent
                    if not punct_space_more(token)
                ])

    if verbose:
        logger.info(f'Working on text from: {col}')

    # # debug - only getting from the sample dataframe here
    # df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].sample(n=50).copy()

    df_phrased = df.loc[df[col].notnull(),
                        ['tfa_master_uid', 'app_year', col]].copy()

    nlp = spacy.load('en', disable=[])

    # clean text and tokenize entities
    if verbose:
        logger.info('Cleaning docs...')
    df_phrased[col] = list(cleaned_doc_corpus(df_phrased[col].values))
    # remove 'the_' from NER tokens
    df_phrased[col] = df_phrased[col].apply(
        lambda x: ' '.join([re.sub('^the_', 'the ', y) for y in x.split()]))
    if verbose:
        logger.info('\tDone.')

    # create & open a new file in write mode
    if verbose:
        logger.info('Saving documents, one per line...')
    doc_count = 0
    with codecs.open(filepath_dict['doc_txt_filepath'], 'w',
                     encoding='utf_8') as doc_txt_file:
        for doc in df_phrased[[col]].apply(lambda x: ' '.join(x),
                                           axis=1).tolist():
            # write the doc as a line in the new file
            # escape newline characters in the original doc text
            doc_txt_file.write(doc.replace('\n', '\\n') + '\n')
            doc_count += 1
    if verbose:
        logger.info(
            f"Text from {doc_count:,} docs written to: {filepath_dict['doc_txt_filepath']}"
        )

    nlp = spacy.load('en', disable=['ner'])

    # lemmatize and save sentences

    if overwrite_interim:
        if verbose:
            logger.info(
                f"Processing documents into unigram sentences: {filepath_dict['unigram_sentences_filepath']}"
            )
        # with codecs.open(filepath_dict['unigram_sentences_filepath'], 'w', encoding='utf_8') as f:
        with smart_open(filepath_dict['unigram_sentences_filepath'], 'w') as f:
            for sentence in lemmatized_sentence_corpus(
                    filepath_dict['doc_txt_filepath']):
                f.write(sentence + '\n')
            if verbose:
                logger.info('Done.')
        unigram_sentences = LineSentence(
            filepath_dict['unigram_sentences_filepath'])

        if verbose:
            logger.info('Unigram examples:')
            for unigram_sentence in it.islice(unigram_sentences, 10, 20):
                logger.info(u' '.join(unigram_sentence))
                logger.info('=' * 30)

        if verbose:
            logger.info('Finding bigram phrases')
        # create the bigram model
        bigram = Phrases(unigram_sentences,
                         min_count=phrase_min_count,
                         threshold=phrase_threshold,
                         max_vocab_size=phrase_max_vocab_size,
                         progress_per=phrase_progress_per,
                         scoring=phrase_scoring,
                         common_terms=phrase_common_terms)
        bigram_model = Phraser(bigram)
        bigram_model.save(filepath_dict['bigram_model_filepath'])

        if verbose:
            logger.info(
                f"Saving bigram phrased sentences: {filepath_dict['bigram_sentences_filepath']}"
            )
        # save bigram sentences
        with codecs.open(filepath_dict['bigram_sentences_filepath'],
                         'w',
                         encoding='utf_8') as f:
            for unigram_sentence in unigram_sentences:
                bigram_sentence = u' '.join(bigram_model[unigram_sentence])
                f.write(bigram_sentence + '\n')

        bigram_sentences = LineSentence(
            filepath_dict['bigram_sentences_filepath'])
        if verbose:
            logger.info('Bigram examples:')
            for bigram_sentence in it.islice(bigram_sentences, 10, 20):
                logger.info(u' '.join(bigram_sentence))
                logger.info('=' * 30)

        if verbose:
            logger.info('Finding trigram phrases')
        # create the trigram model
        trigram = Phrases(bigram_sentences,
                          min_count=phrase_min_count,
                          threshold=phrase_threshold,
                          max_vocab_size=phrase_max_vocab_size,
                          progress_per=phrase_progress_per,
                          scoring=phrase_scoring,
                          common_terms=phrase_common_terms)
        trigram_model = Phraser(trigram)
        trigram_model.save(filepath_dict['trigram_model_filepath'])

        if verbose:
            logger.info(
                f"Saving trigram phrased sentences: {filepath_dict['trigram_sentences_filepath']}"
            )
        # create trigram sentences
        with codecs.open(filepath_dict['trigram_sentences_filepath'],
                         'w',
                         encoding='utf_8') as f:
            for bigram_sentence in bigram_sentences:
                trigram_sentence = u' '.join(trigram_model[bigram_sentence])
                f.write(trigram_sentence + '\n')

        trigram_sentences = LineSentence(
            filepath_dict['trigram_sentences_filepath'])
        if verbose:
            logger.info('Trigram examples:')
            for trigram_sentence in it.islice(trigram_sentences, 10, 20):
                logger.info(u' '.join(trigram_sentence))
                logger.info('=' * 30)

    if verbose:
        logger.info(
            f"Saving phrased docs using saved models: {filepath_dict['trigram_docs_filepath']}"
        )
    # using saved models, write transformed text out to a new file, one doc per line
    with codecs.open(filepath_dict['trigram_docs_filepath'],
                     'w',
                     encoding='utf_8') as f:
        for parsed_doc in nlp.pipe(line_doc(filepath_dict['doc_txt_filepath']),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):

            # removing punctuation and whitespace
            unigram_doc = [
                token.lemma_ for token in parsed_doc
                if not punct_space_more(token)
            ]

            # apply the first-order and second-order phrase models
            bigram_doc = bigram_model[unigram_doc]
            trigram_doc = trigram_model[bigram_doc]

            # remove any remaining stopwords
            trigram_doc = [
                term for term in trigram_doc
                if term not in nlp.Defaults.stop_words
            ]

            #extend the stop workds
            stop_words_extend = [
                'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say',
                'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done',
                'try', 'many', 'some', 'nice', 'thank', 'think', 'see',
                'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want',
                'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also',
                'may', 'take', 'come'
            ]
            trigram_doc = [
                term for term in trigram_doc if term not in stop_words_extended
            ]

            # write the transformed doc as a line in the new file
            trigram_doc = ' '.join(trigram_doc)
            f.write(trigram_doc + '\n')
    if verbose:
        logger.info('Done.')

    # put the text back in the dataframe
    trigram_docs = LineSentence(filepath_dict['trigram_docs_filepath'])

    if len([doc for doc in trigram_docs]) == df_phrased.shape[0]:
        for i, doc in enumerate(trigram_docs):
            df_phrased.iloc[i, df_phrased.columns.get_loc(col)] = ' '.join(doc)
    else:
        raise ValueError(
            'Different number of processed and original documents')

    # save dataframe
    if verbose:
        logger.info('Saving NLP processed data: {}'.format(
            filepath_dict['filepath_out']))
    df_phrased.to_csv(filepath_dict['filepath_out'])

    return df_phrased

Ejemplo n.º 31

0

Mostrar archivo

Archivo: DataCleaner.py Proyecto: masud-technope/PythonWorkspace

from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import remove_stopwords

# EXP_HOME = "F:/MyWorks/Thesis Works/Crowdsource_Knowledge_Base/DeepGenQR/experiment"
EXP_HOME = "C:/My MSc/ThesisWorks/BigData_Code_Search/DeepGenQR/experiment"
csv_file = EXP_HOME + "/stackoverflow/eclipse/eclipse-qa.csv"

CUSTOM_FILTERS = [
    lambda x: x.lower(), strip_multiple_whitespaces, strip_punctuation,
    remove_stopwords, strip_non_alphanum
]
sentences = LineSentence(open(csv_file, 'r'),
                         max_sentence_length=100000,
                         limit=None)
pre_processed = list()
for sentence in sentences:
    # print(' '.join(sentence))
    temp = ' '.join(sentence)
    pp_sentence = preprocess_string(temp, CUSTOM_FILTERS)
    # print(pp_sentence)
    pre_processed.append(' '.join(pp_sentence))

# saving the pre-processed to the file
myFile = open(pp_raw_code, 'w')
for line in pre_processed:
    myFile.write("%s\n" % line)

print("Corpus preprocessed successfully!")

Ejemplo n.º 32

0

Mostrar archivo

Archivo: np2vec.py Proyecto: cdj0311/nlp-architect

class NP2vec:
    """
    Initialize the np2vec model, train it, save it and load it.
    """

    def is_marked(self, s):
        """
        Check if a string is marked.

        Args:
            s (str): string to check
        """
        return len(s) > 0 and s[-1] == self.mark_char

    def __init__(
            self,
            corpus,
            corpus_format='txt',
            mark_char='_',
            word_embedding_type='word2vec',
            sg=0,
            size=100,
            window=10,
            alpha=0.025,
            min_alpha=0.0001,
            min_count=5,
            sample=1e-5,
            workers=20,
            hs=0,
            negative=25,
            cbow_mean=1,
            iter=15,
            min_n=3,
            max_n=6,
            word_ngrams=1):
        """
        Initialize np2vec model and train it.

        Args:
          corpus (str): path to the corpus.
          corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json
          formats are supported. For json format, the file should contain an iterable of
          sentences. Each sentence is a list of terms (unicode strings) that will be used for
          training.
          mark_char (char): special character that marks NP's suffix.
          word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and
          fasttext are supported.
          np2vec_model_file (str): path to the file where the trained np2vec model has to be
          stored.
          binary (bool): boolean indicating whether the model is stored in binary format; if
          word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True.
          sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training
          algorithm. If 1, CBOW is used,otherwise, skip-gram is employed.
          size (int): model training hyperparameter, size of the feature vectors.
          window (int): model training hyperparameter, maximum distance between the current and
          predicted word within a sentence.
          alpha (float): model training hyperparameter. The initial learning rate.
          min_alpha (float): model training hyperparameter. Learning rate will linearly drop to
          `min_alpha` as training progresses.
          min_count (int): model training hyperparameter, ignore all words with total frequency
          lower than this.
          sample (float): model training hyperparameter, threshold for configuring which
          higher-frequency words are randomly downsampled, useful range is (0, 1e-5)
          workers (int): model training hyperparameter, number of worker threads.
          hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1,
          hierarchical softmax will be used for model training. If set to 0, and `negative` is non-
                        zero, negative sampling will be used.
          negative (int): model training hyperparameter, negative sampling. If > 0, negative
          sampling will be used, the int for negative specifies how many "noise words" should be
          drawn (usually between 5-20). If set to 0, no negative sampling is used.
          cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context
          word vectors. If 1, use the mean, only applies when cbow is used.
          iter (int): model training hyperparameter, number of iterations.
          min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used
          for training word representations.
          max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for
          training word representations. Set `max_n` to be lesser than `min_n` to avoid char
          ngrams being used.
          word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word
          vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training.

        """

        self.mark_char = mark_char
        self.word_embedding_type = word_embedding_type
        self.sg = sg
        self.size = size
        self.window = window
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.min_count = min_count
        self.sample = sample
        self.workers = workers
        self.hs = hs
        self.negative = negative
        self.cbow_mean = cbow_mean
        self.iter = iter
        self.min_n = min_n
        self.max_n = max_n
        self.word_ngrams = word_ngrams

        if corpus_format == 'txt':
            self._sentences = LineSentence(corpus)
        elif corpus_format == 'json':
            with open(corpus) as json_data:
                self._sentences = json.load(json_data)
        elif corpus_format == 'conll2000':
            try:
                self._sentences = list()
                for chunked_sent in conll2000.chunked_sents(corpus):
                    tokens = list()
                    for chunk in chunked_sent:
                        if hasattr(chunk, '_label') and chunk._label == 'NP':
                            s = ''
                            for w in chunk:
                                s += w[0] + self.mark_char
                            tokens.append(s)
                        else:
                            if isinstance(chunk, nltk.Tree):
                                for w in chunk:
                                    tokens.append(w[0])
                            else:
                                tokens.append(chunk[0])
                        self._sentences.append(tokens)
            except Exception:
                print('Conll2000 dataset is missing from NLTK. See downloading details in the '
                      'README file')
        else:
            logger.error('invalid corpus format: ' + corpus_format)
            sys.exit(0)

        if word_embedding_type == 'fasttext' and word_ngrams == 1:
            # remove the marking character at the end for subword fasttext model training
            for i, sentence in enumerate(self._sentences):
                self._sentences[i] = [
                    w[:-1] if self.is_marked(w) else w for w in sentence]

        logger.info('training np2vec model')
        self._train()

    def _train(self):
        """
        Train the np2vec model.
        """
        if self.word_embedding_type == 'word2vec':
            self.model = Word2Vec(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=self.iter)

        elif self.word_embedding_type == 'fasttext':
            self.model = FastText(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=iter,
                min_n=self.min_n,
                max_n=self.max_n,
                word_ngrams=self.word_ngrams)
        else:
            logger.error(
                'invalid word embedding type: ' +
                self.word_embedding_type)
            sys.exit(0)

    def save(self, np2vec_model_file='np2vec.model', binary=False):
        """
        Save the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
        """
        if self.word_embedding_type == 'fasttext' and self.word_ngrams == 1:
            if not binary:
                logger.error(
                    "if word_embedding_type is fasttext and word_ngrams is 1, "
                    "binary should be set to True.")
                sys.exit(0)
            # not relevant to prune fasttext subword model
            self.model.save(np2vec_model_file)
        else:
            # prune non NP terms
            logger.info('pruning np2vec model')
            total_vec = 0
            vector_size = self.model.vector_size
            for word in self.model.wv.vocab.keys():
                if self.is_marked(word):
                    total_vec += 1
            logger.info(
                "storing %sx%s projection weights for NP's into %s" %
                (total_vec, vector_size, np2vec_model_file))
            with utils.smart_open(np2vec_model_file, 'wb') as fout:
                fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
                # store NP vectors in sorted order: most frequent NP's at the top
                for word, vocab in sorted(
                        iteritems(
                            self.model.wv.vocab), key=lambda item: -item[1].count):
                    if self.is_marked(word):
                        embedding_vec = self.model.wv.syn0[vocab.index]
                        if binary:
                            fout.write(
                                utils.to_utf8(word) + b" " + embedding_vec.tostring())
                        else:
                            fout.write(
                                utils.to_utf8(
                                    "%s %s\n" %
                                    (word, ' '.join(
                                        "%f" %
                                        val for val in embedding_vec))))

    @classmethod
    def load(cls, np2vec_model_file, binary=False, word_ngrams=0):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            return KeyedVectors.load_word2vec_format(
                np2vec_model_file, binary=binary)
        elif word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        else:
            logger.error('invalid value for \'word_ngrams\'')