Ejemplo n.º 1
0
 def loadDevblogModel(self,
                      embedding_dim,
                      epochs,
                      window,
                      min_count):
     """
     Devblog 데이터를 기반으로 FastText 단어 임베딩 모델 학습
     
     - input
     : embedding_dim / int / 단어 벡터화시 차원 수
     : epochs / int / 학습 횟수
     : window / int / 학습에 사용될 n-gram
     : min_count / int / 학습에 사용될 단어의 최소 등장횟수
     
     - return
     : we_model
     """
     model = None
     if not os.path.isfile(CONST.devblog_model_path):
         print('🐈  학습된 단어 임베딩 모델이 없습니다.')
         dc = Document()
         docs = dc.getDocs(labeled_only=False) # 전체 데이터 가져오기
         print('🐈  단어 임베딩 모델 학습을 시작합니다.')
         sentences = docs.text.apply(lambda x: [han2Jamo(s) for s in x.split(' ')])
         model = FastText(size=embedding_dim, window=window, min_count=min_count)
         model.build_vocab(sentences=sentences)
         model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
         
         print('🐈  단어 임베딩 모델을 저장합니다.')
         model.save(CONST.devblog_model_path)
     else:
         model = FastText.load(CONST.devblog_model_path)
     return model
Ejemplo n.º 2
0
def FastText_Save(files_in, models_out, min_count, size, iters):
    '''
    FastText Model : input sentences in which words are pulled together with  ' ' 
     The input sentence is a list in a [] --> [list,list2,list3,......] and save the model.
    Arg:
     files_in: the tokenized txt data
     models_out: the path to save the model
     min_count: the threshold of frequency, if a word's frequency is smaller than it, the word will be drop
     size: the size of output vector of words
     iters: the times for training
    // vocabulary: vocab=(model.wv.vocab).keys()
    '''
    txt_file = open(files_in, 'r')
    sentence = []
    for line in txt_file:
        line = line.strip()
        line = line.split(' ')
        sentence.append(line)
    # Train the networks
    model = FastText(sentence,
                     min_count=min_count,
                     size=size,
                     iter=iters,
                     window=5)
    model.save(models_out)
Ejemplo n.º 3
0
    def classType_fasttext_train(self, classType):

        train_sentences = []

        for word in self.train:
            sentence = []
            mappings = self.word_mapping[word]
            for mapping in mappings:
                if mapping == classType:
                    sentence.append(word)
            if len(sentence) > 0:
                train_sentences.append(sentence)

        feature_encoder = FastText(size=50,
                                   window=2,
                                   min_count=1,
                                   min_n=2,
                                   max_n=6)
        feature_encoder.build_vocab(sentences=train_sentences)
        feature_encoder.train(sentences=train_sentences,
                              total_examples=feature_encoder.corpus_count,
                              epochs=1000)
        feature_encoder.save('./models/' + classType + '_fasttext.model')
        if classType == 'company':
            self.company_feature_encoder = feature_encoder
        elif classType == 'location':
            self.location_feature_encoder = feature_encoder
        elif classType == 'goods':
            self.goods_feature_encoder = feature_encoder
        else:
            raise Exception(
                'Allowed arguments are company, location and goods')
    def main(self):
        print('使用fasttext 方式进行训练, start train...')

        model = FastText(sentences=self.data, sg= 1, size= 150, window =2, min_count=1)
        model.save(self.fasttext_model)

        print('模型训练完成...')
Ejemplo n.º 5
0
def fasttext_model(model_name, dir_model):
    print('Creating fasttext model')
    trainings = []
    cwd = os.getcwd()
    file_pathes = Path(cwd + '\\wakati').glob('**/*.txt')
    for file_path in file_pathes:
        with open(file_path, 'r', encoding='utf-8') as wafile:
            #wakati text整形
            text = (wafile.read()).replace('\n', '')
            text = re.sub(r"\text+", " ", text)
            #label付与
            tag_name = os.path.basename(file_path)
            tag_name = '__label__' + tag_name[20:26] + ' , '
            text = tag_name + text
            text = text.replace('\u3000', '')
            text = text.replace('\xa0', '')
            textls = text.split(' ')
            textls2 = [x for x in textls if x]  #空item削除
            trainings.append(textls2)
    #print(list(trainings[0]))
    print('fasttext modeling')
    model_ft = FastText(trainings,
                        size=300,
                        window=15,
                        min_count=5,
                        iter=10,
                        workers=10,
                        sg=1)
    model_ft.save(dir_model + model_name)
Ejemplo n.º 6
0
def main():
    """
    script to training fastText word embedding model
    """
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i',
                        '--input-file',
                        required=False,
                        default=config.DATA_FILE,
                        help='input data file for training')
    parser.add_argument('-m',
                        '--model-file',
                        required=False,
                        default=config.MODEL_FILE,
                        help='model output name')
    parser.add_argument('-s',
                        '--embedding-size',
                        required=False,
                        type=int,
                        default=config.MODEL_FILE,
                        help='model output name')

    args = parser.parse_args()

    model = FastText(size=args.embedding_size, sg=1)
    model.build_vocab(corpus_file=args.input_file)

    total_words = model.corpus_total_words
    model.train(corpus_file=args.input_file, total_words=total_words, epochs=5)

    model.save(args.model_file)
Ejemplo n.º 7
0
def train_vector_model(train_data_list, mode):
    if mode == 'train':
        mecab = Okt()
        str_buf = train_data_list['encode']
        joinString = ' '.join(str_buf)
        pos1 = mecab.pos(joinString)
        pos2 = ' '.join(
            list(map(lambda x: '\n'
                     if x[1] in ['Punctuation'] else x[0], pos1))).split('\n')
        morphs = list(map(lambda x: mecab.morphs(x), pos2))
        print("BUILD MODEL")
        model = FastText(size=vector_size,
                         window=3,
                         workers=8,
                         min_count=1,
                         sg=1,
                         iter=1000)
        model.build_vocab(morphs)
        print("BUILD COMPLETE")

        print("TRAIN START")
        model.train(morphs,
                    total_examples=model.corpus_count,
                    epochs=model.epochs,
                    compute_loss=True)
        if not os.path.exists('./fasttext'):
            os.makedirs('./fasttext')

        model.save('./fasttext/model')
        print("TRAIN COMPLETE")
        return model
    else:
        return FastText.load('./fasttext/model')
    def load_ft_model(self, fname):
        """
        class FastText(sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
        word_ngrams=1, sample=0.001, seed=1, workers=3, min_alpha=0.0001, negative=5, cbow_mean=1, hashfxn=hash, iter=5,
        null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH)
        min_n : int
            Min length of char ngrams to be used for training word representations.
        max_n : int
            Max length of char ngrams to be used for training word representations.
            Set max_n to be lesser than min_n to avoid char ngrams being used.
        word_ngrams : int {1,0}
            If 1, uses enriches word vectors with subword(ngrams) information. If 0, this is equivalent to word2vec.
        bucket : int
            Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model.
            This option specifies the number of buckets used by the model.
        """
        print(
            'Loading Fasttext Model... in {0:.2f} seconds'.format(time.time() -
                                                                  start_time))
        try:
            fasttext_model = FastText.load(fname)
            print(fasttext_model)
        except IOError:
            print('No existed model. Training Ft Model... in {0:.2f} seconds'.
                  format(time.time() - start_time))
            texts = config.WikiCorpus()
            fasttext_model = FastText(texts, **DEFAULT_ARGUMENTS_FT)
            fasttext_model.save(fname)

        print('Success to load Fasttext Model... in {0:.2f} seconds'.format(
            time.time() - start_time))
        return fasttext_model
Ejemplo n.º 9
0
def getWordVec(corpus: list, type=1) -> object:
    '''
        Args:
            corpus: list[list[str]], each sublist indicates a sentence
            type: 1 = word2vec, 2 = fasttext
    '''
    coherenceMetric = Callback(ConvergenceMetric())
    convergenceMetric = Callback(CoherenceMetric())
    diffMetric = Callback(DiffMetric())
    if type == 1:
        model = Word2Vec(corpus)
        model.save('word2vec.model')
        return model
    else:
        model = FastText(min_count=1)
        logging.info('Starting building vocabulary table')
        model.build_vocab(corpus)
        logging.info('Starting training')

        model.train(corpus,
                    total_examples=model.corpus_count,
                    epochs=model.epochs,
                    callbacks=[epochLogger])
        model.save('FastText.model')
        return model
Ejemplo n.º 10
0
def train_vector_model(train_data_list, train):
    if train:
        str_buf = train_data_list['encode']
        joinString = ' '.join(str_buf)
        pos1 = kiwi_f.k_pos(joinString)
        pos2 = ' '.join(
            list(map(lambda x: '\n'
                     if x[1] in ['Punctuation'] else x[0], pos1))).split('\n')
        morphs = list(map(lambda x: kiwi_f.k_morphs(x), pos2))
        print("BUILD MODEL")
        model = FastText(
            size=300,
            window=3,
            workers=8,
            min_count=1,
            sg=1,  #skipgram 모델의 성능이 더 좋다고 알려져있음
            iter=1000)
        model.build_vocab(morphs)
        print("BUILD COMPLETE")

        print("TRAIN START")
        model.train(morphs,
                    total_examples=model.corpus_count,
                    epochs=model.epochs,
                    compute_loss=True)
        if not os.path.exists(path.FASTTEXT_DIR):
            os.makedirs(path.FASTTEXT_DIR)

        model.save(path.model_path + 'model_test')
        print("TRAIN COMPLETE")
        return model
    else:
        return FastText.load(path.model_path + 'model_test')
Ejemplo n.º 11
0
class EmbeddingModel():
    def __init__(self, name="default", phraser=None):
        self.name = "embedding_" + name + ".model"
        self.phraser = phraser
        if self.name in os.listdir(dir_embedding):
            self.get_embedding = FastText.load(dir_embedding + self.name)
            print("Embedding {} loaded".format(name))
        else:
            print("embedding not exists")
            print("start building...")
            self.build_embedding()
            self.save()

    def build_embedding(self):
        tickers = [
            i for i in os.listdir(dir_cleaned_news) if i.endswith(".csv")
        ]
        tokenized_docs = []
        start = time.time()
        for ticker in tickers:
            df = pd.read_csv(dir_cleaned_news + ticker, index_col=0)
            tokenized_docs += tokenizer(df['content'], self.phraser)

        self.get_embedding = FastText(tokenized_docs, sg=1, hs=1)
        end = time.time()

        print("train finished! ", end - start, " seconds")

    # 저장
    def save(self):
        self.get_embedding.save(dir_embedding + self.name)
        print("saved!")
Ejemplo n.º 12
0
 def make_title_model(self, title_list_detach):
     try:
         print("make_title_model 실행")
         if not (os.path.isfile(
                 "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model"
         )):
             print("make_title_model 모델 학습 시작")
             FT_title_model = FT_gensim(title_list_detach,
                                        size=300,
                                        window=100,
                                        min_count=1,
                                        sg=1,
                                        iter=2000)
             print("make_title_model2 모델 학습 완료")
             self.FT_title_model = FT_title_model
             FT_title_model.save(
                 "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model"
             )
         self.FT_title_model = FT_gensim.load(
             "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model"
         )
         print("make_title_model 모델 로드됨")
     except OSError as e:
         print("failed to create directory!")
         raise
Ejemplo n.º 13
0
def train(sentences):
    print("starting to train!")
    # train model
    if args.train_pairs and args.relevant_selects:
        min_count = args.min_count * 5
    elif args.train_pairs:
        min_count = args.min_count * 10
    else:
        min_count = args.min_count

    if "word2vec" in args.gensim_model_name:
        model = Word2Vec(sentences,
                         size=args.embedding_size,
                         window=20,
                         sg=args.skipgram,
                         workers=16,
                         min_count=min_count)
    elif "fast" in args.gensim_model_name:
        model = FastText(sentences,
                         size=args.embedding_size,
                         window=20,
                         sg=args.skipgram,
                         workers=16,
                         min_count=min_count)

    # summarize the loaded model
    print(model)
    # access vector for one word
    # save model
    # trim unneeded model memory = use (much) less RAM
    model.init_sims(replace=True)
    model.save(args.data_dir + args.model_name)
Ejemplo n.º 14
0
def train_fasttext(infile, outfile, skipgram, loss, size, epochs):
    """
	train_fasttext(args**) -> Takes the input file, the output file and the model hyperparameters as arguments and trains the model accordingly.
	The model is saved at the output location.
	Arguments
	---------
	infile : Input pre-processed wiki dump
	outfile : Output directory to save the model.
	skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
	loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
	size : Embedding size (100 ~ 300)
	epochs : Number of epochs
	"""
    sentence = LineSentence(infile)

    model = FastText(sentence,
                     sg=skipgram,
                     hs=loss,
                     size=size,
                     alpha=0.05,
                     window=5,
                     min_count=5,
                     min_n=2,
                     max_n=5,
                     workers=3,
                     iter=epochs)

    model.save(outfile)
Ejemplo n.º 15
0
def trainmodel(paragraphset, fs, fw, fc):
    embedding_model = FastText(
        paragraphset, size=fs, window=fw, min_count=fc, workers=4, sg=1)
    mname = str(fs)+"_"+str(fw)+"_"+str(fc)+".model"
    embedding_model.save(mname)
    print(mname+"save done")
    return embedding_model
Ejemplo n.º 16
0
def text_setup_for_feature_representation(dataset,embedding_case):
    dataset = get_equal_for_each_cat(dataset,1000000)
    #dataset = clean_text_language(dataset)
    x = [k for k in dataset['text']]
    #x_1 = [i.split() for i in x]
    if embedding_case == 1:
        #x_1 = [i.split() for i in x]
        print("text to word sequence...")
        x_list = [text_to_word_sequence(k) for k in dataset['text']]
        print('text prepared for word2vec...')
        print(len(x_list))
        model = Word2Vec(x_list, size = 800, window = 5, min_count=3, workers=3)
        print("saving model...")
        model.save("word2vec_yelp_800")
        del model
    elif embedding_case == 2:
        x_1 = [i.split() for i in x]
        print('text prepared for FastText...')
        model = FastText(x_list, size = 300, window = 5, min_count = 3, workers=3)
        print("saving model...")
        model.save("FastText_yelp_all")
        del model
    elif embedding_case == 3:
        taggedDocs = nt('taggedDocs','words tags')
        docs = []
        for i in range(len(x)):
            words = x[i].split()
            tag = [i]
            docs.append(taggedDocs(words,tag))

        print('Text prepared for doc2vec...')
        model = Doc2Vec(x_list, size = 300, window = 8, min_count = 3, workers = 3)
        print("saving model...")
        model.save('doc2vec_yelp_all_latest')
        del model
Ejemplo n.º 17
0
def train_vector_model(datas, train):
    path = configs.fasttext_path
    if train:
        mecab = Okt()
        str_buf = datas['encode']
        joinString = ' '.join(str_buf)
        pos1 = mecab.pos(joinString)
        pos2 = ' '.join(list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n')
        morphs = list(map(lambda x: mecab.morphs(x), pos2))
        print("BUILD MODEL")
        model = FastText(size=vector_size,
                         window=3,
                         workers=8,
                         min_count=2,
                         sg=1,
                         iter=1500)
        model.build_vocab(morphs)
        print("BUILD COMPLETE")

        print("TRAIN START")
        model.train(morphs, total_examples=model.corpus_count,
                    epochs=model.epochs,
                    compute_loss=True)

        if not os.path.exists(path):
            os.makedirs(path)

        model.save(path + 'model_v2')
        print("TRAIN COMPLETE")
        return model
    else:
        print("LOAD SAVED MODEL")
        return FastText.load(path + 'model_v2')
Ejemplo n.º 18
0
    def generate_outer_feature(self):
        train_df, _ = read_data()
        test_df = read_data(test=True)
        all_df = pd.concat([train_df, test_df], ignore_index=True)

        users = all_df['user_id'].unique()

        docs = []
        for u in users:
            docs.append(all_df[all_df['user_id'] == u]['kiji_id'].values)

        vc = all_df['kiji_id'].value_counts()
        to_none_ids = vc[vc < 5].index

        def to_word(d):
            if d in to_none_ids:
                return 'None'
            return d

        if os.path.exists(self.fast_model_path):
            model = FastText.load(self.fast_model_path)
        else:
            docs = [[to_word(w) for w in doc] for doc in docs]
            with timer(logger,
                       format_str='create kiji_id fast_model' + ' {:.3f}[s]'):
                model = FastText(docs, workers=6, size=64)
            model.save(self.fast_model_path)

        z = self.df_outer['kiji_id_raw'].map(to_word).map(
            lambda x: model.wv[x])
        df = pd.DataFrame(np.array(z.values.tolist())).add_prefix('kiji_wv_')
        df[self.merge_key] = self.df_outer['kiji_id_raw']
        return df
Ejemplo n.º 19
0
def constructModelFromCiteseer():
    fname = open("./intelligentciteseerfasttext.model", "wb")
    cwd = os.path.dirname(os.path.realpath(__file__))
    dataDirPath = os.path.join(cwd, os.path.pardir, "citeseerdata")
    sentences = []
    termparser = TermParser()
    termparser.labautopedia()
    termparser.webopedia()
    termparser.constructTermCountDict()
    compsciterms = termparser.allterms
    for entry in scantree(dataDirPath):
        if not entry.name.startswith('.') and entry.is_file():
            filepath = entry.path
            with open(filepath, "r", encoding="utf-8") as f:
                textcontent = f.read()
                for term in compsciterms:
                    if term in textcontent:
                        textcontent = textcontent.replace(term, ''.join(term.split(" ")))
                        termparser.termdict[term] += 1
                content = textcontent.split(". ")
                for line in content:
                    sentence = cleanuptext(line)
                    if len(sentence) is not 0:
                        sentences.append(sentence), compsciterms
    termparser.wordOccurenceGraph()
    modelF = FastText(sentences, size=4, window=4, min_count=1, iter=10)
    modelF.save(fname)
    fname.close()
Ejemplo n.º 20
0
def train_wordvectors(in_file, out_file):
    df = pd.read_csv(in_file, encoding='gb18030', usecols=['content'])
    df = df.fillna('')
    df['tokens'] = df['content'].apply(lambda x: list(jieba.cut(x)))
    sentences = df['tokens'].tolist()
    model = FastText(sentences, window=5, size=35, iter=10, min_count=1)
    model.save(out_file)
Ejemplo n.º 21
0
def train_model(sentences: Collection[str], save_path=MODEL_PATH):
    model = FastText(size=VECTOR_SIZE)
    model.build_vocab(sentences=sentences)
    model.train(sentences=sentences,
                total_examples=model.corpus_count,
                epochs=50)
    model.save(save_path)
    return model
Ejemplo n.º 22
0
def trainModel(fileName):
    print("training ")
    corpus = createCorpus(fileName)
    model = FastText(corpus, size=300, window=5, min_count=5, sg=0, iter=4)
    model.save(
        'C:/Users/Lenovo/Desktop/Bitirme/Word2Vec.v3/models/fasttext.model.bin'
    )
    print("done")
Ejemplo n.º 23
0
def create_model(skip_gram, tokenized_sentences, model_path):
    model = FastText(min_count=1, window=5, sg=skip_gram)
    model.build_vocab(sentences=tokenized_sentences)
    model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), vector_size=5, epochs=100)

    model.save(model_path)

    return model
 def model(self, minCnt, size, window):
     # size = N dim. vector
     model = FastText(self.tokensListSet,
                      min_count=minCnt,
                      size=size,
                      window=window)
     model.save('model/fastText.bin')
     print(model)
Ejemplo n.º 25
0
class Word2Vector(object):
    def __init__(self,
                 src_file,
                 dst_file,
                 size=300,
                 window=5,
                 min_count=10,
                 hs=0,
                 sg=0,
                 learning_rate=0.025):
        self.src_file = src_file
        self.model_file = dst_file
        self.size = size
        self.window = window
        self.min_count = min_count
        self.hs = hs  # 1: 分层softmax, 0: 不使用分层softmax
        self.sg = sg  # 1: skip-gram,  0: CBOW
        self.alpha = learning_rate
        self.workers = multiprocessing.cpu_count()

    def train(self, sentences):
        self.model = FastText(sentences,
                              size=self.size,
                              window=self.window,
                              min_count=self.min_count,
                              sg=self.sg,
                              workers=self.workers)
        self.model.save(self.model_file)
        self.model.save_word2vec_format(self.model_file + '.bin', binary=True)

    def train_model(self):
        sentences = LineSentence(self.src_file)
        self.train(sentences)

    def online_train_model(self, sentences):  # 在线训练
        self.model.build_vocab(LineSentence(sentences))
        self.model.train(total_examples=self.model.corpus_count,
                         epochs=self.model.iter)

    def online_train_model(self, file_name, isdir=True):  # 在线训练
        if isdir:
            sentences = PathLineSentences(self.src_file)
        else:
            sentences = LineSentence(self.src_file)
        self.online_train_model(sentences)

    def train_dir_model(self):
        sentences = PathLineSentences(self.src_file)
        self.train(sentences)

    def load_model(self, model_name):
        self.model = FastText.load(model_name)

    def show_similarity(self, word1, word2):
        return self.model.wv.similarity(word1, word2)

    def show_word_vector(self, word):
        return self.model.wv[word]
Ejemplo n.º 26
0
 def train(self):
     model = FastText(self.sentences,
                      size=200,
                      window=3,
                      min_count=1,
                      iter=70)
     currdir = os.getcwd()
     model.save(currdir + '/mymodel.bin')
     return model
Ejemplo n.º 27
0
def bible_embeddings(processed_bible):
    #Parameters: processed bible file
    #Returns: writes bible representation to file
    model = FastText()
    model.build_vocab(sentences=processed_bible)
    model.train(sentences=processed_bible,
                total_examples=len(processed_bible),
                epochs=10)
    model.save("bible_ft.bin")
Ejemplo n.º 28
0
def _train_and_save_model_ft(sents, model_path):
    ft_model = FastText(sents,
                        size=128,
                        window=32,
                        min_count=5,
                        sample=1e-2,
                        sg=1,
                        iter=50)
    ft_model.save(model_path)
    return ft_model
def fasttext(model_path, sentences):
    '''
    https://radimrehurek.com/gensim/models/fasttext.html
    model_path should have a .model extension
    sentences: list of list of strings(tokens)
    '''
    model = FastText(sentences, min_count=1)
    word_vectors = model.wv
    model.save(model_path)
    return model_path
Ejemplo n.º 30
0
def fasttext_train(corpus_path, save_path):
    """输入分词完成的txt文件,一行为一个文本。"""
    model = FastText(window=5, size=200, min_count=1, workers=2)
    model.build_vocab(
        corpus_file=corpus_path)  # scan over corpus to build the vocabulary

    total_words = model.corpus_total_words  # number of words in the corpus
    model.train(corpus_file=corpus_path, total_words=total_words, epochs=5)

    model.save(save_path)
Ejemplo n.º 31
0
class NP2vec:
    """
    Initialize the np2vec model, train it, save it and load it.
    """

    def is_marked(self, s):
        """
        Check if a string is marked.

        Args:
            s (str): string to check
        """
        return len(s) > 0 and s[-1] == self.mark_char

    def __init__(
            self,
            corpus,
            corpus_format='txt',
            mark_char='_',
            word_embedding_type='word2vec',
            sg=0,
            size=100,
            window=10,
            alpha=0.025,
            min_alpha=0.0001,
            min_count=5,
            sample=1e-5,
            workers=20,
            hs=0,
            negative=25,
            cbow_mean=1,
            iter=15,
            min_n=3,
            max_n=6,
            word_ngrams=1):
        """
        Initialize np2vec model and train it.

        Args:
          corpus (str): path to the corpus.
          corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json
          formats are supported. For json format, the file should contain an iterable of
          sentences. Each sentence is a list of terms (unicode strings) that will be used for
          training.
          mark_char (char): special character that marks NP's suffix.
          word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and
          fasttext are supported.
          np2vec_model_file (str): path to the file where the trained np2vec model has to be
          stored.
          binary (bool): boolean indicating whether the model is stored in binary format; if
          word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True.
          sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training
          algorithm. If 1, CBOW is used,otherwise, skip-gram is employed.
          size (int): model training hyperparameter, size of the feature vectors.
          window (int): model training hyperparameter, maximum distance between the current and
          predicted word within a sentence.
          alpha (float): model training hyperparameter. The initial learning rate.
          min_alpha (float): model training hyperparameter. Learning rate will linearly drop to
          `min_alpha` as training progresses.
          min_count (int): model training hyperparameter, ignore all words with total frequency
          lower than this.
          sample (float): model training hyperparameter, threshold for configuring which
          higher-frequency words are randomly downsampled, useful range is (0, 1e-5)
          workers (int): model training hyperparameter, number of worker threads.
          hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1,
          hierarchical softmax will be used for model training. If set to 0, and `negative` is non-
                        zero, negative sampling will be used.
          negative (int): model training hyperparameter, negative sampling. If > 0, negative
          sampling will be used, the int for negative specifies how many "noise words" should be
          drawn (usually between 5-20). If set to 0, no negative sampling is used.
          cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context
          word vectors. If 1, use the mean, only applies when cbow is used.
          iter (int): model training hyperparameter, number of iterations.
          min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used
          for training word representations.
          max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for
          training word representations. Set `max_n` to be lesser than `min_n` to avoid char
          ngrams being used.
          word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word
          vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training.

        """

        self.mark_char = mark_char
        self.word_embedding_type = word_embedding_type
        self.sg = sg
        self.size = size
        self.window = window
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.min_count = min_count
        self.sample = sample
        self.workers = workers
        self.hs = hs
        self.negative = negative
        self.cbow_mean = cbow_mean
        self.iter = iter
        self.min_n = min_n
        self.max_n = max_n
        self.word_ngrams = word_ngrams

        if corpus_format == 'txt':
            self._sentences = LineSentence(corpus)
        elif corpus_format == 'json':
            with open(corpus) as json_data:
                self._sentences = json.load(json_data)
        elif corpus_format == 'conll2000':
            try:
                self._sentences = list()
                for chunked_sent in conll2000.chunked_sents(corpus):
                    tokens = list()
                    for chunk in chunked_sent:
                        if hasattr(chunk, '_label') and chunk._label == 'NP':
                            s = ''
                            for w in chunk:
                                s += w[0] + self.mark_char
                            tokens.append(s)
                        else:
                            if isinstance(chunk, nltk.Tree):
                                for w in chunk:
                                    tokens.append(w[0])
                            else:
                                tokens.append(chunk[0])
                        self._sentences.append(tokens)
            except Exception:
                print('Conll2000 dataset is missing from NLTK. See downloading details in the '
                      'README file')
        else:
            logger.error('invalid corpus format: ' + corpus_format)
            sys.exit(0)

        if word_embedding_type == 'fasttext' and word_ngrams == 1:
            # remove the marking character at the end for subword fasttext model training
            for i, sentence in enumerate(self._sentences):
                self._sentences[i] = [
                    w[:-1] if self.is_marked(w) else w for w in sentence]

        logger.info('training np2vec model')
        self._train()

    def _train(self):
        """
        Train the np2vec model.
        """
        if self.word_embedding_type == 'word2vec':
            self.model = Word2Vec(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=self.iter)

        elif self.word_embedding_type == 'fasttext':
            self.model = FastText(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=iter,
                min_n=self.min_n,
                max_n=self.max_n,
                word_ngrams=self.word_ngrams)
        else:
            logger.error(
                'invalid word embedding type: ' +
                self.word_embedding_type)
            sys.exit(0)

    def save(self, np2vec_model_file='np2vec.model', binary=False):
        """
        Save the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
        """
        if self.word_embedding_type == 'fasttext' and self.word_ngrams == 1:
            if not binary:
                logger.error(
                    "if word_embedding_type is fasttext and word_ngrams is 1, "
                    "binary should be set to True.")
                sys.exit(0)
            # not relevant to prune fasttext subword model
            self.model.save(np2vec_model_file)
        else:
            # prune non NP terms
            logger.info('pruning np2vec model')
            total_vec = 0
            vector_size = self.model.vector_size
            for word in self.model.wv.vocab.keys():
                if self.is_marked(word):
                    total_vec += 1
            logger.info(
                "storing %sx%s projection weights for NP's into %s" %
                (total_vec, vector_size, np2vec_model_file))
            with utils.smart_open(np2vec_model_file, 'wb') as fout:
                fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
                # store NP vectors in sorted order: most frequent NP's at the top
                for word, vocab in sorted(
                        iteritems(
                            self.model.wv.vocab), key=lambda item: -item[1].count):
                    if self.is_marked(word):
                        embedding_vec = self.model.wv.syn0[vocab.index]
                        if binary:
                            fout.write(
                                utils.to_utf8(word) + b" " + embedding_vec.tostring())
                        else:
                            fout.write(
                                utils.to_utf8(
                                    "%s %s\n" %
                                    (word, ' '.join(
                                        "%f" %
                                        val for val in embedding_vec))))

    @classmethod
    def load(cls, np2vec_model_file, binary=False, word_ngrams=0):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            return KeyedVectors.load_word2vec_format(
                np2vec_model_file, binary=binary)
        elif word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        else:
            logger.error('invalid value for \'word_ngrams\'')