Ejemplo n.º 1
0
    def classType_fasttext_train(self, classType):

        train_sentences = []

        for word in self.train:
            sentence = []
            mappings = self.word_mapping[word]
            for mapping in mappings:
                if mapping == classType:
                    sentence.append(word)
            if len(sentence) > 0:
                train_sentences.append(sentence)

        feature_encoder = FastText(size=50,
                                   window=2,
                                   min_count=1,
                                   min_n=2,
                                   max_n=6)
        feature_encoder.build_vocab(sentences=train_sentences)
        feature_encoder.train(sentences=train_sentences,
                              total_examples=feature_encoder.corpus_count,
                              epochs=1000)
        feature_encoder.save('./models/' + classType + '_fasttext.model')
        if classType == 'company':
            self.company_feature_encoder = feature_encoder
        elif classType == 'location':
            self.location_feature_encoder = feature_encoder
        elif classType == 'goods':
            self.goods_feature_encoder = feature_encoder
        else:
            raise Exception(
                'Allowed arguments are company, location and goods')
def fasttext_model(sentences, size=100, min_count=5, negative=5, window=5,
                   cbow=True, iterations=5, seed=0, workers=1):
    """

    :param sentences:
    :param size:
    :param min_count:
    :param window:
    :param negative:
    :param cbow: boolean to determine the training type; True is for CBOW;
    False is for Skip-gram
    :param iterations:
    :param seed:
    :param workers:
    :return:
    """
    if cbow is True:
        skip = 0
    else:
        skip = 1
    model = FastText(size=size, window=window,
                     min_count=min_count, workers=workers, sg=skip,
                     negative=negative, seed=seed)
    model.build_vocab(sentences)
    model.train(sentences, total_examples=model.corpus_count, epochs=iterations)
    return model
Ejemplo n.º 3
0
def main():
    start = time.time()
    #CLI arguments
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--corpus_file", type=str, required=True, help="Path to the corpus .txt file.")
    parser.add_argument("--window", type=int, default=5, help="Window size. default=5")
    parser.add_argument("--vector_size", type=int, default=200, help="Vector dimensions. default=200")
    parser.add_argument("--min_count", type=int, default=1, help="The model ignores all words with total frequency lower than this. default=1")
    parser.add_argument("--method", type=str, required=True, help="[sg|cbow]")
    parser.add_argument("--output_file", type=str, default=r'../Models/FastText_model/FastText_combined_corp_model.wordvectors', help="Path to the output .wordvectors file.")
    args = parser.parse_args()

    #set method
    method = 1
    if args.method == "sg":
        method = 1
    elif args.method == "cbow":
        method = 0
    else:
        print("method not supported!")
        return

    #setup model
    model = FastText(vector_size=args.vector_size, window=args.window, min_count=args.min_count, sg=method)
    model.build_vocab(corpus_file=args.corpus_file)
    print("Built vocab. Starting train...")

    #train
    model.train(corpus_file=args.corpus_file, total_words=model.corpus_total_words, epochs=5)
    model.wv.save(args.output_file)
    print ("Done training fasttext in %.2f min" % ((time.time()-start)/60))
Ejemplo n.º 4
0
def train_fasttext(sentences,
                   embedding_size=100,
                   window=5,
                   sg=1,
                   hs=0,
                   min_ct=2,
                   min_n=1,
                   max_n=4,
                   ns_exponent=0.75,
                   negative=15,
                   epoch=50,
                   sample_t=1e-5):
    start_time = time.time()
    ftmodel = FastText(size=embedding_size,
                       window=window,
                       sg=sg,
                       hs=hs,
                       negative=negative,
                       sample=sample_t,
                       ns_exponent=ns_exponent,
                       min_n=min_n,
                       max_n=max_n,
                       min_count=min_ct,
                       workers=12,
                       seed=7)
    ftmodel.build_vocab(sentences=sentences)
    ftmodel.train(sentences=sentences,
                  epochs=epoch,
                  total_examples=ftmodel.corpus_count)
    training_time = time.time() - start_time
    print("%d seconds used to train this model" % (training_time))
    return ftmodel
def fasttext_model(sentences,
                   size=100,
                   min_count=5,
                   negative=5,
                   window=5,
                   cbow=True,
                   iterations=5,
                   seed=0,
                   workers=1):
    """ Returns: the trained model """
    if cbow is True:
        sg = 0
    else:
        sg = 1
    model = FastText(size=size,
                     window=window,
                     min_count=min_count,
                     workers=workers,
                     sg=sg,
                     negative=negative,
                     seed=seed)
    model.build_vocab(sentences)
    model.train(sentences,
                total_examples=model.corpus_count,
                epochs=iterations)
    return model
def train_vector_model(train_data_list, train):
    if train:
        str_buf = train_data_list['encode']
        joinString = ' '.join(str_buf)
        pos1 = kiwi_f.k_pos(joinString)
        pos2 = ' '.join(
            list(map(lambda x: '\n'
                     if x[1] in ['Punctuation'] else x[0], pos1))).split('\n')
        morphs = list(map(lambda x: kiwi_f.k_morphs(x), pos2))
        print("BUILD MODEL")
        model = FastText(
            size=300,
            window=3,
            workers=8,
            min_count=1,
            sg=1,  #skipgram 모델의 성능이 더 좋다고 알려져있음
            iter=1000)
        model.build_vocab(morphs)
        print("BUILD COMPLETE")

        print("TRAIN START")
        model.train(morphs,
                    total_examples=model.corpus_count,
                    epochs=model.epochs,
                    compute_loss=True)
        if not os.path.exists(path.FASTTEXT_DIR):
            os.makedirs(path.FASTTEXT_DIR)

        model.save(path.model_path + 'model_test')
        print("TRAIN COMPLETE")
        return model
    else:
        return FastText.load(path.model_path + 'model_test')
Ejemplo n.º 7
0
def train_vector_model(train_data_list, mode):
    if mode == 'train':
        mecab = Okt()
        str_buf = train_data_list['encode']
        joinString = ' '.join(str_buf)
        pos1 = mecab.pos(joinString)
        pos2 = ' '.join(
            list(map(lambda x: '\n'
                     if x[1] in ['Punctuation'] else x[0], pos1))).split('\n')
        morphs = list(map(lambda x: mecab.morphs(x), pos2))
        print("BUILD MODEL")
        model = FastText(size=vector_size,
                         window=3,
                         workers=8,
                         min_count=1,
                         sg=1,
                         iter=1000)
        model.build_vocab(morphs)
        print("BUILD COMPLETE")

        print("TRAIN START")
        model.train(morphs,
                    total_examples=model.corpus_count,
                    epochs=model.epochs,
                    compute_loss=True)
        if not os.path.exists('./fasttext'):
            os.makedirs('./fasttext')

        model.save('./fasttext/model')
        print("TRAIN COMPLETE")
        return model
    else:
        return FastText.load('./fasttext/model')
Ejemplo n.º 8
0
def make_fasttext(target_dataset):

    corpus_path = os.path.join(CONFIG.DATASET_PATH, target_dataset,
                               "corpus.txt")
    sentences = word2vec.LineSentence(corpus_path)
    dimension_size = 300
    print("embedding started")
    embedding_model = FastText(sentences=sentences,
                               size=dimension_size,
                               window=6,
                               min_count=5,
                               workers=4,
                               sg=1)  #skip-gram
    embedding_model = FastText(size=dimension_size,
                               window=6,
                               min_count=5,
                               workers=4,
                               sg=1)  #skip-gram
    embedding_model.build_vocab(sentences=sentences)
    embedding_model.train(sentences=sentences,
                          total_examples=embedding_model.corpus_count,
                          epochs=10)
    model_name = "FASTTEXT_" + target_dataset + ".model"
    #pad_value = np.finfo(np.float32).eps
    pad_value = 1.
    embedding_model.wv.add("<PAD>",
                           np.full(embedding_model.vector_size, pad_value),
                           replace=True)
    embedding_model.wv.init_sims(replace=True)
    embedding_model.wv.save(os.path.join(CONFIG.EMBEDDING_PATH, model_name))
    print("embedding completed")
Ejemplo n.º 9
0
def train_vector_model(datas, train):
    path = configs.fasttext_path
    if train:
        mecab = Okt()
        str_buf = datas['encode']
        joinString = ' '.join(str_buf)
        pos1 = mecab.pos(joinString)
        pos2 = ' '.join(list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n')
        morphs = list(map(lambda x: mecab.morphs(x), pos2))
        print("BUILD MODEL")
        model = FastText(size=vector_size,
                         window=3,
                         workers=8,
                         min_count=2,
                         sg=1,
                         iter=1500)
        model.build_vocab(morphs)
        print("BUILD COMPLETE")

        print("TRAIN START")
        model.train(morphs, total_examples=model.corpus_count,
                    epochs=model.epochs,
                    compute_loss=True)

        if not os.path.exists(path):
            os.makedirs(path)

        model.save(path + 'model_v2')
        print("TRAIN COMPLETE")
        return model
    else:
        print("LOAD SAVED MODEL")
        return FastText.load(path + 'model_v2')
    def run(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Transform dataframe with missing values to DF of aggregated embeddings for each sample
        the final embeddings for sample is mean embedding vector of all embeddings vector of non missing values

        :param df: DF with missing data
        :return: embeddings DF
        """
        # encode features to string
        df_intrv = utilities.intervals(df, inplace=False)
        # concatenate values to sequence
        seq_df = utilities.concat_columns(df_intrv)
        # min_n=3, max_n=2 to avoid char gram
        size = df.shape[1]
        emb_size = 50
        window = 30
        if size < 60:
            emb_size = int(size - (1 + ((size // 10) ** 1.5 + (size // 10))))
            window = int(size - ((size // 10) ** 2 + (size // 10)))

        model = FastText(size=emb_size, window=window, min_count=1, workers=-1, min_n=3, max_n=2)  # instantiate
        model.build_vocab(sentences=seq_df)
        model.train(sentences=seq_df, total_examples=len(seq_df), epochs=50)
        # for high missingness level some rows don't include values impute most frequence in that case
        idxmax = df_intrv.apply(lambda col: col.value_counts()[0], axis=0).idxmax()
        idxmax = df_intrv[idxmax].value_counts().idxmax()
        seq_df = pd.Series([x if len(x) > 0 else [idxmax] for x in seq_df])
        # get embeddings sequences
        emb = seq_df.apply(lambda row: model.wv[row].mean(axis=0).tolist())
        # create DF with emb dims as columns
        emb_df = pd.DataFrame(np.row_stack(emb))
        emb_df.columns = ['x' + str(x) for x in emb_df.columns]
        return emb_df
Ejemplo n.º 11
0
def fetch_emb(lenWindow, minOccur, emb_path, vocab_path, RESET):
    if not os.path.exists(emb_path) or RESET:
        with open('../data/cub/text_trainvalclasses.txt', 'r') as file:
            text = file.read()
            sentences = sent_tokenize(text)

        texts = []
        for i, line in enumerate(sentences):
            words = word_tokenize(line)
            texts.append(words)

        model = FastText(size=300, window=lenWindow, min_count=minOccur)
        model.build_vocab(sentences=texts)
        model.train(sentences=texts, total_examples=len(texts), epochs=10)

        with open(vocab_path, 'rb') as file:
            vocab = json.load(file)

        i2w = vocab['i2w']
        base = np.ones((300,), dtype=np.float32)
        emb = [base * (i - 1) for i in range(3)]
        for word in list(i2w.values())[3:]:
            emb.append(model[word])

        emb = np.array(emb)
        with open(emb_path, 'wb') as file:
            pickle.dump(emb, file)

    else:
        with open(emb_path, 'rb') as file:
            emb = pickle.load(file)

    return emb
def train_fasttext(sentences,
                   embedding_size=100,
                   window=5,
                   sg=1,
                   hs=0,
                   min_ct=2,
                   min_n=2,
                   max_n=4,
                   ns_exponent=0.75,
                   negative=15,
                   epoch=50,
                   sample=1e-5):
    ftmodel = FastText(size=embedding_size,
                       window=window,
                       sg=sg,
                       hs=hs,
                       negative=negative,
                       sample=sample,
                       ns_exponent=ns_exponent,
                       min_n=min_n,
                       max_n=max_n,
                       min_count=min_ct,
                       workers=8,
                       seed=7)
    ftmodel.build_vocab(sentences=sentences)
    ftmodel.train(sentences=sentences,
                  epochs=epoch,
                  total_examples=ftmodel.corpus_count)
    return ftmodel
Ejemplo n.º 13
0
def main():
    """
    script to training fastText word embedding model
    """
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i',
                        '--input-file',
                        required=False,
                        default=config.DATA_FILE,
                        help='input data file for training')
    parser.add_argument('-m',
                        '--model-file',
                        required=False,
                        default=config.MODEL_FILE,
                        help='model output name')
    parser.add_argument('-s',
                        '--embedding-size',
                        required=False,
                        type=int,
                        default=config.MODEL_FILE,
                        help='model output name')

    args = parser.parse_args()

    model = FastText(size=args.embedding_size, sg=1)
    model.build_vocab(corpus_file=args.input_file)

    total_words = model.corpus_total_words
    model.train(corpus_file=args.input_file, total_words=total_words, epochs=5)

    model.save(args.model_file)
Ejemplo n.º 14
0
 def loadDevblogModel(self,
                      embedding_dim,
                      epochs,
                      window,
                      min_count):
     """
     Devblog 데이터를 기반으로 FastText 단어 임베딩 모델 학습
     
     - input
     : embedding_dim / int / 단어 벡터화시 차원 수
     : epochs / int / 학습 횟수
     : window / int / 학습에 사용될 n-gram
     : min_count / int / 학습에 사용될 단어의 최소 등장횟수
     
     - return
     : we_model
     """
     model = None
     if not os.path.isfile(CONST.devblog_model_path):
         print('🐈  학습된 단어 임베딩 모델이 없습니다.')
         dc = Document()
         docs = dc.getDocs(labeled_only=False) # 전체 데이터 가져오기
         print('🐈  단어 임베딩 모델 학습을 시작합니다.')
         sentences = docs.text.apply(lambda x: [han2Jamo(s) for s in x.split(' ')])
         model = FastText(size=embedding_dim, window=window, min_count=min_count)
         model.build_vocab(sentences=sentences)
         model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
         
         print('🐈  단어 임베딩 모델을 저장합니다.')
         model.save(CONST.devblog_model_path)
     else:
         model = FastText.load(CONST.devblog_model_path)
     return model
Ejemplo n.º 15
0
def getWordVec(corpus: list, type=1) -> object:
    '''
        Args:
            corpus: list[list[str]], each sublist indicates a sentence
            type: 1 = word2vec, 2 = fasttext
    '''
    coherenceMetric = Callback(ConvergenceMetric())
    convergenceMetric = Callback(CoherenceMetric())
    diffMetric = Callback(DiffMetric())
    if type == 1:
        model = Word2Vec(corpus)
        model.save('word2vec.model')
        return model
    else:
        model = FastText(min_count=1)
        logging.info('Starting building vocabulary table')
        model.build_vocab(corpus)
        logging.info('Starting training')

        model.train(corpus,
                    total_examples=model.corpus_count,
                    epochs=model.epochs,
                    callbacks=[epochLogger])
        model.save('FastText.model')
        return model
def recipe_to_ingre2vec(filepath='../recipe_data/'):
    recipe_sentences = []  # csv파일로 부터 읽어온 documents 저장
    recipe_folder = os.listdir(filepath)  # recipe_data폴더에 들어있는 파일(폴더) 목록 list
    for i, folder in enumerate(recipe_folder):
        csv_filepath = os.listdir(filepath +
                                  folder)  # 해당 폴더에 들어있는 csv파일 목록 list
        for j, csv_file in enumerate(csv_filepath):
            fi = open(filepath + folder + '/' + csv_file,
                      'rt',
                      encoding='UTF8')
            rdr = csv.reader(fi)
            for k, row in enumerate(rdr):
                if k == 0:
                    continue
                elif k % 2 == 0:
                    recipe_sentences.append(row)
            fi.close()

    model_ingredient = FastText(
        sg=1, window=10 * 1000000, vector_size=100,
        min_count=3)  # item2vec로 사용하기 위해 windowsize를 크게 설정
    model_ingredient.build_vocab(recipe_sentences)
    model_ingredient.train(recipe_sentences,
                           epochs=10,
                           total_examples=model_ingredient.corpus_count)

    print("length of ingex2vec: %i" % (len(model_ingredient.wv.index_to_key)))
    print("Ingre2vec embedding finished!")
    return model_ingredient
Ejemplo n.º 17
0
def create_embedding(caption_file: str,
                     vocab_file: str,
                     embed_size: int,
                     output: str,
                     **fasttext_kwargs):
    caption_df = pd.read_json(caption_file)
    caption_df["tokens"] = caption_df["tokens"].apply(lambda x: ["<start>"] + [token for token in x] + ["<end>"])

    sentences = list(caption_df["tokens"].values)
    vocabulary = torch.load(vocab_file, map_location="cpu")

    epochs = fasttext_kwargs.get("epochs", 10)
    model = FastText(size=embed_size, min_count=1, **fasttext_kwargs)
    model.build_vocab(sentences=sentences)
    model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs)
    
    word_embeddings = np.zeros((len(vocabulary), embed_size))
    
    with tqdm(total=len(vocabulary), ascii=True) as pbar:
        for word, idx in vocabulary.word2idx.items():
            if word == "<pad>" or word == "<unk>":
                continue
            word_embeddings[idx] = model.wv[word]
            pbar.update()

    np.save(output, word_embeddings)

    print("Finish writing fasttext embeddings to " + output)
Ejemplo n.º 18
0
def fast_text_word_model(data, col_name):
    '''
    Creating word vectors using fastText library.
    
    Args:
        data (DataFrame): The first parameter.
        col_name (string): The second parameter.
        
    Returns:
        model (word embedding model)
    '''
    sentences_list = list()
    delimeter = '|'

    for index, row in data.iterrows():
        if row[col_name] != None:
            # Creating a list of text
            sentences_list.extend(row[col_name].split(delimeter))
    strip_list = [item.strip() for item in sentences_list]
    # Removing duplicates entry from strip_list
    sentences = list(set(strip_list))
    # Creating FastText object
    model = FastText(size=100, window=10, min_count=1, workers=4, sg=1)
    # adding list of sentences to model
    model.build_vocab(sentences)
    total_words = model.corpus_total_words
    # Training the model to generate the vectors
    model.train(sentences, total_words=total_words, epochs=100)
    return model
Ejemplo n.º 19
0
def fasttext_model(sentences, size=100, min_count=5, negative=5, window=5,
                   cbow=True, iterations=5, seed=0, workers=1):
    """
    creates and trains a genism fastText model:
    - sentences: list of sentences to be trained on
    - size: dimensionality of the embedding layer
    - min_count: minimum number of occurrences of a word
    for use in training
    - window: maximum distance between the current and predicted
    word within a sentence
    - negative: size of negative sampling
    - cbow: boolean to determine the training type; True is for CBOW;
    False is for Skip-gram
    - iterations: number of iterations to train over
    - seed: seed for the random number generator
    - workers: number of worker threads to train the model
    Returns: the trained model
    """
    skip = 1
    if cbow:
        skip = 0
    model = FastText(size=size, window=window, min_count=min_count,
                     workers=workers, sg=skip, negative=negative, seed=seed)
    model.build_vocab(sentences)
    model.train(sentences, total_examples=model.corpus_count,
                epochs=iterations)
    return model
    def test_check_pre_train_dtypes(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        se = BaseSentence2VecModel(ft)

        se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float64)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32)

        se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float16)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float32)

        se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float16)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float32)

        se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=int)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32)

        se.word_weights = np.ones(len(se.wv.vocab), dtype=bool)
        with self.assertRaises(TypeError):
            se._check_pre_training_sanity(1, 1, 1)
        se.word_weights = np.ones(len(se.wv.vocab), dtype=np.float32)
Ejemplo n.º 21
0
class FastTextMeanEmbeddingVectorizer(object):
    def __init__(self, size=100, window=5, min_count=2, workers=4, epochs=100):
        super().__init__()
        self.model = FastText(size=size, window=window, min_count=min_count, workers=workers, sg=1)
        self.size = size
        self.epochs = epochs

    def fit(self, X, y=None):
        self.model.build_vocab(sentences=X)
        self.model.train(sentences=X, total_examples=len(X), epochs=self.epochs)  # train
        return self

    def transform(self, X):
        result = []
        for words in X:
            if len(words) >0 :
                mean = np.mean([self.model.wv[w] for w in words], axis=0)
            else:
                mean = [np.zeros(self.size)]
            result.append(mean)
        return np.array(result)

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)
 def test_check_pre_train_san_no_sv_vecs(self):
     ft = FastText(min_count=1, size=5)
     ft.build_vocab(SENTENCES)
     se = BaseSentence2VecModel(ft)
     se.sv.vectors = None
     with self.assertRaises(RuntimeError):
         se._check_pre_training_sanity(1, 1, 1)
 def test_check_pre_train_san_incos_len(self):
     ft = FastText(min_count=1, size=5)
     ft.build_vocab(SENTENCES)
     se = BaseSentence2VecModel(ft)
     se.word_weights = np.ones(20)
     with self.assertRaises(RuntimeError):
         se._check_pre_training_sanity(1, 1, 1)
    def test_cy_equal_np_ft_random(self):
        ft = FastText(size=20, min_count=1)
        ft.build_vocab(SENTENCES)

        m1 = Average(ft)
        m1.prep.prepare_vectors(sv=m1.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m1._pre_train_calls()

        from fse.models.average_inner import MAX_NGRAMS_IN_BATCH
        m1.batch_ngrams = MAX_NGRAMS_IN_BATCH
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences[:2], m1.sv.vectors, mem1)

        m2 = Average(ft)
        m2.prep.prepare_vectors(sv=m2.sv,
                                total_sentences=len(self.sentences),
                                update=False)
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy
        o2 = train_average_cy(m2, self.sentences[:2], m2.sv.vectors, mem2)

        self.assertEqual(o1, o2)
        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))
    def test_save_load_with_memmap(self):
        ft = FastText(min_count=1, size=5)
        ft.build_vocab(SENTENCES)
        shape = (1000, 1000)
        ft.wv.vectors = np.zeros(shape, np.float32)

        p = Path("fse/test/test_data/test_emb")
        p_vecs = Path("fse/test/test_data/test_emb_wv.vectors")
        p_ngrams = Path("fse/test/test_data/test_emb_ngrams.vectors")
        p_vocab = Path("fse/test/test_data/test_emb_vocab.vectors")

        p_not_exists = Path("fse/test/test_data/test_emb.wv.vectors.npy")

        se = BaseSentence2VecModel(ft, wv_mapfile_path=str(p))
        self.assertTrue(p_vecs.exists())
        self.assertTrue(p_ngrams.exists())
        self.assertTrue(p_vocab.exists())

        se.save(str(p.absolute()))
        self.assertTrue(p.exists())
        self.assertFalse(p_not_exists.exists())

        se = BaseSentence2VecModel.load(str(p.absolute()))
        self.assertFalse(se.wv.vectors_vocab.flags.writeable)
        self.assertEqual(shape, se.wv.vectors.shape)
        self.assertEqual((2000000, 5), se.wv.vectors_ngrams.shape)

        for p in [p, p_vecs, p_ngrams, p_vocab]:
            p.unlink()
Ejemplo n.º 26
0
def train_fasttext(vec_op=utils.average):
    model = FastText()

    # build the vocabulary
    model.build_vocab(corpus_file='data/corpus/sec_corpus.txt')

    # train the model
    model.train(corpus_file='data/corpus/sec_corpus.txt',
                epochs=model.epochs,
                total_examples=model.corpus_count,
                total_words=model.corpus_total_words)

    documents_tokens = io_manager.read_documents_for_word2vec()

    if not os.path.exists('./data/fasttext'):
        os.makedirs('./data/fasttext')
    with open('./data/fasttext/document_vectors.txt', 'w',
              encoding="utf8") as f:
        for doc in documents_tokens:
            doc_vec = vec_op([model.wv[word] for word in doc[1]])
            f.write(
                doc[0].strip().replace('data_by_sect', 'data_orig_by_sect') +
                ' ' + ' '.join(map(str, doc_vec)) + '\n')

    return model
def create_new_model(model_name):
    word_vec_num_dim = 4
    model = FastText(size=word_vec_num_dim, window=3, min_count=1)
    model.build_vocab(sentences=common_texts)
    model.train(sentences=common_texts,
                total_examples=len(common_texts),
                epochs=10)
    return model
Ejemplo n.º 28
0
def create_model(skip_gram, tokenized_sentences, model_path):
    model = FastText(min_count=1, window=5, sg=skip_gram)
    model.build_vocab(sentences=tokenized_sentences)
    model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), vector_size=5, epochs=100)

    model.save(model_path)

    return model
Ejemplo n.º 29
0
def embedding(corpus, model_type):
    if model_type == "fasttext":
        model = FastText(workers=10)
    if model_type == "word2vec":
        model = word2vec.Word2Vec(workers=10)
    model.build_vocab(corpus)
    model.train(corpus, total_examples=len(corpus), epochs=10)
    return model
Ejemplo n.º 30
0
def train_model(sentences: Collection[str], save_path=MODEL_PATH):
    model = FastText(size=VECTOR_SIZE)
    model.build_vocab(sentences=sentences)
    model.train(sentences=sentences,
                total_examples=model.corpus_count,
                epochs=50)
    model.save(save_path)
    return model