Beispiel #1
0
def main(ft_src, ft_tgt, tm_model, corpus, hyp_num=1000):
    ft_src_model = FastText.load_fasttext_format(ft_src)
    ft_tgt_model = FastText.load_fasttext_format(ft_tgt)
    tm = pickle.load(open(tm_model, 'rb'))

    for i, line in enumerate(io.open(corpus, 'r', encoding='utf-8')):
        sent_pairs = line.strip().split(' ||| ')
        source = sent_pairs[0]
        targets = merge_subseq([tok.split() for tok in sent_pairs[1:]])

        G = nx.DiGraph()
        for tokens in targets:
            G = add_edges(G, handle_tokens(tokens))

        source_vec = get_vec(ft_src_model, source)
        source_vec_proj = tm.predict(source_vec.reshape(1, -1))[0]
        source_vec_proj = unitvec(source_vec_proj)
        candidates = Counter()
        for p in sorted(nx.all_simple_paths(G, ('<s>', 0), ('</s>', 0)),
                        key=lambda path: path_cost(G, path, weight='weight')):
            target_tokens = [t[0] for t in p[1:-1]]
            target_tokens = ' '.join(target_tokens).split()  # hack
            text = ' '.join(target_tokens)
            target_vec = unitvec(get_vec(ft_tgt_model, target_tokens))
            candidates[text] = np.dot(source_vec_proj, target_vec)

        for text, score in candidates.most_common(hyp_num):
            print('{0} ||| {1} ||| Cosine={2}'.format(i, text, score))
 def _generate_word_embeddings(self, algo=EmbeddingsAlgorithm.WORD2VEC, use_morphs=False, min_count=2, dim=100):
     """Generates the word embeddings for the current language
     
     :param use_morphs: If true, will use the morphed corpus to generate embeddings. If false, will use the raw 
     corpus
     :param min_count: The minimum number of times a word must occur in order for it to be processed
     :param dim: The number of dimensions of the output vectors
     :return: The embeddings for the current languagego
     """
     _log.info('Learning word vectors...')
     if algo == EmbeddingsAlgorithm.WORD2VEC:
         if use_morphs:
             return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count)
         else:
             return Word2Vec(sentences=self._language_data, size=dim, min_count=min_count)
     elif algo == EmbeddingsAlgorithm.FASTTEXT:
         if use_morphs:
             self._split_corpus_into_morphs()
             self._save_language_data('fastTest_input.txt')
             return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt',
                                   output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count)
         else:
             self._save_language_data('fasttext_input.txt')
             return FastText.train('fastText/fasttext', self._language_dir + 'fasttext_input.txt',
                                   output_file=self._language_dir + 'ft_model', size=dim, min_count=min_count)
     else:
         _log.error('Unknown algorithm %s' % algo)
Beispiel #3
0
def fasttext_model_from_file2(file_path):
    save_file_name = os.path.join(const.GENERATED_DATA_DIR, const.FASTTEXT_PREFIX + file_path.split('/')[-1])
    try:
        model = gensimFastText.load_fasttext_format(save_file_name + '.bin', encoding='utf-8')
        logging.info('model loaded:' + save_file_name)
    except FileNotFoundError:
        fastext_bin_path = os.path.join(const.ROOT_DIR, 'fasttext/fastText')
        model = gensimFastText.train(fastext_bin_path, file_path, min_count=1)
    return model.wv
Beispiel #4
0
def computeCorpusSims(name,lg):
	if lg=="fr":model = FastText.load_fasttext_format(fr)
	else:model = FastText.load_fasttext_format(en)
	data=pd.read_csv(name, sep='\t')
	texts=data["text"]
	titles=data["title"]
	size=len(texts)
	sims=[]
	for i in range(size):sims+=[meanSim(ensText(texts[i],model),MWV(titles[i],model))]
	sims.sort()
	return sims
Beispiel #5
0
def main(ft_src, ft_tgt, corpus_src, corpus_tgt, out_fname):
    ft_src_model = FastText.load_fasttext_format(ft_src)
    ft_tgt_model = FastText.load_fasttext_format(ft_tgt)

    X = get_vec(ft_src_model, corpus_src)
    y = get_vec(ft_tgt_model, corpus_tgt)
    assert X.shape == y.shape, 'mismatched shapes'

    lr = LinearRegression()
    lr.fit(X, y)

    with io.open(out_fname, 'wb') as out:
        pickle.dump(lr, out, pickle.HIGHEST_PROTOCOL)
Beispiel #6
0
def prepare_word_emb_matrices(experiment):
    """
    Initializes word embeddings for each word in training vocabulary
    from pretrained or custom-trained embedding files
    :param experiment: the ID of the word embedding file
    :return: the training embedding matrix
    """

    with open("public_data/stats/stats_train.pkl", 'rb') as stats:
        stats = pickle.load(stats)
    vocab = stats["VOCAB"]
    stops = [word.lower() for word in set(stopwords.words('english'))]
    vocab = vocab + stops

    if experiment == "RANDOM":
        word_embs = np.random.uniform(low=-1.0,
                                      high=1.0,
                                      size=(len(vocab),
                                            PARAMS["SIZE"])).astype("float32")

    else:
        word_embs = []
        count_unk = 0
        count_kn = 0

        if experiment == "5":
            emb_model = KeyedVectors.load_word2vec_format(
                "public_data/models/experiment_5/embeddings_5.bin",
                binary=True)
        elif experiment == "6":
            emb_model = Word2Vec.load(
                "public_data/models/experiment_6/embeddings_6")

        elif experiment in ["7", "8"]:
            emb_model = FastText.load_fasttext_format(
                "public_data/models/experiment_%s/embeddings_%s.bin" %
                (experiment, experiment))
        for word in vocab:
            if word in emb_model:
                word_embs.append(emb_model[word])
                count_kn += 1
            else:
                word_embs.append(
                    np.random.uniform(low=-1.0, high=1.0, size=PARAMS["SIZE"]))
                count_unk += 1

        word_embs = np.array(word_embs).astype("float32")
        print(count_unk / (count_kn + count_unk))

    pad = np.zeros(shape=PARAMS["SIZE"]).astype("float32")
    unk = np.random.uniform(low=-1.0, high=1.0,
                            size=PARAMS["SIZE"]).astype("float32")
    word_embs = np.insert(word_embs, 0, unk, axis=0)  #id 1
    word_embs = np.insert(word_embs, 0, pad, axis=0)  #id 0

    with open("public_data/embeddings/word_embeddings_%s.pkl" % experiment,
              'wb') as out:
        pickle.dump(word_embs, out, protocol=4)

    return word_embs
Beispiel #7
0
 def __init__(self, model_path, model_type='fasttext', **kwarg):
     if model_type == "fasttext":
         self._model = FastText.load_fasttext_format(model_path)
     elif model_type == "word2vec":
         self._model = Word2Vec.load_word2vec_format(model_path)
     else:
         raise NotImplementedError("other model is not supported")
Beispiel #8
0
 def __init__(self):
     # add data imports
     self.data_df = pd.read_csv("sarcasm/train-balanced-sarcasm.csv")
     self.data_split = data_split
     # self.stop_words = ['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
     #                    'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or',  'who', 'as', 'from', 'him', 'each', 'the', 'themselves',
     #                    'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their',
     #                    'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then',
     #                    'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'now', 'under', 'he', 'you', 'herself', 'has', 'just',
     #                    'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it',
     #                    'further', 'was', 'here', ]
     self.stop_words = []
     print("Loading Vectors")
     self.vec_model = FastText.load_fasttext_format(
         '../../Science_Fair_Project/vectors/cc.en.300.bin/cc.en.300.bin')
     # self.vec_model = {}
     print("Completed Loading Vectors")
     # ipdb.set_trace()
     self.data_df = self.data_df[["comment", "parent_comment", "label"]]
     self.data_df = shuffle(self.data_df)
     self.data_df["label"] = self.data_df["label"].astype(int)
     self.data_df["comment"] = self.data_df["comment"].astype(str)
     self.data_df["comment"] = self.data_df["comment"].str.lower()
     self.data_df["comment"] = self.data_df["comment"].str.strip(
         to_strip=".!?,")
     self.data_df["comment"] = self.data_df["comment"].str.split()
     self.data_df["parent_comment"] = self.data_df["parent_comment"].astype(
         str)
     self.data_df["parent_comment"] = self.data_df[
         "parent_comment"].str.lower()
     self.data_df["parent_comment"] = self.data_df[
         "parent_comment"].str.strip(to_strip=".!?,")
     self.data_df["parent_comment"] = self.data_df[
         "parent_comment"].str.split()
Beispiel #9
0
def get_model(my_corpus, f_embeddings):
    """Get the appropriate model for the corpus"""

    global current_model

    f_output = ""
    if my_corpus == "Corola.300.20":
        current_model = KeyedVectors.load_word2vec_format(f_embeddings,
                                                          binary=False)
        f_output = "Results/Corpus_Similarities/CoRoLa_300_20-cosine_similarity.txt"

    elif my_corpus == "Corola.400.5":
        current_model = KeyedVectors.load_word2vec_format(f_embeddings,
                                                          binary=False)
        f_output = "Results/Corpus_Similarities/CoRoLa_400_5-cosine_similarity.txt"

    elif my_corpus == "Facebook":
        current_model = FastText.load_fasttext_format(f_embeddings)
        f_output = "Results/Corpus_Similarities/fastText-cosine_similarity.txt"

    elif my_corpus == "CONLL2017-Word2vec":
        current_model = KeyedVectors.load_word2vec_format(f_embeddings,
                                                          binary=True)
        f_output = "Results/Corpus_Similarities/CoNLL-2017-cosine_similarity.txt"
    return f_output
Beispiel #10
0
    def load_embeddings_file(file_name, embedding_type, lower=True):
        if not os.path.isfile(file_name):
            print(file_name, "does not exist")
            return {}, 0

        if embedding_type == "word2vec":
            model = KeyedVectors.load_word2vec_format(file_name,
                                                      binary=True,
                                                      unicode_errors="ignore")
            words = model.index2entity
        elif embedding_type == "fasttext":
            model = FastText.load_fasttext_format(file_name)
            words = [w for w in model.wv.vocab]
        else:
            print("Unknown Type")
            return {}, 0

        if lower:
            vectors = {word.lower(): model[word] for word in words}
        else:
            vectors = {word: model[word] for word in words}

        if "UNK" not in vectors:
            unk = np.mean([vectors[word] for word in vectors.keys()], axis=0)
            vectors["UNK"] = unk

        return vectors, len(vectors["UNK"])
    def setup(self, log=True):
        hp = HP(base_dir=self.dir)
        if self.conf.trainer_type == "DST":
            for attr, val in self.conf.hyperparameter.items():
                if val is not None:
                    setattr(hp, attr, val)
            self.trainer = DST(hp=hp)
        elif self.conf.trainer_type == "CDST":
            for attr, val in self.conf.hyperparameter.items():
                if val is not None:
                    setattr(hp, attr, val)
            self.trainer = CDST(hp=hp)
        elif self.conf.trainer_type == "FT_DST":
            for attr, val in self.conf.hyperparameter.items():
                if attr != "embedding_dim":
                    if val is not None:
                        setattr(hp, attr, val)
            if not self.fast_text_model:
                print("Setting fast_text_model...", end="")
                self.fast_text_model = FastText.load_fasttext_format(
                    hp.fast_text_model_file)
                print("Ok.")
            self.trainer = FT_DST(fast_text_model=self.fast_text_model, hp=hp)
        else:
            raise ValueError("Unknown Trainer")

        self.log = log
        if log:
            self.tee = Tee(str(self.trainer.hp.dir / "log.txt"))
            sys.stdout = self.tee
 def get_embeddings_authors(self, authors_pondered_tokens: Dict[str, List[Tuple[str, float]]]):
     self.model_embeddings = FastTextWrapper.load_fasttext_format(TextProcessings.FAST_TEXT_PATH)
     authors_embeddings = []
     print('computing embeddings')
     with open('300_authors_embeddings.txt', 'w', encoding='utf-8') as f:
         for name, tokens_weights in authors_pondered_tokens.items():
             weighted_avg_author = np.float32([0] * TextProcessings.WORD_DIM)
             print(name, file=f)
             weights, tokens = [], []
             unique_tokens = set()
             for (token, weight) in tokens_weights:
                 if token in self.model_embeddings.wv.vocab and token not in unique_tokens:
                     weights.append(weight)
                     tokens.append(token)
                     unique_tokens.add(token)
                     
             # normalize weights s.t. is a prob distribution
             weights = normalize(np.float32([weights]), norm='l1')[0]
             # print(weights, file=f)
             # print(weights[0])
             for i, token in enumerate(tokens):
                 weighted_avg_author += weights[i] * self.model_embeddings.wv[token]
                 print(token, weights[i] , file=f)
             authors_embeddings.append((name, weighted_avg_author))
         pickle.dump(authors_embeddings, open(TextProcessings.AUTHORS_EMBEDDINGS_FILE, "wb"))
def load_language_model(language_models_path, language_model_name, texts, self_train: bool, save_self_model: bool,
                        self_model_name):
    """
    загрузка/обучение языковой модели
    :param language_models_path: директория с яз. моделями
    :param language_model_name: имя яз. модели
    :param texts: корпус текстов (если обучаем свою модель)
    :param self_train: обучаем ли свою
    :param save_self_model: сохранять ли свою обученную
    :param self_model_name: имя своей модели
    :return:
    """
    if self_train:
        language_model = Word2Vec(texts, min_count=0, size=300)
        if save_self_model:
            language_model.save(os.path.join(language_models_path, self_model_name + '.w2v'))
    else:
        if language_model_name[-3:] == 'bin':
            language_model = gensim.models.Word2Vec.load(os.path.join(language_models_path, language_model_name))
        elif language_model_name[-3:] == 'bin':
            language_model = FastText.load_fasttext_format(os.path.join(language_models_path, language_model_name))
        else:
            language_model = Word2Vec.load(os.path.join(language_models_path, language_model_name))

    return language_model
Beispiel #14
0
    def __init__(self,
                 embedding_file,
                 sequences,
                 seq_type,
                 k_mer,
                 restricted_kmer=False,
                 use_idf=False,
                 norm=None):
        '''
            Class constructor
        '''
        SequenceKmerRep.__init__(self,
                                 sequences,
                                 seq_type,
                                 k_mer,
                                 restricted_kmer=restricted_kmer,
                                 use_idf=use_idf,
                                 norm=norm,
                                 delete_empty_col=True)
        print('loading embedding..')

        if embedding_file.split('.')[-1] == 'txt':
            self.model = KeyedVectors.load_word2vec_format(embedding_file,
                                                           binary=False)
        else:
            self.model = FastText.load_fasttext_format(embedding_file)

        self.emb_trans = [self.model[x.lower()] for x in self.vocab]
        # summation vector
        self.embeddingX = self.X.dot(self.emb_trans)
        self.emb_kmer_concat = np.concatenate(
            (self.embeddingX, self.X.toarray()), axis=1)
Beispiel #15
0
  def read_model(self, tipe):

    start = time.time()

    if tipe == 'own-model1':
      print('Loading {} '.format(str(tipe)))
      embeddings_path = 'word_embedding_cbow.bin'
      word2vec_model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False, unicode_errors="ignore")
    elif tipe == 'own-model2':
      print('Loading {} '.format(str(tipe)))
      embeddings_path = '/home/pras/Embeddings/model_arif'
      word2vec_model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False, unicode_errors="ignore")
    elif tipe == 'own-model3':
      print('Loading {} '.format(str(tipe)))
      embeddings_path = '/home/adrian/new_cnn/modelapik_cbows.bin'
      word2vec_model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False, unicode_errors="ignore")
    elif tipe == 'bojanowski':
      print('Loading {} model'.format(str(tipe)))
      embeddings_path = '/home/pras/Embeddings/wiki.bin'
      word2vec_model = FastText.load_fasttext_format(embeddings_path)
    else:
      print('Error Embeddings')

    end = time.time()
    print('Loading {} done in {} Seconds'.format(str(tipe), (end-start)))
    print('')

    self.word2vec = word2vec_model
    self.embed_dim = 300

    return word2vec_model
 def load_embeddings(self, file_path):
     # Embeddins must be in fastText format either bin or
     print('Loading embeddins...')
     if file_path.endswith('.bin'):
         from gensim.models.wrappers import FastText
         embeddings = FastText.load_fasttext_format(file_path)
     else:
         pre_trained_embeddins_dict = dict()
         with open(file_path) as f:
             _ = f.readline()
             for line in f:
                 token, *embedding = line.split()
                 embedding = np.array(
                     [float(val_str) for val_str in embedding])
                 if token in self.token_dict:
                     pre_trained_embeddins_dict[token] = embedding
         print('Readed')
         pre_trained_std = np.std(list(pre_trained_embeddins_dict.values()))
         embeddings = pre_trained_std * np.random.randn(
             len(self.token_dict), len(embedding))
         for idx in range(len(self.token_dict)):
             token = self.token_dict.idx2tok(idx)
             if token in pre_trained_embeddins_dict:
                 embeddings[idx] = pre_trained_embeddins_dict[token]
     return embeddings
Beispiel #17
0
 def __init__(self, lang='eng', dim=200):
    
     self.dim = dim
     # self.file_path = 'ko.bin'
     self.file_path = 'wiki.ko.bin'
     
     self.model = FastText.load_fasttext_format(self.file_path)
Beispiel #18
0
    def load_word_vectors(wordchar2vector_path, word2vector_path):
        """
        Фабрика для получения удобного доступа к обеим моделям встраивания слов - посимвольной
        морфологической и пословной синтактико-семантической.
        
         :param wordchar2vector путь к файлу с векторами слов в модели посимвольного встраивания.
         :param word2vector_path путь к файлу с векторами слов в word2vec, fasttext или glove моделях
         
         :return экземпляр класса, предоставляющий метод-индексатор и возвращающий объединенный
          вектор встраивания для слова. 
        """

        print('Loading the wordchar2vector model {} '.format(wordchar2vector_path), end='')
        # Грузим заранее подготовленные векторы слов для модели
        # встраивания wordchar2vector (см. wordchar2vector.py)
        wc2v = gensim.models.KeyedVectors.load_word2vec_format(wordchar2vector_path, binary=False)
        wc2v_dims = len(wc2v.syn0[0])
        print('wc2v_dims={0}'.format(wc2v_dims))

        if os.path.basename(word2vector_path).startswith('fasttext'):
            print('Loading FastText model {} '.format(word2vector_path), end='')
            WordEmbeddings._flush_print()
            w2v = FastText.load_fasttext_format(word2vector_path)
            w2v_dims = w2v.vector_size
            print('w2v_dims={0}'.format(w2v_dims))
            return WordEmbeddings_FastText(wc2v, wc2v_dims, w2v, w2v_dims)
        else:
            print('Loading w2v model {} '.format(word2vector_path), end='')
            WordEmbeddings._flush_print()
            w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vector_path, binary=not word2vector_path.endswith('.txt'))
            w2v_dims = len(w2v.syn0[0])
            print('w2v_dims={0}'.format(w2v_dims))
            return WordEmbeddings_W2V(wc2v, wc2v_dims, w2v, w2v_dims)
Beispiel #19
0
def evaluatePool(corpus_words, pool, POS_weights):
    print(str(datetime.now()) + ' start load_fasttext_format')
    embedding = FastText.load_fasttext_format('embedding.bin', encoding='utf-8')
    print(str(datetime.now()) + ' finish load_fasttext_format')

    for sentence in pool:
        # lexic
        n = sentence._words
        if n == 0:
            continue;
        oovw = (sentence._oovw / n)
        rep = 1 / (1 + sentence._repetitions / n)
        lexic = (oovw + rep) / 2

        # gramatic
        gramatic = 0
        for pos, pos_weight in POS_weights.items():
            gramatic += pos_weight * (getattr(sentence,'_POS_' + pos)/sentence._words)
        gramatic = gramatic / len(POS_weights)
        # print('lexic: ' + str(lexic) + ', gramatic: ' + str(gramatic))

        # semantics
        synonyms = 0
        for pool_word in sentence.text.split():
            for corpus_word in corpus_words:
                try:
                    synonyms += bool(embedding.similarity(pool_word, corpus_word) >= similarity_threshold)
                except:
                    pass
        semantic = 1 - (n / (n + synonyms))

        sentence.score = w * lexic + w * gramatic + w * semantic
        session.commit()
Beispiel #20
0
def get_fasttext_model(dataset="tweet", model_type="bin"):
    w2v_rootdir = os.path.join(res_basedir, "word2vecs")
    tweets_rootdir = os.path.join(resources_rootdir, "tweet_w2v",
                                  "tweet_fasttext")
    ds_rootdir = os.path.join(resources_rootdir, "ds_aa", "fasttext_embs")
    amazon_rootdir = os.path.join(resources_rootdir, "amazon", "fasttext_embs")
    # amazon_rootdir = os.path.join(resources_rootdir, "amazon", "fasttext_embs_50_eps")
    if dataset == "tweet":
        model_path = os.path.join(tweets_rootdir,
                                  "tweet_fasttext.{}".format(model_type))
    elif dataset == "ds":
        model_path = os.path.join(ds_rootdir,
                                  "ds_fasttext.{}".format(model_type))
    elif dataset == "amazon":
        model_path = os.path.join(amazon_rootdir,
                                  "amazon_fasttext.{}".format(model_type))
    elif dataset == "wiki":
        model_path = os.path.join(w2v_rootdir,
                                  "wiki.en/wiki.en.{}".format(model_type))
    elif dataset == "simple":
        model_path = os.path.join(
            w2v_rootdir, "wiki.simple/wiki.simple.{}".format(model_type))
    print "fasttext model: ", model_path

    if model_type == "bin":
        model = FastText.load_fasttext_format(model_path)
    else:
        model = KeyedVectors.load_word2vec_format(model_path, binary=False)
    return model
Beispiel #21
0
def load_ft():
    w2v_model = FastText.load_fasttext_format('../embedding/cc.zh.300.bin')
    print("Finish Load")
    dim = len(w2v_model['好'])
    fw1 = codecs.open("../embedding/embedding_all_ftoov_%d.txt" % (dim), 'w', encoding='utf-8')
    vocab_dict = pickle.load(open('../data/vocabulary.pkl', 'rb'))
    word_list = ['unk' for i in range(len(vocab_dict))]
    for k, v in vocab_dict.items():
        word_list[v] = k
    # print(word_list)
    embedding_matrix = np.zeros((len(vocab_dict), dim))
    miss = 0
    for index, w in enumerate(word_list):
        if index % 1000 == 0:
            print(index)
        try:
            # in_set.add(w)
            embeds = np.asarray(w2v_model[w])
        except:
            w2v_model.most_similar(w)
            miss += 1
            print(w)
            embeds = np.random.uniform(-0.25, 0.25, dim)
        embedding_matrix[index] = embeds

    fw1.write(str(len(word_list)) + ' ' + str(dim)+'\n')
    for index, w in enumerate(word_list):
        fw1.write(w)
        for i in embedding_matrix[index]:
            fw1.write(' ' + str(i))
        fw1.write('\n')
    pickle.dump(vocab_dict, open('../data/vocabulary2.pkl', 'wb'))
    print(len(word_list))
    print("miss:%d" % miss)
Beispiel #22
0
def train_fasttext(data_dir='./data',
                   dim=300,
                   epoch=5,
                   ft_model='skipgram',
                   ft_lr=0.05,
                   ft_window=5):

    data_dir = Path(data_dir)

    import fasttext

    model = fasttext.train_unsupervised(
        str(data_dir / 'ocb_and_wikisource.w2v_tokens.txt'),
        model=ft_model,
        lr=ft_lr,  # learning rate [0.05]
        dim=dim,  # size of word vectors [100]
        ws=ft_window,  # size of the context window [5]
        epoch=epoch  # number of epochs [5]
        # thread            # number of threads [number of cpus]
    )
    model.save_model(str(data_dir / 'ocb_and_wikisource.fasttext.bin'))

    from gensim.models.wrappers import FastText

    ft_model = FastText.load_fasttext_format(
        str(data_dir / 'ocb_and_wikisource.fasttext.bin'))

    ft_model.wv.save_word2vec_format(data_dir /
                                     'ocb_and_wikisource.fasttext.w2v.txt')

    logger.info('done')
Beispiel #23
0
def train_fasttext(hf_dataset, output_dir):
    """

    Run with: $ ./data_cli.py train_fasttext paperswithcode_aspects ./output

    :return:
    """

    tokens_fp = os.path.join(output_dir, 'tokens.txt')
    fasttext_bin_fp = os.path.join(output_dir, 'fasttext.bin')
    fasttext_w2v_fp = os.path.join(output_dir, 'fasttext.w2v.txt')

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir='./data/nlp_cache',
                           split='docs')

    logger.info(f'Documents loaded: {len(docs_ds):,}')

    # Tokenized text
    doc_delimiter = '\n'
    token_delimiter = ' '
    tokens_count = 0

    with open(tokens_fp, 'w') as f:
        for doc in docs_ds:
            # Extract plain text
            text = doc['title'] + ': ' + doc['abstract']

            for token in gensim.utils.simple_preprocess(text,
                                                        min_len=2,
                                                        max_len=15):
                f.write(token + token_delimiter)
                tokens_count += 1
            f.write(doc_delimiter)

    logger.info(f'Total tokens: {tokens_count:,}')

    # Train actual fasttext model
    logger.info(f'Train fastext model...')

    model = fasttext.train_unsupervised(
        tokens_fp,
        model='skipgram',
        lr=0.05,  # learning rate [0.05]
        dim=300,  # size of word vectors [100]
        ws=5,  # size of the context window [5]
        epoch=5  # number of epochs [5]
        # thread            # number of threads [number of cpus]
    )
    model.save_model(fasttext_bin_fp)

    del model

    ft_model = FastText.load_fasttext_format(fasttext_bin_fp)
    ft_model.wv.save_word2vec_format(fasttext_w2v_fp)

    logger.info(f'Output saved to: {fasttext_w2v_fp}')

    logger.info('Done')
Beispiel #24
0
def load_embeddings_file(file_name, lower=False, type=None):
    if type == None:
        file_type = file_name.rsplit(".",1)[1] if '.' in file_name else None
        if file_type == "p":
            type = "pickle"
        elif file_type == "bin":
            type = "word2vec"
        elif file_type == "vec":
            type = "fasttext"
        else:
            type = "word2vec"

    if type == "word2vec":
        model = KeyedVectors.load_word2vec_format(file_name, binary=True, unicode_errors="ignore")
        words = model.index2entity
    elif type == "fasttext":
        model = FastText.load_fasttext_format(file_name)
        words = [w for w in model.wv.vocab]
    elif type == "pickle":
        with open(file_name,'rb') as fp:
            model = pickle.load(fp)
        words = model.keys()

    if lower:
        vectors = {word.lower(): model[word] for word in words}
    else:
        vectors = {word: model[word] for word in words}

    if "UNK" not in vectors:
        unk = np.mean([vectors[word] for word in vectors.keys()], axis=0)
        vectors["UNK"] = unk

    return vectors, len(vectors["UNK"])
Beispiel #25
0
 def __init__(self, path, model, max_length, word_dim):
     filePath = open(path)
     self.model = FastText.load_fasttext_format(model)
     self.data = list(csv.reader(filePath))
     self.encoder = preprocessing.LabelEncoder()
     self.encoder.fit(list(map(lambda x: x[1], self.data)))
     self.max_length = max_length
     self.word_dim = word_dim
Beispiel #26
0
def load_test():
    model = fasttext.load_model("/home/zhoutong/nlp/data/cc.en.300.bin")
    vec1 = model.get_word_vector("china")
    vec2 = model.get_word_vector("america")
    similarity(vec1,vec2)

    sen_vec1 = model.get_sentence_vector("I come from china")
    sen_vec2 = model.get_sentence_vector("I am chinese")
    np.concatenate([model.get_word_vector(i) for i in ["I","am","chinese"]]) / 3
    similarity(sen_vec1,sen_vec2)

    gensim_model = FastText.load_fasttext_format('/home/zhoutong/nlp/data/cc.en.300.bin') # 10min
    gensim_model.most_similar('teacher')
    gensim_model.similarity('teacher', 'teaches')
    gensim_model.init_sims(replace=True)
    gensim_model.save('/home/zhoutong/nlp/data/cc.en.300.bin.gensim')
    gensim_model_new = FastText.load('/home/zhoutong/nlp/data/cc.en.300.bin.gensim',mmap='r')
Beispiel #27
0
 def __init__(self, *args, hp=None, fast_text_model=None, **kwargs):
     hp = hp or HP()
     hp.fast_text = True
     hp.embedding_dim = 300
     super().__init__(*args, hp=hp, **kwargs)
     self.decode = np.vectorize(lambda x: x.decode("utf-8"))
     assert fast_text_model or self.hp.fast_text_model_file
     self.fast_text_model = fast_text_model or FastText.load_fasttext_format(
         self.hp.fast_text_model_file)
Beispiel #28
0
def provide_fasttext_model():
    assure_fasttext_model_exists()

    print('providing fasttext model ...')
    model = FastText.load_fasttext_format(
        os.path.join(FASTTEXT_MODEL_BASE_DIR, FASTTEXT_MODEL_BIN_NAME))
    print('succesfully provided fasttext model')

    return model.wv
Beispiel #29
0
def load_fasttext_model(path):
    """ Load a pre-trained FastText model.

    :param path: path of the file of the pre-trained FastText model
    :return: a pre-trained FastText model
    :type path: str
    :rtype: gensim.models.keyedvectors.KeyedVectors
    """
    return FastText.load_fasttext_format(path)
def get_fasttext_embed(list_of_words):
    words_embed_dict = dict()
    ff_model = FastText.load_fasttext_format(fast_text_file)
    for w in list_of_words:
        try:
            words_embed_dict[w] = ff_model[w]
        except:
            pass
    return words_embed_dict
Beispiel #31
0
 def load_embeddings(self, file_path):
     # Embeddins must be in fastText format either bin or
     print('Loading embeddins...')
     if file_path.endswith('.bin'):
         from gensim.models.wrappers import FastText
         embeddings = FastText.load_fasttext_format(file_path)
     else:
         from gensim.models import KeyedVectors
         embeddings = KeyedVectors.load_word2vec_format(file_path)
     return embeddings
Beispiel #32
0
def printvec(train_path, vec_path):
	#1.小文字化など前処理したファイルを作成
	print('\nPreprpcessing training data...')
	tmp_path=train_path[:-4]+'_cleaned.txt'
	with open(train_path) as f_in:
	    with open(tmp_path, 'w') as f_out:
	        for line in f_in:
	            text=line.lower()
	            text = re.sub(r"[^a-z ]", "", text)
	            text = re.sub(r"[ ]+", " ", text)
	            f_out.write(text)
	train_path=tmp_path
	
	#2.辞書の作成
	print('\nMake dic...')
	s=set()
	with open(train_path) as f:
	    for line in f:
	        text=line.lower()
	        text = text.replace("\n", " ").replace('\r','')
	        text = re.sub(r"[ ]+", " ", text)
	        text_list=text.split(" ")
	        tmp_set=set(text_list)
	        s.update(tmp_set)

	words = sorted(list(s))
	len_words=len(words)
	word_indices = dict((c, i+1) for i, c in enumerate(words))
	indices_word = dict((i+1, c) for i, c in enumerate(words))
	# 0番目はパディング用の数字なので使わないことに注意
	
	#3.fasttextの学習
	myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext'
	ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0)
	ft_model.save(today_str+'ft.model')

	#4.ベクトルのファイル出力
	with open(vec_path, 'w') as file:
		for i in range(len_words):
		    if i!=0:
		    	word=indices_word[i]
		    	if word in ft_model.wv.vocab:
		    		vec=ft_model[word]
		    	else:
		    		vec=np.zeros((vec_size),dtype=np.float32)
		    	output=word+' > 'str(vec)+'\n'
		    	file.write(output)
		    	
	#5.モデルをリセット
	ft_model.reset_weights()
Beispiel #33
0
#単語から辞書IDを返す
def search_word_indices(word):
    if word in word_indices:
        return word_indices[word]
    else:
        return word_indices["#OTHER"]



#fasttextの学習
vec_size=100

print('Learning fasttext...')

myft_path='/home/tamaki/M1/Keras/mine2017_8to11/fastText/fasttext'
ft_model = FastText.train(ft_path=myft_path, corpus_file=train_path, size=vec_size, window=5, min_count=0)
ft_model.save(today_str+'ft.model')
# FastTextはcbowとskipgramの二つの学習方法があるがデフォルトではcbow

print_time('FastText end')


#word2vecのベクトルを得る
#未知語の場合には[0,0,0, ... ,0]みたいなやつにとりあえずしてる
#未知語は集合に格納し,あとでファイル出力
#要改良
KeyError_set=set()
def get_ft_vec(word):
    if word in ft_model.wv.vocab:
        return ft_model[word]
    else: