Example #1
0
def save_fasttext(vocab):
    model = FastText.load_word2vec_format('../../corpora/wiki.en.vec')
     # 新建KeyedVectors
    kmodel = KeyedVectors(300)
    loss = 0
    for word in vocab:
        try:
            vec = model[word]
        except:
            loss += 1
            continue
        kmodel.add(word, vec, replace=True)
    print('loss word: ', loss)
    kmodel.save('../../corpora/fasttext.wv')
Example #2
0
def build_vocab(filenames, vocabfile):
    """Write unique words from a set of files to a new file"""
    if os.path.isfile(vocabfile):
        print('Loading existing vocabulary from', vocabfile)
        return
    vocab = set()
    for filename in filenames:
        if filename.endswith('.vec'):
            model = FastText.load_word2vec_format(filename)
            vocab |= set(model.vocab.keys())
        else:
            with open(filename, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.rstrip('\n').lower().split()
                    vocab |= set(tokens)
    with open(vocabfile, 'w', encoding='utf-8') as f:
        for token in vocab:
            f.write(token + '\n')
Example #3
0
def load_word_vectors(embeddings_path):
    if os.path.isfile(embeddings_path + '.pth') and \
            os.path.isfile(embeddings_path + '.vocab'):
        print('==> File found, loading to memory')
        vectors = torch.load(embeddings_path + '.pth')
        vocab = Vocab(filename=embeddings_path + '.vocab')
        return vocab, vectors
    if os.path.isfile(embeddings_path + '.model'):
        model = KeyedVectors.load(embeddings_path + ".model")
    if os.path.isfile(embeddings_path + '.vec'):
        model = FastText.load_word2vec_format(embeddings_path + '.vec')
    list_of_tokens = model.vocab.keys()
    vectors = torch.zeros(len(list_of_tokens), model.vector_size)
    with open(embeddings_path + '.vocab', 'w', encoding='utf-8') as f:
        for token in list_of_tokens:
            f.write(token+'\n')
    vocab = Vocab(filename=embeddings_path + '.vocab')
    for index, word in enumerate(list_of_tokens):
        vectors[index, :] = torch.from_numpy(model[word])
    return vocab, vectors
df_feat['glove_cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]
df_feat['glove_skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
df_feat['glove_skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
df_feat['glove_kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
df_feat['glove_kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]


# fasttext w2v distance
model = FastText.load_word2vec_format('../../corpora/wiki.en.vec')
def sent2vec(s):
    words = str(s).lower()
    words = nltk.word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())
Example #5
0
from gensim.models.wrappers import FastText

model = FastText()
model.load_word2vec_format('/home/ltp/WorkShop/fastText/model/wiki.zh.vec')
Example #6
0
def load_embeddings(embeddings_path):
    if os.path.isfile(embeddings_path + '.model'):
        model = KeyedVectors.load(embeddings_path + ".model")
    if os.path.isfile(embeddings_path + '.vec'):
        model = FastText.load_word2vec_format(embeddings_path + '.vec')
    return model