Beispiel #1
0
 def load_embeddings(self, file_path):
     # Embeddins must be in fastText format either bin or
     print('Loading embeddins...')
     if file_path.endswith('.bin'):
         from gensim.models.wrappers import FastText
         embeddings = FastText.load_fasttext_format(file_path)
     else:
         from gensim.models import KeyedVectors
         embeddings = KeyedVectors.load_word2vec_format(file_path)
     return embeddings
    def from_gensim(self, fname, normalizeVectors=True):
        import gensim
        print >> sys.stderr, "Loading embeddings file", fname
        if re.search(r'bin$', fname):
            model = FastText.load_fasttext_format(fname)
        elif re.search(r'bin$', fname):
            model = gensim.models.KeyedVectors.load_word2vec_format(
                fname,
                binary=True,
                encoding='utf-8',
                unicode_errors='ignore',
                fvocab=fvocab
                #,unicode_errors='replace'
            )
        elif re.search(r'vec$', fname):
            model = gensim.models.KeyedVectors.load_word2vec_format(
                fname, binary=False, fvocab=fvocab)
        else:
            model = gensim.models.KeyedVectors.load(fname)

        if "wv" in model:
            model = model.wv

        try:
            self.vecs = model.vectors
        except:
            self.vecs = model.syn0

        self.id2label = [] + model.index2word
        for w in model.vocab:
            self.labels[w] = {
                "id": model.vocab[w].index,
                "freq": model.vocab[w].count
            }

        if normalizeVectors:
            self.vecs = normalize(self.vecs)
        self.build()
        return self
Beispiel #3
0
def Import(lang, path):
    Target_lang = lang
    # print(len(model.words))
    word_list = pd.read_csv('wordlists/religion_Translations/' + lang + '.csv')
    # model = FastText.load_fasttext_format("D:\Data\Word Embedding\Vectors/" + path)

    ##########  Import Word Lists  ##################

    word_list = np.array(word_list)
    occupations = []
    Islam = []  # man
    Christianity = []  #  woman   this is woman bias
    Terrorism = []
    LenOfIslam = 18  # The length of lists for male and female
    LenOfChristianity = 15  # The length of lists for male and female
    LenOfNatrual = 48
    print(word_list[:, :])
    print(len(word_list[:, 1]))

    for i in range(0, LenOfIslam):
        Islam.append(word_list[i][0])
    print(Islam)
    for i in range(0, LenOfChristianity):
        Christianity.append(word_list[i][1])
    Christianity.append(word_list[i][1])
    for i in range(0, LenOfNatrual):
        Terrorism.append(word_list[i][2])
    print(Terrorism)

    print(
        '''This is the Translated result of Terrorism word list for this language'''
    )
    print("Import Loading")
    model = FastText.load_fasttext_format("D:\Data\Word Embedding\Vectors/" +
                                          path,
                                          encoding='utf-8')
    # model = KeyedVectors.load_word2vec_format("D:\Data\Word Embedding\Vectors/" + path, encoding='utf-8')
    return model, Islam, Christianity, Terrorism
Beispiel #4
0
def export_to_file(path_to_model, output_file):
    output = codecs.open(output_file, 'w+', 'utf-8')

    print("Converting to text")
    model = FastText.load_fasttext_format(path_to_model)
    vocab = model.wv.vocab
    # output.write(str(len(vocab)) + " " + str(len(model[vocab[0]])))
    header = False
    for mid in tqdm.tqdm(vocab):
        if not header:
            output.write(str(len(vocab)) + " " + str(len(model[mid])) + "\n")
            header = True

        vector = list()
        for dimension in model[mid]:
            vector.append(str(dimension))
        # line = { "mid": mid, "vector": vector  }
        vector_str = " ".join(vector)
        line = mid + " " + vector_str
        # line = json.dumps(line)
        output.write(line + "\n")
    output.close()
    print("Done!")
def get_fasttext_model(dataset="tweet", model_type="bin"):
    w2v_rootdir = os.path.join(res_basedir, "word2vecs")
    tweets_rootdir = os.path.join(resources_rootdir, "tweet_w2v", "tweet_fasttext")
    ds_rootdir = os.path.join(resources_rootdir, "ds_aa", "fasttext_embs")
    amazon_rootdir = os.path.join(resources_rootdir, "amazon", "fasttext_embs")
    # amazon_rootdir = os.path.join(resources_rootdir, "amazon", "fasttext_embs_50_eps")
    if dataset == "tweet":
        model_path = os.path.join(tweets_rootdir, "tweet_fasttext.{}".format(model_type))
    elif dataset == "ds":
        model_path = os.path.join(ds_rootdir, "ds_fasttext.{}".format(model_type))
    elif dataset == "amazon":
        model_path = os.path.join(amazon_rootdir, "amazon_fasttext.{}".format(model_type))
    elif dataset == "wiki":
        model_path = os.path.join(w2v_rootdir, "wiki.en/wiki.en.{}".format(model_type))
    elif dataset == "simple":
        model_path = os.path.join(w2v_rootdir, "wiki.simple/wiki.simple.{}".format(model_type))
    print "fasttext model: ", model_path

    if model_type == "bin":
        model = FastText.load_fasttext_format(model_path)
    else:
        model = KeyedVectors.load_word2vec_format(model_path, binary=False)
    return model
 def __init__(self, Model, params, predict=False):
     self.Model = Model
     self.params = params
     if predict:
         print("Loading Vectors")
         self.vec_model = FastText.load_fasttext_format(
             'vectors/cc.en.300.bin/cc.en.300.bin').wv
         print("Finished Loading Vectors")
         # self.vec_model = []
         print("loading model")
         self.test_val = tf.placeholder(tf.float32,
                                        shape=(None, max_word_count, 300))
         self.test_output = self.Model.build_model(self.test_val,
                                                   self.params,
                                                   predict=True)
         self.sess = tf.InteractiveSession()
         self.saver = tf.train.Saver()
         # self.sess.run(tf.global_variables_initializer())
         # self.sess.run(tf.local_variables_initializer())
         self.saver.restore(
             self.sess,
             tf.train.latest_checkpoint(f"tensorboard_{self.Model.type}"))
         print("finished Loading model")
Beispiel #7
0
def load_ft():
    w2v_model = FastText.load_fasttext_format('../embedding/cc.zh.300.bin')
    print("Finish Load")
    dim = len(w2v_model['好'])
    fw1 = codecs.open("../embedding/embedding_all_ftoov_%d.txt" % (dim),
                      'w',
                      encoding='utf-8')
    vocab_dict = pickle.load(open('../data/vocabulary.pkl', 'rb'))
    word_list = ['unk' for i in range(len(vocab_dict))]
    for k, v in vocab_dict.items():
        word_list[v] = k
    # print(word_list)
    embedding_matrix = np.zeros((len(vocab_dict), dim))
    miss = 0
    for index, w in enumerate(word_list):
        if index % 1000 == 0:
            print(index)
        try:
            # in_set.add(w)
            embeds = np.asarray(w2v_model[w])
        except:
            w2v_model.most_similar(w)
            miss += 1
            print(w)
            embeds = np.random.uniform(-0.25, 0.25, dim)
        embedding_matrix[index] = embeds

    fw1.write(str(len(word_list)) + ' ' + str(dim) + '\n')
    for index, w in enumerate(word_list):
        fw1.write(w)
        for i in embedding_matrix[index]:
            fw1.write(' ' + str(i))
        fw1.write('\n')
    pickle.dump(vocab_dict, open('../data/vocabulary2.pkl', 'wb'))
    print(len(word_list))
    print("miss:%d" % miss)
def main():
    print("read word embeddings")
    word2vec = FastText.load_fasttext_format("data/embeddings/wiki.vi.bin").wv

    print("read relations")
    vectors = []
    metadata = []
    with open("data/RelationNormalize.txt", "r", encoding="utf8") as f:
        for line in f:
            w = line.strip().lower()
            if w in word2vec and w not in metadata:
                metadata.append(w)
                vec = word2vec.word_vec(w)
                vec = list(map(str, vec))
                vectors.append("\t".join(vec))
    del word2vec

    print("saving data")
    with open("data/vectors.txt", "w", encoding="utf8") as f:
        f.write("\n".join(vectors))
    del vectors
    with open("data/metadata.txt", "w", encoding="utf8") as f:
        f.write("\n".join(metadata))
    del metadata
#     loss = criterion(output, true_scores)
#     loss.backward()
#     torch.nn.utils.clip_grad_norm_(model.parameters(), 1000)
#     optimizer.step()
#     return output, loss.item()
#
#
# def calculate_true_score(sentence_and_tags, true_tags):
#     tags = sentence_and_tags[1]
#     length = len(tags)
#     assert length == len(true_tags)
#     count = 0
#     for i in range(length):
#         if tags[i] == true_tags[i]:
#             count += 1
#     return count/length

if __name__ == '__main__':
    language_model = FastText.load_fasttext_format(PATH_TO_MODEL)
    x, y, tag_set = pr.load_train_test_validation_sets(PATH_TO_DATA)
    train_set = pr.WordsDataset(x, tag_set)
    test_set = pr.WordsDataset(y, tag_set)

    item = train_set[16][0]["words"]
    print(item)
    my_little_tensor = create_tensor(item, None, language_model)
    print(my_little_tensor.shape)
    print("-----------------")

    model = train_model(train_set, language_model)
Beispiel #10
0
from nltk.tag.stanford import StanfordNERTagger
from elasticsearch import Elasticsearch
import itertools
import nltk
from pymongo import MongoClient
from nltk.corpus import wordnet
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.corpus import stopwords
import re
import csv
from gensim.models.wrappers import FastText
model = FastText.load_fasttext_format(
    '/Users/sepidehmesbah/Downloads/fastText/modelFT')
from app.modules import filter_entities
client = MongoClient('localhost:4321')
db = client.pub
import random
from nltk.tag.stanford import StanfordPOSTagger
# english_postagger = StanfordPOSTagger('/Users/sepidehmesbah/Downloads/stanford-postagger-full-2016-10-31/models/english-bidirectional-distsim.tagger', '/Users/sepidehmesbah/Downloads/stanford-postagger-full-2016-10-31/stanford-postagger.jar')
# print(english_postagger.tag('Figures 3 (a) and (b) show the precision-recall curves for the three datasets: MovieLens, NewsSmall and NewsBig.3'.split()))

dsnames = []

# corpuspath = "/Users/sepidehmesbah/SmartPub/DataProfiling/dataset-names.txt"
# with open(corpuspath,"r") as file:
#     for row in file.readlines():
#         dsnames.append(row.strip())
#
# ###############################
#
class_weighted = False

OUTPUT_DIR = '../Output/'

category2index = pickle.load(
    open(os.path.join(OUTPUT_DIR + 'category2index_%s.dict' % DATASET), 'rb'))
categories = [''] * len(category2index)
for cate, i in category2index.items():
    categories[i] = cate
print(categories)

# Creating the model
print("Loading the FastText Model")
# en_model = {"test":np.array([0]*300)}
en_model = FastText.load_fasttext_format('../FastText/wiki.en/wiki.en')


class PretrainFastTextClassifier:
    def __init__(self):
        ''' load data '''
        self.domains_train = pickle.load(
            open(OUTPUT_DIR + 'training_domains_%s.list' % DATASET, 'rb'))
        self.domains_train = [
            d for cat_domains in self.domains_train for d in cat_domains
        ]
        self.domains_val = pickle.load(
            open(OUTPUT_DIR + 'validation_domains_%s.list' % DATASET, 'rb'))
        self.domains_val = [
            d for cat_domains in self.domains_val for d in cat_domains
        ]
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format('../data/raw/BIN/wiki.simple')

# Give the embedding of a given word
print(model.wv['brain'])

# Test if a word is in the model
print('brain' in model.wv.vocab)

# Give the most similar words
print(model.most_similar('brain'))

# Compute similarity between two words
print(model.similarity('brain', 'synapse'))

# Compute cosine distance between two groups of words
trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])

# Make arithmetic with words
print(model.wv.most_similar(positive=['king', 'woman'], negative=['man']))
from gensim.models.wrappers import FastText
from gensim.models.fasttext import FastText, load_facebook_vectors
from gensim.models.keyedvectors import KeyedVectors

import numpy
import numpy as np
from numpy.random import seed
seed(1)

#%% fast load
wxTrain1, wxTest1, wyTrain1, wyTest1, sentences_corpus, keywords_dictionary, labels = dsurdu.Generate_Urdu_Ngrams(
    _ngram_range=(1, 1), _max_features=3000, words=True)

w2v_file_fast_text = "C:\FasText\\cc.ur.300.bin.gz"
w2vmodel = FastText.load_fasttext_format(w2v_file_fast_text)
print("Word 2 Vector File Loaded!")

vector = w2vmodel.wv['easy']
print("Shape of Vector:" + str(vector.shape))

#%% fast vector generation
vectors_per_document = 5
X_train_Vector = []
for kl in keywords_dictionary:
    vector_list = []
    for word in kl[0:vectors_per_document]:
        if word in w2vmodel.wv.vocab:
            vector_list.append(w2vmodel[word])
        else:
            vector_list.append(np.random.uniform(-0.1, 0.1, 300))
 def __init__(self):
     # add data imports
     self.data_df = pd.read_csv("data.csv")
     self.data_path = "./complete.json.gz"
     self.data_split = data_split
     self.stop_words = [
         'ourselves',
         'hers',
         'between',
         'yourself',
         'but',
         'again',
         'there',
         'about',
         'once',
         'during',
         'out',
         'very',
         'having',
         'with',
         'they',
         'own',
         'an',
         'be',
         'some',
         'for',
         'do',
         'its',
         'yours',
         'such',
         'into',
         'of',
         'most',
         'itself',
         'other',
         'off',
         'is',
         's',
         'am',
         'or',
         'who',
         'as',
         'from',
         'him',
         'each',
         'the',
         'themselves',
         'until',
         'below',
         'are',
         'we',
         'these',
         'your',
         'his',
         'through',
         'me',
         'were',
         'her',
         'more',
         'himself',
         'this',
         'down',
         'should',
         'our',
         'their',
         'while',
         'above',
         'both',
         'up',
         'to',
         'ours',
         'had',
         'she',
         'all',
         'when',
         'at',
         'any',
         'before',
         'them',
         'same',
         'and',
         'been',
         'have',
         'in',
         'will',
         'on',
         'does',
         'yourselves',
         'then',
         'that',
         'because',
         'what',
         'over',
         'why',
         'so',
         'can',
         'did',
         'now',
         'under',
         'he',
         'you',
         'herself',
         'has',
         'just',
         'where',
         'too',
         'only',
         'myself',
         'which',
         'those',
         'i',
         'after',
         'few',
         'whom',
         'being',
         'if',
         'theirs',
         'my',
         'against',
         'a',
         'by',
         'doing',
         'it',
         'further',
         'was',
         'here',
     ]
     print("Loading Vectors")
     self.vec_model = FastText.load_fasttext_format(
         'vectors/cc.en.300.bin').wv
     # self.vec_model = {}
     print("Completed Loading Vectors")
     self.data_df["label"] = self.data_df["label"].astype(int)
     self.data_df["text"] = self.data_df["text"].astype(str)
     self.data_df["text"] = self.data_df["text"].str.lower()
     self.data_df["text"] = self.data_df["text"].str.strip(to_strip=".!?,")
     self.data_df["text"] = self.data_df["text"].str.split()
     self.data_df["text"] = self.data_df["text"].apply(
         lambda x: [w for w in x if not w in self.stop_words])
Beispiel #15
0
def calculate_similarity(source_doc,
                         target_doc,
                         embedding="Glove",
                         threshold=0):
    """Calculates & returns similarity scores between given source document & all the target documents."""
    def w2v_vectorize(doc):
        """Identify the vector values for each word in the given document"""
        doc = [i.lower().split() for i in doc]
        word_list = []
        for w in doc:
            w = [word for word in w if word not in stopwords.words('english')]
            word_list.append(w)
        vec_list = []
        for words in word_list:
            word_vecs = []
            for word in words:
                try:
                    vec = w2v_model[word]
                    word_vecs.append(vec)
                except KeyError:
                    pass
            vector = np.mean(word_vecs, axis=0)
            vec_list.append(vector)
        vectors = np.mean(vec_list, axis=0)
        return vectors

    def glove_vectorize(doc):
        """Identify the vector values for each word in the given document"""
        doc = [i.lower().split() for i in doc]
        word_list = []
        for w in doc:
            w = [word for word in w if word not in stopwords.words('english')]
            word_list.append(w)
        vec_list = []
        for words in word_list:
            word_vecs = []
            for word in words:
                try:
                    vec = glove_model[word]
                    word_vecs.append(vec)
                except KeyError:
                    pass
            vector = np.mean(word_vecs, axis=0)
            vec_list.append(vector)
        vectors = np.mean(vec_list, axis=0)
        return vectors

    def fasttext_vectorize(doc):
        """Identify the vector values for each word in the given document"""
        doc = " ".join(doc)
        doc = doc.lower()
        words = [w for w in doc.split(" ")]
        word_vecs = []
        for word in words:
            try:
                vec = fasttext_model[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass
        vector = np.mean(word_vecs, axis=0)
        return vector

    def cosine_sim(vecA, vecB):
        """Find the cosine similarity distance between two vectors."""
        csim = np.dot(vecA,
                      vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
        if np.isnan(np.sum(csim)):
            return 0
        return csim

    if embedding == "Word2Vec":
        w2v_model = KeyedVectors.load_word2vec_format(
            'Semantic_Similarity/Word_Embedding/data/GoogleNews-vectors-negative300.bin',
            binary=True,
        )
        source_vec = w2v_vectorize(source_doc)
        target_vec = w2v_vectorize(target_doc)
        sim_score = cosine_sim(source_vec, target_vec)

        if sim_score > threshold:

            return sim_score

    elif embedding == "Glove":
        source_vec = glove_vectorize(source_doc)
        target_vec = glove_vectorize(target_doc)
        sim_score = cosine_sim(source_vec, target_vec)

        if sim_score > threshold:

            return sim_score

    elif embedding == "FastText":
        fasttext_model = FastText.load_fasttext_format(
            'Semantic_Similarity/Word_Embedding/data/cc.en.300.bin')
        source_vec = fasttext_vectorize(source_doc)
        target_vec = fasttext_vectorize(target_doc)
        sim_score = cosine_sim(source_vec, target_vec)

        if sim_score > threshold:

            return sim_score
import sys
from gensim.models.wrappers import FastText
from gensim.scripts.glove2word2vec import glove2word2vec
model = FastText.load_fasttext_format(sys.argv[1])

with open(sys.argv[2], 'r', encoding='utf-8') as fin:
    with open(sys.argv[3], 'w', encoding='utf-8') as fout:
        for line in fin:
            word = line.strip()
            if word in model:
                embs = list(model[word])
            else:
                embs = [1e-8 for _ in range(model.vector_size)]
            embs = [str(item) for item in embs]
            fout.write(' '.join([word]+embs)+'\n')

if len(sys.argv)>4 is not None:
    glove2word2vec(sys.argv[3], sys.argv[4])
Beispiel #17
0
#
# for item in iterator:
#     title = item.find('TITLE')
#     Title_text = title.text
#     print(Title_text)
#     body = item.find('BODY')
#     body_text = body.text
#     print(body_text)

f_model_file = 'C:/Users/ffayaza/Documents/MscProEmb/WordEmberding/Model/wiki/wiki.ta.vec'
f_model_bin = 'C:/Users/ffayaza/Documents/MscProEmb/WordEmberding/Model/wiki/wiki.ta.bin'
ftext = KeyedVectors.load_word2vec_format(f_model_file)
# print(ftext.wv.vocab)
# print(ftext.wv['தலைமையில்'])

model = FastText.load_fasttext_format(f_model_bin, encoding='utf8')
# print(model.wv.vocab)
# print(model.wv['தலைமையில்'])


def avg_sentence_vector(words, model, num_features, index2word_set):
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features, ), dtype="float32")
    nwords = 0

    for word in words:
        print(word)
        if word in index2word_set:
            print(word)
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
Beispiel #18
0
from gensim.models.wrappers import FastText
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from gensim.models.phrases import Phraser
import re
from stop_words import get_stop_words
from sqlalchemy import create_engine

# Word score and stem mapping
db_connect = create_engine('sqlite:///stem_map_20170802.db')

# Word embedding model
model = FastText.load_fasttext_format('language_model_20170802')

# Bigrams phraser
bigram = Phraser.load('bigram_phraser_20170801.model')


def get_stem_from_word(word):
    """ Get stem from word """

    p_stemmer = PorterStemmer()
    return '_'.join([p_stemmer.stem(w) for w in word.split('_')])


def get_most_similar(word, topn=10, gender=None, pos=None):
    """ Get most similar words by gender or part of speech

        Returns a list of dictionaries with:

        - 'word': list tuples containing of word expansions from the recommended stem,
Beispiel #19
0
			except:pass
	return tmp/len(text)
def ensText(text,model):
	text=text.split()
	tmp=[]
	for i in text:
			try:tmp+=[model.wv[i]]
			except:pass
	return tmp
def sim(a,b):return 1 - spatial.distance.cosine(a, b)
def meanSim(text,title):
	s=0
	for v in text:s+=sim(v,title)
	return s/(1+len(text))
from gensim.models.wrappers import FastText
model = FastText.load_fasttext_format('/home/celvaigh/these/divers/wiki.fr/wiki.fr.bin')
fr="wiki.fr.bin"
en="wiki.en.bin"
#model = word_vectors = KeyedVectors.load_word2vec_format('/home/celvaigh/these/divers/wiki.fr/wiki.fr.bin', binary=True)
def computeCorpusSims(name,lg):
	if lg=="fr":model = FastText.load_fasttext_format(fr)
	else:model = FastText.load_fasttext_format(en)
	data=pd.read_csv(name, sep='\t')
	texts=data["text"]
	titles=data["title"]
	size=len(texts)
	sims=[]
	for i in range(size):sims+=[meanSim(ensText(texts[i],model),MWV(titles[i],model))]
	sims.sort()
	return sims
Beispiel #20
0
    clean_text = re.sub(r'[^\w\s]', '', clean_text)
    return clean_text


## Loading data

train = pd.read_csv(DATA_PATH + 'train-expanded_NoLing.csv')
validation = pd.read_csv(DATA_PATH + 'dev_expanded_NoLing.csv')
answer_texts_train = pd.read_csv(DATA_PATH + 'answer_texts_train_NoLing.csv')
answer_texts_validation = pd.read_csv(DATA_PATH +
                                      'answer_texts_dev_NoLing.csv')
test = pd.read_csv(DATA_PATH + 'test_NoLabels_NoLing.csv')
answer_texts_test = pd.read_csv(DATA_PATH + 'answer_texts_test_NoLing.csv')
translations = pickle.load(
    open(EXTRA_DATA_PATH + 'arabic_english_translations.p', 'rb'))
embeddings_index = FastText.load_fasttext_format(EMBEDDING_PATH +
                                                 'cc.en.300.bin')

## Preparing data

train['pool'] = train['pool'].apply(literal_eval)
train['answer_ids'] = train['answer_ids'].apply(literal_eval)
validation['pool'] = validation['pool'].apply(literal_eval)
validation['answer_ids'] = validation['answer_ids'].apply(literal_eval)
answer_texts_test.set_index('answer_id', drop=False, inplace=True)
test['candidates'] = test['candidates'].apply(literal_eval)
test.set_index('Unnamed: 0', drop=True, inplace=True)
for idx, row in test.iterrows():
    test.set_value(idx, 'candidates',
                   sorted([int(x) for x in set(row['candidates'])]))
answer_texts_test['clean_answer'] = answer_texts_test['answer'].apply(
    clean_text)
from vacancies import vacancy_dict

from gensim.models.wrappers import FastText
import gensim
import pandas as pd
from scipy.spatial.distance import euclidean

from openpyxl import load_workbook
import string

model = FastText.load_fasttext_format('data.bin')
wb = load_workbook('candidates.xlsx')


def clean_str(s):
    for c in string.punctuation:
        s = s.replace(c, "")
    return s


def get_similarity_euql(model, first_sentence, second_sentence):
    similarity = 0
    first_sentence = [
        i for i in clean_str(first_sentence).split() if i in model
    ]
    second_sentence = [
        i for i in clean_str(second_sentence).split() if i in model
    ]
    for i in first_sentence:
        first_vector = model[i]
        sim_i = 0
            new_question_tokens = []
            for token in question_doc:
                if isPunctuation(token) or isStopWord(token) or isInContext(token, context_token_lemmas):
                    new_question_tokens.append(token.text)
            new_question = TreebankWordDetokenizer().detokenize(new_question_tokens)
            if new_question == "":
                paragraph['qas'].pop(i)
            else:
                paragraph['qas'][i]['question'] = new_question
                i += 1

    num_topics += 1
    print("{}/{} topics done".format(num_topics, datalen))

from gensim.models.wrappers import FastText
model = FastText.load_fasttext_format('wiki.simple')
print("Model Loaded...")
nlp = spacy.load("en_core_web_sm")

def getMostSimContextWord(token, context_tokens):
    highest_sim = float('-Inf')
    most_sim_word = None
    for ct in context_tokens:
        # if isPunctuation(ct) or isStopWord(ct):
        #     continue
        # else:
        if ct.lower() in model.wv.vocab:
            curr_sim = model.similarity(token.text.lower(), ct.lower())
            if curr_sim >= highest_sim:
                highest_sim = curr_sim
                most_sim_word = ct
Beispiel #23
0
from gensim.models.wrappers import FastText

model = FastText.load_fasttext_format('wiki.ko.bin')

print("Finish creating model")

print(model.most_similar('전자'))

print(model.similarity('전자', '전기'))
Beispiel #24
0
from pymongo import MongoClient
from nltk.corpus import wordnet
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.corpus import stopwords
from app.modules import normalized_pub_distance
from bson.objectid import ObjectId
from gensim.models.wrappers import FastText
model = FastText.load_fasttext_format('/data/modelFT')

dsnames = []
mtnames = []

datasetspath = '/data/dataset_names.txt'
with open(datasetspath, "r") as file:
    for row in file.readlines():
        dsnames.append(row.strip())

methodpath = '/data/method_names.txt'
with open(methodpath, "r") as file:
    for row in file.readlines():
        mtnames.append(row.strip())


def is_int_or_float(s):
    ''' return 1 for int, 2 for float, -1 for not a number'''
    try:
        float(s)

        return 1 if s.count('.') == 0 else 2
    except ValueError:
        return -1
import fasttext
import fasttext.util

from gensim.models.keyedvectors import KeyedVectors
from gensim.models.wrappers import FastText

ft = fasttext.load_model('fasttext_cc/cc.tr.300.bin')
print("300 bin loaded")

fasttext.util.reduce_model(ft, 100)
print("300 bin reduced")

ft.save_model('fasttext_cc/cc.tr.100.bin')
print("100 bin saved")

model = FastText.load_fasttext_format('fasttext_cc/cc.tr.100.bin')
print("100 bin read")

model.wv.save_word2vec_format('fasttext_cc/cc.tr.100.vec', binary=False)
print("100 text saved")
def load_fasttext(path=None):
    word2vecmodel = FastText.load_fasttext_format(path)
    return word2vecmodel
def original_ratings():
    "Function returning the automatic specificity scores along with the human ratings."
    data = loadmat('./original_data/specificity_automated.mat')
    automatic = data['specificity_automated'][0]
    data = loadmat('./original_data/specificity_scores_MEM5S.mat')
    ratings = data['scores']
    ratings = [nanmean([nanmean(row) for row in image]) for image in ratings]
    return automatic, ratings

def load_images():
    image_data = loadmat('./original_data/memorability_888_img_5_sent.mat')
    images = [[s[0] for s in group] for group in image_data['memorability_sentences']]
    return images

print("Loading vectors.")
model = FastText.load_fasttext_format(VECTOR_LOCATION)
print("Loaded.")

automatic, ratings = original_ratings()
images = load_images()

vectorizer, analyzer = analyze_corpus(images)
scores = []
for i, image in enumerate(images):
    if i % 10 == 0:
        print(i)
    score = image_specificity(image, vectorizer, analyzer, model)
    scores.append(score)


result = spearmanr(automatic, ratings)
Beispiel #28
0
"""FastTextを利用して文章を変換する"""

import pickle
import numpy as np
from gensim.models.wrappers import FastText

with open('../raw_data/remap.pkl', 'rb') as f:
    pickle.load(f)
    pickle.load(f)
    pickle.load(f)
    pickle.load(f)
    pickle.load(f)
    texts = pickle.load(f)

fasttext = FastText.load_fasttext_format('../raw_data/cc.en.300.bin')


def sec2vec(sentence):
    global fasttext
    # 文を単語に分ける
    words = sentence.split()
    # 存在する単語のみ利用
    words = [fasttext[word] for word in words if word in fasttext]
    # 文のベクトルを平均で算出
    if len(words) == 0:
        return np.zeros((300, ), dtype=np.float32)
    return np.mean(words, axis=0)


# レビュー文をベクトル化
r = np.ndarray((len(texts), 300), dtype=np.float32)
img_embedding = 300
nr_input_lines = 5000  # used to determine steps_per_epoch (nr batches)
photos_per_batch = 5  # a batch will consists of photos_per_batch photos (each photo has 20-30 text samples)
lstm_cell = 128
examples_train = 0

# In[3]:

filepath = "dataset_txt/Flickr8k.lemma.token.txt"
fast_text = "fastText_eng/wiki.simple"
dir_imgs = "dataset_img"
token_start = 'sstart'
token_end = 'eend'
special_token = 'xx'
# load fast text - takes a lot of time
model_embeddings = FastTextWrapper.load_fasttext_format(fast_text)


# some initial info about whole dataset
def get_initial_info_data(filepath):

    global examples_train
    token_set = set()
    with open(filepath, 'r') as f:
        for line in f:
            #print(line.split(' ')[0].split('#')[0])
            txt = " ".join(line.split(' ')[1:])
            tokens = nltk.word_tokenize(txt)
            examples_train += len(tokens) + 1
            for token in tokens:
                token_set.add(token.lower())
Beispiel #30
0
print('x_train shape:', X_tr.shape)
print('x_test shape:', X_te.shape)

print("================")

print(X_tr)
print("================")
print(X_te)

print("================")

from gensim.models.wrappers import FastText

print("start...loading...wiki....en")

model = FastText.load_fasttext_format('wiki.en')

nb_words = min(max_features, len(tokenizer.word_index))

embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in tokenizer.word_index.items():
    if i >= nb_words:
        continue
    if word in model.wv:
        embedding_matrix[i] = model[word]
print('Null word embeddings: %d' %
      np.sum(np.sum(embedding_matrix, axis=1) == 0))

import tensorflow as tf

gpu_options = tf.GPUOptions(allow_growth=True)
Beispiel #31
0
from preprocessing import *
from gensim.models.wrappers import FastText
import pandas as pd
import numpy as np

NUMBER_DOC = 4124

print("starting loading fasttext")
model = FastText.load_fasttext_format('cc.en.300.bin')
print("fasttext loaded")


def building_idf(path, number_doc):
    data = pd.read_csv(path, sep=" ")
    data.columns = ['frequency', 'word', 'pos', 'num_doc_occ']

    data['idf'] = data.apply(
        lambda element: np.log(NUMBER_DOC / element['num_doc_occ']), axis=1)

    return data


def sim(element_1, element_2):
    return model.similarity(element_1, element_2)


def simple_similarity_calcul(sentence_1, sentence_2, data):  # No filter

    cleared_sentence_1_2 = formatting_sentences([sentence_1, sentence_2])
    cleared_sentence_1, cleared_sentence_2 = cleared_sentence_1_2[
        0], cleared_sentence_1_2[1]