Beispiel #1
0
def read_vector_models(path_src_vw_model_bin, path_tgt_vw_model_bin):
    if not all([os.path.isfile(fname) for fname in [path_src_vw_model_bin, path_tgt_vw_model_bin]]):
        print('Some of the vector model files given do not exist, perhaps check defaults!')
        sys.exit()

    print('+ preparing src vector model')
    if "ft" in path_src_vw_model_bin:
        vw_src_model = FastTextKeyedVectors.load(path_src_vw_model_bin)
        vw_src_model.add(UNK_token, np.random.normal(0, 0.01, vw_src_model.vector_size))
    else:
        vw_src_model = KeyedVectors.load_word2vec_format(path_src_vw_model_bin, binary=True)
    print('++ src vector model read')
    vw_src_model = extendPretrainedModel(vw_src_model)
    print('++ src vector model extended')

    print('+ preparing tgt vector model')
    if "ft" in path_tgt_vw_model_bin:
        vw_tgt_model = FastTextKeyedVectors.load(path_tgt_vw_model_bin)
        vw_tgt_model.add(UNK_token, np.random.normal(0, 0.01, vw_tgt_model.vector_size))
    else:
        vw_tgt_model = KeyedVectors.load_word2vec_format(path_tgt_vw_model_bin, binary=True)
    print('++ tgt vector model read')
    vw_tgt_model = extendPretrainedModel(vw_tgt_model)
    print('++ tgt vector model extended')

    return vw_src_model, vw_tgt_model
Beispiel #2
0
def embedding_text(target_dataset):
    print("Loading embedding model...")
    model_name = 'FASTTEXT_' + target_dataset + '.model'
    embedding_model = FastTextKeyedVectors.load(
        os.path.join(CONFIG.EMBEDDING_PATH, model_name))
    print("Loading embedding model completed")
    dataset_path = os.path.join(CONFIG.DATASET_PATH, target_dataset)
    for loc_id in tqdm(os.listdir(dataset_path)):
        path_dir = os.path.join(dataset_path, loc_id)
        for post in tqdm(os.listdir(path_dir), leave=False):
            pickle_path = os.path.join(path_dir, post, "text.p")
            with open(os.path.join(path_dir, post, "text.txt"),
                      'r',
                      encoding='utf-8',
                      newline='\n') as f:
                text_data = f.read()
                word_list = text_data.split()
                vector_list = []
                if len(word_list) > CONFIG.MAX_SENTENCE_LEN:
                    # truncate sentence if sentence length is longer than `max_sentence_len`
                    word_list = word_list[:CONFIG.MAX_SENTENCE_LEN]
                    word_list[-1] = '<EOS>'
                else:
                    word_list = word_list + ['<PAD>'] * (
                        CONFIG.MAX_SENTENCE_LEN - len(word_list))
                for word in word_list:
                    vector = embedding_model.get_vector(word)
                    vector_list.append(vector)
                vector_array = np.array(vector_list, dtype=np.float32)
            f.close()
            with open(pickle_path, 'wb') as f:
                cPickle.dump(vector_array, f, protocol=-1)
            f.close()
            del text_data, word_list, vector_array
 def train_model(self, corpus):
     if self.model is None:
         logging.info(f"Start loading model {self.pretrained_model_path}")
         if self.pretrained_model_path.endswith(".bin"):
             self.model = load_facebook_vectors(self.pretrained_model_path)
         else:
             self.model = FastTextKeyedVectors.load(self.pretrained_model_path)
         self.model.init_sims(True)
         logging.info(f"Finished loading model {self.pretrained_model_path}")
     return self.model
Beispiel #4
0
    def test_ft_kv_backward_compat_w_360(self):
        kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))
        ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))

        expected = ['trees', 'survey', 'system', 'graph', 'interface']
        actual = [word for (word, similarity) in kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)

        actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)
    def test_ft_kv_backward_compat_w_360(self):
        kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))
        ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz"))

        expected = ['trees', 'survey', 'system', 'graph', 'interface']
        actual = [word for (word, similarity) in kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)

        actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)]

        self.assertEqual(actual, expected)
def test_fasttext(target_model):
	model_name = 'FASTTEXT_' + target_model + '.model'
	model = FastTextKeyedVectors.load(os.path.join(CONFIG.EMBEDDING_PATH, model_name))
	pad_vector = np.full(300, np.finfo(np.float32).eps)
	# pad_vector = np.random.randn(300)
	# pad_vector = np.ones(300)
	# pad_vector = np.full(300, 100)
	# print(pad_vector)
	print(model.similar_by_word("<EOS>"))
	print(model.similar_by_vector(vector=pad_vector, topn=5))
	model.add("<PAD>", pad_vector)
	model.init_sims(replace=True)
	print(model.similar_by_vector(vector=pad_vector, topn=5))
	print(model.get_vector("<EOS>"))
	print(model.get_vector("<PAD>"))
def data_reshape(df,
                 lookup_table_relations,
                 lookup_table_ent_types,
                 lookup_table_deptags,
                 lookup_table_postags,
                 len_max_seq=30):
    """Take a dataframe of features and reformat to fit into a vectors of (1D features, sequential features, target) as the input of the model"""
    wordvectors = FastTextKeyedVectors.load(
        ".data/pretrained_word_vectors.bin")

    #Create array X and y
    features_flat = []
    labels = []
    seq_flat = []

    for row in df.itertuples():
        labels.append(int(row.relation != ''))
        #'flat' features
        ent1t = lookup_table_ent_types[row.ent1type]
        ent2t = lookup_table_ent_types[row.ent2type]
        vec = np.concatenate((ent1t, ent2t))
        features_flat.append(vec)

        len_seq = 0
        for i in range(0, len(row.shortest_dependency_path_p.split("/"))):
            #'sequence' features
            current_word = row.shortest_dependency_path_w.split("/")[i]
            current_dependency = row.shortest_dependency_path_p.split("/")[i]
            current_pos_tag = row.shortest_dependency_path_t.split("/")[i]
            seq_sdp_w = wordvectors[current_word] if type(
                current_word) == str else np.zeros(100)
            seq_sdp_p = lookup_table_deptags[
                current_dependency.split(':')
                [0]] if "conj" in current_dependency else lookup_table_deptags[
                    current_dependency]
            seq_sdp_t = lookup_table_postags[current_pos_tag]
            vec_seq = np.concatenate((seq_sdp_w, seq_sdp_p, seq_sdp_t))
            seq_flat.append(vec_seq)
            len_seq += 1
        while len_seq < len_max_seq:
            seq_flat.append(np.zeros(len(vec_seq)))
            len_seq += 1
    #add labels and reshpare into 3-dimensional tensors
    labels = np.array(labels)
    features_words = np.array(features_flat)
    seq = np.reshape(seq_flat, (df.shape[0], len_max_seq, len(vec_seq)))
    return (features_words, seq, labels)
def visualize_language():
    model_name = get_model_name()
    path = "data/vector_models/" + model_name

    if "ft" in model_name:
        wv = FastTextKeyedVectors.load(path)
    else:
        wv = KeyedVectors.load_word2vec_format(path, binary=True)

    print("Vocab size:", len(wv.vocab))

    words = [""]
    if "en" in model_name:
        words = en_words

    if "nl" in model_name:
        words = nl_words

    visualize_words(wv, words)
def get_subplot_for_data(lang="en"):
    lang_full, lang_short = language_map(lang)
    fig = plt.figure()

    plot_labels = {
        "w2v": "Word2Vec",
        "ft": "FastText",
        "cbow": "CBOW",
        "sg": "Skip-Gram"
    }

    for i, type in enumerate(["w2v", "ft"]):
        for j, hp in enumerate(["cbow", "sg"]):
            print(type, hp)

            # First word2vec
            model_name = type + "_" + lang + "_d100_" + hp + "_st.bin"
            path = "data/vector_models/" + model_name

            if type == "ft":
                wv = FastTextKeyedVectors.load(path)
            else:
                wv = KeyedVectors.load_word2vec_format(path, binary=True)

            words = all_words[lang]

            total_words = []
            for topic in words:
                total_words.extend(topic)

            pca = PCA(n_components=2)

            X = wv[wv.vocab]
            mean = np.mean(X, axis=0)
            var = np.var(X, axis=0)

            X -= mean
            X /= var
            pca.fit(X)

            # Start subplot
            subplot_num = i * 2 + (j + 1)
            axis = fig.add_subplot(2, 2, subplot_num)

            for topic in words:
                X = wv[topic]
                X -= mean
                X /= var
                result = pca.transform(X)

                axis.scatter(result[:, 0], result[:, 1], s=5.0)
                for k, word in enumerate(topic):
                    axis.annotate(word,
                                  xy=(result[k, 0], result[k, 1]),
                                  size=7)

                plt.setp(axis.get_xticklabels(), visible=False)
                plt.setp(axis.get_yticklabels(), visible=False)

            axis.set_title(lang_full.capitalize() + " - " + plot_labels[type] +
                           " - " + plot_labels[hp],
                           fontdict={"fontsize": 12})
    # plt.savefig("Figures/embedding_" + lang_short + ".png")

    plt.show()
Beispiel #10
0
def get_latent(args):
    device = torch.device(args.gpu)
    print("Loading embedding model...")
    image_embedding_model = models.__dict__[args.arch](pretrained=True)
    image_embedding_dim = image_embedding_model.fc.in_features
    args.image_embedding_dim = image_embedding_dim
    model_name = 'FASTTEXT_' + args.target_dataset + '.model'
    text_embedding_model = FastTextKeyedVectors.load(
        os.path.join(CONFIG.EMBEDDING_PATH, model_name))
    text_embedding_dim = text_embedding_model.vector_size
    args.text_embedding_dim = text_embedding_dim
    print("Building index...")
    indexer = AnnoyIndexer(text_embedding_model, 10)
    print("Loading embedding model completed")
    print("Loading dataset...")
    full_dataset = load_full_data(args,
                                  CONFIG,
                                  text_embedding_model,
                                  total=True)
    print("Loading dataset completed")
    full_loader = DataLoader(full_dataset,
                             batch_size=args.batch_size,
                             shuffle=False)

    # t1 = max_sentence_len + 2 * (args.filter_shape - 1)
    t1 = CONFIG.MAX_SENTENCE_LEN
    t2 = int(math.floor(
        (t1 - args.filter_shape) / 2) + 1)  # "2" means stride size
    t3 = int(math.floor((t2 - args.filter_shape) / 2) + 1)
    args.t3 = t3

    text_encoder = text_model.ConvolutionEncoder(text_embedding_dim, t3,
                                                 args.filter_size,
                                                 args.filter_shape,
                                                 args.latent_size)
    text_decoder = text_model.DeconvolutionDecoder(text_embedding_dim, t3,
                                                   args.filter_size,
                                                   args.filter_shape,
                                                   args.latent_size)
    imgseq_encoder = imgseq_model.RNNEncoder(image_embedding_dim,
                                             args.num_layer,
                                             args.latent_size,
                                             bidirectional=True)
    imgseq_decoder = imgseq_model.RNNDecoder(image_embedding_dim,
                                             args.num_layer,
                                             args.latent_size,
                                             bidirectional=True)
    checkpoint = torch.load(os.path.join(CONFIG.CHECKPOINT_PATH,
                                         args.checkpoint),
                            map_location=lambda storage, loc: storage)
    multimodal_encoder = multimodal_model.MultimodalEncoder(
        text_encoder, imgseq_encoder, args.latent_size)
    multimodal_encoder.load_state_dict(checkpoint['multimodal_encoder'])
    multimodal_encoder.to(device)
    multimodal_encoder.eval()

    f_csv = open(os.path.join(CONFIG.CSV_PATH, 'latent_features.csv'),
                 'w',
                 encoding='utf-8')
    wr = csv.writer(f_csv)
    for steps, (text_batch, imgseq_batch,
                short_code) in enumerate(full_loader):
        torch.cuda.empty_cache()
        with torch.no_grad():
            text_feature = Variable(text_batch).to(device)
            imgseq_feature = Variable(imgseq_batch).to(device)
        h = multimodal_encoder(text_feature, imgseq_feature)
        row = [short_code] + h.detach().cpu().numpy().tolist()
        wr.writerow(row)
        del text_feature, imgseq_feature
    f_csv.close()
    print("Finish!!!")
Beispiel #11
0
# encoder layer weights
vectors = torch.zeros(len(corpus.dictionary), args.emsize)

if not args.evaluate:
    ###############################################################################
    # Load word embeddings and corpus
    ###############################################################################
    w2v_model = None
    if args.emmodel != 'no':
        print("using pretrained word embeddings", args.emmodel)
        try:
            w2v_model = KeyedVectors.load_word2vec_format(args.emmodel, binary=True)
            # w2v_model = Word2Vec.load(args.emmodel)
        except UnicodeDecodeError:
            w2v_model = FastTextKeyedVectors.load(args.emmodel)

        assert w2v_model.vector_size == args.emsize

    # initialise uniformly
    initrange = 0.1
    nn.init.uniform_(vectors)
    print("encoder layer shape", vectors.shape)
    # use pretrained vectors if available
    oov_count = 0
    if w2v_model:
        for i, word in enumerate(corpus.dictionary.idx2word):
            try:
                # print(w2v_model.wv[word].shape, vectors[i].shape)
                vectors[i] = torch.tensor(w2v_model.wv[word])
            except KeyError as err:
"""
File: app.py
Created Date: Monday, 2nd November 2020 9:18:53 am
Author: Tianyu Gu ([email protected])
"""

from pathlib import Path

from flask import Flask, abort, jsonify, request
from gensim.models.keyedvectors import FastTextKeyedVectors

_data_folder = Path(__file__).parent.parent.joinpath("data")
_fasttext_wv = FastTextKeyedVectors.load(
    str(_data_folder.joinpath("fasttext.wv")))

app = Flask("gensim_fasttext_service")


@app.route("/most-similar", methods=["POST"])
def similar_by_word():
    word: str = request.form.get("word")
    topn_arg = request.form.get("topn", "10")
    if not topn_arg.isdigit():
        abort(400)

    topn: int = int(topn_arg)
    res = [
        candidate
        for candidate, _ in _fasttext_wv.most_similar(word, topn=topn)
    ]
    return jsonify(res)
Beispiel #13
0
    # model_filename = "20190509_yle-wikipedia_word2vec_cbow_fi_lr=0.05,dim=100,ws=5,epoch=5,neg=5,mincount=5"
    # model_filename = "fin-word2vec-lemma"
    # model_filename = "wikipedia2008_fi_lemmatized_size=200,alpha=0.025,window=5,min_count=2,sg=1,negative=15,iter=5"
    model_filename = "Word2Vec_iltalehti-wikipedia_new_size=300,alpha=0.025,window=5,min_count=2,sg=1,negative=5,iter=15"
    

    # remember to switch the right model_file_type
    model_file = os.path.join(config.EMBEDDINGS_DIR, model_filename + model_file_type)

    print("Using the "+model_type+" model:", model_filename)
    print("Loading "+model_type+" model...")
    if model_type == 'Word2Vec':
        try:
            model = KeyedVectors.load_word2vec_format(model_file, binary=True)
        except UnicodeDecodeError:
            model = KeyedVectors.load_word2vec_format(model_file, binary=False)
    else:
        model = FastTextKeyedVectors.load(model_file)
    print(model_type+" model loaded.")

    # evaluate
    result_string = intrusion(model)
    result_string += analogy(model)
    result_string += nearest_neighbours(model)

    result_file = os.path.join("results", model_type + model_filename + "_results.txt")

    with open(result_file, 'w', encoding='utf-8') as f:
        f.write(result_string)

Beispiel #14
0
models, results = {}, {}
word2vec = KeyedVectors.load("C:/Users/Kamil/Downloads/word2vec_300_3_polish.bin")


models[f"CBOW-W2V"] = Average(word2vec, lang_freq="pl")
models[f"SIF-W2V"] = SIF(word2vec, components=10)
models[f"uSIF-W2V"] = uSIF(word2vec, length=11)

from gensim.scripts.glove2word2vec import glove2word2vec  
glove = KeyedVectors.load_word2vec_format("C:/Users/Kamil/Downloads/glove_300_3_polish2.txt")
models[f"CBOW-Glove"] = Average(glove,  lang_freq="pl")
print(f"After memmap {sys.getsizeof(glove.vectors)}")
models[f"SIF-Glove"] = SIF(glove, components=15)
models[f"uSIF-Glove"] = uSIF(glove,length=11)

ft = FastTextKeyedVectors.load("D:/fasttext_300_3_polish.bin")
models[f"CBOW-FT"] = Average(ft, lang_freq="pl")
models[f"SIF-FT"] = SIF(ft, components=10)
models[f"uSIF-FT"] = uSIF(ft, length=11)


s=models[f"uSIF-W2V"]
s.sv[0]

cs, md, ed = [],[],[]
for i, j in zip(range(task_length), range(task_length, 2*task_length)):
    temp1 = s.sv[i].reshape(1, -1)
    temp2 = s.sv[j].reshape(1, -1)
    cs.append((1 - (paired_cosine_distances(temp1, temp2)))[0])
    md.append(-paired_manhattan_distances(temp1, temp2)[0])
    ed.append(-paired_euclidean_distances(temp1, temp2)[0])