Ejemplo n.º 1
0
    def ensemble_embedding(self, word_embedding, context_embedding):
        """Replace current syn0 with the sum of context and word embeddings.

        Parameters
        ----------
        word_embedding : str
            Path to word embeddings in GloVe format.
        context_embedding : str
            Path to context embeddings in word2vec_format.

        Returns
        -------
        numpy.ndarray
            Matrix with new embeddings.

        """
        glove2word2vec(context_embedding, context_embedding + '.w2vformat')
        w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding)
        c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding)
        # compare vocab words using keys of dict vocab
        assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings'

        # sort context embedding to have words in same order as word embedding
        prev_c_emb = copy.deepcopy(c_emb.syn0)
        for word_id, word in enumerate(w_emb.index2word):
            c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index]
        # add vectors of the two embeddings
        new_emb = w_emb.syn0 + c_emb.syn0
        self.syn0 = new_emb
        return new_emb
Ejemplo n.º 2
0
 def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1):
     glove2word2vec(model_file, model_file + '.w2vformat')
     model = cls.load_word2vec_format('%s.w2vformat' % model_file)
     if ensemble and context_file:
         model.ensemble_embedding(model_file, context_file)
     if sorted_vocab and vocab_file:
         model.sort_embeddings(vocab_file)
     return model
Ejemplo n.º 3
0
    def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1):
        """Load model from `model_file`.

        Parameters
        ----------
        model_file : str
            Path to model in GloVe format.
        vocab_file : str, optional
            Path to file with vocabulary.
        context_file : str, optional
            Path to file with context-embedding in word2vec_format.
        sorted_vocab : {0, 1}, optional
            If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing.
        ensemble : {0, 1}, optional
            If 1 - use ensemble of word and context vectors.

        """
        glove2word2vec(model_file, model_file + '.w2vformat')
        model = cls.load_word2vec_format('%s.w2vformat' % model_file)
        if ensemble and context_file:
            model.ensemble_embedding(model_file, context_file)
        if sorted_vocab and vocab_file:
            model.sort_embeddings(vocab_file)
        return model
def main(*args, **kwargs):
    # store_location = "/home/rpatel12/ferraro_user/NAM_Modified_data/data_sets/WN_11/"
    # import pandas as pd

    # DATA_LOCATION = "/home/rpatel12/ferraro_user/WN11_1/"
    print(args)
    print(type(args))
    DATA_LOCATION = args[0]
    # model_location = "/home/rpatel12/ferraro_user/glove_data/"
    model_location = args[1]
    store_location = DATA_LOCATION
    data_set_type = args[2]
    data_list = pd.read_csv(DATA_LOCATION + "data_list_train.csv")
    print(data_list.head())
    token_list = get_train_entities(data_list, dataset=data_set_type)
    print(len(token_list))
    data_dev = pd.read_csv(DATA_LOCATION + "data_list_dev.csv")
    data_test = pd.read_csv(DATA_LOCATION + "data_list_test.csv")

    glove_file = datapath(model_location + "glove.840B.300d.txt")
    tmp_file = get_tmpfile(model_location + "test_word2vec.txt")
    #
    _ = glove2word2vec(glove_file, tmp_file)
    model = KeyedVectors.load_word2vec_format(tmp_file)

    # converting word id and id to word
    word_2_id_dict, id_2_word_dict = word_2_id(token_list)

    # building the embedding matrix
    embedding_matrix, OOV_word = build_embedding_dict(model, 300,
                                                      id_2_word_dict)
    print(OOV_word)
    # build relation to head and relation to tail dictionaries
    head = data_list["Head"].tolist()
    tail = data_list['tail'].tolist()
    rel = data_list['relation'].tolist()

    relation_2_head, relation_2_tail = get_rel_dict(head, tail, rel)
    train_vec_dict = build_test_dev_vec(data_list,
                                        word_2_id_dict,
                                        train=True,
                                        dataset=data_set_type)
    dev_vec_dict = build_test_dev_vec(data_dev,
                                      word_2_id_dict,
                                      dataset=data_set_type)
    test_vec_dict = build_test_dev_vec(data_test,
                                       word_2_id_dict,
                                       dataset=data_set_type)

    # saving embedding matrix
    output = open(store_location + "embedding_matrix.pkl", 'wb')
    pickle.dump(embedding_matrix, output, protocol=2)
    output.close()

    # saving train data
    output = open(store_location + "train_vec_dict.pkl", 'wb')
    pickle.dump(train_vec_dict, output, protocol=2)
    output.close()
    # saving dev data
    output = open(store_location + "dev_vec_dict.pkl", 'wb')
    pickle.dump(dev_vec_dict, output, protocol=2)
    output.close()

    # saving test data
    output = open(store_location + "test_vec_dict.pkl", 'wb')
    pickle.dump(test_vec_dict, output, protocol=2)
    output.close()

    # saving word to id dict
    output = open(store_location + "word_2_id.pkl", 'wb')
    pickle.dump(word_2_id_dict, output, protocol=2)
    output.close()
    # saving id to work dict
    output = open(store_location + "id_2_word.pkl", 'wb')
    pickle.dump(id_2_word_dict, output, protocol=2)
    output.close()
    # saving rel to tail
    output = open(store_location + "relation_2_tail.pkl", 'wb')
    pickle.dump(relation_2_tail, output, protocol=2)
    output.close()
    # saving rel to head
    output = open(store_location + "relation_2_head.pkl", 'wb')
    pickle.dump(relation_2_head, output, protocol=2)
    output.close()
Ejemplo n.º 5
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import unicodedata
import six
import gensim
import random
import torch
import numpy as np
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove2word2vec('glove.42B.300d.txt', 'word2vec.txt')
embed_lookup = KeyedVectors.load_word2vec_format("word2vec.txt")


def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
    if six.PY3:
        if isinstance(text, str):
            return text
        elif isinstance(text, list):
            return list
        elif isinstance(text, bytes):
            return text.decode("utf-8", "ignore")
        else:
            raise ValueError("Unsupported string type: %s" % (type(text)))
    elif six.PY2:
Ejemplo n.º 6
0
import numpy as np
import gensim
from gensim import utils
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import nltk
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
import os

stop_words = stopwords.words('english')

print("--- loading pre-trained model ---")
glove2word2vec(glove_input_file="glove.6B.300d.txt",
               word2vec_output_file="gensim_glove_vectors.txt")
model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt",
                                          binary=False)
print("--- DONE loading model ---")


def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc
           if word.isalpha()]  #restricts string to alphabetic characters only
    return doc


def filter_docs(corpus, labels, condition_on_doc):
Ejemplo n.º 7
0
from gensim.scripts.glove2word2vec import glove2word2vec
import os

CWD = os.getcwd()
glove2word2vec(CWD + "\\glove\\vectors.txt", CWD + "\\glove\\word2vec.txt")
Ejemplo n.º 8
0
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 22 19:21:32 2018

@author: RAJDEEP PAL
"""

from gensim.scripts.glove2word2vec import glove2word2vec

glove_path = 'F:/year 2/hpg/project/attribute_embedding/GloVe/glove.6B.300d.txt'
w2v_path = 'F:/year 2/hpg/project/attribute_embedding/GloVe/glove.6B.300d.txt.word2vec'
glove2word2vec(glove_path, w2v_path)

#%%

from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format(w2v_path, binary=False)

results = model.most_similar(positive=['woman', 'king'],
                             negative=['man'],
                             topn=3)
print(results)

#%%

words = list(model.vocab)
print(len(words))

print(model['walk'])
print('brush_teeth' in words)
Ejemplo n.º 9
0
# taken from https://stackoverflow.com/a/47465278

from gensim.scripts.glove2word2vec import glove2word2vec

from embeddings.glove import original_path, gensim_path

glove2word2vec(glove_input_file=original_path, word2vec_output_file=gensim_path)
Ejemplo n.º 10
0
from gensim.scripts.glove2word2vec import glove2word2vec

path_old = '../data/glove.42B.300d.txt'
path_new = '../data/glove.42B.300d_gensim.txt'
glove2word2vec(path_old, path_new)
Ejemplo n.º 11
0
def glove2word():
    from gensim.scripts.glove2word2vec import glove2word2vec
    glove2word2vec("/users5/yjtian/Downloads/glove.840B.300d.txt",
                   "/users5/yjtian/Downloads/glove.840B.300d.w2v.txt")
Ejemplo n.º 12
0
from gensim.scripts.glove2word2vec import glove2word2vec

glove2word2vec('glove.6B.300d.txt', 'test.txt')
Ejemplo n.º 13
0
import predict_emotion
from predict_emotion import take_image_classify_emotion
from bio_summarization import summarize_doc
import nltk
nltk.download('averaged_perceptron_tagger')

with open('fairytale.pkl', 'rb') as file:
    lm = pickle.load(file)

unzipped_folder = "glove.twitter.27B/"  # ENTER THE PATH TO THE UNZIPPED `glove.twitter.27B` HERE

# use glove2word2vec to convert GloVe vectors in text format into the word2vec text format:
if not Path('gensim_glove_vectors_200.txt').exists():
    # assumes you've downloaded and extracted the glove stuff
    glove2word2vec(glove_input_file=unzipped_folder +
                   "glove.twitter.27B.200d.txt",
                   word2vec_output_file="gensim_glove_vectors_200.txt")

# read the word2vec txt to a gensim model using KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors_200.txt",
                                                binary=False)


class UI:
    def __init__(self):
        self.db = Database()
        self.db.load('celebrities.pkl')
        self.logged_in = False
        save_camera_config(0, exposure=0.2)

    def forgetful_new_person(self, db, new_name):
Ejemplo n.º 14
0
def build_vocab_glove(vocab, glove_file, embedding_dim, binary,
                      pre_trained_vocab_embedding_file):

    vocab_size = vocab.get_vocab_size()

    # init
    vocab_embedded = np.random.uniform(-0.25, 0.25,
                                       (vocab_size, embedding_dim))

    # load any vectors from the word2vec
    print("Load glove file {}\n".format(glove_file))

    pad_embedded = np.random.uniform(-0.25, 0.25, (embedding_dim, ))
    unk_embedded = np.random.uniform(-0.25, 0.25, (embedding_dim, ))
    sos_embedded = np.random.uniform(-0.25, 0.25, (embedding_dim, ))
    eos_embedded = np.random.uniform(-0.25, 0.25, (embedding_dim, ))

    # load any vectors from the word2vec
    print("Load glove file: {} to gensim model. \n".format(glove_file))

    # fname, fvocab=None, binary=False, encoding='utf8'

    glove_file = datapath(glove_file)
    tmp_file = get_tmpfile("tmp_word2vec.txt")

    from gensim.scripts.glove2word2vec import glove2word2vec
    glove2word2vec(glove_file, tmp_file)

    glove_model = KeyedVectors.load_word2vec_format(fname=tmp_file)

    out_of_vocab_count = 0
    out_of_vocab_words = []

    if binary:
        save_f = codecs.open(pre_trained_vocab_embedding_file,
                             'w',
                             encoding='utf-8')
    else:
        save_f = codecs.open(pre_trained_vocab_embedding_file,
                             'wb',
                             encoding='utf-8')

    header = "%d %d\n" % (vocab_size, embedding_dim)
    # write header
    save_f.write(header)

    for id, word in vocab.idx2word.items():
        if id == vocab.padid:
            word_embedded = pad_embedded
        elif id == vocab.sosid:
            word_embedded = sos_embedded
        elif id == vocab.eosid:
            word_embedded = eos_embedded
        elif id == vocab.unkid:
            word_embedded = unk_embedded
        else:
            try:
                word_embedded = glove_model.wv[word]
            except KeyError:
                out_of_vocab_words.append(word)
                out_of_vocab_count += 1
                word_embedded = unk_embedded

        vector_str = ' '.join([str(s) for s in word_embedded])
        save_f.write('%s %s\n' % (word, vector_str))

        vocab_embedded[id] = word_embedded

    save_f.close()
    del glove_model

    return vocab_embedded, out_of_vocab_count, out_of_vocab_words
Ejemplo n.º 15
0
 def __init__(self):
     glove_file = 'glove.840B.300d.txt'
     tmp_file = "glove.840B.300d.tmp"
     glove2word2vec(glove_file, tmp_file)
     self.model = KeyedVectors.load_word2vec_format(tmp_file)
Ejemplo n.º 16
0
def glove_word2vec():
	glove_input_file = 'glove.6B.100d.txt'
	word2vec_output_file = 'glove2word2vec.txt'
	glove2word2vec(glove_input_file, word2vec_output_file)
Ejemplo n.º 17
0
    def generate_words(self, string):
        """Génère des mots similaires par proximité vectorielle en base du gloVe n-dimensions."""

        # déclare le dataset gloVe à n-dimensions
        glove_file = 'data/sampled_glove.6B.50d.txt'
        word2vec_file = 'data/sampled_word2vec-glove.6B.50d.txt'
        file = pathlib.Path(word2vec_file)

        # teste son existence dans les dossiers
        if file.exists():
            # s'il existe, pas besoin de le générer à nouveau
            print('word2vec_file {} already exists, loading existing one, not generated'.format(word2vec_file))
        else:
            # si non, le génère
            print('word2vec_file {} doesn\'t exist, generating new one'.format(word2vec_file))
            glove2word2vec(glove_file, word2vec_file)

        # déclare le modèle
        model = KeyedVectors.load_word2vec_format(word2vec_file)

        # déclare la liste des stopwords à filtrer
        spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
        word = ''
        string_splitted = string.split()

        # sépare mot par mot, nécessaire dans le cas de beaucoup de mots dans l'input str
        for i in range(len(string_splitted)):
            if(string_splitted[i].lower() not in spacy_stopwords):
                word = string_splitted[i]

        # récupère les 100 premiers mots les plus similaires par proximité vectorielle
        list_words = []
        if word != '':
            try:
                list_words = model.most_similar(positive=[word], topn=100)
            except:
                list_words = []

        # défini les premiers mots de la liste
        list_close_words = [word]
        list_far_words = [word]

        # récupère les 4 réponses du QCM (une ou plusieurs réponses correctes)
        count = 0
        for i in range(len(list_words)):
            if word not in list_words[i][0]:
                list_close_words.append(list_words[i][0])
                count = count + 1
            if count == 3:
                break

        list_words.reverse()

        # récupère les 4 réponses du QCU (une seule réponse correcte)
        count = 0
        i = 0
        for i in range(len(list_words)):
            if word not in list_words[i][0]:
                list_far_words.append(list_words[i][0])
                count = count + 1
            if count == 3 :
                break
        
        dict_words = {}
        dict_words['list_far_words'] = list_far_words
        dict_words['list_close_words'] = list_close_words

        return dict_words
Ejemplo n.º 18
0
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath,get_tmpfile
from nltk import word_tokenize

glove_file = datapath('./glove.6B.50d.txt')
tmp_file = get_tmpfile("glove_word2vec.txt")
glove2word2vec(glove_file, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)

def sent_embedding(sentence,model=model):
    #  add check for 1) empty sentence 2) sentence containing all words which are out of vocab.
    tokens = [w for w in word_tokenize(sentence.lower()) if w.isalpha()]
    sent_emb = np.mean([model[t] if t in model else model['unk'] for t in tokens ],axis=0)
    return sent_emb
    # print(sent_emb)
Ejemplo n.º 19
0
from pathlib import Path

import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

from configuration.config import data_dir

glove_file = Path(data_dir)/'Tencent_AILab_ChineseEmbedding_for_el.txt'
glove_file = datapath(glove_file)

w2v_file = get_tmpfile(Path(data_dir)/'tmpfile')
glove2word2vec(glove_file, w2v_file)

m = gensim.models.KeyedVectors.load_word2vec_format(w2v_file)
m.save(str(Path(data_dir)/'tencent_embed_for_el2019'))

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

import pyemblib
import gensim
import os

parent = os.path.abspath("../../embeddings/") 
path = os.path.abspath("../../embeddings/GoogleNews-vectors-negative300.bin")
glove = os.path.abspath("../../embeddings/glove.840B.300d.txt") 
wikitext_path = os.path.abspath("../../embeddings/wiki-news-300d-1M-subword.vec") 

# gensim working. 
# google_news = KeyedVectors.load_word2vec_format(path, binary=True)
glove2word2vec(glove_input_file=glove, word2vec_output_file=os.path.join(parent, "glove.840B.300d_Word2Vec_format.txt"))


# pyemblib not working. 
# embedding = pyemblib.read(path, format='Word2Vec', mode=pyemblib.Mode.Binary)



wikitext = pyemblib.read(wikitext_path, format='Word2Vec', mode=pyemblib.Mode.Text)
print("Successful read.")



Ejemplo n.º 21
0
import gensim

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

PATH_TO_WORD2VEC = os.path.expanduser(
    "~/data/word2vec/GoogleNews-vectors-negative300.bin")
PATH_TO_GLOVE = os.path.expanduser("~/data/glove/glove.840B.300d.txt")

word2vec = gensim.models.KeyedVectors.load_word2vec_format(PATH_TO_WORD2VEC,
                                                           binary=True)

# In[7]:

tmp_file = "/tmp/glove.840B.300d.w2v.txt"
glove2word2vec(PATH_TO_GLOVE, tmp_file)
glove = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

# In[1]:

import csv

PATH_TO_FREQUENCIES_FILE = "data/sentence_similarity/frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "data/sentence_similarity/doc_frequencies.tsv"


def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader:
Ejemplo n.º 22
0
import numpy as np

print('Indexing word vectors.')

embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('/content/glove.6B.100d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

model.most_similar('post')

result = model.most_similar(positive=['post', 'tweet'])
print("{}: {:.4f}".format(*result[0]))

Ejemplo n.º 23
0
def loadEmbedding(filename):
    #加载glove模型,转化为word2vec,再加载word2vec模型
    word2vec_temp_file = 'word2vec_temp.txt'
    glove2word2vec(filename, word2vec_temp_file)
    model = KeyedVectors.load_word2vec_format(word2vec_temp_file)
    return model
Ejemplo n.º 24
0
from keras.models import Sequential
from keras.layers import *
from keras.utils.np_utils import to_categorical
from keras.initializers import Constant
from train_am_pm_helper import *

labeled_examples, unlabeled_examples = parse_labeled_unlabeled_examples()
am_pm_set = parse_am_pm_set(labeled_examples)
train_am_pm_set, test_am_pm_set = train_test_split_am_pm_set(am_pm_set)
merged_am_pm_set = construct_merged_am_pm_set(train_am_pm_set, 1)

config = configparser.ConfigParser()
config.read("../config.ini")
glove_input_file = config["Paths"]["glove_input_file"]
word2vec_output_file = glove_input_file + '.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

glove_model = KeyedVectors.load_word2vec_format(
    word2vec_output_file, binary=False)

max_features = 20000
sequence_length = 300
tokenizer = Tokenizer(num_words=max_features, split=' ',
                      oov_token='<unw>', filters=' ')
texts = []
for hour in range(12):
    for sent, lab in am_pm_set[hour]:
        texts.append(clean_str(sent))
tokenizer.fit_on_texts(texts)

word_index = tokenizer.word_index
        lines = fin.read().split("\n")
        lines = [l.split()[0] for l in lines if len(l) > 0]

    return set(lines)




if __name__ == "__main__":
    from gensim.scripts.glove2word2vec import glove2word2vec
    #voc = get_vocab()
    #my_glove_file = "data/my-glove.txt"
    #get_embeddings("/home/diesel/Projects/Datasets/Datasets/glove_data/glove.6B/glove.6B.300d.txt", my_glove_file, voc)
    #t0 = time.time()
    glove_w2v_file = "data/glove-w2v.txt"
    glove2word2vec(my_glove_file, glove_w2v_file)

    W2vecextractor = Word2vecExtractor(glove_w2v_file)


    t1 = time.time()
    print("done loading word vectors: ", (t1 - t0) / 60.0)
    doc = "A fisherman was catching fish by the sea. A monkey saw him, and wanted to imitate what he was doing. The man went away into a little cave to take a rest, leaving his net on the beach. The monkey came and grabbed the net, thinking that he too would go fishing. But since he didn't know anything about it and had not had any training, the monkey got tangled up in the net, fell into the sea, and was drowned. The fisherman seized the monkey when he was already done for and said, 'You wretched creature! Your lack of judgment and stupid behaviour has cost you your life!'"

    feature_dict = W2vecextractor.get_doc2vec_feature_dict(doc)
    print(feature_dict)

    t2 = time.time()
    print("execution time:", (t1 - t0) / 60.0)

Ejemplo n.º 26
0
# Usage: python3 generate_pretrained.py <pretrained_file>

vocab = {}
embedding_size = 300
path_to_bin = sys.argv[1]

with open("../dataset/CoNLL-2003/vocab_dict") as f:
    for line in f:
        pairs = line.strip().split()
        vocab[pairs[0]] = int(pairs[1])  # word: index

if "glove" in path_to_bin:
    word2vec_output_file = path_to_bin + '.word2vec'
    if not os.path.exists(word2vec_output_file):
        glove2word2vec(path_to_bin, word2vec_output_file)
    model = gensim.models.KeyedVectors.load_word2vec_format(
        word2vec_output_file, binary=False)
else:
    model = gensim.models.KeyedVectors.load_word2vec_format(path_to_bin,
                                                            binary=True)

word_embedding = np.zeros((len(vocab), embedding_size))

for word, index in vocab.items():
    try:
        word_embed = np.asarray(model[word])
    except KeyError:
        word_embed = 2 * np.random.rand(embedding_size, ) - 1.0  # [-1, 1]

    word_embedding[index] = word_embed
Ejemplo n.º 27
0
from gensim.scripts.glove2word2vec import glove2word2vec
import sys

if __name__ == "__main__":
    input_vec = sys.argv[1]
    gensim_vec = sys.argv[2]

    glove2word2vec(glove_input_file=input_vec, word2vec_output_file=gensim_vec)
Ejemplo n.º 28
0
def setupfiles():
    glove2word2vec(glove_path, word2vec_output_file)
Ejemplo n.º 29
0
def main(hidden_dim, batch_size):
    global device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if not os.path.exists('glove.6B.50d.w2v.txt'):
        print("w2v file not found, generating...")
        glove2word2vec(glove_input_file='glove.6B.50d.txt',
                       word2vec_output_file='glove.6B.50d.w2v.txt')
    global w2v
    w2v = KeyedVectors.load_word2vec_format('glove.6B.50d.w2v.txt',
                                            binary=False)

    print("Fetching data...")
    train_data, valid_data = fetch_data(
    )  # X_data is a list of pairs (document, y); y in {0,1,2,3,4}

    model = RNN(50, hidden_dim, 5, batch_size)
    model.double()
    model.cuda()

    print("Vectorizing data...")
    train_vecs, train_labs = vectorize_data(train_data)
    valid_vecs, valid_labs = vectorize_data(valid_data)
    print("Finished vectorizing data")

    optimizer = optim.SGD(model.parameters(),
                          lr=0.01,
                          momentum=0.9,
                          nesterov=False)
    #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    iters = 10
    while iters > 0:  # How will you decide to stop training and why
        model.train()
        optimizer.zero_grad()
        minibatch_size = 16
        N = len(train_data)
        perm = np.random.permutation(N)
        train_vecs = [train_vecs[i] for i in perm]
        train_labs = train_labs[perm]
        total = 0
        correct = 0
        epoch = 10 - iters
        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                gold_label = train_labs[minibatch_index * minibatch_size +
                                        example_index].long()
                predicted_vector = model(
                    train_vecs[minibatch_index * minibatch_size +
                               example_index].to(device))
                predicted_label = torch.argmax(predicted_vector)
                correct += int(predicted_label == gold_label)
                total += 1
                example_loss = model.compute_Loss(
                    predicted_vector.view(1, -1),
                    torch.tensor([gold_label]).to(device))
                if loss is None:
                    loss = example_loss
                else:
                    loss += example_loss
            loss = loss / minibatch_size
            loss.backward()
            optimizer.step()

        optimizer.zero_grad()
        N = len(valid_data)
        total = 0
        correct = 0
        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                gold_label = valid_labs[minibatch_index * minibatch_size +
                                        example_index].long()
                predicted_vector = model(
                    valid_vecs[minibatch_index * minibatch_size +
                               example_index].to(device))
                predicted_label = torch.argmax(predicted_vector)
                correct += int(predicted_label == gold_label)
                total += 1
        print("Validation completed for epoch {}".format(epoch + 1))
        print("Validation accuracy for epoch {}: {}".format(
            epoch + 1, correct / total))
        #scheduler.step()
        iters -= 1
Ejemplo n.º 30
0
def get_predictive_model(classifier):
    # set directries based on run-time environment
    if in_docker == 'True':
        model_dir = '/data/models/'
        data_dir = '/data/data/'
    else:
        model_dir = 'models/'
        data_dir = 'data/'

    # get model and convert to w2v
    glove_input_file = model_dir + 'w2v_glove_300.txt'

    word2vec_output_file = '/tmp/w2v.txt'
    glove2word2vec(glove_input_file, word2vec_output_file)
    model = KeyedVectors.load_word2vec_format(word2vec_output_file,
                                              binary=False)

    # get stop words

    sw = data_dir + "/stopwords.txt"
    with open(sw) as f:
        stop_words = f.read().splitlines()

    def get_sentence_vector(sentence):
        word_list = word_tokenize(sentence)
        word_list = [
            word.lower() for word in word_list
            if word.lower() not in stop_words
        ]
        word_vectors = []

        for x in word_list:
            try:
                w_vec = model.get_vector(x)
                word_vectors.append(w_vec)
            except KeyError:
                pass

        return sum(word_vectors) / len(word_vectors)

    # load prepartitioned train/test sets
    test = pd.read_csv(data_dir + "/test.csv")
    train = pd.read_csv(data_dir + "/train.csv")

    test['vec'] = [get_sentence_vector(x) for x in test.text]
    train['vec'] = [get_sentence_vector(x) for x in train.text]

    train_grouped_abbr = train.groupby('abbrev')
    test_grouped_abbr = test.groupby('abbrev')

    # load full data set
    frames = [test, train]
    df = pd.concat(frames)

    # Loop through different abbreviations
    for abbr in train.abbrev.unique():

        train_abbr = train_grouped_abbr.get_group(abbr)
        test_abbr = test_grouped_abbr.get_group(abbr)

        X_train = np.array(list(train_abbr.vec))
        y_train = train_abbr.expansion

        X_test = np.array(list(test_abbr.vec))
        y_test = test_abbr.expansion

        if classifier == 'svm':
            # set up SVM
            clf = SVC(C=1.0, kernel='linear', degree=1).fit(X_train, y_train)

        elif classifier == 'log':
            clf = LogisticRegression().fit(X_train, y_train)

        elif classifier == 'mlp':
            clf = MLPClassifier().fit(X_train, y_train)

        elif classifier == 'bag':
            clf = BaggingClassifier(tree.DecisionTreeClassifier(
                random_state=1)).fit(X_train, y_train)

        elif classifier == 'boost':
            num_trees = 70
            clf = AdaBoostClassifier(n_estimators=num_trees,
                                     random_state=1032).fit(X_train, y_train)

        elif classifier == 'rf':
            clf = RandomForestClassifier().fit(X_train, y_train)

        else:
            print('INVALID OPTION!')

        pred = clf.predict(X_test)
        output_dir = Path(data_dir + "output")
        output_dir.mkdir(parents=True, exist_ok=True)
        (pd.DataFrame({'predictions': pred})).to_csv(
            output_dir / "{}_{}.csv".format(classifier, abbr))

        cm = confusion_matrix(y_test, pred, labels=list(set(df.expansion)))
        print()
        print("MODEL -> ", classifier)
        print("##" * 20)
        print(" " * 20 + abbr)
        print("##" * 20)

        print(classification_report(y_test, pred))
        print()

        print(f'examples (first 5 cases)\t\t\t\t\t\ttrue_abbr\t\t\tpred_abbr')

        # Print first 5 cases
        i = 0
        for input_row, true_abbr, pred_abbr in zip(train_abbr.iterrows(),
                                                   y_test, pred):

            sn_start = max(input_row[1].start - 25, 0)
            sn_end = min(input_row[1].end + 25, len(input_row[1].text))

            example_text = input_row[1].text[sn_start:sn_end]
            print(f'... {example_text} ...\t{true_abbr:<35}\t{pred_abbr}')

            if i == 5:
                break

            i += 1
Ejemplo n.º 31
0
inverted_idx = data_dict['inverted_idx']
qlist_seg = data_dict['qlist_seg']
alist = data_dict['alist']
X = data_dict['X']

# 加载nltk自带停用词,该停用词表个人感觉一般,具体到细分领域可能还是需要自己归纳
sw = set(stopwords.words('english'))
# 个人感觉对于一个问题而言这些词不应该删去
sw -= {'who', 'when', 'why', 'where', 'how'}
# 这里只是随便去了下符号
sw.update(['\'s', '``', '\'\''])
ps = PorterStemmer()

# 将GloVe转为word2vec
if not os.path.exists('./data/glove2word2vec.6B.100d.txt'):
    _ = glove2word2vec('./data/glove.6B.100d.txt',
                       './data/glove2word2vec.6B.100d.txt')
    model = KeyedVectors.load_word2vec_format(
        './data/glove2word2vec.6B.100d.txt')
else:
    model = KeyedVectors.load_word2vec_format(
        './data/glove2word2vec.6B.100d.txt')


def text_preprocessing(text):
    """
    对单条文本进行处理。
    text: str类型

    return: 分词后的list
    """
Ejemplo n.º 32
0
 def transform_glove_in_word2vec(glove_input_file: str,
                                 word2vec_output_file: str):
     glove2word2vec(glove_input_file, word2vec_output_file)
Ejemplo n.º 33
0
def build_word2vec():
    info('converting from glove to word2vec format')
    glove2word2vec(glove_file, w2v_file)
    info('training word2vec model')
    model = KeyedVectors.load_word2vec_format(w2v_file, binary=False)
    model.save(w2v_model)