Beispiel #1
0
def load_w2v(w2vdim):
    # return {}
    model_path = '../data/emory_w2v/w2v-%d.bin' % w2vdim
    model = Word2Vec.load_word2vec_format(model_path, binary=True)
    print("The vocabulary size is: " + str(len(model.vocab)))

    return model
Beispiel #2
0
def load_w2v():
    model_path = '../data/word2vec_twitter_model/word2vec_twitter_model.bin'
    with Timer("load w2v"):
        model = Word2Vec.load_word2vec_format(model_path, binary=True)
        print("The vocabulary size is: " + str(len(model.vocab)))

    return model
Beispiel #3
0
def load_w2v(w2vdim):
    # return {}
    model_path = '../data/emory_w2v/w2v-%d.bin' % w2vdim
    model = Word2Vec.load_word2vec_format(model_path, binary=True)
    print("The vocabulary size is: " + str(len(model.vocab)))

    return model
Beispiel #4
0
def load_w2v():
    model_path = '/Users/bong/works/data/word2vec_twitter_model/word2vec_twitter_model.bin'
    with Timer("load w2v"):
        model = Word2Vec.load_word2vec_format(model_path, binary=True)
        print("The vocabulary size is: " + str(len(model.vocab)))

    return model
Beispiel #5
0
 def load_word2vec(self, fname="../data/word2vec_twitter_model/word2vec_twitter_model.bin"):
     """
     Loads the word vectors.
     """
     model = Word2Vec.load_word2vec_format(fname, binary=True)
     average_emb = np.mean( [ model[w] for w in model.vocab ], axis=0)
     self.word_vectors =model
     self.avg_embd = average_emb
Beispiel #6
0
def text_to_embeddings_store(model_path, input_file, output_file, vector_dimension):
    """
    Takes an input_file and conby converts it to an output_file by replacing words with embedding vectors based on the model at model_path
    """
    # TODO: Refactor to call to text_to_embeddings
        #model_path = args.model_path_
    print("Loading the model, this can take some time...")
    model = Word2Vec.load_word2vec_format(model_path, binary=True)
    print("Loafing spaCy model, this can take some time...")
    nlp=English()
    #print(("The vocabulary size is: "+str(len(model.vocab))))
    #print("Vector for 'Shubham': " + str(model['Shubham']))
    #print("Embedding dimension: " + str(len(model['Shubham'])))
    #f1=open("embedding_vectors_400.txt","w")
    f1=open(output_file,'w')
    zero =  np.zeros((vector_dimension,), dtype=np.float)
    #Specify encoding with io.open (careful io.open is slow, don't use for large files)
    #latin-1 is usually the culprit if the files aren't utf-8 encoded
    #with io.open("dataset_latin-1.txt", "r", encoding='latin-1') as f:
    count=0
    max_length=0
    with io.open(input_file, "r", encoding='utf-8') as f:
        for line in f:
            #spaCy would do this better :)
            #row=line.split()
            doc = nlp(line)                
            arr = []
            #for i in range(0,len(doc)):
            for token in doc:
                try:
                    embedding = model[token.text]
                    #print("Success for:\t" + token.text)
                except KeyError:
                    #print("Fail for:\t" + token.text)
                    #TODO: set embedding to zero vector instead of continue
                    embedding = zero
                #temp=str(model[row[i]])
                #temp.replace('\n',' ')
                #f1.write(temp)
                arr.append(embedding)
                #TODO: write as one line using join method
                #f1.write(str(embedding))
                #f1.write(" ")
            rows,cols=np.shape(arr)
            if rows==0:                 #ignore the tweet if out of vocabulary and take the control to the beginning of the loop
                count=count+1
                continue
            temp = arr[0]
            if (rows>max_length): # maximum words in a sentence
                max_length=rows
            for i in range(1,rows):
                temp=np.concatenate((temp,arr[i]),axis=0)
            rand=' '.join(map(str,temp))
            f1.write(rand)
            f1.write("\n")
    print("There are"+str(count)+"out of vocabulary sentences.")
    print(max_length)   
    return max_length
Beispiel #7
0
def load_w2v(w2vdim, sample_test=True):
    if sample_test:
        return {'a': np.array([np.float32(0.0)] * w2vdim)}

    else:
        model_path = '../data/emory_w2v/w2v-%d.bin' % w2vdim
        with Timer("load w2v"):
            model = Word2Vec.load_word2vec_format(model_path, binary=True)
            print("The vocabulary size is: " + str(len(model.vocab)))

        return model
Beispiel #8
0
def load_w2v(w2vdim, sample_test=True):
    if sample_test:
        return {'a': np.array([np.float32(0.0)] * w2vdim)}

    else:
        model_path = '../data/emory_w2v/w2v-%d.bin' % w2vdim
        with Timer("load w2v"):
            model = Word2Vec.load_word2vec_format(model_path, binary=True)
            print("The vocabulary size is: " + str(len(model.vocab)))

        return model
def load_embeddings(filename, binary=False):
    if "word2vec_twitter_model.bin" not in filename:
        model = gensim.models.Word2Vec.load_word2vec_format(filename, binary=binary)
    else:
        from word2vecReader import Word2Vec
        model = Word2Vec.load_word2vec_format(filename, binary=True)
    w2v = dict()
    vocabs = model.vocab.keys()
    print("Vocabulary size before pre-processing: %s." % len(vocabs))

    for key in model.vocab.keys():
        w2v[key] = model[key]

    return w2v
Beispiel #10
0
def load_embeddings(filename, binary=False):
    if "word2vec_twitter_model.bin" not in filename:
        model = gensim.models.Word2Vec.load_word2vec_format(filename,
                                                            binary=binary)
    else:
        from word2vecReader import Word2Vec
        model = Word2Vec.load_word2vec_format(filename, binary=True)
    w2v = dict()
    vocabs = model.vocab.keys()
    print("Vocabulary size before pre-processing: %s." % len(vocabs))

    for key in model.vocab.keys():
        w2v[key] = model[key]

    return w2v
Beispiel #11
0
 def train_word2vec(
     self,
     domain_corpus,
     feature_length,
     sg=1,
     min_count=5,
     workers=3
 ):
     """
     Trains the word2vec model on a corpus, by default using the skip-gram
     model.
     """
     tokenized = [ twokenize.tokenize(text) for text in domain_corpus ]
     model = Word2Vec(tokenized, min_count=min_count, sg=1, workers=workers)
     word_vectors = model.wv
     del model
     self.word_vectors = word_vectors
Beispiel #12
0
def load_model(pretrained=True):
    import os
    os.chdir(
        '/Users/lukasmalik/Desktop/Praktikum CSH/project-internship/scripts')
    import datetime
    start_time = datetime.datetime.now()
    if pretrained == True:
        import os
        import sys
        from word2vecReader import Word2Vec

        os.environ['PYTHONINSPECT'] = 'True'

        model_path = "../models/word2vec_twitter_model.bin"
        print("Loading the model, this can take some time...")
        model = Word2Vec.load_word2vec_format(model_path, binary=True)
        print("The vocabulary size is: " + str(len(model.vocab)))
        print("--- %s seconds ---" % (datetime.datetime.now() - start_time))
    return (model)
#!/usr/bin/env python

import os
import sys
from word2vecReader import Word2Vec

os.environ['PYTHONINSPECT'] = 'True'

model_path = "./word2vec_twitter_model.bin"
print("Loading the model, this can take some time...")
model = Word2Vec.load_word2vec_format(model_path, binary=True)
print("The vocabulary size is: " + str(len(model.vocab)))
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))
single_label = np.asarray(labels)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

###standardize
timeInfo = preprocessing.StandardScaler().fit_transform(timeInfo)
postInfo = preprocessing.StandardScaler().fit_transform(postInfo)
HAN_pre = []
HAN_reca = []
HAN_f1 = []
HAN_AUC = []
embeddings_index = Word2Vec.load_word2vec_format("word2vec_twitter_model.bin",
                                                 binary=True)  #

# print('Total %s word vectors.' % len(embeddings_index))
embedding_matrix = np.random.random((len(word_index) + 1, POST_DIM))
outword_dic = dict()
for word, i in word_index.items():
    if word in embeddings_index.vocab:
        embedding_vector = embeddings_index[word]
        embedding_matrix[i] = embedding_vector
    else:
        new_vector = np.random.rand(POST_DIM, )
        outword_dic.setdefault(word, new_vector)
        embedding_matrix[i] = outword_dic[word]

for j in range(10):
    indices = np.arange(data.shape[0])
word_index_test = tokenizer_test.word_index
print('Found %s unique tokens.' % len(word_index_test))
x_test = pad_sequences(sequences_test, maxlen=12)
print(x_test.shape)

all_sentences = text_
tokenizer = Tokenizer()  # nb_words=MAX_NB_WORDS
tokenizer.fit_on_texts(all_sentences)
sequences = tokenizer.texts_to_sequences(all_sentences)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

x_train = pad_sequences(sequences)
print(x_train.shape)
model = Word2Vec.load_word2vec_format(
    "/home/pengyuan/PycharmProjects/Multimodal_Study/Twitter_word2vec/word2vec_twitter_model.bin",
    binary=True)
pretrained_weights = model.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print(vocab_size, emdedding_size)
embedding_matrix = np.zeros((len(word_index) + 1, 400))
for word, i in word_index.items():
    if word in model:
        embedding_matrix[i] = model[word]
    else:
        embedding_matrix[i] = np.random.rand(1, 400)[0]

image_data = process_images(df)
print(image_data.shape)
image_data_test = process_images(df_2)
from __future__ import division
import sys
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale

sys.path.append('models/word2vec_twitter_model')
from word2vecReader import Word2Vec

model_path = 'models/word2vec_twitter_model/word2vec_twitter_model.bin'

print 'Loading the model...'
model = Word2Vec.load_word2vec_format(model_path, binary=True)

def preprocess(text):
    special_chars = """.,?!:;(){}[]#"""
    for c in special_chars:
        text = text.replace(c, ' %s '%c)
    words = text.lower().split()

    return words

def get_vector(text, model=model, size=400):
    words = preprocess(text)
    vec = np.zeros(size)
    count = 0.
    for word in words:
        try:
            vec += model[word]
Beispiel #17
0
def load_w2v():
    model_path = '../data/word2vec_twitter_model/word2vec_twitter_model.bin'
    model = Word2Vec.load_word2vec_format(model_path, binary=True)
    print("The vocabulary size is: " + str(len(model.vocab)))

    return model