def load_w2v(w2vdim): # return {} model_path = '../data/emory_w2v/w2v-%d.bin' % w2vdim model = Word2Vec.load_word2vec_format(model_path, binary=True) print("The vocabulary size is: " + str(len(model.vocab))) return model
def load_w2v(): model_path = '../data/word2vec_twitter_model/word2vec_twitter_model.bin' with Timer("load w2v"): model = Word2Vec.load_word2vec_format(model_path, binary=True) print("The vocabulary size is: " + str(len(model.vocab))) return model
def load_w2v(): model_path = '/Users/bong/works/data/word2vec_twitter_model/word2vec_twitter_model.bin' with Timer("load w2v"): model = Word2Vec.load_word2vec_format(model_path, binary=True) print("The vocabulary size is: " + str(len(model.vocab))) return model
def load_word2vec(self, fname="../data/word2vec_twitter_model/word2vec_twitter_model.bin"): """ Loads the word vectors. """ model = Word2Vec.load_word2vec_format(fname, binary=True) average_emb = np.mean( [ model[w] for w in model.vocab ], axis=0) self.word_vectors =model self.avg_embd = average_emb
def text_to_embeddings_store(model_path, input_file, output_file, vector_dimension): """ Takes an input_file and conby converts it to an output_file by replacing words with embedding vectors based on the model at model_path """ # TODO: Refactor to call to text_to_embeddings #model_path = args.model_path_ print("Loading the model, this can take some time...") model = Word2Vec.load_word2vec_format(model_path, binary=True) print("Loafing spaCy model, this can take some time...") nlp=English() #print(("The vocabulary size is: "+str(len(model.vocab)))) #print("Vector for 'Shubham': " + str(model['Shubham'])) #print("Embedding dimension: " + str(len(model['Shubham']))) #f1=open("embedding_vectors_400.txt","w") f1=open(output_file,'w') zero = np.zeros((vector_dimension,), dtype=np.float) #Specify encoding with io.open (careful io.open is slow, don't use for large files) #latin-1 is usually the culprit if the files aren't utf-8 encoded #with io.open("dataset_latin-1.txt", "r", encoding='latin-1') as f: count=0 max_length=0 with io.open(input_file, "r", encoding='utf-8') as f: for line in f: #spaCy would do this better :) #row=line.split() doc = nlp(line) arr = [] #for i in range(0,len(doc)): for token in doc: try: embedding = model[token.text] #print("Success for:\t" + token.text) except KeyError: #print("Fail for:\t" + token.text) #TODO: set embedding to zero vector instead of continue embedding = zero #temp=str(model[row[i]]) #temp.replace('\n',' ') #f1.write(temp) arr.append(embedding) #TODO: write as one line using join method #f1.write(str(embedding)) #f1.write(" ") rows,cols=np.shape(arr) if rows==0: #ignore the tweet if out of vocabulary and take the control to the beginning of the loop count=count+1 continue temp = arr[0] if (rows>max_length): # maximum words in a sentence max_length=rows for i in range(1,rows): temp=np.concatenate((temp,arr[i]),axis=0) rand=' '.join(map(str,temp)) f1.write(rand) f1.write("\n") print("There are"+str(count)+"out of vocabulary sentences.") print(max_length) return max_length
def load_w2v(w2vdim, sample_test=True): if sample_test: return {'a': np.array([np.float32(0.0)] * w2vdim)} else: model_path = '../data/emory_w2v/w2v-%d.bin' % w2vdim with Timer("load w2v"): model = Word2Vec.load_word2vec_format(model_path, binary=True) print("The vocabulary size is: " + str(len(model.vocab))) return model
def load_embeddings(filename, binary=False): if "word2vec_twitter_model.bin" not in filename: model = gensim.models.Word2Vec.load_word2vec_format(filename, binary=binary) else: from word2vecReader import Word2Vec model = Word2Vec.load_word2vec_format(filename, binary=True) w2v = dict() vocabs = model.vocab.keys() print("Vocabulary size before pre-processing: %s." % len(vocabs)) for key in model.vocab.keys(): w2v[key] = model[key] return w2v
def train_word2vec( self, domain_corpus, feature_length, sg=1, min_count=5, workers=3 ): """ Trains the word2vec model on a corpus, by default using the skip-gram model. """ tokenized = [ twokenize.tokenize(text) for text in domain_corpus ] model = Word2Vec(tokenized, min_count=min_count, sg=1, workers=workers) word_vectors = model.wv del model self.word_vectors = word_vectors
def load_model(pretrained=True): import os os.chdir( '/Users/lukasmalik/Desktop/Praktikum CSH/project-internship/scripts') import datetime start_time = datetime.datetime.now() if pretrained == True: import os import sys from word2vecReader import Word2Vec os.environ['PYTHONINSPECT'] = 'True' model_path = "../models/word2vec_twitter_model.bin" print("Loading the model, this can take some time...") model = Word2Vec.load_word2vec_format(model_path, binary=True) print("The vocabulary size is: " + str(len(model.vocab))) print("--- %s seconds ---" % (datetime.datetime.now() - start_time)) return (model)
#!/usr/bin/env python import os import sys from word2vecReader import Word2Vec os.environ['PYTHONINSPECT'] = 'True' model_path = "./word2vec_twitter_model.bin" print("Loading the model, this can take some time...") model = Word2Vec.load_word2vec_format(model_path, binary=True) print("The vocabulary size is: " + str(len(model.vocab)))
word_index = tokenizer.word_index print('Total %s unique tokens.' % len(word_index)) single_label = np.asarray(labels) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) ###standardize timeInfo = preprocessing.StandardScaler().fit_transform(timeInfo) postInfo = preprocessing.StandardScaler().fit_transform(postInfo) HAN_pre = [] HAN_reca = [] HAN_f1 = [] HAN_AUC = [] embeddings_index = Word2Vec.load_word2vec_format("word2vec_twitter_model.bin", binary=True) # # print('Total %s word vectors.' % len(embeddings_index)) embedding_matrix = np.random.random((len(word_index) + 1, POST_DIM)) outword_dic = dict() for word, i in word_index.items(): if word in embeddings_index.vocab: embedding_vector = embeddings_index[word] embedding_matrix[i] = embedding_vector else: new_vector = np.random.rand(POST_DIM, ) outword_dic.setdefault(word, new_vector) embedding_matrix[i] = outword_dic[word] for j in range(10): indices = np.arange(data.shape[0])
word_index_test = tokenizer_test.word_index print('Found %s unique tokens.' % len(word_index_test)) x_test = pad_sequences(sequences_test, maxlen=12) print(x_test.shape) all_sentences = text_ tokenizer = Tokenizer() # nb_words=MAX_NB_WORDS tokenizer.fit_on_texts(all_sentences) sequences = tokenizer.texts_to_sequences(all_sentences) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) x_train = pad_sequences(sequences) print(x_train.shape) model = Word2Vec.load_word2vec_format( "/home/pengyuan/PycharmProjects/Multimodal_Study/Twitter_word2vec/word2vec_twitter_model.bin", binary=True) pretrained_weights = model.syn0 vocab_size, emdedding_size = pretrained_weights.shape print(vocab_size, emdedding_size) embedding_matrix = np.zeros((len(word_index) + 1, 400)) for word, i in word_index.items(): if word in model: embedding_matrix[i] = model[word] else: embedding_matrix[i] = np.random.rand(1, 400)[0] image_data = process_images(df) print(image_data.shape) image_data_test = process_images(df_2)
from __future__ import division import sys import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from sklearn.cross_validation import train_test_split from sklearn.preprocessing import scale sys.path.append('models/word2vec_twitter_model') from word2vecReader import Word2Vec model_path = 'models/word2vec_twitter_model/word2vec_twitter_model.bin' print 'Loading the model...' model = Word2Vec.load_word2vec_format(model_path, binary=True) def preprocess(text): special_chars = """.,?!:;(){}[]#""" for c in special_chars: text = text.replace(c, ' %s '%c) words = text.lower().split() return words def get_vector(text, model=model, size=400): words = preprocess(text) vec = np.zeros(size) count = 0. for word in words: try: vec += model[word]
def load_w2v(): model_path = '../data/word2vec_twitter_model/word2vec_twitter_model.bin' model = Word2Vec.load_word2vec_format(model_path, binary=True) print("The vocabulary size is: " + str(len(model.vocab))) return model