def test_pythonObj(self): """ :return: """ class A(): def __init__(self): self.b1 = B() self.b2 = B() self.list = [1000, 23424.2, 'asdf0', u'unicode编码', self.b1] self.dic = { 132323423412312311: 'utf8编码', '232': self.b2, self.b2: set([1,2]), u'unicode编码': None, 123: (11,1,111), 11: (11,1,111), } class B(): def __init__(self): self.none = None self.str = '1111' self.int = 15151515151515155151 self.float = 11231231231212342323. self.list = [1,2,3,4] self.dict = {1:2, 2:3} self.tuple = (1,2,3, 4) return glove = Glove(A()) glove.meaure() print glove.report
def word_embedding(sentences,embedding_size,windows_len): """ Verify that the square error diminishes with fitting """ corpus_model = Corpus() corpus_model.fit(sentences,window=windows_len) # Check that the performance is poor without fitting glove_model = Glove(no_components=embedding_size, learning_rate=0.05) glove_model.fit(corpus_model.matrix, epochs=0, no_threads=2) log_cooc_mat = corpus_model.matrix.copy() log_cooc_mat.data = np.log(log_cooc_mat.data) log_cooc_mat = np.asarray(log_cooc_mat.todense()) corpus_dict=corpus_model.dictionary corpus_inverse_dict=dict(map(reversed, corpus_dict.items())) return glove_model,corpus_dict,corpus_inverse_dict
class MyGloVe: def initiate_model(self, input_corpus): self.corpus_model = Corpus() self.corpus_model.fit(self.__read_corpus(input_corpus), window=10) self.glove = Glove(no_components=100, learning_rate=0.05) self.glove.fit(self.corpus_model.matrix, epochs=200) self.glove.add_dictionary(self.corpus_model.dictionary) def cosine_similarity(self, first_text, second_text): first = self.__average_feature_vector(first_text) second = self.__average_feature_vector(second_text) return 1 - spatial.distance.cosine(first, second) def __read_corpus(self, input_corpus): for line in input_corpus: yield line def __average_feature_vector(self, text): words = text.split() words_no = 0 feature_vector = numpy.zeros((100, ), dtype="float32") for word in words: if word in self.glove.dictionary: word_idx = self.glove.dictionary[word] words_no += 1 feature_vector = numpy.add(feature_vector, self.glove.word_vectors[word_idx]) if words_no > 0: feature_vector = numpy.divide(feature_vector, words_no) return feature_vector
def __init__(self): # Corpus model vocab = dict(torch.load("../data/dialogue.vocab.pt", "text")) self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi) # Model self.glove = Glove(no_components=args.no_components, learning_rate=args.learning_rate)
def initiate_model(self, input_corpus): self.corpus_model = Corpus() self.corpus_model.fit(self.__read_corpus(input_corpus), window=10) self.glove = Glove(no_components=100, learning_rate=0.05) self.glove.fit(self.corpus_model.matrix, epochs=200) self.glove.add_dictionary(self.corpus_model.dictionary)
def build_model_glove(args): from glove import Glove, Corpus if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def Myself_Model(self, cropus_path, save=None, back_corpus=None, epochs=10, no_threads=8, no_components=100, learning_rate=0.05): """ sd """ self.get_data = self.read_corpus(cropus_path) corpus_model = Corpus() corpus_model.fit(self.get_data, window=10) if back_corpus != None: yield corpus_model #self.glove = Glove() self.glove = Glove(no_components=no_components, learning_rate=learning_rate) self.glove.fit(corpus_model.matrix, epochs=epochs, no_threads=no_threads, verbose=True) self.glove.add_dictionary(corpus_model.dictionary) if save != None: #save = 'model/articles_glove.model' self.glove.save(save) self.model = self.glove return self.glove
def trainShake2(self): corpus = Corpus() shakespeare_words = self.shakespeare_lines() # corpus.fit(shakespeare_corpus + sonnets_corpus, window=10) corpus.fit(shakespeare_words, window=10) self.glove = Glove(no_components=100, learning_rate=0.05) self.glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) self.glove.add_dictionary(corpus.dictionary)
def getGloveEmbedding(seqs, size=300, window=10, epochs=20): corpus = Corpus() corpus.fit(seqs, window=window) glove = Glove(no_components=size, learning_rate=0.05) glove.fit(corpus.matrix, epochs=epochs, verbose=True) return corpus.dictionary, glove.word_vectors
def test_word_vector_by_word_without_fitting(): glove_model = Glove() glove_model.dictionary = {"word": 0} with pytest.raises(Exception) as ex: glove_model.word_vector_by_word("word") assert(ex.value.message == "Model must be fit before querying")
def get_embedding(words): from glove import Glove global glove_obj if not glove_obj: glove_obj = Glove() if words not in glove_cache: glove_cache[words] = glove_obj.get_vector(words) return glove_cache[words]
def test_word_vector_by_word_without_dictionary(): glove_model = Glove() glove_model.word_vectors = [[1, 2, 3]] with pytest.raises(Exception) as ex: glove_model.word_vector_by_word("word") assert(ex.value.message == "No word dictionary supplied")
def get_glove_model(co_occurrence_dic, epochs=25): assert isinstance(co_occurrence_dic, dict) model = Glove(co_occurrence_dic) for epoch in range(epochs): # train the model err = model.train() print("epoch %d, error %.3f" % (epoch, err), flush=True) model return model.W
def fit_matrix(self, coo_mat): model = Glove(no_components=self.n_features, learning_rate=0.05) model.fit(coo_mat, epochs=self.n_epochs, no_threads=self.n_threads, verbose=True) self.model = model return self
def Pre_Existing_Model(self, cropus_path): self.get_data = self.read_corpus(cropus_path) glove = Glove() stanford = glove.load_stanford(self.model_kind) self.model = stanford return stanford
def build_model_glove(self, file_model_glove, vector_size=300, window=100, learning_rate=0.05, workers=4, epochs=100, use_stored_model=True): """ Build glove model Args: -------- file_model_glove: str, Filename to save model (or load model if it exists under this name). vector_size: int, Dimensions of word vectors (default = 100) window: int, Window size for context words (small for local context, larger for global context, default = 50) learning_rate: float, Learning rate for training of GloVe model (defaut set to 0.05). workers: int, Number of threads to run the training on (should not be more than number of cores/threads, default = 4). epochs: int, Number of training iterations (default=100). use_stored_model: bool, Load stored model if True, else train new model. """ from glove import Corpus, Glove # Check if model already exists and should be loaded if os.path.isfile(file_model_glove) and use_stored_model: print("Load stored GloVe model ...") self.model_glove = Glove.load(file_model_glove) else: if use_stored_model: print("Stored GloVe model not found!") print("Calculating new GloVe model...") # Creating a corpus object corpus = Corpus() # Training the corpus to generate the co occurence matrix which is used in GloVe MS_docs = self.corpus corpus.fit(MS_docs, window=window) self.model_glove = Glove(no_components=vector_size, learning_rate=learning_rate) self.model_glove.fit(corpus.matrix, epochs=epochs, no_threads=workers, verbose=True) self.model_glove.add_dictionary(corpus.dictionary) # Save model self.model_glove.save(file_model_glove)
def glove_benchmark(sentences_iterable, window=5, size=100, alpha=0.05, epochs=10, num_threads=1): """ Wrapper de los pasos para crear el modelo de Glove para poder realizar mediciones del rendimiento. """ corpus_model = Corpus() corpus_model.fit(sentences_iterable, window=window) glove = Glove(no_components=size, learning_rate=alpha) glove.fit(corpus_model.matrix, epochs=epochs, no_threads=1, verbose=False)
def __init__(self, glove_components=300, min_df=5, max_df=0.4): self.glove_model = Glove(no_components=glove_components) self.tf_idf_model = TfidfVectorizer(min_df=min_df, max_df=max_df, token_pattern='[^\s]+', lowercase=False) self.word_mapping = None self.embedding_dim = glove_components self.wmd = None self._r = None
def __init__(self, options): super(TextModel, self).__init__() self.options = options self.glove = Glove() self._GloVe = self.glove.get_embeddings() self.embeddings = nn.EmbeddingBag(constants.VOCAB_SIZE, constants.EMBED_DIM, mode=constants.REDUCE_TYPE) self.embeddings.weight = nn.Parameter(torch.tensor(self._GloVe))
def main(): """ load the glove model, and extract, save information as txt. :return: """ glove_words_model = Glove.load('glove_words_0.001_1_win1.model') glove_chars_model = Glove.load('glove_chars_0.001_1_win3.model') save_words_path = 'glove_words_embed.txt' save_chars_path = 'glove_chars_embed.txt' etl_glove(glove_ana=glove_words_model, save_txt_path=save_words_path) etl_glove(glove_ana=glove_chars_model, save_txt_path=save_chars_path)
def prepare_data(self): test, train, val = utils.load_test_train_val(self.data_num) # df train_texts = list(train.posts) glove = Glove() glove.create_custom_embedding([word for text in train_texts for word in text.split()]) self.train_tuple = utils.process_data(train, glove, self.max_words, self.max_posts) self.test_tuple = utils.process_data(test, glove, self.max_words, self.max_posts) self.val_tuple = utils.process_data(val, glove, self.max_words, self.max_posts)
def glove_vectors(x, embedding_size, epochs=50, lr=0.05, alpha=0.75, max_count=100, tmp_loc='glove.w2vmodel'): # create dict ourselves so that the ids correspond to their location in the df, starting to count from first col downwards df = pd.DataFrame(x) word_id_dict = create_vocab_dict(df) # Creating a corpus object corpus = Corpus(dictionary=word_id_dict) # Training the corpus to generate the co occurence matrix which is used in GloVe # Distance scaling: standard glove reduces the occurence count based on how far a context word is from the focus word. # Should not be used since distance has no meaning for purely categorical variables. corpus.fit(df.values.tolist(), window=len(df.columns), distance_scaling=False) # alpha is the weighing of the loss, based on how likely a cooccurence is (Xij), less likely = less weight. glove = Glove(no_components=embedding_size, learning_rate=lr, alpha=alpha, max_count=max_count) glove.fit( corpus.matrix, epochs=epochs, no_threads=1, verbose=True ) # glove paper: 50 epochs for dimensionality <300, 100 otherwise glove.add_dictionary(corpus.dictionary) glove.save_word2vec_format(tmp_loc) model = KeyedVectors.load_word2vec_format(tmp_loc) if os.path.exists(tmp_loc): os.remove(tmp_loc) return model
def main(): corpus_model = Corpus() corpus_model = Corpus.load('bioc-corpus-AZ2.model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('bioc-glove-AZ2.model')
def get_embeddings(prepared_input): corpus = Corpus() corpus.fit(prepared_input, window=10) glove = Glove(no_components=5, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model')
def train_glove_fashionrec(dimensionality, context, epochs): """ Train with Glove on IG corpora""" total_count, vocab_size = corpus_stats("data/clean2_corpus.txt") print("total word count: {}, vocabulary size: {}".format( total_count, vocab_size)) fileName = "results/training/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_" + ".txt" corpus = readCorpus() lines = corpus.split("\n") linessplit = map(lambda x: x.split(" "), lines) corpus_model = Corpus() start_time = datetime.now() corpus_model.fit(linessplit, window=context) corpusModelFile = "trained/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_corpus" + ".model" corpus_model.save(corpusModelFile) glove = Glove(no_components=dimensionality, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(epochs), no_threads=8, verbose=True) glove.add_dictionary(corpus_model.dictionary) time_elapsed = datetime.now() - start_time gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".model" glove.save(gloveModelFile) notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str( context) + " context, " + str( epochs) + " epochs \n" + "Training time: " + str(time_elapsed) save_to_file(fileName, notes) gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".vec" save_glove_bin_to_vec(glove, gloveVecFile)
def train_glove(target_group, glove_para, src_file, save_model_name): """ example: train_glove(target_group='words', glove_para=glove_para_word) after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model') :param target_group: 'words' or 'chars' :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4} :return: """ corpus_model = Corpus() corpus_model.fit(read_corpus(src_file=src_file, words_or_chars=target_group), window=glove_para['window_size'] ) #avg word size is 6 for each sentence corpus_model.save('corpus_model_{}.model'.format(target_group)) print target_group print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=glove_para['no_components'], learning_rate=glove_para['learning_rate']) glove.fit(corpus_model.matrix, epochs=glove_para['no_epochs'], no_threads=glove_para['parallelism'], verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(save_model_name)
def train_glove(src_filename, dim=100): corpus = Corpus() corpus.fit(get_lines(src_filename), window=10) glove = Glove(no_components=dim, learning_rate=0.001) glove.fit(corpus.matrix, epochs=100, no_threads=20, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save(DATA_DIR + 'glove.{}d.model'.format(dim))
def test_fitting(): """ Verify that the square error diminishes with fitting """ num_sentences = 5000 seed = 10 corpus = Corpus() corpus.fit(generate_training_corpus(num_sentences, vocabulary_size=50, seed=seed)) # Check that the performance is poor without fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=0, no_threads=2) log_cooc_mat = corpus.matrix.copy() log_cooc_mat.data = np.log(log_cooc_mat.data) log_cooc_mat = np.asarray(log_cooc_mat.todense()) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0 # Check that it is good with fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=500, no_threads=2) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
def train_and_save_model(data_dir, model_name='LeGlove', num_epochs=10, parallel_threads=1): ''' This function processes all the data into a training corpus and fits a GloVe model to this corpus. Parameters: data_dir (string): master directory containing all jurisdiction-level directories model_name (string): name of model to be used for output num_epochs (int): number of epochs for which to train model parallel_threads (int): number of parallel threads to use for training The trained model is saved as "[model_name].model" into the current directory. ''' corpus_model = Corpus() corpus_model.fit(read_corpus(data_dir), window=CONTEXT_WINDOW) glove = Glove(no_components=NUM_COMPONENTS, learning_rate=LEARNING_RATE) glove.fit(corpus_model.matrix, epochs=num_epochs, no_threads=parallel_threads, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(model_name + '.model')
def train_glove(path): import itertools from gensim.models.word2vec import Text8Corpus from gensim.scripts.glove2word2vec import glove2word2vec from glove import Corpus, Glove #import os #import struct sentences = list(itertools.islice(Text8Corpus(path), None)) corpus = Corpus() corpus.fit(sentences, window=10) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES) glove.save(file_name) glove2word2vec(file_name, file_name + '_modified') """ command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified' os.system(command) with open(file_name+'_modified', mode='rb') as file: # b is important -> binary fileContent = file.read() print 'Content',fileContent """ print 'Finished' return glove
def __init__(self, data): self.data = data self.corpus = None self.liu = LiuLexicon() self.subj = SubjLexicon() self.buildTweetCorpus() self.word_vec_model = Word2Vec(self.corpus) self.glove_vec_model = Glove(100, self.corpus) self.clusters = Cluster(100) self.initEncoders() self.topicVecs = self.word_vec_model.getVectorsForTopics( self.topicenc.classes_) self.collectTopUnigrams() self.collectTopBigrams()
def train(path, freq, window, dim, lr, epochs): lines = [] dic = {} print("Start of train method") try: for f in os.listdir(path): text = open(path + '/' + f, 'r').read() text = re.sub('\n', ' ', text) text = text.split() for word in text: if word in dic.keys(): dic[word] += 1 else: dic[word] = 1 print("Created Dictionary for frequencies of words.") for f in os.listdir(path): text = open(path + '/' + f, 'r').read() text = re.sub('\n', ' ', text) text = text.split() text = [word for word in text if dic[word] > freq] lines.append(text) print( "Converted preprocessed text data in input format of array of array of words." ) corpus = Corpus() corpus.fit(lines, window=window) glove = Glove(no_components=dim, learning_rate=lr) glove.fit(corpus.matrix, epochs=epochs, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') print("Saved the trained model to glove.model.") except: print("Error occured in training glove model")
def train_model(line): corpus = Corpus() corpus.fit(line) glove = Glove(no_components=5, learning_rate=0.05, random_state=0) glove.fit(corpus.matrix, epochs=10, no_threads=100, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') return glove
def get_model(): ''' lazy initialization for glove model so it works in pool ''' global model if model == None: print 'loading the glove model...' model = Glove.load('w2v/glove_lemma_stopwords') return model
def __init__(self,data_src,num_features=100,window=10,learning_rate=0.05,epochs=10): self.learning_rate = learning_rate self.num_features = num_features self.window = window self.epochs = epochs self.pretrain(data_src) self.model = Glove.load("glove.model")
def train_glove(sentences): print 'training glove model...' t0 = time() num_features = 300 # Word vector dimensionality context = 5 # Context window size learning_rate = 0.05 corpus = Corpus() corpus.fit(sentences, window=context) glove = Glove(no_components=num_features, learning_rate=learning_rate) glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True) glove.add_dictionary(corpus.dictionary) print 'took %0.5fs.' % (time() - t0) return glove
def run_glove(self): """ run global vector """ #sentences = [["hi","good","to"],["see","u"]] sentences = self.get_sentences() print '\n' + '-'*80 print "Fitting words into corpus" corpus = Corpus() corpus.fit(sentences, window=10) print "Running Glove" glove = Glove(no_components=200, learning_rate=0.05) glove.fit(corpus.matrix, epochs=5, no_threads=10, verbose=True) glove.add_dictionary(corpus.dictionary) print "Fitting words and vectors into unique_words and vectors200" unique_words = [] vectors200 = [] cnt1 = 0 length1 = len(glove.inverse_dictionary) for word_id in glove.inverse_dictionary: cnt1 += 1 unique_words.append(glove.inverse_dictionary[word_id]) vectors200.append(glove.word_vectors[word_id]) sys.stdout.write("\rStatus: %s / %s"%(cnt1, length1)) sys.stdout.flush() print '\n' + "Processing vectors200" processed_vectors200 = [] processed_vector = [] cnt2 = 0 length2 = len(vectors200) for vector in vectors200: cnt2 += 1 for float_num in vector: processed_vector.append(float_num) processed_vectors200.append(processed_vector) sys.stdout.write("\rStatus: %s / %s"%(cnt2, length2)) sys.stdout.flush() return unique_words, processed_vectors200
def load_wv_model(word_vector_file, word_vector_type): if word_vector_type == WordVectorTypes.glove.name: from glove import Glove glove_model = Glove.load_stanford(word_vector_file) wv_model = GloveWrapper(glove_model) else: import word2vec w2v_model = word2vec.load(word_vector_file) wv_model = W2VWrapper(w2v_model) return wv_model
def create_vectors_dataset(input_files, vector_files, max_len=500): print('Creating word vectors file') training_set_file, test_set_file = input_files train_word_file, test_word_file = vector_files train_stories = pickle.load(open(training_set_file,'r')) test_stories = pickle.load(open(test_set_file,'r')) train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in train_stories] test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in test_stories] vocab = sorted(reduce(lambda x, y: x | y, (set(story + [answer]) for story, answer in train_stories + test_stories))) # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 story_maxlen = max(map(len, (x for x, _ in train_stories + test_stories))) print('-') print('Vocab size:', vocab_size, 'unique words') print('Story max length:', story_maxlen, 'words') print('Number of training stories:', len(train_stories)) print('Number of test stories:', len(test_stories)) print('-') print('Here\'s what a "story" tuple looks like (input, query, answer):') print(train_stories[0]) print('-') print('Vectorizing the word sequences...') word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) answer_vocab = sorted(reduce(lambda x, y: x | y, (set([answer]) for _, answer in train_stories + test_stories))) # Reserve 0 for masking via pad_sequences answer_dict = dict((word, i) for i, word in enumerate(answer_vocab)) print('Answers dict len: {0}'.format(len(answer_dict))) # I need to check also if this exist word_vectors_dir = 'word_vectors/glove.42B.300d.txt' word_vectors_model = Glove.load_stanford(word_vectors_dir) inputs_train, answers_train = get_word_vectors(train_stories, answer_dict, max_len, word_vectors_model) inputs_test, answers_test = get_word_vectors(test_stories, answer_dict, max_len, word_vectors_model) with h5py.File(train_word_file,'w') as train_f: _ = train_f.create_dataset('inputs',data=inputs_train) _ = train_f.create_dataset('answers',data=answers_train) with h5py.File(test_word_file,'w') as test_f: _ = test_f.create_dataset('inputs',data=inputs_test) _ = test_f.create_dataset('answers',data=answers_test) return (inputs_train, answers_train),(inputs_test, answers_test)
def test_measure(self): """ :return: """ class A(): pass a = A() for i in xrange(100): a1 = A() for j in xrange(100): a2 = A() setattr(a1, 'a%s' % j, a2) setattr(a, 'a%s' % i, a1) glove = Glove(a) glove.meaure() print glove.report
def __init__(self, data): self.data = data self.corpus = None self.liu = LiuLexicon() self.subj = SubjLexicon() self.buildTweetCorpus() self.word_vec_model = Word2Vec(self.corpus) self.glove_vec_model = Glove(100, self.corpus) self.clusters = Cluster(100) self.initEncoders() self.topicVecs = self.word_vec_model.getVectorsForTopics(self.topicenc.classes_) self.collectTopUnigrams() self.collectTopBigrams()
def pretrain(self,data_src): if not os.path.isfile("glove.model"): data_src = DataClean([ ["[^a-z]"," "], # only letters [" [ ]+", " "], # remove extra spaces ],html_clean=True,split_words=True).fit(data_src).transform(data_src) corpus_model = Corpus() corpus_model.fit(data_src,window=self.window) glove = Glove(no_components=self.num_features,learning_rate=self.learning_rate) glove.fit(corpus_model.matrix,epochs=self.epochs,verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save("glove.model")
def glove_vector_download_and_save(url, outdir, maxmegabytes): # construct filenames filename_full = os.path.basename(url) filename_name = os.path.splitext(filename_full)[0] # create file-specific output directory dirname_file = "{}/{}".format(outdir, filename_name) if not os.path.isdir(dirname_file): os.mkdir(dirname_file) # download file filename_save = "{}/{}".format(dirname_file, filename_full) if not os.path.isfile(filename_save): print("downloading {}...".format(filename_save)) urllib.urlretrieve(url, filename_save) # extract zip print("extracting {}...".format(filename_save)) with zipfile.ZipFile(filename_save, "r") as z: z.extractall(dirname_file) # build model for each file file_pattern = "{}/*.txt".format(dirname_file) for file_glove_in in glob.glob(file_pattern): try: # ensure file isn't too big filesize = os.path.getsize(file_glove_in) / 1024 / 1024 if filesize > maxmegabytes: print("skipping {}M file {}...".format(filesize, file_glove_in)) else: # load vectors print("importing glove vectors from {}".format(file_glove_in)) model = Glove.load_stanford(file_glove_in) # save model object file_glove_out = "{}.obj".format(os.path.splitext(file_glove_in)[0]) print("saving glove model to {}...".format(file_glove_out)) model.save_obj(file_glove_out) # delete extracted file os.remove(file_glove_in) except MemoryError as e: print e.strerror
def test_stanford_loading(): model = Glove.load_stanford('glove/tests/stanford_test.txt') assert model.word_vectors is not None assert model.word_vectors.shape == (100, 25) assert len(model.dictionary) == 100 # Python 2/3 compatibility. Check the ellipsis # character is in the dictionary. try: # Python 2 assert unichr(8230) in model.dictionary except NameError: # Pyton 3 assert '…' in model.dictionary
def from_glove_model(cls, vector_file): """ WARNING: `glove_python` is required to use this function! Load a GloVe vector model. :param vector_path: path to glove model :return: a `Vectors` object """ from glove import Glove model = Glove.load_stanford(vector_file) if isinstance(vector_file, str) else vector_file vocab = model.dictionary.keys() vectors = {} dims = model.no_components # vector dimensionality dimension_names = ['f%02d' % i for i in range(dims)] for word in vocab: vectors[word] = zip(dimension_names, model.word_vectors[model.dictionary[word]]) return Vectors(vectors)
def download_and_save_vectors_glove(self, url, outdir, datafile=None, maxmegabytes=None): ''' download and save pre-trained glove model ''' # download file dirname_file = self.download_and_extract_file(url, outdir) # extract file file_in = "{}/{}.txt".format(dirname_file, datafile) # build output filename fullpath_out = self.download_fullpath(outdir, datafile) # catch memory exceptions try: # ensure file isn't too big filesize = os.path.getsize(file_in) / 1024 / 1024 filesize_ok = (not maxmegabytes or filesize <= int(maxmegabytes)) # download specific file and/or files under specific limit if filesize_ok: print("importing glove vectors from {}".format(file_in)) model = Glove.load_stanford(file_in) # save model object to specified output directory print("saving glove model to {}...".format(fullpath_out)) model.save_obj(fullpath_out) else: print("skipping file {}...".format(file_in)) except MemoryError as e: print e.strerror # remove extracted directory shutil.rmtree(dirname_file)
def get_data(args): feature_set_names = CONFIG['train']['features'] if set(feature_set_names).intersection(['word2vec', 'doc2vec']) and not args.embedding: raise RuntimeError("--embedding argument must be supplied") # get Y labels training_set = read_tsv(args.train) y_labels = training_set["sentiment"] sentences = [obj['review'] for obj in read_json_lines(args.sentences)] if not args.embedding or feature_set_names == ['bow']: # don't drop NaNs -- have a sparse matrix here return False, (get_bow_features(sentences), y_labels) # load embedding if CONFIG['pretrain']['algorithm'] == 'word2vec': embedding = word2vec.Word2Vec.load(args.embedding) elif CONFIG['pretrain']['algorithm'] == 'glove': embedding = Glove.load(args.embedding) # dynamicaly add GloveWrapper mixin embedding.__class__ = type('MyGlove', (Glove, GloveWrapper), {}) # get feature vectors if 'doc2vec' in CONFIG['train']['features']: embedding_vectors = get_doc2vec_features(sentences, embedding) elif 'word2vec' in CONFIG['train']['features']: embedding_vectors = get_word2vec_features(sentences, embedding) else: raise RuntimeError("Invalid config setting train:features=%s" % CONFIG['train']['features']) if 'bow' in feature_set_names: return True, get_mixed_features(sentences, embedding_vectors, y_labels) else: # matrix is dense -- drop NaNs return False, drop_nans(embedding_vectors, y_labels)
def get_data(args): feature_set_names = CONFIG['train']['features'] if set(feature_set_names).intersection(['embedding']) and not args.embedding: raise RuntimeError("--embedding argument must be supplied") # get input data sentences, y_labels = sample_by_y(args) if not args.embedding or feature_set_names == ['bow']: # don't drop NaNs -- have a sparse matrix here X = get_bow_features(sentences) return False, (X, y_labels) # load embedding if CONFIG['pretrain']['algorithm'] == 'word2vec': from gensim.models import word2vec embedding = word2vec.Word2Vec.load(args.embedding) elif CONFIG['pretrain']['algorithm'] == 'glove': from glove import Glove embedding = Glove.load(args.embedding) # dynamicaly add GloveWrapper mixin embedding.__class__ = type('MyGlove', (Glove, GloveWrapper), {}) # get feature vectors if 'embedding' in CONFIG['train']['features']: embedding_vectors = get_word2vec_features(sentences, embedding) else: raise RuntimeError("Invalid config setting train:features=%s" % CONFIG['train']['features']) if 'bow' in feature_set_names: X, y_labels = get_mixed_features(sentences, embedding_vectors, y_labels) return True, (X, y_labels) else: # matrix is dense -- drop NaNs X, y_labels = drop_nans(embedding_vectors, y_labels) return False, (X, y_labels)
def build_glove_embeddings(training, testing, args): ''' Trains the model on the sentiment140 dataset @Arguments: data: the loaded sentiment140 dataset from module num_epochs: the number of epochs to train on num_threads: the number of threads to use num_components: the number of components the glove model should use learning_rate: the model's learning rate window_size: the size of the window to use when looking for word co-occurence verbose: boolean for whether or not extensive output should be printed to screen @Return: A trained glove model ''' # initialize model glove = Glove(no_components = args.vecsize, learning_rate = args.learningRate) txtSource = chain( imap(lambda (txt,lbl): txt, training), imap(lambda (txt,lbl): txt, testing)) # read in the data to train on corpus_model = Corpus() corpus_model.fit( imap(preprocess.tokenize, txtSource), window = args.window) # fit the model using the given parameters logging.info("Training GloVe") glove.fit(corpus_model.matrix, epochs = args.epochs, no_threads = args.parallelism, verbose = args.verbose) # add a dictionary just to make it easier for similarity queries glove.add_dictionary(corpus_model.dictionary) transformer = lambda words: glove.transform_paragraph(words, use_pca = args.pca) fromTraining = to_sklearn_format(transformer, training, args.vecsize) fromTesting = to_sklearn_format(transformer, testing, args.vecsize) return fromTraining, fromTesting
import argparse from glove import Glove if __name__ == '__main__': parser = argparse.ArgumentParser(description=('Load GloVe model and test similarity scores')) parser.add_argument('--model', '-m', action='store', required=True, help='The saved GloVe model object') args = parser.parse_args() # load the model glove = Glove.load_obj(args.model) # give me the 5 words most similar to each word in the words list in this # corpus and show me how similar the words are in this corpus to each word # in the words list in general words = ['sky', 'queen', 'car', 'run', 'medical'] for word in words: # get most similar words similarities = glove.most_similar(word) # print out results print("Most similar to {}:".format(word)) for match, score in similarities: print("\t{0:15} {1:.2f}".format(match, score))
def loadGloveModel(self, modelFile = MODEL_FILE): print("Loading pre-trained GloVe model \"{}\"...").format(modelFile) self.glove = Glove.load(modelFile) print("Done loading.") print("")
mlp100 = mlp_model(100) mlp100_accuracy = train_test(mlp100, x, y, folds) mlp1000 = mlp_model(1000) mlp1000_accuracy = train_test(mlp1000, x, y, folds) print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy)) #3CNN #Glove Vectors from reviews c = [review.split() for review in data.data] corpus = Corpus() corpus.fit(c, window=10) glv = Glove(no_components=100, learning_rate=0.05) glv.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glv.add_dictionary(corpus.dictionary) embeddings_index = glv.dictionary BASE_DIR = '' GLOVE_DIR = BASE_DIR + '/glove.6B/' TEXT_DATA_DIR = 'txt_sentoken/' MAX_SEQUENCE_LENGTH = 1000 MAX_NB_WORDS = 20000 EMBEDDING_DIM = 100 VALIDATION_SPLIT = 0.2 texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id
def fit(self,X,y=None): self.model = Glove.load("glove.model") return self
print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(args.train), no_threads=args.parallelism, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model') if args.query: # Finally, query the model for most similar words. if not args.train: print('Loading pre-trained GloVe model') glove = Glove.load('glove.model') print('Querying for %s' % args.query) pprint.pprint(glove.most_similar(args.query, number=10))
def embedding_func(gridded_words_overall,embedding_size): """*************** GLOVE for Video ***************""" glove_bins=np.asarray(gridded_words_overall) print(glove_bins) glove_shape=glove_bins.shape glove_weights=np.ones((glove_shape)) #bovw_shape=(3,5) #bovw_bins = np.random.randint(9,13, size=bovw_shape) #bovw_weights = np.random.randint(2, size=bovw_shape) #print('Bovw bins') #print(bovw_bins) #print('Bovw weights') #print(bovw_weights) dictionary = {} rows = [] cols = [] data = array.array('f') k=0 #print(bovw_bins) for frame in glove_bins: for i, first_word in enumerate(frame): first_word_idx = dictionary.setdefault(first_word, len(dictionary)) w1=glove_weights[k,i] for j, second_word in enumerate(frame): second_word_idx = dictionary.setdefault(second_word, len(dictionary)) w2=glove_weights[k,j] distance = 1 w=w1*w2 if first_word_idx == second_word_idx: pass elif first_word_idx < second_word_idx: rows.append(first_word_idx) cols.append(second_word_idx) data.append(np.double(w*np.double(1.0) / distance)) else: rows.append(second_word_idx) cols.append(first_word_idx) data.append(np.double(w*np.double(1.0) / distance)) k=k+1 x=sp.coo_matrix((data, (rows, cols)), shape=(len(dictionary), len(dictionary)), dtype=np.double).tocsr().tocoo() print(dictionary) xarr=x.toarray() xarr/=np.amax(xarr) print("coocurance matrix") print(xarr) xsparse=sp.coo_matrix(xarr) glove_model = Glove(no_components=embedding_size, learning_rate=0.05) glove_model.fit(xsparse, epochs=500, no_threads=2) new_word_representation=glove_model.word_vectors return new_word_representation,dictionary
for line in file: d = json.loads(line) uris.append(d[0]) questions.append(d[1]) answers.append(d[2]) cats.append(d[3]) def get_lines(): for a in answers: yield a.split() # Build the corpus dictionary and cooccurence matrix corpus_model = Corpus() corpus_model.fit(get_lines(), window=8) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) # Train GloVe model #glove = Glove(no_components = no_comp, learning_rate=0.05) glove = Glove.load_stanford('vectors.6B.100d.txt') glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True) glove.add_dictionary(corpus_model.dictionary) # Save with open('model.glove', 'w+') as file: file.write('%i %i \n' % (len(glove.dictionary), no_comp)) for (word, idx) in glove.dictionary.iteritems(): file.write('%s %s \n' % (word, ' '.join(str(n) for n in glove.word_vectors[idx])))
required=True, help='The filename of the stored GloVe model.') parser.add_argument('--encode', '-e', action='store_true', default=False, help=('If True, words from the ' 'evaluation set will be utf-8 encoded ' 'before looking them up in the ' 'model dictionary')) parser.add_argument('--parallelism', '-p', action='store', default=1, help=('Number of parallel threads to use')) args = parser.parse_args() # Load the GloVe model glove = Glove.load(args.model) if args.encode: encode = lambda words: [x.lower().encode('utf-8') for x in words] else: encode = lambda words: [unicode(x.lower()) for x in words] # Load the analogy task dataset. One example can be obtained at # https://word2vec.googlecode.com/svn/trunk/questions-words.txt sections = defaultdict(list) evaluation_words = [sections[section].append(encode(words)) for section, words in metrics.read_analogy_file(args.test)] section_ranks = []
@author: dannl ''' from glove import Glove from glove import Corpus import time cooc_file='/home/dannl/tmp/newstech/glove/word.cooc' model_file='/home/dannl/tmp/newstech/glove/glove.model' oldtime=time.time() # get a cooccurrence matrix corpus_cooc = Corpus.load(cooc_file) # get a model glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_cooc.matrix, epochs=5,no_threads=4, verbose=True) glove.add_dictionary(corpus_cooc.dictionary) glove.save(model_file) # count=0 # for word,wid in corpus_cooc.dictionary.items(): # count+=1 # if count>100: # break # print word,wid print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) print 'time cost:%.2f'%(time.time()-oldtime)