def train_glove_fashionrec(dimensionality, context, epochs): """ Train with Glove on IG corpora""" total_count, vocab_size = corpus_stats("data/clean2_corpus.txt") print("total word count: {}, vocabulary size: {}".format( total_count, vocab_size)) fileName = "results/training/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_" + ".txt" corpus = readCorpus() lines = corpus.split("\n") linessplit = map(lambda x: x.split(" "), lines) corpus_model = Corpus() start_time = datetime.now() corpus_model.fit(linessplit, window=context) corpusModelFile = "trained/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_corpus" + ".model" corpus_model.save(corpusModelFile) glove = Glove(no_components=dimensionality, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(epochs), no_threads=8, verbose=True) glove.add_dictionary(corpus_model.dictionary) time_elapsed = datetime.now() - start_time gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".model" glove.save(gloveModelFile) notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str( context) + " context, " + str( epochs) + " epochs \n" + "Training time: " + str(time_elapsed) save_to_file(fileName, notes) gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".vec" save_glove_bin_to_vec(glove, gloveVecFile)
def main(): corpus_model = Corpus() corpus_model = Corpus.load('bioc-corpus-AZ2.model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('bioc-glove-AZ2.model')
def glove_embed(data, embed_dim, window_size, epochs_, step_size): ''' DESCRIPTION : Perform Global Vectors for word embeddings for tokens in data set INPUT: |--- train: list of tweets |--- embed_size: [int] integer representing embedding dimension |--- window_size: [int] integer representing the size of the window of tokens considered during training for each token |--- epochs: [int] integer for number of epochs for Word2Vec training |--- step_size: [float] learning step for the SGD for Word2Vec training OUTPUT: |--- embeddings: [dict] dictionnary with tweets as keys and 1D array of feature vector as values |--- vocab: [dict] dictionnary with tokens as keys and index of each token in vocab as values |--- glove: [Global Vectors Model] GloVe model trained on data ''' sentences = get_tokens(data) model = Corpus() model.fit(sentences, window=window_size) glove = Glove(no_components=embed_dim, learning_rate=step_size) glove.fit(model.matrix, epochs=epochs_, no_threads=1, verbose=True) glove.add_dictionary(model.dictionary) embeddings = np.zeros((len([*glove.dictionary]), embed_dim)) for w, id_ in glove.dictionary.items(): embeddings[id_, :] = np.array([glove.word_vectors[id_]]) vocab = dict() for idx, line in enumerate([*glove.dictionary]): vocab[line.strip()] = idx return embeddings, vocab, glove
def train_glove(target_group, glove_para, src_file, save_model_name): """ example: train_glove(target_group='words', glove_para=glove_para_word) after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model') :param target_group: 'words' or 'chars' :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4} :return: """ corpus_model = Corpus() corpus_model.fit(read_corpus(src_file=src_file, words_or_chars=target_group), window=glove_para['window_size'] ) #avg word size is 6 for each sentence corpus_model.save('corpus_model_{}.model'.format(target_group)) print target_group print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=glove_para['no_components'], learning_rate=glove_para['learning_rate']) glove.fit(corpus_model.matrix, epochs=glove_para['no_epochs'], no_threads=glove_para['parallelism'], verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(save_model_name)
def getWordEmbeddings(processed_text): corpus = Corpus() corpus.fit(processed_text, window=3) glove = Glove(no_components=500, learning_rate=0.001) glove.fit(corpus.matrix, epochs=300000, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) print(glove.most_similar('price'))
def get_embeddings(prepared_input): corpus = Corpus() corpus.fit(prepared_input, window=10) glove = Glove(no_components=5, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model')
def train_glove(save_dir, size): print('START') f_corpus = get_full_corpus() corpus = Corpus() print('CREATE CORPUS') corpus.fit(f_corpus, window=10) word_dict = corpus.dictionary.keys() glove = Glove(no_components=size, learning_rate=0.05) print('START LEARNING') glove.fit(corpus.matrix, epochs=60, no_threads=8, verbose=True) glove.add_dictionary(corpus.dictionary) dict_in_bin = dict() print('START SAVE') for item in word_dict: word_indx = glove.dictionary[item] dict_in_bin[item] = glove.word_vectors[word_indx] with open(save_dir, "wb") as file: pickle.dump(dict_in_bin, file) print('COMMON TEST') while True: try: s = input("Введите строку: ") print(glove.most_similar(s, number=10)) word_indx = glove.dictionary[s] print(glove.word_vectors[word_indx]) except: continue
def build_model_glove(args): if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def build_glove_embeddings(corpus): """ DESCRIPTION: Applies the Glove python SGD algorithm given by glove_python library and build the word embeddings from our training set. INPUT: corpus: a list of lists where each sub-list represent a tweet. The outer list represents the whole training dataset. OUTPUT: words: python dictionary of the form (word, [vector of embeddings]) """ words = load_glove_embeddings_from_txt_file( MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE) if words != None: return words model = Corpus() model.fit(corpus, window=algorithm['options']['WE']['window_size']) glove = Glove(no_components=algorithm['options']['WE']['we_features'], learning_rate=algorithm['options']['WE']['learning_rate']) print('\nFitting Glove Python Embeddings') glove.fit(model.matrix, epochs=algorithm['options']['WE']['epochs']) glove.add_dictionary(model.dictionary) words = {} for w, id_ in glove.dictionary.items(): words[w] = np.array(glove.word_vectors[id_]) store_embeddings_to_txt_file(words, MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE) return words
def train_and_save_model(data_dir, model_name='LeGlove', num_epochs=10, parallel_threads=1): ''' This function processes all the data into a training corpus and fits a GloVe model to this corpus. Parameters: data_dir (string): master directory containing all jurisdiction-level directories model_name (string): name of model to be used for output num_epochs (int): number of epochs for which to train model parallel_threads (int): number of parallel threads to use for training The trained model is saved as "[model_name].model" into the current directory. ''' corpus_model = Corpus() corpus_model.fit(read_corpus(data_dir), window=CONTEXT_WINDOW) glove = Glove(no_components=NUM_COMPONENTS, learning_rate=LEARNING_RATE) glove.fit(corpus_model.matrix, epochs=num_epochs, no_threads=parallel_threads, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(model_name + '.model')
def train_glove(path): import itertools from gensim.models.word2vec import Text8Corpus from gensim.scripts.glove2word2vec import glove2word2vec from glove import Corpus, Glove #import os #import struct sentences = list(itertools.islice(Text8Corpus(path), None)) corpus = Corpus() corpus.fit(sentences, window=10) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES) glove.save(file_name) glove2word2vec(file_name, file_name + '_modified') """ command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified' os.system(command) with open(file_name+'_modified', mode='rb') as file: # b is important -> binary fileContent = file.read() print 'Content',fileContent """ print 'Finished' return glove
def train_glove(corpus, vocabulary, zero_init_indices=0, rand_init_indices=1, embedding_dim=300): """Use glove to train on corpus to obtain embedding Here we use a python implementation of Glove, but the official glove implementation of C version is also highly recommended: https://github.com/stanfordnlp/GloVe/blob/master/demo.sh Args: corpus: list of tokenized texts, corpus to train on vocabulary: dict, a mapping of words to indices zero_init_indices: int or a list, the indices which use zero-initialization. These indices usually represent padding token. rand_init_indices: int or a list, the indices which use randomly-initialization.These indices usually represent other special tokens, such as "unk" token. embedding_dim: int, dimensionality of embedding Returns: np.array, a word embedding matrix. """ corpus_model = Corpus() corpus_model.fit(corpus, window=10) glove = Glove(no_components=embedding_dim, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True) glove.add_dictionary(corpus_model.dictionary) word_vectors = dict( (w, glove.word_vectors[glove.dictionary[w]]) for w in glove.dictionary) emb = filter_embeddings(word_vectors, embedding_dim, vocabulary, zero_init_indices, rand_init_indices) return emb
def build_model_glove(args): from glove import Glove, Corpus if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def trainGlove(path, no_components=100, learning_rate=0.05, epochs=100, no_threads=1, verbose=True, context_window=5, save_path='outputs/Glove'): # function to load in and train GloVe model print('Training Glove Model...') glove = Glove(no_components=no_components, learning_rate=learning_rate) corpus = buildCorpus(path, context_window) glove.fit(corpus.matrix, epochs=epochs, no_threads=no_threads, verbose=1) glove.add_dictionary(corpus.dictionary) # glove.save(save_path + '/glove.model') with open('data/words.txt', 'r') as f: words = f.read().split('\n')[:-1] shared_words = list(set(words).intersection(set(list(corpus.dictionary)))) glove_dict = {} for word in shared_words: glove_dict[word] = glove.word_vectors[glove.dictionary[word], :] np.save('DSMs/glove.npy', glove_dict)
def train_glove(src_filename, dim=100): corpus = Corpus() corpus.fit(get_lines(src_filename), window=10) glove = Glove(no_components=dim, learning_rate=0.001) glove.fit(corpus.matrix, epochs=100, no_threads=20, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save(DATA_DIR + 'glove.{}d.model'.format(dim))
class GloveEmbeding: def __init__(self, no_components=128): self.no_components = no_components self.model = Glove(no_components=self.no_components, learning_rate=0.05) def train(self, lines): # lines list of lists corpus = Corpus() corpus.fit(lines, window=10) self.model.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) self.model.add_dictionary(corpus.dictionary) def save(self, path): self.model.save(path) def load(self, path): self.load(path) def embeddings(self, sentences, max_len): # paragraph list of tokens matrix = np.zeros((len(sentences), max_len, self.no_components)) for nr, sentence in enumerate(sentences): for nr_tok, tok in enumerate(sentence): try: emb_nr = self.model.dictionary[tok] matrix[nr, nr_tok, :] = self.model.word_vectors[emb_nr] except KeyError: print('word \"' + tok.encode(encoding='UTF-8') + '\" not in dictionary') except IndexError: pass return matrix
def train_glove(corpus, params, exp_id, save_dir, save_dict=False): dictionary = load_glove_dictionary(exp_id, save_dir) # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id)) if os.path.exists(dict_path): corpus_model = Corpus.load(dict_path) else: corpus_model = Corpus(dictionary) corpus_model.fit(corpus, window=params['window'] * 2, ignore_missing=True) if save_dict: corpus_model.save(dict_path) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=100, learning_rate=params['alpha']) glove.fit(corpus_model.matrix, epochs=50, no_threads=params['workers'], verbose=True) glove.add_dictionary(corpus_model.dictionary) return glove
class GloVeFilter(object): def __init__(self): # Corpus model vocab = dict(torch.load("../data/dialogue.vocab.pt", "text")) self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi) # Model self.glove = Glove(no_components=args.no_components, learning_rate=args.learning_rate) def load_corpus_from_txt(self): print('Reading corpus statistics...') #texts = [self.pp.preprocessing(l.strip().decode("utf8", "ignore")) for l in open(args.data_path)] texts = [ l.strip().decode("utf8", "ignore").split(" ") for l in open(args.data_path) ] self.corpus_model.fit(texts, window=args.window, ignore_missing=True) self.corpus_model.save(args.corpus_model_path) print('Dict size: %s' % len(self.corpus_model.dictionary)) print('Collocations: %s' % self.corpus_model.matrix.nnz) def load_corpus_from_model(self): print('Reading corpus statistics...') self.corpus_model = Corpus.load(args.corpus_model_path) print('Dict size: %s' % len(self.corpus_model.dictionary)) print('Collocations: %s' % self.corpus_model.matrix.nnz) def train(self): print('Training the GloVe model...') self.glove.fit(self.corpus_model.matrix, epochs=args.epochs, verbose=True) self.glove.add_dictionary(self.corpus_model.dictionary) self.glove.save(args.model_path) print('Training finished')
class MyGloVe: def initiate_model(self, input_corpus): self.corpus_model = Corpus() self.corpus_model.fit(self.__read_corpus(input_corpus), window=10) self.glove = Glove(no_components=100, learning_rate=0.05) self.glove.fit(self.corpus_model.matrix, epochs=200) self.glove.add_dictionary(self.corpus_model.dictionary) def cosine_similarity(self, first_text, second_text): first = self.__average_feature_vector(first_text) second = self.__average_feature_vector(second_text) return 1 - spatial.distance.cosine(first, second) def __read_corpus(self, input_corpus): for line in input_corpus: yield line def __average_feature_vector(self, text): words = text.split() words_no = 0 feature_vector = numpy.zeros((100, ), dtype="float32") for word in words: if word in self.glove.dictionary: word_idx = self.glove.dictionary[word] words_no += 1 feature_vector = numpy.add(feature_vector, self.glove.word_vectors[word_idx]) if words_no > 0: feature_vector = numpy.divide(feature_vector, words_no) return feature_vector
def parse_Word2Vec(full_content): corpus = Corpus() corpus.fit(full_content, window=10) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) # Open file to write the results f2 = open('/home/ubuntu/corpus/results.txt', 'w') # Loop through all the article types in the file with open('/home/ubuntu/corpus/article_types.csv', 'r') as f: reader = csv.reader(f, delimiter="\t") for row in reader: article_type = row[0] translator = str.maketrans({key: '' for key in string.punctuation}) article_type_no_punctuation = article_type.translate(translator) wordnet = WordNetLemmatizer() article_type_clean = wordnet.lemmatize(article_type_no_punctuation) try: match = glove.most_similar(article_type_clean, number=10) matched_item = match[0][0] print(article_type_clean + ' -> ' + str(matched_item)) f2.write(article_type + '\n') f2.write(str(matched_item + '\n')) except: pass print('failed for: ' + article_type) f2.close()
def glove_vectors(x, embedding_size, epochs=50, lr=0.05, alpha=0.75, max_count=100, tmp_loc='glove.w2vmodel'): # create dict ourselves so that the ids correspond to their location in the df, starting to count from first col downwards df = pd.DataFrame(x) word_id_dict = create_vocab_dict(df) # Creating a corpus object corpus = Corpus(dictionary=word_id_dict) # Training the corpus to generate the co occurence matrix which is used in GloVe # Distance scaling: standard glove reduces the occurence count based on how far a context word is from the focus word. # Should not be used since distance has no meaning for purely categorical variables. corpus.fit(df.values.tolist(), window=len(df.columns), distance_scaling=False) # alpha is the weighing of the loss, based on how likely a cooccurence is (Xij), less likely = less weight. glove = Glove(no_components=embedding_size, learning_rate=lr, alpha=alpha, max_count=max_count) glove.fit( corpus.matrix, epochs=epochs, no_threads=1, verbose=True ) # glove paper: 50 epochs for dimensionality <300, 100 otherwise glove.add_dictionary(corpus.dictionary) glove.save_word2vec_format(tmp_loc) model = KeyedVectors.load_word2vec_format(tmp_loc) if os.path.exists(tmp_loc): os.remove(tmp_loc) return model
def train(path, freq, window, dim, lr, epochs): lines = [] dic = {} print("Start of train method") try: for f in os.listdir(path): text = open(path + '/' + f, 'r').read() text = re.sub('\n', ' ', text) text = text.split() for word in text: if word in dic.keys(): dic[word] += 1 else: dic[word] = 1 print("Created Dictionary for frequencies of words.") for f in os.listdir(path): text = open(path + '/' + f, 'r').read() text = re.sub('\n', ' ', text) text = text.split() text = [word for word in text if dic[word] > freq] lines.append(text) print( "Converted preprocessed text data in input format of array of array of words." ) corpus = Corpus() corpus.fit(lines, window=window) glove = Glove(no_components=dim, learning_rate=lr) glove.fit(corpus.matrix, epochs=epochs, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') print("Saved the trained model to glove.model.") except: print("Error occured in training glove model")
def train_model(line): corpus = Corpus() corpus.fit(line) glove = Glove(no_components=5, learning_rate=0.05, random_state=0) glove.fit(corpus.matrix, epochs=10, no_threads=100, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') return glove
def build_glove_word_vectors(data_frame, vec_dim, vectorizer, window_size, niter): corpus = Corpus(vectorizer.vocabulary_) corpus.fit(data_frame.post, window=window_size, ignore_missing=True) glove = Glove(no_components=vec_dim, learning_rate=0.01) glove.fit(corpus.matrix, epochs=niter, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) return glove
def glove_feat(df, feat, length): corpus = Corpus() corpus.fit(df[feat], window=20) glove = Glove(no_components=length, learning_rate=0.05) glove.fit(corpus.matrix, epochs=10, no_threads=10, verbose=True) glove.add_dictionary(corpus.dictionary) return glove
def glove_embed(texts, victor_size): corpus_model = Corpus() corpus_model.fit(texts, window=5, ignore_mising=False) glove = Glove(no_components=victor_size, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=20, no_threads=1, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('embed_model/glove.embed_model') return glove
def train_model(self, data, embeddings_size, context_size=10, learning_rate=0.05, alpha=0.75, max_count=100, max_loss=10.0, dictionary=None, workers=5, epochs=20, seed=1111): assert len(data) != 0 print(f'The passed data consists of {len(data)} sentences.') print('Fitting the corpus with the given training sentences') start_time = time.time() if os.path.exists(self.corpus_path): print(f'Loading corpus {self.corpus_path} from disk') corpus = Corpus.load(self.corpus_path) else: corpus = Corpus(dictionary=dictionary) # dataset represented as a list of lists # the length of the (symmetric) context window used for co-occurrence corpus.fit(data, window=context_size) corpus.save(self.corpus_path) print('| Corpus fit time: {:5.2f}s |'.format(time.time() - start_time)) print( f'The vocabulary size of the trained model is {len(corpus.dictionary)} unique tokens' ) print(f'The number of collocations is {corpus.matrix.nnz}') print('Training GloVe model') start_time = time.time() model = Glove( no_components=embeddings_size, # number of latent dimensions alpha=alpha, max_count=max_count, max_loss=max_loss, learning_rate=learning_rate, random_state=seed, ) # fitting to the corpus and adding standard dictionary to the object model.fit(corpus.matrix, epochs=epochs, no_threads=workers, verbose=True) model.add_dictionary(corpus.dictionary) print('| GloVe model training time: {:5.2f}s |'.format(time.time() - start_time)) model.save(self.model_path) self.word_vectors = model
def generateModel(traces): linesentence = "" myFile = open("sentences.txt", 'w') mypackCount = 0 for trace in traces: linesentence = "" directionCursor = None dataCursor = 0 for packet in trace.getPackets(): if directionCursor == None: directionCursor = packet.getDirection() if packet.getDirection() != directionCursor: #dataKey = 'S'+str(directionCursor)+'-'+str( GloveClassifier.roundArbitrary(dataCursor, 600) ) dataKey = 'S' + str(directionCursor) + '-' + str(dataCursor) linesentence = linesentence + " " + dataKey directionCursor = packet.getDirection() dataCursor = 0 dataCursor += packet.getLength() linesentence = linesentence + " " + str( packet.getLength()) + "_" + str(packet.getDirection()) if dataCursor > 0: key = 'S' + str(directionCursor) + '-' + str(dataCursor) linesentence = linesentence + " " + key myFile.write(linesentence) myFile.write("\n") myFile.close() if config.CLASSIFIER == config.GLOVE_CLASSIFIER: sentences = models.word2vec.LineSentence("sentences.txt") corpus = Corpus() corpus.fit(sentences, window=8) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus.matrix, epochs=100, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save("mygloveModel") elif config.CLASSIFIER == config.W2V_CLASSIFIER: txt = open("sentences.txt") # print txt.read() if (len(txt.read()) > 0): print "in here" txt.close() sentences = models.word2vec.LineSentence("sentences.txt") model = models.word2vec.Word2Vec(sentences, size=50, window=15, min_count=1, workers=4) model.save("word2vecModel") txt.close()
def fit(self, sents): corpus = Corpus() corpus.fit(sents, window=self.window_size) model = Glove(no_components=self.n_features, learning_rate=0.05) model.fit(corpus.matrix, epochs=self.n_epochs, no_threads=self.n_threads, verbose=True) model.add_dictionary(corpus.dictionary) self.model = model return self
def generate_glove_text8(EMBEDDING_DIM, saveTo='models/glovetext8.model'): import itertools sentences = list(itertools.islice(word2vec.Text8Corpus('data/text8'), None)) corpus = Corpus() corpus.fit(sentences, window=10) glove = Glove(no_components=EMBEDDING_DIM, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save(saveTo) print('DONE! Saved to', saveTo)
def glove(lines): # creating a corpus object corpus = Corpus() # training the corpus to generate the co occurence matrix which is used in GloVe corpus.fit(lines, window=10) # creating a Glove object which will use the matrix created in the above lines to create embeddings # We can set the learning rate as it uses Gradient Descent and number of components glove = Glove(no_components=5, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model')
def pretrain(self,data_src): if not os.path.isfile("glove.model"): data_src = DataClean([ ["[^a-z]"," "], # only letters [" [ ]+", " "], # remove extra spaces ],html_clean=True,split_words=True).fit(data_src).transform(data_src) corpus_model = Corpus() corpus_model.fit(data_src,window=self.window) glove = Glove(no_components=self.num_features,learning_rate=self.learning_rate) glove.fit(corpus_model.matrix,epochs=self.epochs,verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save("glove.model")
def run_glove(self): """ run global vector """ #sentences = [["hi","good","to"],["see","u"]] sentences = self.get_sentences() print '\n' + '-'*80 print "Fitting words into corpus" corpus = Corpus() corpus.fit(sentences, window=10) print "Running Glove" glove = Glove(no_components=200, learning_rate=0.05) glove.fit(corpus.matrix, epochs=5, no_threads=10, verbose=True) glove.add_dictionary(corpus.dictionary) print "Fitting words and vectors into unique_words and vectors200" unique_words = [] vectors200 = [] cnt1 = 0 length1 = len(glove.inverse_dictionary) for word_id in glove.inverse_dictionary: cnt1 += 1 unique_words.append(glove.inverse_dictionary[word_id]) vectors200.append(glove.word_vectors[word_id]) sys.stdout.write("\rStatus: %s / %s"%(cnt1, length1)) sys.stdout.flush() print '\n' + "Processing vectors200" processed_vectors200 = [] processed_vector = [] cnt2 = 0 length2 = len(vectors200) for vector in vectors200: cnt2 += 1 for float_num in vector: processed_vector.append(float_num) processed_vectors200.append(processed_vector) sys.stdout.write("\rStatus: %s / %s"%(cnt2, length2)) sys.stdout.flush() return unique_words, processed_vectors200
def train_glove(sentences): print 'training glove model...' t0 = time() num_features = 300 # Word vector dimensionality context = 5 # Context window size learning_rate = 0.05 corpus = Corpus() corpus.fit(sentences, window=context) glove = Glove(no_components=num_features, learning_rate=learning_rate) glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True) glove.add_dictionary(corpus.dictionary) print 'took %0.5fs.' % (time() - t0) return glove
def build_glove_embeddings(training, testing, args): ''' Trains the model on the sentiment140 dataset @Arguments: data: the loaded sentiment140 dataset from module num_epochs: the number of epochs to train on num_threads: the number of threads to use num_components: the number of components the glove model should use learning_rate: the model's learning rate window_size: the size of the window to use when looking for word co-occurence verbose: boolean for whether or not extensive output should be printed to screen @Return: A trained glove model ''' # initialize model glove = Glove(no_components = args.vecsize, learning_rate = args.learningRate) txtSource = chain( imap(lambda (txt,lbl): txt, training), imap(lambda (txt,lbl): txt, testing)) # read in the data to train on corpus_model = Corpus() corpus_model.fit( imap(preprocess.tokenize, txtSource), window = args.window) # fit the model using the given parameters logging.info("Training GloVe") glove.fit(corpus_model.matrix, epochs = args.epochs, no_threads = args.parallelism, verbose = args.verbose) # add a dictionary just to make it easier for similarity queries glove.add_dictionary(corpus_model.dictionary) transformer = lambda words: glove.transform_paragraph(words, use_pca = args.pca) fromTraining = to_sklearn_format(transformer, training, args.vecsize) fromTesting = to_sklearn_format(transformer, testing, args.vecsize) return fromTraining, fromTesting
print("Loading pretrained corpus...") corpus = Corpus.load("cache/corpus.p") except: print("Training corpus...") corpus.fit(texts, window=max_sentence_length) corpus.save("cache/corpus.p") glove = Glove(no_components=number_components, learning_rate=0.05) try: print("Loading pretrained GloVe vectors...") glove = Glove.load("cache/glove.p") except: print("Training GloVe vectors...") # More epochs seems to make it worse glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save("cache/glove.p") # Convert input text print("Vectorizing input sentences...") X = vectify(texts, previous_message, glove.dictionary, max_sentence_length, contextual) y = np.array([x == u'1' for x in classes]).astype(np.int32) X, y, texts = X[:207458], y[:207458], texts[:207458] def print_accurate_forwards(net, history): X_train, X_valid, y_train, y_valid = net.train_split(X, y, net) y_classified = net.predict(X_valid) acc_fwd = np.mean([x == y_ and y_ == 1 for x, y_ in zip(y_valid, y_classified)])/np.mean(y_valid) fls_pos = np.mean([x != y_ and y_ == 0 for x, y_ in zip(y_classified, y_valid)])/(np.mean(y_valid)) print('Accurately forwarded: {:.4f}'.format(acc_fwd) + ', False Positives: {:.4f}'.format(fls_pos) + ', Valid forwards: {:.4f}'.format((acc_fwd / (acc_fwd + fls_pos))) )
print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(args.train), no_threads=args.parallelism, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model') if args.query: # Finally, query the model for most similar words. if not args.train: print('Loading pre-trained GloVe model') glove = Glove.load('glove.model') print('Querying for %s' % args.query) pprint.pprint(glove.most_similar(args.query, number=10))
mlp1000 = mlp_model(1000) mlp1000_accuracy = train_test(mlp1000, x, y, folds) print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy)) #3CNN #Glove Vectors from reviews c = [review.split() for review in data.data] corpus = Corpus() corpus.fit(c, window=10) glv = Glove(no_components=100, learning_rate=0.05) glv.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glv.add_dictionary(corpus.dictionary) embeddings_index = glv.dictionary BASE_DIR = '' GLOVE_DIR = BASE_DIR + '/glove.6B/' TEXT_DATA_DIR = 'txt_sentoken/' MAX_SEQUENCE_LENGTH = 1000 MAX_NB_WORDS = 20000 EMBEDDING_DIM = 100 VALIDATION_SPLIT = 0.2 texts = [] # list of text samples labels_index = {} # dictionary mapping label name to numeric id labels = [] # list of label ids for name in sorted(os.listdir(TEXT_DATA_DIR)): path = os.path.join(TEXT_DATA_DIR, name)
@author: dannl ''' from glove import Glove from glove import Corpus import time cooc_file='/home/dannl/tmp/newstech/glove/word.cooc' model_file='/home/dannl/tmp/newstech/glove/glove.model' oldtime=time.time() # get a cooccurrence matrix corpus_cooc = Corpus.load(cooc_file) # get a model glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_cooc.matrix, epochs=5,no_threads=4, verbose=True) glove.add_dictionary(corpus_cooc.dictionary) glove.save(model_file) # count=0 # for word,wid in corpus_cooc.dictionary.items(): # count+=1 # if count>100: # break # print word,wid print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) print 'time cost:%.2f'%(time.time()-oldtime)