def parse_Word2Vec(full_content): corpus = Corpus() corpus.fit(full_content, window=10) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) # Open file to write the results f2 = open('/home/ubuntu/corpus/results.txt', 'w') # Loop through all the article types in the file with open('/home/ubuntu/corpus/article_types.csv', 'r') as f: reader = csv.reader(f, delimiter="\t") for row in reader: article_type = row[0] translator = str.maketrans({key: '' for key in string.punctuation}) article_type_no_punctuation = article_type.translate(translator) wordnet = WordNetLemmatizer() article_type_clean = wordnet.lemmatize(article_type_no_punctuation) try: match = glove.most_similar(article_type_clean, number=10) matched_item = match[0][0] print(article_type_clean + ' -> ' + str(matched_item)) f2.write(article_type + '\n') f2.write(str(matched_item + '\n')) except: pass print('failed for: ' + article_type) f2.close()
def Myself_Model(self, cropus_path, save=None, back_corpus=None, epochs=10, no_threads=8, no_components=100, learning_rate=0.05): """ sd """ self.get_data = self.read_corpus(cropus_path) corpus_model = Corpus() corpus_model.fit(self.get_data, window=10) if back_corpus != None: yield corpus_model #self.glove = Glove() self.glove = Glove(no_components=no_components, learning_rate=learning_rate) self.glove.fit(corpus_model.matrix, epochs=epochs, no_threads=no_threads, verbose=True) self.glove.add_dictionary(corpus_model.dictionary) if save != None: #save = 'model/articles_glove.model' self.glove.save(save) self.model = self.glove return self.glove
def glove_embed(data, embed_dim, window_size, epochs_, step_size): ''' DESCRIPTION : Perform Global Vectors for word embeddings for tokens in data set INPUT: |--- train: list of tweets |--- embed_size: [int] integer representing embedding dimension |--- window_size: [int] integer representing the size of the window of tokens considered during training for each token |--- epochs: [int] integer for number of epochs for Word2Vec training |--- step_size: [float] learning step for the SGD for Word2Vec training OUTPUT: |--- embeddings: [dict] dictionnary with tweets as keys and 1D array of feature vector as values |--- vocab: [dict] dictionnary with tokens as keys and index of each token in vocab as values |--- glove: [Global Vectors Model] GloVe model trained on data ''' sentences = get_tokens(data) model = Corpus() model.fit(sentences, window=window_size) glove = Glove(no_components=embed_dim, learning_rate=step_size) glove.fit(model.matrix, epochs=epochs_, no_threads=1, verbose=True) glove.add_dictionary(model.dictionary) embeddings = np.zeros((len([*glove.dictionary]), embed_dim)) for w, id_ in glove.dictionary.items(): embeddings[id_, :] = np.array([glove.word_vectors[id_]]) vocab = dict() for idx, line in enumerate([*glove.dictionary]): vocab[line.strip()] = idx return embeddings, vocab, glove
def get_embeddings(prepared_input): corpus = Corpus() corpus.fit(prepared_input, window=10) glove = Glove(no_components=5, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model')
def train_glove(save_dir, size): print('START') f_corpus = get_full_corpus() corpus = Corpus() print('CREATE CORPUS') corpus.fit(f_corpus, window=10) word_dict = corpus.dictionary.keys() glove = Glove(no_components=size, learning_rate=0.05) print('START LEARNING') glove.fit(corpus.matrix, epochs=60, no_threads=8, verbose=True) glove.add_dictionary(corpus.dictionary) dict_in_bin = dict() print('START SAVE') for item in word_dict: word_indx = glove.dictionary[item] dict_in_bin[item] = glove.word_vectors[word_indx] with open(save_dir, "wb") as file: pickle.dump(dict_in_bin, file) print('COMMON TEST') while True: try: s = input("Введите строку: ") print(glove.most_similar(s, number=10)) word_indx = glove.dictionary[s] print(glove.word_vectors[word_indx]) except: continue
def build_model_glove(args): if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def train_glove(src_filename, dim=100): corpus = Corpus() corpus.fit(get_lines(src_filename), window=10) glove = Glove(no_components=dim, learning_rate=0.001) glove.fit(corpus.matrix, epochs=100, no_threads=20, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save(DATA_DIR + 'glove.{}d.model'.format(dim))
def word_embedding(sentences,embedding_size,windows_len): """ Verify that the square error diminishes with fitting """ corpus_model = Corpus() corpus_model.fit(sentences,window=windows_len) # Check that the performance is poor without fitting glove_model = Glove(no_components=embedding_size, learning_rate=0.05) glove_model.fit(corpus_model.matrix, epochs=0, no_threads=2) log_cooc_mat = corpus_model.matrix.copy() log_cooc_mat.data = np.log(log_cooc_mat.data) log_cooc_mat = np.asarray(log_cooc_mat.todense()) corpus_dict=corpus_model.dictionary corpus_inverse_dict=dict(map(reversed, corpus_dict.items())) return glove_model,corpus_dict,corpus_inverse_dict
def generate_glove_corpus(): global article_info_path, output_path write_log('GloVe Load article info : Start') with open(article_info_path, 'r') as f_art: article_info = json.load(f_art) write_log('GloVe Load article info : End') write_log('GloVe Generate sentences : Start') sentences = [] for url, dict_info in article_info.items(): sentence_header = dict_info.get('sentence_header', None) sentence_body = dict_info.get('sentence_body', None) if (sentence_header == None) or (sentence_body == None): continue words = [] #for sentence in sentence_header + sentence_body: for sentence in sentence_header: for word in sentence.split(' '): words.append(word) sentences.append(words) write_log('GloVe Generate sentences : End') write_log('GloVe Generate corpus : Start') corpus = Corpus() corpus.fit(sentences, window=10) write_log('GloVe Generate corpus : End') corpus.save(output_path)
def train_glove(target_group, glove_para, src_file, save_model_name): """ example: train_glove(target_group='words', glove_para=glove_para_word) after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model') :param target_group: 'words' or 'chars' :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4} :return: """ corpus_model = Corpus() corpus_model.fit(read_corpus(src_file=src_file, words_or_chars=target_group), window=glove_para['window_size'] ) #avg word size is 6 for each sentence corpus_model.save('corpus_model_{}.model'.format(target_group)) print target_group print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=glove_para['no_components'], learning_rate=glove_para['learning_rate']) glove.fit(corpus_model.matrix, epochs=glove_para['no_epochs'], no_threads=glove_para['parallelism'], verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(save_model_name)
class MyGloVe: def initiate_model(self, input_corpus): self.corpus_model = Corpus() self.corpus_model.fit(self.__read_corpus(input_corpus), window=10) self.glove = Glove(no_components=100, learning_rate=0.05) self.glove.fit(self.corpus_model.matrix, epochs=200) self.glove.add_dictionary(self.corpus_model.dictionary) def cosine_similarity(self, first_text, second_text): first = self.__average_feature_vector(first_text) second = self.__average_feature_vector(second_text) return 1 - spatial.distance.cosine(first, second) def __read_corpus(self, input_corpus): for line in input_corpus: yield line def __average_feature_vector(self, text): words = text.split() words_no = 0 feature_vector = numpy.zeros((100, ), dtype="float32") for word in words: if word in self.glove.dictionary: word_idx = self.glove.dictionary[word] words_no += 1 feature_vector = numpy.add(feature_vector, self.glove.word_vectors[word_idx]) if words_no > 0: feature_vector = numpy.divide(feature_vector, words_no) return feature_vector
def build_glove_embeddings(corpus): """ DESCRIPTION: Applies the Glove python SGD algorithm given by glove_python library and build the word embeddings from our training set. INPUT: corpus: a list of lists where each sub-list represent a tweet. The outer list represents the whole training dataset. OUTPUT: words: python dictionary of the form (word, [vector of embeddings]) """ words = load_glove_embeddings_from_txt_file( MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE) if words != None: return words model = Corpus() model.fit(corpus, window=algorithm['options']['WE']['window_size']) glove = Glove(no_components=algorithm['options']['WE']['we_features'], learning_rate=algorithm['options']['WE']['learning_rate']) print('\nFitting Glove Python Embeddings') glove.fit(model.matrix, epochs=algorithm['options']['WE']['epochs']) glove.add_dictionary(model.dictionary) words = {} for w, id_ in glove.dictionary.items(): words[w] = np.array(glove.word_vectors[id_]) store_embeddings_to_txt_file(words, MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE) return words
def build_model_glove(args): from glove import Glove, Corpus if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def train_glove(corpus, vocabulary, zero_init_indices=0, rand_init_indices=1, embedding_dim=300): """Use glove to train on corpus to obtain embedding Here we use a python implementation of Glove, but the official glove implementation of C version is also highly recommended: https://github.com/stanfordnlp/GloVe/blob/master/demo.sh Args: corpus: list of tokenized texts, corpus to train on vocabulary: dict, a mapping of words to indices zero_init_indices: int or a list, the indices which use zero-initialization. These indices usually represent padding token. rand_init_indices: int or a list, the indices which use randomly-initialization.These indices usually represent other special tokens, such as "unk" token. embedding_dim: int, dimensionality of embedding Returns: np.array, a word embedding matrix. """ corpus_model = Corpus() corpus_model.fit(corpus, window=10) glove = Glove(no_components=embedding_dim, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True) glove.add_dictionary(corpus_model.dictionary) word_vectors = dict( (w, glove.word_vectors[glove.dictionary[w]]) for w in glove.dictionary) emb = filter_embeddings(word_vectors, embedding_dim, vocabulary, zero_init_indices, rand_init_indices) return emb
def train(path, freq, window, dim, lr, epochs): lines = [] dic = {} print("Start of train method") try: for f in os.listdir(path): text = open(path + '/' + f, 'r').read() text = re.sub('\n', ' ', text) text = text.split() for word in text: if word in dic.keys(): dic[word] += 1 else: dic[word] = 1 print("Created Dictionary for frequencies of words.") for f in os.listdir(path): text = open(path + '/' + f, 'r').read() text = re.sub('\n', ' ', text) text = text.split() text = [word for word in text if dic[word] > freq] lines.append(text) print( "Converted preprocessed text data in input format of array of array of words." ) corpus = Corpus() corpus.fit(lines, window=window) glove = Glove(no_components=dim, learning_rate=lr) glove.fit(corpus.matrix, epochs=epochs, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') print("Saved the trained model to glove.model.") except: print("Error occured in training glove model")
def train_glove(path): import itertools from gensim.models.word2vec import Text8Corpus from gensim.scripts.glove2word2vec import glove2word2vec from glove import Corpus, Glove #import os #import struct sentences = list(itertools.islice(Text8Corpus(path), None)) corpus = Corpus() corpus.fit(sentences, window=10) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES) glove.save(file_name) glove2word2vec(file_name, file_name + '_modified') """ command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified' os.system(command) with open(file_name+'_modified', mode='rb') as file: # b is important -> binary fileContent = file.read() print 'Content',fileContent """ print 'Finished' return glove
class GloVeFilter(object): def __init__(self): # Corpus model vocab = dict(torch.load("../data/dialogue.vocab.pt", "text")) self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi) # Model self.glove = Glove(no_components=args.no_components, learning_rate=args.learning_rate) def load_corpus_from_txt(self): print('Reading corpus statistics...') #texts = [self.pp.preprocessing(l.strip().decode("utf8", "ignore")) for l in open(args.data_path)] texts = [ l.strip().decode("utf8", "ignore").split(" ") for l in open(args.data_path) ] self.corpus_model.fit(texts, window=args.window, ignore_missing=True) self.corpus_model.save(args.corpus_model_path) print('Dict size: %s' % len(self.corpus_model.dictionary)) print('Collocations: %s' % self.corpus_model.matrix.nnz) def load_corpus_from_model(self): print('Reading corpus statistics...') self.corpus_model = Corpus.load(args.corpus_model_path) print('Dict size: %s' % len(self.corpus_model.dictionary)) print('Collocations: %s' % self.corpus_model.matrix.nnz) def train(self): print('Training the GloVe model...') self.glove.fit(self.corpus_model.matrix, epochs=args.epochs, verbose=True) self.glove.add_dictionary(self.corpus_model.dictionary) self.glove.save(args.model_path) print('Training finished')
def getWordEmbeddings(processed_text): corpus = Corpus() corpus.fit(processed_text, window=3) glove = Glove(no_components=500, learning_rate=0.001) glove.fit(corpus.matrix, epochs=300000, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) print(glove.most_similar('price'))
def train_and_save_model(data_dir, model_name='LeGlove', num_epochs=10, parallel_threads=1): ''' This function processes all the data into a training corpus and fits a GloVe model to this corpus. Parameters: data_dir (string): master directory containing all jurisdiction-level directories model_name (string): name of model to be used for output num_epochs (int): number of epochs for which to train model parallel_threads (int): number of parallel threads to use for training The trained model is saved as "[model_name].model" into the current directory. ''' corpus_model = Corpus() corpus_model.fit(read_corpus(data_dir), window=CONTEXT_WINDOW) glove = Glove(no_components=NUM_COMPONENTS, learning_rate=LEARNING_RATE) glove.fit(corpus_model.matrix, epochs=num_epochs, no_threads=parallel_threads, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(model_name + '.model')
def feature_extract(path_dataset): feature_extract_dataset = [] speeches = read_csv(path_dataset, sep="|") speeches['Classe'] = speeches['Classe'].replace(1, 1) #Falas mistas de Estamira e sua Família speeches['Classe'] = speeches['Classe'].replace(0, -1) # Para cada fala for indice, fala in enumerate(speeches.Fala): #inicialização do método para pegar co-ocorrência dataset = Corpus() grafo = Graph() lsa = TruncatedSVD(n_components=1) tolkenizado = [simple_preprocess(str(fala), deacc=True)] quantas_palavras = shape(tolkenizado)[1] dataset.fit(tolkenizado, window=79) graph = Graph(dataset.matrix) values_lsa = lsa.fit_transform(dataset.matrix) values_mean = mean(values_lsa, axis=0) values_std = std(values_lsa, axis=0) feature_extract_dataset.append([ average_clustering(G=graph), average_shortest_path_length(G=graph), speeches.comprimento[indice], values_mean.item(), values_std.item(), quantas_palavras ]) return DataFrame(feature_extract_dataset), speeches['Classe'].values
def test_fitting(): """ Verify that the square error diminishes with fitting """ num_sentences = 5000 seed = 10 corpus = Corpus() corpus.fit(generate_training_corpus(num_sentences, vocabulary_size=50, seed=seed)) # Check that the performance is poor without fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=0, no_threads=2) log_cooc_mat = corpus.matrix.copy() log_cooc_mat.data = np.log(log_cooc_mat.data) log_cooc_mat = np.asarray(log_cooc_mat.todense()) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0 # Check that it is good with fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=500, no_threads=2) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
def train_glove_fashionrec(dimensionality, context, epochs): """ Train with Glove on IG corpora""" total_count, vocab_size = corpus_stats("data/clean2_corpus.txt") print("total word count: {}, vocabulary size: {}".format( total_count, vocab_size)) fileName = "results/training/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_" + ".txt" corpus = readCorpus() lines = corpus.split("\n") linessplit = map(lambda x: x.split(" "), lines) corpus_model = Corpus() start_time = datetime.now() corpus_model.fit(linessplit, window=context) corpusModelFile = "trained/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_corpus" + ".model" corpus_model.save(corpusModelFile) glove = Glove(no_components=dimensionality, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(epochs), no_threads=8, verbose=True) glove.add_dictionary(corpus_model.dictionary) time_elapsed = datetime.now() - start_time gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".model" glove.save(gloveModelFile) notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str( context) + " context, " + str( epochs) + " epochs \n" + "Training time: " + str(time_elapsed) save_to_file(fileName, notes) gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".vec" save_glove_bin_to_vec(glove, gloveVecFile)
def glove_vectors(x, embedding_size, epochs=50, lr=0.05, alpha=0.75, max_count=100, tmp_loc='glove.w2vmodel'): # create dict ourselves so that the ids correspond to their location in the df, starting to count from first col downwards df = pd.DataFrame(x) word_id_dict = create_vocab_dict(df) # Creating a corpus object corpus = Corpus(dictionary=word_id_dict) # Training the corpus to generate the co occurence matrix which is used in GloVe # Distance scaling: standard glove reduces the occurence count based on how far a context word is from the focus word. # Should not be used since distance has no meaning for purely categorical variables. corpus.fit(df.values.tolist(), window=len(df.columns), distance_scaling=False) # alpha is the weighing of the loss, based on how likely a cooccurence is (Xij), less likely = less weight. glove = Glove(no_components=embedding_size, learning_rate=lr, alpha=alpha, max_count=max_count) glove.fit( corpus.matrix, epochs=epochs, no_threads=1, verbose=True ) # glove paper: 50 epochs for dimensionality <300, 100 otherwise glove.add_dictionary(corpus.dictionary) glove.save_word2vec_format(tmp_loc) model = KeyedVectors.load_word2vec_format(tmp_loc) if os.path.exists(tmp_loc): os.remove(tmp_loc) return model
def train_glove(corpus, params, exp_id, save_dir, save_dict=False): dictionary = load_glove_dictionary(exp_id, save_dir) # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id)) if os.path.exists(dict_path): corpus_model = Corpus.load(dict_path) else: corpus_model = Corpus(dictionary) corpus_model.fit(corpus, window=params['window'] * 2, ignore_missing=True) if save_dict: corpus_model.save(dict_path) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=100, learning_rate=params['alpha']) glove.fit(corpus_model.matrix, epochs=50, no_threads=params['workers'], verbose=True) glove.add_dictionary(corpus_model.dictionary) return glove
def main(args): ############################ # corpus_model = Corpus() # corpus_model.fit(read_corpus(args.corpus)) # corpus_model.save('corpus_select.model') ############################ # corpus_model = Corpus().load('corpus_select.model') # print('Dict size: %s' % len(corpus_model.dictionary)) # print('Collocations: %s' % corpus_model.matrix.nnz) # with open('global_vocab.pkl', 'wb') as handle: # pickle.dump(corpus_model.dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) ############################ # opening vocab to create the corpus object with open('global_vocab.pkl', 'rb') as f: vocab_dict = pickle.load(f) doc_model = Corpus(dictionary=vocab_dict) texts = list(read_corpus(args.corpus)) #opening weight csv diff_bias = pd.read_csv(args.diff_bias, header=0) #col 2 is science/arts, col 3 is weapons/instruments total = {} # for i in range(10): for i in tqdm(range(len(texts))): doc = [texts[i]] doc_model.fit(doc) # we might not even need to save it, just put it into one matrix and save that coo = doc_model.matrix.todok() weight = diff_bias.iloc[i, 2] coo = {k:weight*v for k,v in coo.items()} total = Counter(coo) + Counter(total) def _dict_to_csr(term_dict): term_dict_v = term_dict.values() term_dict_k = term_dict.keys() term_dict_k_zip = zip(*term_dict_k) term_dict_k_zip_list = list(term_dict_k_zip) shape = (len(term_dict_k_zip_list[0]), len(term_dict_k_zip_list[1])) csr = sp.csr_matrix((list(term_dict_v), list(map(list, zip(*term_dict_k)))), shape = shape) coo = csr.tocoo() return coo total = dict(total) total = _dict_to_csr(total) print(total.get_shape()) with open('doc_matrices_weighted.pkl', 'wb') as handle: pickle.dump(total, handle, protocol=pickle.HIGHEST_PROTOCOL)
def trainShake2(self): corpus = Corpus() shakespeare_words = self.shakespeare_lines() # corpus.fit(shakespeare_corpus + sonnets_corpus, window=10) corpus.fit(shakespeare_words, window=10) self.glove = Glove(no_components=100, learning_rate=0.05) self.glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) self.glove.add_dictionary(corpus.dictionary)
def getGloveEmbedding(seqs, size=300, window=10, epochs=20): corpus = Corpus() corpus.fit(seqs, window=window) glove = Glove(no_components=size, learning_rate=0.05) glove.fit(corpus.matrix, epochs=epochs, verbose=True) return corpus.dictionary, glove.word_vectors
def train_model(line): corpus = Corpus() corpus.fit(line) glove = Glove(no_components=5, learning_rate=0.05, random_state=0) glove.fit(corpus.matrix, epochs=10, no_threads=100, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') return glove
def build_glove_word_vectors(data_frame, vec_dim, vectorizer, window_size, niter): corpus = Corpus(vectorizer.vocabulary_) corpus.fit(data_frame.post, window=window_size, ignore_missing=True) glove = Glove(no_components=vec_dim, learning_rate=0.01) glove.fit(corpus.matrix, epochs=niter, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) return glove
def buildCorpus(data_path=None, context_window=5): # function that loads in wikipedia data and fits corpus model print('Fitting data...') # intialize and fit corpus corpus = Corpus() corpus.fit(textGenerator(data_path), window=context_window) return corpus
def test_supplied_dict_missing(): dictionary = {'a': 1, 'naïve': 0} corpus = [['a', 'naïve', 'fox']] model = Corpus(dictionary=dictionary) model.fit(corpus, max_map_size=0, window=10)
def glove_feat(df, feat, length): corpus = Corpus() corpus.fit(df[feat], window=20) glove = Glove(no_components=length, learning_rate=0.05) glove.fit(corpus.matrix, epochs=10, no_threads=10, verbose=True) glove.add_dictionary(corpus.dictionary) return glove
def pretrain(self,data_src): if not os.path.isfile("glove.model"): data_src = DataClean([ ["[^a-z]"," "], # only letters [" [ ]+", " "], # remove extra spaces ],html_clean=True,split_words=True).fit(data_src).transform(data_src) corpus_model = Corpus() corpus_model.fit(data_src,window=self.window) glove = Glove(no_components=self.num_features,learning_rate=self.learning_rate) glove.fit(corpus_model.matrix,epochs=self.epochs,verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save("glove.model")
def train_glove(sentences): print 'training glove model...' t0 = time() num_features = 300 # Word vector dimensionality context = 5 # Context window size learning_rate = 0.05 corpus = Corpus() corpus.fit(sentences, window=context) glove = Glove(no_components=num_features, learning_rate=learning_rate) glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True) glove.add_dictionary(corpus.dictionary) print 'took %0.5fs.' % (time() - t0) return glove
def run_glove(self): """ run global vector """ #sentences = [["hi","good","to"],["see","u"]] sentences = self.get_sentences() print '\n' + '-'*80 print "Fitting words into corpus" corpus = Corpus() corpus.fit(sentences, window=10) print "Running Glove" glove = Glove(no_components=200, learning_rate=0.05) glove.fit(corpus.matrix, epochs=5, no_threads=10, verbose=True) glove.add_dictionary(corpus.dictionary) print "Fitting words and vectors into unique_words and vectors200" unique_words = [] vectors200 = [] cnt1 = 0 length1 = len(glove.inverse_dictionary) for word_id in glove.inverse_dictionary: cnt1 += 1 unique_words.append(glove.inverse_dictionary[word_id]) vectors200.append(glove.word_vectors[word_id]) sys.stdout.write("\rStatus: %s / %s"%(cnt1, length1)) sys.stdout.flush() print '\n' + "Processing vectors200" processed_vectors200 = [] processed_vector = [] cnt2 = 0 length2 = len(vectors200) for vector in vectors200: cnt2 += 1 for float_num in vector: processed_vector.append(float_num) processed_vectors200.append(processed_vector) sys.stdout.write("\rStatus: %s / %s"%(cnt2, length2)) sys.stdout.flush() return unique_words, processed_vectors200
def test_supplied_dictionary(): dictionary = {'a': 2, 'naïve': 1, 'fox': 0} corpus = [['a', 'naïve', 'fox']] model = Corpus(dictionary=dictionary) model.fit(corpus, max_map_size=0, window=10) assert model.dictionary == dictionary assert model.matrix.shape == (len(dictionary), len(dictionary)) assert (model.matrix.tocsr()[2]).sum() == 0
def test_corpus_construction(): corpus_words = ['a', 'naïve', 'fox'] corpus = [corpus_words] model = Corpus() model.fit(corpus, max_map_size=0, window=10) for word in corpus_words: assert word in model.dictionary assert model.matrix.shape == (len(corpus_words), len(corpus_words)) expected = [[0.0, 1.0, 0.5], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0]] assert (model.matrix.todense().tolist() == expected)
def build_glove_embeddings(training, testing, args): ''' Trains the model on the sentiment140 dataset @Arguments: data: the loaded sentiment140 dataset from module num_epochs: the number of epochs to train on num_threads: the number of threads to use num_components: the number of components the glove model should use learning_rate: the model's learning rate window_size: the size of the window to use when looking for word co-occurence verbose: boolean for whether or not extensive output should be printed to screen @Return: A trained glove model ''' # initialize model glove = Glove(no_components = args.vecsize, learning_rate = args.learningRate) txtSource = chain( imap(lambda (txt,lbl): txt, training), imap(lambda (txt,lbl): txt, testing)) # read in the data to train on corpus_model = Corpus() corpus_model.fit( imap(preprocess.tokenize, txtSource), window = args.window) # fit the model using the given parameters logging.info("Training GloVe") glove.fit(corpus_model.matrix, epochs = args.epochs, no_threads = args.parallelism, verbose = args.verbose) # add a dictionary just to make it easier for similarity queries glove.add_dictionary(corpus_model.dictionary) transformer = lambda words: glove.transform_paragraph(words, use_pca = args.pca) fromTraining = to_sklearn_format(transformer, training, args.vecsize) fromTesting = to_sklearn_format(transformer, testing, args.vecsize) return fromTraining, fromTesting
def test_supplied_dict_missing_ignored(): dictionary = {'a': 0, 'fox': 1} corpus = [['a', 'naïve', 'fox']] model = Corpus(dictionary=dictionary) model.fit(corpus, max_map_size=0, window=10, ignore_missing=True) assert model.dictionary == dictionary assert model.matrix.shape == (len(dictionary), len(dictionary)) # Ensure that context windows and context window # weights are preserved. full_model = Corpus() full_model.fit(corpus, window=10) assert (full_model.matrix.todense()[0, 2] == model.matrix.todense()[0, 1] == 0.5)
def main(): corpus_model = Corpus() corpus_model.fit(itertexts(), window=10, max_map_size=1000000) corpus_model.save('bioc-corpus-AZ2.model')
help='Get closes words to this word.') args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') if args.wiki: print('Using wikipedia corpus') get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_cooc = Corpus() corpus_cooc.fit(get_data(args.create), window=10) corpus_cooc.save('corpus.model') print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_cooc = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz)
help='Get closes words to this word.') args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') if args.wiki: print('Using wikipedia corpus') get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_model = Corpus() corpus_model.fit(get_data(args.create), window=10) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz)
mlp10_accuracy = train_test(mlp10, x, y, folds) mlp100 = mlp_model(100) mlp100_accuracy = train_test(mlp100, x, y, folds) mlp1000 = mlp_model(1000) mlp1000_accuracy = train_test(mlp1000, x, y, folds) print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy)) #3CNN #Glove Vectors from reviews c = [review.split() for review in data.data] corpus = Corpus() corpus.fit(c, window=10) glv = Glove(no_components=100, learning_rate=0.05) glv.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glv.add_dictionary(corpus.dictionary) embeddings_index = glv.dictionary BASE_DIR = '' GLOVE_DIR = BASE_DIR + '/glove.6B/' TEXT_DATA_DIR = 'txt_sentoken/' MAX_SEQUENCE_LENGTH = 1000 MAX_NB_WORDS = 20000 EMBEDDING_DIM = 100 VALIDATION_SPLIT = 0.2
delchars = ''.join(delchars) with open(filename, 'r') as datafile: for line in datafile: # list of tokenized words yield line.lower().translate(None, delchars).split(' ') if __name__ == '__main__': # initialize glove object glove = Glove(no_components=100, learning_rate=0.05) # read in the data to train on; this file is shakespeare text corpus_model = Corpus() corpus_model.fit(read_corpus("data/input.txt"), window=10) # fit the model using the given parameters glove.fit(corpus_model.matrix, epochs=10, no_threads=1, verbose=True) # add a dictionary just to make it easier for similarity queries glove.add_dictionary(corpus_model.dictionary) # save glove object to file glove.save_obj('glove.model.obj') # give me the 5 words most similar to each word in the words list in this # corpus and show me how similar the words are in this corpus to each word # in the words list in general words = ['sky', 'queen', 'car']
''' from glove import Glove from glove import Corpus from gensim import corpora import time dic_file=r'/home/dannl/tmp/newstech/glove/news.dic' corpus_file='/home/dannl/tmp/newstech/news.txt' cooc_file='/home/dannl/tmp/newstech/glove/word.cooc' def read_corpus(filename): with open(filename, 'r') as datafile: for line in datafile: yield line.split()[1:] # get a cooccurrence matrix oldtime=time.time() dictionary = corpora.Dictionary.load(dic_file) # corpus_cooc = Corpus() # corpus_cooc.fit(read_corpus(corpus_file), window=10) corpus_cooc = Corpus(dictionary=dictionary.token2id) corpus_cooc.fit(read_corpus(corpus_file), window=10,ignore_missing=True) corpus_cooc.save(cooc_file) print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) print 'time cost:%.2f'%(time.time()-oldtime,)
def fit_corpus(corpus): model = Corpus() model.fit(corpus) return corpus
with open('yahoo_train.txt', 'r') as file: for line in file: d = json.loads(line) uris.append(d[0]) questions.append(d[1]) answers.append(d[2]) cats.append(d[3]) def get_lines(): for a in answers: yield a.split() # Build the corpus dictionary and cooccurence matrix corpus_model = Corpus() corpus_model.fit(get_lines(), window=8) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) # Train GloVe model #glove = Glove(no_components = no_comp, learning_rate=0.05) glove = Glove.load_stanford('vectors.6B.100d.txt') glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True) glove.add_dictionary(corpus_model.dictionary) # Save with open('model.glove', 'w+') as file: file.write('%i %i \n' % (len(glove.dictionary), no_comp)) for (word, idx) in glove.dictionary.iteritems(): file.write('%s %s \n' % (word, ' '.join(str(n) for n in glove.word_vectors[idx])))
import itertools from gensim.models.word2vec import Text8Corpus from glove import Corpus, Glove # for installing text8 corpus you should follow this commands # wget http://mattmahoney.net/dc/text8.zip -P /tmp # unzip text8.zip sentences = list(itertools.islice(Text8Corpus('/tmp/text8'), None)) corpus = Corpus() corpus.fit(sentences, window=10) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) print glove.most_similar('frog', number=10) print glove.most_similar('girl', number=10) print glove.most_similar('car', number=10)
for row in csvsequence: texts.append(clean(row[3]).split()) classes.append(row[0]) # Calculate distribution, to account for 95th percentile of messages. max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts]))) print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length)) corpus = Corpus() try: print("Loading pretrained corpus...") corpus = Corpus.load("cache/corpus.p") except: print("Training corpus...") corpus.fit(texts, window=max_sentence_length) corpus.save("cache/corpus.p") glove = Glove(no_components=number_components, learning_rate=0.05) try: print("Loading pretrained GloVe vectors...") glove = Glove.load("cache/glove.p") except: print("Training GloVe vectors...") # More epochs seems to make it worse glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save("cache/glove.p") # Convert input text print("Vectorizing input sentences...")