def build_gensim(self, docs, model=None): dp = DocumentPreprocessor() docs_tokenized = (dp.tokenizer(doc) for doc in docs) # Get the word co-occurrence matrix -- needs lots of RAM!! cooccur = glove.Corpus() cooccur.fit(docs_tokenized, window=10) # wiki_generator = lambda: (filter_text(text) for text in wiki) # cooccur.fit(wiki_generator(), window=10) # and train GloVe model itself, using 10 epochs if model is None: model = glove.Glove(no_components=600, learning_rate=0.05) model.fit(cooccur.matrix, epochs=10) doc_vectors = [] docs_tokenized = (dp.tokenizer(doc) for doc in docs) for doc in docs_tokenized: doc_vector = np.zeros(len(model.word_vectors[0]), dtype=np.float) if len(doc): for word in doc: try: doc_vector += model[word] except: log.debug( 'Word: {} doesn\'t appear in model.'.format(word)) else: log.debug('Empty document in data') doc_vectors.append(doc_vector) return np.array(doc_vectors), model
def train_all(self): """ builds the vocab and trains the model :return: """ documents = list(self.read_input()) corpus = glove.Corpus() corpus.fit(documents, window=self.window_size) self.model = glove.Glove(no_components=self.vector_size, learning_rate=self.learning_rate, alpha=self.alpha) self.model.fit(corpus.matrix, epochs=self.iterations, no_threads=self.workers, verbose=True) self.dictionary=corpus.dictionary self.model.add_dictionary(corpus.dictionary)
def fit(self, corpus=None, size=2): # , distance_metric=0, zero_out_diag=False): if corpus is not None: self.corpus = corpus assert self.token2id is not None, "Fit with no vocabulary!" assert self.corpus is not None, "Fit with no corpus!" glove_corpus = glove.Corpus(dictionary=self.token2id) glove_corpus.fit(corpus, window=size) self.nw_xy = glove_corpus.matrix return self
def train(word2id, id2word, corpus, win, dim): cooccur = glove.Corpus(dictionary=word2id) cooccur.fit(corpus(), window=win) logger.info("glove model creating") logger.info('Dict size: %s' % len(cooccur.dictionary)) logger.info('Collocations: %s' % cooccur.matrix.nnz) model = glove.Glove(no_components=dim, learning_rate=0.05) model.fit(cooccur.matrix, epochs=10, no_threads=5, verbose=True) model.add_dictionary(cooccur.dictionary) model.word2id = dict( (utils.to_unicode(w), id) for w, id in model.dictionary.items()) model.id2word = gensim.utils.revdict(model.word2id) utils.pickle(model, './model/glove.model')
def glove2vec(text_sentence, win=10, noc=1, lr=0.05, epochs=10, nothr=1, verbose=True): corpus_model = glove.Corpus() corpus_model.fit(text_sentence, window=win) word_list = glove.Glove(no_components=noc, learning_rate=lr) word_list.fit(corpus_model.matrix, epochs=epochs, no_threads=nothr, verbose=verbose) word_list.add_dictionary(corpus_model.dictionary) return word_list
def generate_embedding_dictionary(docs_tokens, embedding_dim, iters, window=2, learning_rate=0.05): time_start = time() corpus_model = glove.Corpus() corpus_model.fit(docs_tokens, window=window) glove_model = glove.Glove(no_components=embedding_dim, learning_rate=learning_rate) glove_model.fit(corpus_model.matrix, epochs=iters, no_threads=4) end_time = time() glove_model.add_dictionary(corpus_model.dictionary) word_to_index = glove_model.dictionary index_word = glove_model.inverse_dictionary embedding_dictionary = {index_word[i]: vector for i, vector in enumerate(glove_model.word_vectors)} # embedding_dictionary["<<UNKNOWN>>"] = np.zeros(embedding_dim) return embedding_dictionary, embedding_dim, word_to_index, end_time - time_start
def glove(windows, num_components=16, glove_window=10, epochs=20, verbose=False): import glove import hdbscan import multiprocessing ws = [[template_id for template_id in w] for w in windows] corpus = glove.Corpus() corpus.fit(ws, window=glove_window) # TODO: Explore reasonable glove defaults glove_model = glove.Glove(no_components=num_components, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=epochs, no_threads=multiprocessing.cpu_count(), verbose=verbose) glove_model.add_dictionary(corpus.dictionary) labels = [] vectors = [] # TODO: Explore how to pull data more nicely from glove for key in glove_model.__dict__['dictionary']: word_vector_index = glove_model.__dict__['dictionary'][key] labels.append(key) vectors.append( list(glove_model.__dict__['word_vectors'][word_vector_index])) # Clustering output_events = defaultdict(list) for i, val in enumerate( hdbscan.HDBSCAN(min_cluster_size=2).fit_predict(vectors)): output_events[val].append(labels[i]) # Create event objects events = [] for item in output_events: event = Event(id=str(uuid.uuid4()), template_ids=map(int, output_events[item])) if len(event.template_ids) > 0: events.append(event) return events
def __init__(self, docs_tokens, emb_dim, iters, window, learn_rate): self.time = 0. self.time = time() corpus_model = glove.Corpus() corpus_model.fit(docs_tokens, window=window) glove_model = glove.Glove(no_components=emb_dim, learning_rate=learn_rate) glove_model.fit(corpus_model.matrix, epochs=iters, no_threads=4) glove_model.add_dictionary(corpus_model.dictionary) self.time = time() - self.time word_to_index = glove_model.dictionary index_word = glove_model.inverse_dictionary embedding_dictionary = {index_word[i]: vector for i, vector in enumerate(glove_model.word_vectors)} super(EmbeddingModel, self).get_from_data(embedding_dictionary, emb_dim, word_to_index, self) self.name = 'glove'
def train_glove(sentences=None, nr_feature=None, save_name=None): verify_cwd() if sentences is None: print("preprocessing sentences...") sentences = list( itertools.islice(word2vec.Text8Corpus('./data/text8'), None)) print("{} sentences found.".format(len(sentences))) if save_name is None: save_name = "./data/glove.model" if nr_feature is None: nr_feature = 200 corpus = glove.Corpus() print("start fiting sentences...") corpus.fit(sentences, window=10) gl = glove.Glove(no_components=nr_feature, learning_rate=0.05) print("start training glove...") gl.fit(corpus.matrix, epochs=10, no_threads=multiprocessing.cpu_count(), verbose=True) corpus.save("./data/corpus.model") gl.save("./data/glove.model")
def glove_pro(df_raw, sentence_id, word_id, emb_size=128, window=50, dropna=False, n_jobs=16, learning_rate=0.05, epoch=8, return_model=False): """ conda create -y -n TF1.14 python=3.6 pip install glove_python ------ test_glove = datalog.head(10000) sentence_id = 'user_id' word_id = 'industry' res = glove_pro(test_glove, sentence_id, word_id, emb_size=32, window=20, dropna=False, n_jobs=16, learning_rate=0.05, epoch=8,return_model=True) res.keys() res['sentence_emb_df'].info() res['model'].most_similar("6", number=10) """ list_col_nm = f'{sentence_id}__{word_id}_list' if (n_jobs is None) or (n_jobs <= 0): n_jobs = multiprocessing.cpu_count() logger.info(f"========== GloVE: {sentence_id} {word_id} ==========") df = df_raw[[sentence_id, word_id]].copy() if df[sentence_id].isnull().sum() > 0: logger.warning("NaNs exist in sentence_id column!!") if dropna: df = df.dropna(subset=[sentence_id, word_id]) else: df = df.fillna('NULL_zhangqibot') df = df.astype(str) tmp = df.groupby(sentence_id, as_index=False)[word_id].agg({list_col_nm: list}) sentences = tmp[list_col_nm].values.tolist() all_words_vocabulary = df[word_id].unique().tolist() del tmp[list_col_nm] gc.collect() matrix = glv.Corpus() matrix.fit(corpus=sentences, window=window) model = glv.Glove(no_components=emb_size, learning_rate=learning_rate, alpha=0.75, max_count=100, max_loss=10.0, random_state=666) model.fit(matrix.matrix, epochs=epoch, no_threads=n_jobs, verbose=1) model.add_dictionary(matrix.dictionary) # get word embedding matrix emb_dict = {} for word_i in all_words_vocabulary: if word_i in model.dictionary: emb_dict[word_i] = model.word_vectors[model.dictionary[word_i]] else: emb_dict[word_i] = np.zeros(emb_size, dtype="float32") return {"word_emb_dict": emb_dict}
(w, v.index) for w, v in model.vocab.iteritems()) model.id2word = utils.revdict(model.word2id) model.word_vectors = model.syn0norm utils.pickle(model, outf('w2v')) if 'glove' in program: if os.path.exists(outf('glove')): logger.info("glove model found, loading") model = utils.unpickle(outf('glove')) else: if os.path.exists(outf('glove_corpus')): logger.info("glove corpus matrix found, loading") cooccur = utils.unpickle(outf('glove_corpus')) else: logger.info("glove corpus matrix not found, creating") cooccur = glove.Corpus(dictionary=word2id) cooccur.fit(corpus(), window=WINDOW) utils.pickle(cooccur, outf('glove_corpus')) logger.info("glove model not found, creating") model = glove.Glove(no_components=DIM, learning_rate=0.05) model.fit(cooccur.matrix, epochs=10, no_threads=WORKERS, verbose=True) model.add_dictionary(cooccur.dictionary) model.word2id = dict((utils.to_unicode(w), id) for w, id in model.dictionary.iteritems()) model.id2word = gensim.utils.revdict(model.word2id) utils.pickle(model, outf('glove')) if 'pmi' in program:
def train(self, train_file): corpus_model = glove.Corpus() corpus_model.fit(self.read_custom_corpus(train_file), window=self.window_size, ) self.model = DGlove(no_components=self.dimensions, learning_rate=0.05) self.model.fit(corpus_model.matrix, no_threads=self.workers, epochs=self.epochs, verbose=True) self.model.add_dictionary(corpus_model.dictionary)
# coding: utf-8 import glove c = glove.Corpus() c.fit(["dog eat food", "people eat food", "dog eat people"], window=3) c.fit([["dog", "eat", "food"], ["people eat food", "dog eat people"], window=3) c ] c.fit([["dog", "eat", "food"], ["people eat food", "dog eat people"]], window=3) c c.matrix c.fit([["dog", "eat", "food"], ["people", "eat", "food"], ["dog", "eat", "people"]], window=3) c.matrix c.matrix[1] c.matrix.to_array() c.matrix.toarray() c.fit([["dog", "eat", "food"], ["people", "eat", "food"], ["dog", "eat", "people"]], window=3) s = glove.Glove(2, 0.05) s.fit(c.matrix, epochs = 10, no_threads = 1, verbose = True) s.add_dictionary(c.dictionary) c.dictionary c = glove.Corpus() c.fit([["dog", "eat", "food"], ["people", "eat", "food"], ["dog", "eat", "people"]], window=3) c.dictionary s.fit(c.matrix, epochs = 10, no_threads = 1, verbose = True) s = glove.Glove(2, 0.05) s.fit(c.matrix, epochs = 10, no_threads = 1, verbose = True) s.add_dictionary(c.dictionary) glove.most_similar('dog') s.most_similar('dog') c.matrix.toarray c.matrix.toarray()
from gensim import utils, corpora, matutils, models import glove # Restrict dictionary to the 30k most common words. corpus_file_name = '' wiki = models.word2vec.LineSentence(corpus_file_name) id2word = corpora.Dictionary(wiki) id2word.filter_extremes(keep_n=30000) word2id = dict((word, id) for id, word in id2word.iteritems()) # Filter all wiki documents to contain only those 30k words. filter_text = lambda text: [word for word in text if word in word2id] filtered_wiki = lambda: (filter_text(text) for text in wiki) # generator # Get the word co-occurrence matrix -- needs lots of RAM!! cooccur = glove.Corpus() cooccur.fit(filtered_wiki(), window=10) # and train GloVe model itself, using 10 epochs model_glove = glove.Glove(no_components=600, learning_rate=0.05) model_glove.fit(cooccur.matrix, epochs=10) model_glove.fit() model_glove.save('glove_default_30k.model')
batch_first=True) TRG = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) train, valid, test = torchtext.datasets.WMT14.splits(exts=('.en', '.de'), fields=(SRC, TRG)) length = len(train.examples) src_sentences = [] trg_sentences = [] for i in range(length): src_sentences.append(vars(train.examples[i])['src']) trg_sentences.append(vars(train.examples[i])['trg']) corpus = glove.Corpus() corpus2 = glove.Corpus() corpus.fit(src_sentences, window=10) corpus2.fit(trg_sentences, window=10) glove = Glove(no_components=512, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('src_glove.model') glove.fit(corpus2.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus2.dictionary) glove.save('trg_glove.model')