def train_glove(corpus, params, exp_id, save_dir, save_dict=False): dictionary = load_glove_dictionary(exp_id, save_dir) # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id)) if os.path.exists(dict_path): corpus_model = Corpus.load(dict_path) else: corpus_model = Corpus(dictionary) corpus_model.fit(corpus, window=params['window'] * 2, ignore_missing=True) if save_dict: corpus_model.save(dict_path) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=100, learning_rate=params['alpha']) glove.fit(corpus_model.matrix, epochs=50, no_threads=params['workers'], verbose=True) glove.add_dictionary(corpus_model.dictionary) return glove
def main(): corpus_model = Corpus() corpus_model = Corpus.load('bioc-corpus-AZ2.model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('bioc-glove-AZ2.model')
def build_model_glove(args): if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def build_model_glove(args): from glove import Glove, Corpus if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def generate_glove_map(): """ generate a map of glove :return: none """ global article_info_path, output_path, embedding_dimension, corpus_path write_log('GloVe Load article info : Start') with open(article_info_path, 'r') as f_art: article_info = json.load(f_art) write_log('GloVe Load article info : End') write_log('GloVe Generate set of words : Start') words = set([]) for url, dict_info in article_info.items(): sentence_header = dict_info.get('sentence_header', None) sentence_body = dict_info.get('sentence_body', None) if (sentence_header == None) or (sentence_body == None): continue #for sentence in sentence_header + sentence_body: for sentence in sentence_header: for word in sentence.split(' '): words.update([word]) write_log('GloVe Generate set of words - {} : End'.format(len(words))) write_log('GloVe Load corpus from {}: Start'.format(corpus_path)) corpus = Corpus.load(corpus_path) write_log('GloVe Load corpus : End') write_log('GloVe learning : Start') glove = Glove(no_components=embedding_dimension, learning_rate=0.05) glove.fit(corpus.matrix, epochs=400, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) write_log('GloVe learning : End') dict_a2g = {} for word in words: #word_vector = np.array(glove.word_vectors[glove.dictionary[word]]) word_vector = glove.word_vectors[glove.dictionary[word]].tolist() assert (len(word_vector) == embedding_dimension) dict_a2g[word] = word_vector write_log('GloVe result dump : Start') with open(output_path, 'wb') as f_out: pickle.dump(dict_a2g, f_out) write_log('GloVe result dump : End')
def create_glove_corpus(): corpus_path = '%s/Embedding/glove/glove.corpus'%(folder) if not os.path.exists(corpus_path): corpus = Corpus() # 建立glove corpus物件,並設定matrix scan window大小 corpus.fit(embedding_corpus, window=10) corpus.fit(embedding_corpus, window=10) print('Dict size: %s' % len(corpus.dictionary)) print('Collocations: %s' % corpus.matrix.nnz) corpus.save('%s/Embedding/glove/glove.corpus'%(folder)) # 存字典 else: corpus = Corpus.load('%s/Embedding/glove/glove.corpus'%(folder)) print('Already get glove corpus') return corpus
def get_glove_corpus_model(setting): if not force_gen and os.path.isfile("models/" + setting_string(**setting) + "__glove_corpus_model"): return Corpus.load("models/" + setting_string(**setting) + "__glove_corpus_model") else: token2index_map = json.load(open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json")) if setting['granularity'] == 'documents': item_generator = get_all_documents_as_token_list(setting['token_method'], setting['data_basis']) elif setting['granularity'] == 'paragraphs': item_generator = get_all_docs_paragrahps_as_token_list(setting['token_method'], setting['data_basis']) else: raise corpus = (filter(lambda token: token in token2index_map, doc[1]) for doc in item_generator) corpus_model = Corpus(dictionary=token2index_map) corpus_model.fit(corpus) corpus_model.save("models/" + setting_string(**setting) + "__glove_corpus_model") return corpus_model
qs = [] ts = [] ds = [] sentences = [] for q, t in zip(data_all['question1'].values.tolist(), data_all['question2'].values.tolist()): sentences.append(q.split(' ')) sentences.append(t.split(' ')) qs.append(q.split(' ')) ts.append(t.split(' ')) corpus_model = Corpus() corpus_model.fit(sentences, window=10) corpus_model.save(path + 'corpus.mdl') corpus_model = Corpus.load(path + 'corpus.mdl') glove = Glove(no_components=200, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=7, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(path + 'glove.glv') glove = Glove.load(path + 'glove.glv') print glove qt_sims_dists = [] qt_diff = [] def calc_cosine_dist(text_a, text_b, metric='euclidean'): return pairwise_distances([text_a], [text_b], metric=metric)[0][0]
def train_glove(inst, meta_data={}): start_total = datetime.now() meta_data["glove_params"] = settings.GLOVE_PARAMS glove_paramgrid = ParameterGrid(settings.GLOVE_PARAMS) for params in glove_paramgrid: start = datetime.now() # MAKE CORPUS # set corpus filepath corpus_fp = os.path.join(settings.WVEC_OPT_DIRP, '{}_window{}.glovecorpus'.format( settings.DATASET, params["window"])) # load if corpus exists if os.path.isfile(corpus_fp): logging.info("Loading existing corpus {}.".format(corpus_fp)) corpus_model = Corpus.load(corpus_fp) logging.info("Successfully loaded existing corpus {}.".format(corpus_fp)) # make a new coocurrence corpus if it does not exist else: logging.info("Creating new corpus at {}.".format(corpus_fp)) corpus_model = Corpus() corpus_model.fit(inst, window=params["window"]) os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True) corpus_model.save(corpus_fp) logging.info("Dict size: {}.".format(len(corpus_model.dictionary))) logging.info("Collocations: {}.".format(corpus_model.matrix.nnz)) # GLOVE VECTOR TRAINING glove = Glove(no_components=params["dims"], learning_rate=params["lr"]) logging.info("Start fitting GloVe with parameters: {}.".format(params)) glove.fit(corpus_model.matrix, epochs=params["epochs"], no_threads=params["njobs"], verbose=False) glove.add_dictionary(corpus_model.dictionary) os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True) model_name = 'glove.{}_w{}_lr{}_ep{}.{}d.glovemodel'.format(settings.DATASET, params["window"], params["lr"], params["epochs"], params["dims"]) glove.save(os.path.join(settings.WVEC_OPT_DIRP, model_name)) duration = (datetime.now() - start).total_seconds() meta_data["models"][model_name] = params meta_data["models"][model_name]["duration_training"] = duration logging.info("Finished fitting GloVe {} in {}s with parameters: {}.".format( model_name, duration, params)) # SIMILARITY TEST for test_word in settings.TESTSIM_WORDS: if test_word not in meta_data["most_similar"]: meta_data["most_similar"][test_word] = {} logging.info("Querying model {} for {} most similar to \'{}\':".format( model_name, settings.N_TESTSIM, test_word)) sim = glove.most_similar(test_word, number=settings.N_TESTSIM) meta_data["most_similar"][test_word][model_name] = sim logging.info(pprint.pformat(sim)) total_duration = (datetime.now() - start_total).total_seconds() meta_data["glove_duration_training"] = total_duration return meta_data
get_data = read_corpus corpus_cooc = Corpus() corpus_cooc.fit(get_data(args.create), window=10) corpus_cooc.save('corpus.model') print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_cooc = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_cooc.matrix, epochs=int(args.train), no_threads=args.parallelism, verbose=True) glove.add_dictionary(corpus_cooc.dictionary) glove.save('glove.model') if args.query: # Finally, query the model for most similar words.
def build_model(): # Set up command line parameters. parser = argparse.ArgumentParser(description='Fit a GloVe model.') parser.add_argument('--create', '-c', action='store', default=None, help=('The filename of the corpus to pre-process. ' 'The pre-processed corpus will be saved ' 'and will be ready for training.')) parser.add_argument( '--train', '-t', action='store', default=0, help=('Train the GloVe model with this number of epochs.' 'If not supplied, ' 'We\'ll attempt to load a trained model')) parser.add_argument( '--parallelism', '-p', action='store', default=1, help=('Number of parallel threads to use for training')) parser.add_argument('--query', '-q', action='store', default='', help='Get closes words to this word.') args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') data = read_data(args.create) corpus_model = Corpus() corpus_model.fit(data, window=10) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=50, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(args.train), no_threads=args.parallelism, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model') if args.query: # Finally, query the model for most similar words. if not args.train: print('Loading pre-trained GloVe model') glove = Glove.load('glove.model') print('Querying for %s' % args.query) pprint.pprint(glove.most_similar(args.query, number=10))
parser = argparse.ArgumentParser() parser.add_argument('--epochs', default=25, type=int) args = parser.parse_args() DATA = 'data/articles.csv' PATH = f'custom' CORPUS_PATH = os.path.join(PATH, 'corpus.pkl') SEED = 34985734958 epochs = args.epochs if not os.path.isdir(PATH): os.mkdir(PATH) if os.path.exists(CORPUS_PATH): print('Found existing corpus.') corpus = Corpus.load(CORPUS_PATH) else: print('Could not find existing corpus. Creating new one.') class Iterable: def __init__(self, df, col='text'): self.df = df self.col = col def __iter__(self): for article in self.df[self.col].values: yield preprocess(article) corpus = Corpus() start = time.time() corpus.fit(Iterable(pd.read_csv(DATA)))
texts = [] classes = [] for row in csvsequence: texts.append(clean(row[3]).split()) classes.append(row[0]) # Calculate distribution, to account for 95th percentile of messages. max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts]))) print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length)) corpus = Corpus() try: print("Loading pretrained corpus...") corpus = Corpus.load("cache/corpus.p") except: print("Training corpus...") corpus.fit(texts, window=max_sentence_length) corpus.save("cache/corpus.p") glove = Glove(no_components=number_components, learning_rate=0.05) try: print("Loading pretrained GloVe vectors...") glove = Glove.load("cache/glove.p") except: print("Training GloVe vectors...") # More epochs seems to make it worse glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save("cache/glove.p")
get_data = read_corpus corpus_model = Corpus() corpus_model.fit(get_data(args.create), window=10) corpus_model.save('corpus.model', 'corpus.pmi') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) # import pdb; pdb.set_trace() print('Training the GloVe model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(args.train), no_threads=args.parallelism, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model')
print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('%s/Embedding/glove/glove.model' % (folder)) # 存模型 corpus_model.save('%s/Embedding/glove/corpus.model' % (folder)) # 存字典 #透過gensim以text_data建立字典 dictionary = corpora.Dictionary(embedding_corpus) dictionary.save('%s/Embedding/glove/dictionary.gensim' % (folder)) glove = Glove.load('%s/Embedding/glove/glove.model' % (folder)) corpus_model = Corpus.load('%s/Embedding/glove/corpus.model' % (folder)) dictionary = gensim.corpora.Dictionary.load( '%s/Embedding/glove/dictionary.gensim' % (folder)) # write vocab to file vocab_file = "%s/Embedding/glove/word.vocab" % (folder) if not os.path.exists(vocab_file): # vocab_count = len(glove.dictionary) vocab_count = 0 print("Writing vocab file...") with open(vocab_file, 'w', encoding='utf-8') as writer: for word, idx in glove.dictionary.items(): try: word_id = dictionary.token2id[word] word_freq = dictionary.dfs[word_id] if word_freq < 2: continue
corpus_model.fit(sentences, window=10) #corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('Embedding/main_cat/glove/glove.model') # 存模型 corpus_model.save('Embedding/main_cat/glove/corpus.model') # 存字典 glove = Glove.load('Embedding/main_cat/glove/glove.model') corpus_model = Corpus.load('Embedding/main_cat/glove/corpus.model') # In[ ]: vocab_file = "Embedding/main_cat/glove/word.vocab" if not os.path.exists(vocab_file): # vocab_count = len(glove.dictionary) vocab_count = 0 print("Writing vocab file...") with open(vocab_file, 'w',encoding='utf-8') as writer: for word,idx in glove.dictionary.items(): if word in vocab._word_to_id.keys(): vocab_count += 1
#-*- coding:utf-8 -*- ''' Created on 2016-3-12 @author: dannl ''' from glove import Glove from glove import Corpus import time cooc_file='/home/dannl/tmp/newstech/glove/word.cooc' model_file='/home/dannl/tmp/newstech/glove/glove.model' oldtime=time.time() # get a cooccurrence matrix corpus_cooc = Corpus.load(cooc_file) # get a model glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_cooc.matrix, epochs=5,no_threads=4, verbose=True) glove.add_dictionary(corpus_cooc.dictionary) glove.save(model_file) # count=0 # for word,wid in corpus_cooc.dictionary.items(): # count+=1 # if count>100: # break # print word,wid print('Dict size: %s' % len(corpus_cooc.dictionary))
parser = argparse.ArgumentParser(description='Related artists PCA demo') parser.add_argument('QUERY', action='store', default='', help='Demo PCA using this artist') parser.add_argument('--corpus', '-c', default=CORPUS_FILE, help='Specify corpus file to read') parser.add_argument('--glove', '-g', default=GLOVE_MODEL_FILE, help='Specify glove model file to read') args = parser.parse_args() CORPUS_FILE = args.corpus GLOVE_MODEL_FILE = args.glove if not os.path.exists(RESULT_DIR): os.mkdir(RESULT_DIR) # MAIN if os.path.exists(CORPUS_FILE): print('[{}] Reading corpus from file...'.format(chalk.yellow(CORPUS_FILE))) corpus = Corpus.load(CORPUS_FILE) else: print('[{}] Error reading corpus file.'.format(chalk.red(CORPUS_FILE))) quit(0) if os.path.exists(GLOVE_MODEL_FILE): print('[{}] Reading glove model from file...'.format(chalk.yellow(GLOVE_MODEL_FILE))) glove = Glove.load(GLOVE_MODEL_FILE) else: print('[{}] Error reading glove file.'.format(chalk.red(GLOVE_MODEL_FILE))) quit(0) matrix = glove.word_vectors dictionary = glove.dictionary if args.QUERY not in dictionary:
corpus_model = Corpus() corpus_model.fit(sentences, window=10) #corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('Embedding/category/glove/glove.model') # 存模型 corpus_model.save('Embedding/category/glove/corpus.model') # 存字典 glove = Glove.load('Embedding/category/glove/glove.model') corpus_model = Corpus.load('Embedding/category/glove/corpus.model') # In[56]: vocab_file = "Embedding/category/glove/word.vocab" if not os.path.exists(vocab_file): # vocab_count = len(glove.dictionary) vocab_count = 0 print("Writing vocab file...") with open(vocab_file, 'w', encoding='utf-8') as writer: for word, idx in glove.dictionary.items(): if word in vocab._word_to_id.keys(): vocab_count += 1 writer.write(word + ' ' + str(idx) + '\n') # Output vocab count
def load_corpus_from_model(self): print('Reading corpus statistics...') self.corpus_model = Corpus.load(args.corpus_model_path) print('Dict size: %s' % len(self.corpus_model.dictionary)) print('Collocations: %s' % self.corpus_model.matrix.nnz)
get_data = read_corpus corpus_model = Corpus() corpus_model.fit(get_data(args.create), window=10) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(args.train), no_threads=args.parallelism, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model') if args.query: # Finally, query the model for most similar words.
from glove import Glove, Corpus inputFile = "/media/charles/data/nlp/zzz1000" corpusModelFile = "/media/charles/data/nlp/corpus_wiki.model" outputFile = "/media/charles/data/nlp/glove_wiki.model" epochs = 10 nb_threads = 4 def get_text(fin): f = open(fin) for line in f: yield line[:-1].split(' ') #corpus_model = Corpus() #print("computing coocurrence matrix...") #corpus_model.fit(get_text(inputFile), window=10) #print("saving coocurrence matrix...") #corpus_model.save(corpusModelFile) corpus_model = Corpus.load(corpusModelFile) print("fitting model...") glove = Glove(no_components=200, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=epochs, no_threads=nb_threads, verbose=True) glove.add_dictionary(corpus_model.dictionary) print("saving model to "+outputFile+" ...") glove.save(outputFile)
from __future__ import print_function import argparse import pprint import gensim from glove import Glove from glove import Corpus if __name__ == '__main__': print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model') print('Training finished')
def load(glove_corpus_filename, d, p=None): corpus_model = Corpus.load(glove_corpus_filename) M = corpus_model.matrix.todense() # an upper triangular matrix with diagonal values of zero M = M + M.T # convert to a symmetric matrix return SVDModelFromGloVeCorpus(np.asarray(M), corpus_model.dictionary, d, p)