def build_model_glove(args): from glove import Glove, Corpus if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def train_glove(target_group, glove_para, src_file, save_model_name): """ example: train_glove(target_group='words', glove_para=glove_para_word) after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model') :param target_group: 'words' or 'chars' :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4} :return: """ corpus_model = Corpus() corpus_model.fit(read_corpus(src_file=src_file, words_or_chars=target_group), window=glove_para['window_size'] ) #avg word size is 6 for each sentence corpus_model.save('corpus_model_{}.model'.format(target_group)) print target_group print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=glove_para['no_components'], learning_rate=glove_para['learning_rate']) glove.fit(corpus_model.matrix, epochs=glove_para['no_epochs'], no_threads=glove_para['parallelism'], verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(save_model_name)
def train_glove(corpus, params, exp_id, save_dir, save_dict=False): dictionary = load_glove_dictionary(exp_id, save_dir) # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id)) if os.path.exists(dict_path): corpus_model = Corpus.load(dict_path) else: corpus_model = Corpus(dictionary) corpus_model.fit(corpus, window=params['window'] * 2, ignore_missing=True) if save_dict: corpus_model.save(dict_path) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=100, learning_rate=params['alpha']) glove.fit(corpus_model.matrix, epochs=50, no_threads=params['workers'], verbose=True) glove.add_dictionary(corpus_model.dictionary) return glove
class GloVeFilter(object): def __init__(self): # Corpus model vocab = dict(torch.load("../data/dialogue.vocab.pt", "text")) self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi) # Model self.glove = Glove(no_components=args.no_components, learning_rate=args.learning_rate) def load_corpus_from_txt(self): print('Reading corpus statistics...') #texts = [self.pp.preprocessing(l.strip().decode("utf8", "ignore")) for l in open(args.data_path)] texts = [ l.strip().decode("utf8", "ignore").split(" ") for l in open(args.data_path) ] self.corpus_model.fit(texts, window=args.window, ignore_missing=True) self.corpus_model.save(args.corpus_model_path) print('Dict size: %s' % len(self.corpus_model.dictionary)) print('Collocations: %s' % self.corpus_model.matrix.nnz) def load_corpus_from_model(self): print('Reading corpus statistics...') self.corpus_model = Corpus.load(args.corpus_model_path) print('Dict size: %s' % len(self.corpus_model.dictionary)) print('Collocations: %s' % self.corpus_model.matrix.nnz) def train(self): print('Training the GloVe model...') self.glove.fit(self.corpus_model.matrix, epochs=args.epochs, verbose=True) self.glove.add_dictionary(self.corpus_model.dictionary) self.glove.save(args.model_path) print('Training finished')
def train_glove_fashionrec(dimensionality, context, epochs): """ Train with Glove on IG corpora""" total_count, vocab_size = corpus_stats("data/clean2_corpus.txt") print("total word count: {}, vocabulary size: {}".format( total_count, vocab_size)) fileName = "results/training/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_" + ".txt" corpus = readCorpus() lines = corpus.split("\n") linessplit = map(lambda x: x.split(" "), lines) corpus_model = Corpus() start_time = datetime.now() corpus_model.fit(linessplit, window=context) corpusModelFile = "trained/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_corpus" + ".model" corpus_model.save(corpusModelFile) glove = Glove(no_components=dimensionality, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(epochs), no_threads=8, verbose=True) glove.add_dictionary(corpus_model.dictionary) time_elapsed = datetime.now() - start_time gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".model" glove.save(gloveModelFile) notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str( context) + " context, " + str( epochs) + " epochs \n" + "Training time: " + str(time_elapsed) save_to_file(fileName, notes) gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".vec" save_glove_bin_to_vec(glove, gloveVecFile)
def build_model_glove(args): if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def generate_glove_corpus(): global article_info_path, output_path write_log('GloVe Load article info : Start') with open(article_info_path, 'r') as f_art: article_info = json.load(f_art) write_log('GloVe Load article info : End') write_log('GloVe Generate sentences : Start') sentences = [] for url, dict_info in article_info.items(): sentence_header = dict_info.get('sentence_header', None) sentence_body = dict_info.get('sentence_body', None) if (sentence_header == None) or (sentence_body == None): continue words = [] #for sentence in sentence_header + sentence_body: for sentence in sentence_header: for word in sentence.split(' '): words.append(word) sentences.append(words) write_log('GloVe Generate sentences : End') write_log('GloVe Generate corpus : Start') corpus = Corpus() corpus.fit(sentences, window=10) write_log('GloVe Generate corpus : End') corpus.save(output_path)
def glove_single(domain_name): corpus_model = Corpus() corpus_model.fit(labeled_reviews(domain_name), window=10) corpus_model.save('../work/%s/corpus.model'% domain_name) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') model = Glove(no_components=300, learning_rate=0.05) model.fit(corpus_model.matrix, epochs=int(10), no_threads=6, verbose=True) model.add_dictionary(corpus_model.dictionary) model.save('../work/%s/glove.model' % domain_name) return
def train_glove(sequence_file_path, output_folder, no_components, epochs): corpus_model = Corpus() corpus_model.fit(read_corpus(sequence_file_path), window=10) corpus_model.save(get_corpus_model_path(output_folder)) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=int(no_components), learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(epochs), no_threads=50, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(get_glove_model_path(output_folder))
def create_glove_corpus(): corpus_path = '%s/Embedding/glove/glove.corpus'%(folder) if not os.path.exists(corpus_path): corpus = Corpus() # 建立glove corpus物件,並設定matrix scan window大小 corpus.fit(embedding_corpus, window=10) corpus.fit(embedding_corpus, window=10) print('Dict size: %s' % len(corpus.dictionary)) print('Collocations: %s' % corpus.matrix.nnz) corpus.save('%s/Embedding/glove/glove.corpus'%(folder)) # 存字典 else: corpus = Corpus.load('%s/Embedding/glove/glove.corpus'%(folder)) print('Already get glove corpus') return corpus
def get_glove_corpus_model(setting): if not force_gen and os.path.isfile("models/" + setting_string(**setting) + "__glove_corpus_model"): return Corpus.load("models/" + setting_string(**setting) + "__glove_corpus_model") else: token2index_map = json.load(open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json")) if setting['granularity'] == 'documents': item_generator = get_all_documents_as_token_list(setting['token_method'], setting['data_basis']) elif setting['granularity'] == 'paragraphs': item_generator = get_all_docs_paragrahps_as_token_list(setting['token_method'], setting['data_basis']) else: raise corpus = (filter(lambda token: token in token2index_map, doc[1]) for doc in item_generator) corpus_model = Corpus(dictionary=token2index_map) corpus_model.fit(corpus) corpus_model.save("models/" + setting_string(**setting) + "__glove_corpus_model") return corpus_model
def prepare_corpus(args): logging.info('Preparing corpus') word_counts = Counter() for tokens in map(str.split, open(args.data_path)): word_counts.update(tokens) logging.info('Counted {} unique words.'.format(len(word_counts))) logging.info('Truncating vocabulary at min_count {}, max_tokens {}'.format( args.min_count, args.max_tokens)) tokens = { token for token, count in word_counts.most_common(args.max_tokens) if count >= args.min_count } dictionary = {token: i for i, token in enumerate(tokens)} logging.info('Using vocabulary of size {}'.format(len(dictionary))) corpus = Corpus(dictionary) logging.info('Counting co-occurrences. Window size {}'.format(args.window)) corpus.fit(map(str.split, open(args.data_path)), window=args.window, ignore_missing=True) corpus.save(args.co_path) return corpus
len_train = train.shape[0] qs = [] ts = [] ds = [] sentences = [] for q, t in zip(data_all['question1'].values.tolist(), data_all['question2'].values.tolist()): sentences.append(q.split(' ')) sentences.append(t.split(' ')) qs.append(q.split(' ')) ts.append(t.split(' ')) corpus_model = Corpus() corpus_model.fit(sentences, window=10) corpus_model.save(path + 'corpus.mdl') corpus_model = Corpus.load(path + 'corpus.mdl') glove = Glove(no_components=200, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=7, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(path + 'glove.glv') glove = Glove.load(path + 'glove.glv') print glove qt_sims_dists = [] qt_diff = [] def calc_cosine_dist(text_a, text_b, metric='euclidean'):
def train_glove(inst, meta_data={}): start_total = datetime.now() meta_data["glove_params"] = settings.GLOVE_PARAMS glove_paramgrid = ParameterGrid(settings.GLOVE_PARAMS) for params in glove_paramgrid: start = datetime.now() # MAKE CORPUS # set corpus filepath corpus_fp = os.path.join(settings.WVEC_OPT_DIRP, '{}_window{}.glovecorpus'.format( settings.DATASET, params["window"])) # load if corpus exists if os.path.isfile(corpus_fp): logging.info("Loading existing corpus {}.".format(corpus_fp)) corpus_model = Corpus.load(corpus_fp) logging.info("Successfully loaded existing corpus {}.".format(corpus_fp)) # make a new coocurrence corpus if it does not exist else: logging.info("Creating new corpus at {}.".format(corpus_fp)) corpus_model = Corpus() corpus_model.fit(inst, window=params["window"]) os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True) corpus_model.save(corpus_fp) logging.info("Dict size: {}.".format(len(corpus_model.dictionary))) logging.info("Collocations: {}.".format(corpus_model.matrix.nnz)) # GLOVE VECTOR TRAINING glove = Glove(no_components=params["dims"], learning_rate=params["lr"]) logging.info("Start fitting GloVe with parameters: {}.".format(params)) glove.fit(corpus_model.matrix, epochs=params["epochs"], no_threads=params["njobs"], verbose=False) glove.add_dictionary(corpus_model.dictionary) os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True) model_name = 'glove.{}_w{}_lr{}_ep{}.{}d.glovemodel'.format(settings.DATASET, params["window"], params["lr"], params["epochs"], params["dims"]) glove.save(os.path.join(settings.WVEC_OPT_DIRP, model_name)) duration = (datetime.now() - start).total_seconds() meta_data["models"][model_name] = params meta_data["models"][model_name]["duration_training"] = duration logging.info("Finished fitting GloVe {} in {}s with parameters: {}.".format( model_name, duration, params)) # SIMILARITY TEST for test_word in settings.TESTSIM_WORDS: if test_word not in meta_data["most_similar"]: meta_data["most_similar"][test_word] = {} logging.info("Querying model {} for {} most similar to \'{}\':".format( model_name, settings.N_TESTSIM, test_word)) sim = glove.most_similar(test_word, number=settings.N_TESTSIM) meta_data["most_similar"][test_word][model_name] = sim logging.info(pprint.pformat(sim)) total_duration = (datetime.now() - start_total).total_seconds() meta_data["glove_duration_training"] = total_duration return meta_data
def main(): corpus_model = Corpus() corpus_model.fit(itertexts(), window=10, max_map_size=1000000) corpus_model.save('bioc-corpus-AZ2.model')
args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') if args.wiki: print('Using wikipedia corpus') get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_cooc = Corpus() corpus_cooc.fit(get_data(args.create), window=10) corpus_cooc.save('corpus.model') print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_cooc = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz)
''' from glove import Glove from glove import Corpus from gensim import corpora import time dic_file=r'/home/dannl/tmp/newstech/glove/news.dic' corpus_file='/home/dannl/tmp/newstech/news.txt' cooc_file='/home/dannl/tmp/newstech/glove/word.cooc' def read_corpus(filename): with open(filename, 'r') as datafile: for line in datafile: yield line.split()[1:] # get a cooccurrence matrix oldtime=time.time() dictionary = corpora.Dictionary.load(dic_file) # corpus_cooc = Corpus() # corpus_cooc.fit(read_corpus(corpus_file), window=10) corpus_cooc = Corpus(dictionary=dictionary.token2id) corpus_cooc.fit(read_corpus(corpus_file), window=10,ignore_missing=True) corpus_cooc.save(cooc_file) print('Dict size: %s' % len(corpus_cooc.dictionary)) print('Collocations: %s' % corpus_cooc.matrix.nnz) print 'time cost:%.2f'%(time.time()-oldtime,)
action='store', type=int, default=10, help= 'The length of the (symmetric) context window used for co-occurrence.') parser.add_argument('--max_count', '-m', action='store', type=int, default=100, help='The max co-occurrence count.') args = parser.parse_args() print('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(read_corpus(args.corpus), window=args.window) corpus_model.save('%s.corpus.model' % args.out) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=args.components, learning_rate=args.learning_rate, max_count=args.max_count) glove.fit(corpus_model.matrix, epochs=int(args.train), no_threads=args.parallelism, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('%s.glove.model' % args.out)
args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') if args.wiki: print('Using wikipedia corpus') get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_model = Corpus() corpus_model.fit(get_data(args.create), window=10) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz)
texts.append(clean(row[3]).split()) classes.append(row[0]) # Calculate distribution, to account for 95th percentile of messages. max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts]))) print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length)) corpus = Corpus() try: print("Loading pretrained corpus...") corpus = Corpus.load("cache/corpus.p") except: print("Training corpus...") corpus.fit(texts, window=max_sentence_length) corpus.save("cache/corpus.p") glove = Glove(no_components=number_components, learning_rate=0.05) try: print("Loading pretrained GloVe vectors...") glove = Glove.load("cache/glove.p") except: print("Training GloVe vectors...") # More epochs seems to make it worse glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save("cache/glove.p") # Convert input text print("Vectorizing input sentences...") X = vectify(texts, previous_message, glove.dictionary, max_sentence_length, contextual)
help='Get closes words to this word.') args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') if args.wiki: print('Using wikipedia corpus') get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_model = Corpus() corpus_model.fit(get_data(args.create), window=10) corpus_model.save('corpus.model', 'corpus.pmi') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) # import pdb; pdb.set_trace()
#importing the glove library from glove import Corpus, Glove import pandas as pd from tqdm import tqdm pruned_tagset = pd.read_csv("termstr_all.csv", index_col=0) pruned_tagset = pruned_tagset[pruned_tagset['termstr'].notnull()] tqdm.pandas(desc="split tagset string") pruned_tagset = list( pruned_tagset['termstr'].progress_apply(lambda x: x.split(';'))) #creating a corpus object corpus = Corpus() #training the corpus to generate the co occurence matrix which is used in GloVe # corpus.fit(pruned_tagset, window=3) corpus.save('corpus.model') #creating a Glove object which will use the matrix created in the above lines to create embeddings #We can set the learning rate as it uses Gradient Descent and number of components glove = Glove(no_components=150, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') #print(glove.dictionary) termvec = glove.word_vectors termdic = glove.dictionary temp1 = glove.most_similar('rock', number=10) print(temp1) import pickle
if not os.path.exists("Embedding/main_cat/glove/glove.model"): corpus_model = Corpus() corpus_model.fit(sentences, window=10) #corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('Embedding/main_cat/glove/glove.model') # 存模型 corpus_model.save('Embedding/main_cat/glove/corpus.model') # 存字典 glove = Glove.load('Embedding/main_cat/glove/glove.model') corpus_model = Corpus.load('Embedding/main_cat/glove/corpus.model') # In[ ]: vocab_file = "Embedding/main_cat/glove/word.vocab" if not os.path.exists(vocab_file): # vocab_count = len(glove.dictionary) vocab_count = 0 print("Writing vocab file...")
""" Created on Fri Sep 14 12:45:30 2018 @author: charlie """ import itertools from gensim.models.word2vec import Text8Corpus from glove import Corpus, Glove import os cur_dir = os.getcwd() glove_fname = '/glove.model' corpus_fname = "/corpus.model" if os.path.exists(cur_dir + glove_fname): glove = Glove.load(cur_dir+glove_fname) # corpus = Corpus.load(cur_dir+corpus_fname) else: sentences = list(itertools.islice(Text8Corpus('text/text8'), None)) corpus = Corpus() corpus.fit(sentences, window = 10) glove = Glove(no_components=100, learning_rate = 0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save(cur_dir + glove_fname) corpus.save(cur_dir+corpus_fname) glove.most_similar('men') # Parameters are hashable string not list glove.word_vectors[glove.dictionary['perfect']]
print('Could not find existing corpus. Creating new one.') class Iterable: def __init__(self, df, col='text'): self.df = df self.col = col def __iter__(self): for article in self.df[self.col].values: yield preprocess(article) corpus = Corpus() start = time.time() corpus.fit(Iterable(pd.read_csv(DATA))) print(f'finished co_occur in {int(time.time() - start)} seconds.') corpus.save(CORPUS_PATH) def train_dim(size): """ Trains and saves a SIZE-dimensional glove embedding. """ glove = Glove(no_components=size, random_state=random.seed(SEED)) start = time.time() glove.fit(corpus.matrix, epochs=epochs, no_threads=12, verbose=True) print( f'finished {size}d vectors in {(time.time() - start)/60:.2f} minutes.') with open(f'{PATH}/custom.{size}d.txt', 'w') as f: for word, i in corpus.dictionary.items(): word += ' '
def build_model(): # Set up command line parameters. parser = argparse.ArgumentParser(description='Fit a GloVe model.') parser.add_argument('--create', '-c', action='store', default=None, help=('The filename of the corpus to pre-process. ' 'The pre-processed corpus will be saved ' 'and will be ready for training.')) parser.add_argument( '--train', '-t', action='store', default=0, help=('Train the GloVe model with this number of epochs.' 'If not supplied, ' 'We\'ll attempt to load a trained model')) parser.add_argument( '--parallelism', '-p', action='store', default=1, help=('Number of parallel threads to use for training')) parser.add_argument('--query', '-q', action='store', default='', help='Get closes words to this word.') args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') data = read_data(args.create) corpus_model = Corpus() corpus_model.fit(data, window=10) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print('Reading corpus statistics') corpus_model = Corpus.load('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=50, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(args.train), no_threads=args.parallelism, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model') if args.query: # Finally, query the model for most similar words. if not args.train: print('Loading pre-trained GloVe model') glove = Glove.load('glove.model') print('Querying for %s' % args.query) pprint.pprint(glove.most_similar(args.query, number=10))
if not os.path.exists('%s/Embedding/glove' % (folder)): os.makedirs('%s/Embedding/glove' % (folder)) if not os.path.exists("%s/Embedding/glove/glove.model" % (folder)): corpus_model = Corpus() corpus_model.fit(embedding_corpus, window=10) #corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('%s/Embedding/glove/glove.model' % (folder)) # 存模型 corpus_model.save('%s/Embedding/glove/corpus.model' % (folder)) # 存字典 #透過gensim以text_data建立字典 dictionary = corpora.Dictionary(embedding_corpus) dictionary.save('%s/Embedding/glove/dictionary.gensim' % (folder)) glove = Glove.load('%s/Embedding/glove/glove.model' % (folder)) corpus_model = Corpus.load('%s/Embedding/glove/corpus.model' % (folder)) dictionary = gensim.corpora.Dictionary.load( '%s/Embedding/glove/dictionary.gensim' % (folder)) # write vocab to file vocab_file = "%s/Embedding/glove/word.vocab" % (folder) if not os.path.exists(vocab_file): # vocab_count = len(glove.dictionary) vocab_count = 0
# In[55]: if not os.path.exists("Embedding/category/glove/glove.model"): corpus_model = Corpus() corpus_model.fit(sentences, window=10) #corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('Embedding/category/glove/glove.model') # 存模型 corpus_model.save('Embedding/category/glove/corpus.model') # 存字典 glove = Glove.load('Embedding/category/glove/glove.model') corpus_model = Corpus.load('Embedding/category/glove/corpus.model') # In[56]: vocab_file = "Embedding/category/glove/word.vocab" if not os.path.exists(vocab_file): # vocab_count = len(glove.dictionary) vocab_count = 0 print("Writing vocab file...") with open(vocab_file, 'w', encoding='utf-8') as writer: for word, idx in glove.dictionary.items(): if word in vocab._word_to_id.keys():
GLOVE_MODEL_FILE = args.glove if not os.path.exists(RESULT_DIR): os.mkdir(RESULT_DIR) # MAIN if os.path.exists(CORPUS_FILE): print('[{}] Reading corpus from file...'.format(chalk.yellow(CORPUS_FILE))) corpus = Corpus.load(CORPUS_FILE) else: nx_G = util.get_nx_graph() walks = util.get_node2vec_walks(nx_G) corpus = Corpus() corpus.fit(walks, window=WINDOW_SIZE) print('[{}] Writing corpus file...'.format(chalk.green(CORPUS_FILE))) corpus.save(CORPUS_FILE) if os.path.exists(GLOVE_MODEL_FILE) and not args.train: print('[{}] Reading glove model from file...'.format( chalk.yellow(GLOVE_MODEL_FILE))) glove = Glove.load(GLOVE_MODEL_FILE) else: glove = Glove(no_components=VECTOR_DIMENSION, learning_rate=0.05) glove.fit(corpus.matrix, epochs=GLOVE_EPOCHS, no_threads=PARALLEL_WORKER_COUNT, verbose=True) glove.add_dictionary(corpus.dictionary) print('[{}] Writing glove file...'.format(chalk.green(GLOVE_MODEL_FILE))) glove.save(GLOVE_MODEL_FILE) if args.query:
return sents_token if __name__ == '__main__': # Set up parameters. train = 1 parallelism = 1 query = 'brave' # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(read_corpus(), window=10) corpus_model.save('corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. print('Training the GloVe model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=train, no_threads=parallelism, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('glove.model')
text = text.replace('Trump', 'Trump_Pre_Election') elif date and date >= ELECTION_DATE: text = text.replace('Trump', 'Trump_Post_Election') text = text.replace("\xa0", " ").replace('“', '"').replace('”', '"') sents = sent_tokenize(text) for sent in sents: yield self.tokenizer.tokenize(sent) dirname = os.path.expanduser('./output/articles') sentences = SentencesIterator(dirname) print('Building Corpus...') corpus_model = Corpus() corpus_model.fit(sentences, window=10) corpus_model.save(OUTPUT_DIR + 'corpus.model') print('Build and saved!') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=25, no_threads=10, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(OUTPUT_DIR + 'glove.model') # model = gensim.models.Word2Vec(sentences, size=300, min_count=5, iter=10, workers=10, sg=1) # model.save('./vectors/trump_preprocess_skipgram/w2v_foxnews')