def test_load_dataset(self): dataset_path = os.path.join(base_dir, "__testing_matrix-synopsis", "__testing_matrix-synopsis.gz") if os.path.isdir(base_dir): shutil.rmtree(base_dir) self.assertEqual(api.load("__testing_matrix-synopsis", return_path=True), dataset_path) shutil.rmtree(base_dir) self.assertEqual(len(list(api.load("__testing_matrix-synopsis"))), 1) shutil.rmtree(base_dir)
def test_multipart_load(self): dataset_path = os.path.join( base_dir, '__testing_multipart-matrix-synopsis', '__testing_multipart-matrix-synopsis.gz' ) if os.path.isdir(base_dir): shutil.rmtree(base_dir) self.assertEqual(dataset_path, api.load("__testing_multipart-matrix-synopsis", return_path=True)) shutil.rmtree(base_dir) dataset = api.load("__testing_multipart-matrix-synopsis") self.assertEqual(len(list(dataset)), 1)
def main(clues_path:str): count = Counter() total = 0 multi_word = 0 #model = gensim_models.load("glove-wiki-gigaword-100") model = gensim_models.load("word2vec-google-news-300") for clue_json in clues_iterator(clues_path): if clue_json["separatorLocations"]: multi_word += 1 continue clue = clue_json["clue"] length = int(clue_json["length"]) solution = clue_json["solution"] results = process_clue(clue, length, solution, model) count.update(results) total +=1 print(total) for k, v in count.items(): print(k, ":\t", v/total) print("multi word: ", multi_word/ (multi_word + total))
def testMallet2ModelOn20NewsGroups(self): corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")] dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] lda_mallet_model = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, num_topics=20, id2word=dictionary, iterations=500) lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000) self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
def test_load_model(self): if os.path.isdir(base_dir): shutil.rmtree(base_dir) vector_dead = np.array([ 0.17403787, -0.10167074, -0.00950371, -0.10367849, -0.14034484, -0.08751217, 0.10030612, 0.07677923, -0.32563496, 0.01929072, 0.20521086, -0.1617067, 0.00475458, 0.21956187, -0.08783089, -0.05937332, 0.26528183, -0.06771874, -0.12369668, 0.12020949, 0.28731, 0.36735833, 0.28051138, -0.10407482, 0.2496888, -0.19372769, -0.28719661, 0.11989869, -0.00393865, -0.2431484, 0.02725661, -0.20421691, 0.0328669, -0.26947051, -0.08068217, -0.10245913, 0.1170633, 0.16583319, 0.1183883, -0.11217165, 0.1261425, -0.0319365, -0.15787181, 0.03753783, 0.14748634, 0.00414471, -0.02296237, 0.18336892, -0.23840059, 0.17924534 ]) dataset_path = os.path.join( base_dir, "__testing_word2vec-matrix-synopsis", "__testing_word2vec-matrix-synopsis.gz" ) model = api.load("__testing_word2vec-matrix-synopsis") vector_dead_calc = model["dead"] self.assertTrue(np.allclose(vector_dead, vector_dead_calc)) shutil.rmtree(base_dir) self.assertEqual(api.load("__testing_word2vec-matrix-synopsis", return_path=True), dataset_path) shutil.rmtree(base_dir)
man_df.Description = man_df.Description.apply(lambda x: remove_empty(x)) man_df.Description = man_df.Description.apply( lambda x: list(map(remove_stopwords, x))) man_df.Description = man_df.Description.apply(lambda x: flatten(x)) man_df.Description = man_df.Description.apply(lambda x: remove_non_english(x)) # man_df.Description = man_df.Description.apply(lambda x: reduce_lemma(x)) # Dictionary with the Code and Description -> Dictionary<Code, Description> man_desc_dict = build_dict(man_df) # Besides not necessary to recompute CSV unless there are changes, loading the model is very time consuming if not os.path.exists(MATCH_DIR + METHOD): print("Downloading model") # Pretrained Word2Vec model model = downloader.load('word2vec-google-news-300') # Normalize vectors using L2 regularization model.init_sims(replace=True) sims_dict = {} print("Computing distance") for cap_code, cap_desc in cap_desc_dict.items(): # List of Tuples -> Tuple(MAN_CODE, DISTANCE) sims = [] for man_code, man_desc in man_desc_dict.items():
#!/usr/bin/python3 ''' This file calculates pagerank vectors for small-scale webgraphs. ''' import math import torch import gzip import csv import logging import gensim.downloader as api model = api.load("glove-twitter-25") class WebGraph(): def __init__(self, filename, max_nnz=None, filter_ratio=None): self.url_dict = {} indices = [] from collections import defaultdict target_counts = defaultdict(lambda: 0) # loop through filename to extract the indices logging.debug('computing indices') with gzip.open(filename, newline='', mode='rt') as f: for i, row in enumerate(csv.DictReader(f)): if max_nnz is not None and i > max_nnz: break
def labelizeTweets(tweets, label_type): labelized = [] for i, v in tqdm(enumerate(tweets)): label = '%s_%s' % (label_type, i) labelized.append(TaggedDocument(v, [label])) return labelized X_train = labelizeTweets(X_train, 'TRAIN') X_test = labelizeTweets(X_test, 'TEST') # tweet_w2v = Word2Vec(size=n_dim, min_count=1) # tweet_w2v.build_vocab([x.words for x in tqdm(X_train)]) # tweet_w2v.train([x.words for x in tqdm(X_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs) tweet_w2v = api.load("glove-twitter-200") print("pre-trained vocab size: ", len(tweet_w2v.wv.vocab)) # print(tweet_w2v.wv.vocab) # print("VIRUS EMBEDDING: ", tweet_w2v['virus']) # print("Virus most similar words: ", tweet_w2v.wv.most_similar('virus')) #### Tweet embeddings print('building tf-idf matrix ...') vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=3) # matrix = vectorizer.fit_transform([x.words for x in X_train]) matrix = vectorizer.fit_transform(tweet_w2v.wv.vocab) tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) print('vocab size :', len(tfidf))
import os import nltk from nltk.corpus import wordnet as wn nltk.download('wordnet') # fixes weird OMP mac bug os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' gensim_emb_name = "glove-wiki-gigaword-50" # 'glove-twitter-25' gensim_fn = './data/{}.vk'.format(gensim_emb_name) # we only want to do this once if os.path.exists(gensim_fn): gensim_model = KeyedVectors.load(gensim_fn) # mmap makes this significantly faster (lazy loading) else: gensim_model = api.load(gensim_emb_name) gensim_model.save(gensim_fn) gensim_embedder = Word2VecEmbedder(model=gensim_model) def write_lexicon(words, fn): defs = [] for word in words: synsets = wn.synsets(word) if len(synsets) > 0: word = synsets[0] defs.append(word.definition()) else: defs.append(None) lexicon = pd.DataFrame({"Word": words, 'Definition': defs}) lexicon.to_csv(fn, index=False)
def raiseError(error): return error if __name__ == '__main__': global model #----------- Parsing Arguments --------------- p = argparse.ArgumentParser() p.add_argument("--model", help="Path to the trained model") p.add_argument("--binary", help="Specifies the loaded model is binary") p.add_argument("--host", help="Host name (default: localhost)") p.add_argument("--port", help="Port (default: 5000)") p.add_argument("--path", help="Path (default: /word2vec)") args = p.parse_args() model_path = args.model if args.model else "./model.bin.gz" binary = True if args.binary else False host = args.host if args.host else "localhost" path = args.path if args.path else "/word2vec" port = int(args.port) if args.port else 5000 if not args.model: print( "Usage: word2vec-apy.py --model path/to/the/model [--host host --port 1234]" ) model = modelDownloader.load("word2vec-google-news-300") model.init_sims(replace=True) api.add_resource(N_Similarity, path + '/n_similarity') api.add_resource(Similarity, path + '/similarity') api.add_resource(WMDistance, path + '/wmd') app.run(host=host, port=port)
def __init__(self, config, dataset): super(JOINTSRMFNEGS, self).__init__(config, dataset) # load dataset info self.LABEL = config['LABEL_FIELD'] self.embedding_dim = config['embedding_dimension'] self.alpha = config["alpha"] item_description_fields = config['item_description_fields'] LM_neg_samples = config["LM_neg_samples"] self.logger.info(f"embedding_dimension = {self.embedding_dim}") self.logger.info(f"alpha = {self.alpha}") self.logger.info( f"item_description_fields = {item_description_fields}") self.logger.info(f"LM_neg_samples = {LM_neg_samples}") self.user_embedding = nn.Embedding(self.n_users, self.embedding_dim) self.item_embedding = nn.Embedding(self.n_items, self.embedding_dim) self.user_bias = nn.Parameter(torch.zeros(self.n_users)) self.item_bias = nn.Parameter(torch.zeros(self.n_items)) self.bias = nn.Parameter(torch.zeros(1)) self.apply(self._init_weights) gensim_cache = open('gensim_cache_path', 'r').read().strip() os.environ['GENSIM_DATA_DIR'] = str(gensim_cache) # pretrained_embedding_name = "conceptnet-numberbatch-17-06-300" pretrained_embedding_name = "glove-wiki-gigaword-50" # because the size must be 50 the same as the embedding model_path = api.load(pretrained_embedding_name, return_path=True) model = gensim.models.KeyedVectors.load_word2vec_format(model_path) self.vocab_size = len(model.key_to_index) weights = torch.FloatTensor( model.vectors) # formerly syn0, which is soon deprecated self.logger.info(f"pretrained_embedding shape: {weights.shape}") self.word_embedding = nn.Embedding.from_pretrained(weights, freeze=True) # tHIS IS NOT POSSIBLE BC OF THE MEMORY SIZE!!! # noise_dist = {} # This is the noise distribution! # self.lm_gt = torch.zeros((self.n_items, len(model.key_to_index)), device=self.device) # item_LM_file = os.path.join(dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.item") # item_desc_fields = [] # if "item_description" in item_description_fields: # item_desc_fields.append(3) # if "item_genres" in item_description_fields: # item_desc_fields.append(4) # #TODO other fields? e.g. review? have to write another piece of code # with open(item_LM_file, 'r') as infile: # next(infile) # for line in infile: # split = line.split("\t") # item_id = dataset.token2id("item_id", split[0]) # for fi in item_desc_fields: # desc = split[fi] # for term in desc.split(): # if term in model.key_to_index: # wv_term_index = model.key_to_index[term] # else: # wv_term_index = model.key_to_index["unk"] # self.lm_gt[item_id][wv_term_index] += 1 # if wv_term_index not in noise_dist: # noise_dist[wv_term_index] = 0 # noise_dist[wv_term_index] += 1 # self.logger.info(f"Done with lm_gt construction!") noise_dist = {} self.lm_gt_keys = [[] for i in range(self.n_items)] self.lm_gt_values = [[] for i in range(self.n_items)] item_LM_file = os.path.join(dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.item") item_desc_fields = [] if "item_description" in item_description_fields: item_desc_fields.append(3) if "item_genres" in item_description_fields: item_desc_fields.append(4) # TODO other fields? e.g. review? have to write another piece of code with open(item_LM_file, 'r') as infile: next(infile) for line in infile: split = line.split("\t") item_id = dataset.token2id_exists("item_id", split[0]) if item_id == -1: continue for fi in item_desc_fields: desc = split[fi] for term in desc.split(): if term in model.key_to_index: wv_term_index = model.key_to_index[term] if wv_term_index not in self.lm_gt_keys[item_id]: self.lm_gt_keys[item_id].append(wv_term_index) self.lm_gt_values[item_id].append(1) else: idx = self.lm_gt_keys[item_id].index( wv_term_index) self.lm_gt_values[item_id][idx] += 1 if wv_term_index not in noise_dist: noise_dist[wv_term_index] = 0 noise_dist[wv_term_index] += 1 self.logger.info(f"Done with lm_gt construction!") # keys_sum = 0 # zeros = 0 # max_len = 0 # for lm in self.lm_gt_keys: # if len(lm) == 0: # zeros += 1 # else: # keys_sum += len(lm) # if len(lm) > max_len: # max_len = len(lm) # print(keys_sum) # print(zeros) # print(keys_sum / (self.n_items - zeros)) # print(max_len) # print(len(max(self.lm_gt_keys))) # exit(1) self.sigmoid = nn.Sigmoid() self.loss_rec = nn.BCELoss() self.loss_lm = SoftCrossEntropyLossByNegSampling( LM_neg_samples, noise_dist, 0.75, self.device) # dist to the power of 3/4
import gensim.downloader as api print("downloading") fasttext_model300 = api.load('fasttext-wiki-news-subwords-300') print("downloaded") import numpy as np import os def get_vects(questions): list_of_vectors = [] model = dict() #iterate through the questions and make them a list of words for i in range(len(questions)): #print(questions[i],"\n") #print each question list_of_words = questions[i].split(" ") list_of_vectors = [] #iterate through each word and get the vector if (len(list_of_words) > 0): for j in list_of_words: if j in fasttext_model300.wv.vocab: list_of_vectors.append(fasttext_model300.wv.word_vec(j)) two_dim = np.vstack(tuple(list_of_vectors)) #get mean across cols """filename = "Documents/WT2/w2voutput.txt" if os.path.exists(filename): append = 'a' else: append = 'w' with open(filename, append) as fp: fp.write(str(questions[i])+str(" ")+str(np.mean(two_dim, axis = 0))+str("\n"))""" model[questions[i]] = np.mean(two_dim, axis=0)
sentiment_terms.append([]) dataset['sentiment_terms'] = sentiment_terms return dataset trainset = read(filename='TP2/data/traindata.csv') devset = read(filename='TP2/data/devdata.csv') le = LabelEncoder() le.fit(trainset['sentiment']) trainset['sentiment_label'] = le.transform(trainset['sentiment']) devset['sentiment_label'] = le.transform(devset['sentiment']) import gensim.downloader as api twitter_model = api.load('glove-twitter-50') words = [] from itertools import islice with open('TP2/resources/negative-words.txt') as fin: for line in islice(fin, 36, 4818): words.append(line[:-1]) with open('TP2/resources/positive-words.txt') as fin: for line in islice(fin, 36, 2041): words.append(line[:-1]) vec_corpus = [] for i in range(len(trainset)): sentiment_terms = trainset.iloc[i, :]['sentiment_terms'] for term in sentiment_terms:
from datetime import datetime import gensim.downloader as api # SETTING AFFECT_NORMS_PATH = '../Norms/AffectiveNorms/BRM-emot-submit.csv' ASSOCIATION_NROMS_PATH = '../Norms/AssociationNorms/association_matrix.csv' MATERIAL_PATH = '../Materials/' ITERATION_N = 10000 # For random list creation TRAINED_DATA = "word2vec-google-news-300" #word2vec model affect_df = pd.read_csv(AFFECT_NORMS_PATH) print('LOADED affect norms') # Loading pretrained word2vec model takes time. # Comment out the two lines below when you don't use word2vec data. word_vectors = api.load(TRAINED_DATA) print('LOADED word2vec vectors') association_df = pd.read_csv(ASSOCIATION_NROMS_PATH, index_col=0) print('LOADED association norms') cues_in_norms = set(association_df.index) def cos_sim(v1, v2): """ Calculate cosine similarity. If vector's length == 0, it returns np.nan. """ if (len(v1) == 0) or (len(v2) == 0): return np.nan
def word2vec_padding(list_of_embeddings, length, embedding_length): zero_vec = np.zeros(embedding_length) for _ in range(length - len(list_of_embeddings)): list_of_embeddings.append(zero_vec) return list_of_embeddings[:length] def word2vec_sum(list_of_embeddings, embedding_length): ret_value = np.zeros(embedding_length) for embedding in list_of_embeddings: ret_value += embedding return ret_value word2vec_model = glove_api.load('glove-wiki-gigaword-50') embedding_size = word2vec_model.vector_size word2vec_parser = Word2Vec( type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(0), word2vec_model=word2vec_model, return_func=lambda x: word2vec_padding(x, 65, embedding_size)) with open(os.getcwd() + '/data/zork_walkthrough_' + task + '.txt', 'rb') as f: data = pickle.load(f) states = [word2vec_parser(state) for state in data['states']] raw_actions = data['actions'] actions = [] bows = [] noise = MultivariateNormal(torch.zeros(50), torch.eye(50))
import gensim.downloader as gensim import numpy as np #download/load model model = gensim.load("glove-twitter-25") v1 = model["flower"] v2 = model["flowers"] v3 = model["animal"] v4 = model["cow"] v5 = model["cat"] print(np.dot(v1, v2)) print(np.dot(v3, v4)) print(np.dot(v3, v5)) print(np.dot(v4, v5)) nubes = model["cloud"] negro = model["black"] tormenta = model["storm"] effect = tormenta cause = np.add(nubes, negro) print(cause) print(effect) print(len(cause)) print(len(effect)) relation = np.cross(cause.tolist(), effect.tolist()) event = np.add(cause, np.cross(cause, effect))
for dataset in [data_training, data_test]: for index, row in dataset.iterrows(): for answer in answer_cols: answer_to_numb = [] for word in text_to_word_list(row[answer]): if word not in vocab: vocab[word] = len(inverse_vocabulary) answer_to_numb.append(len(inverse_vocabulary)) inverse_vocabulary.append(word) else: answer_to_numb.append(vocab[word]) dataset.set_value(index, answer, answer_to_numb) embed_size = 300 model = api.load("word2vec-google-news-300" ) # download the model and return as object ready for use word_vectors = model.wv embedding_matrix = np.zeros((len(vocab) + 1, embed_size)) for word, i in vocab.items(): try: embedding_vector = word_vectors[word] embedding_matrix[i] = embedding_vector except KeyError: embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embed_size) del (word_vectors) maxlen_1 = max( data_training.StudentAnswer.map(lambda x: len(x)).max(), data_training.ReferenceAnswers.map(lambda x: len(x)).max(),
def __init__(self, config, dataset): super(JOINTSRMFFULL, self).__init__(config, dataset) # load dataset info self.LABEL = config['LABEL_FIELD'] self.embedding_dim = config['embedding_dimension'] self.alpha = config["alpha"] item_description_fields = config['item_description_fields'] if "number_of_reviews_to_use" in config: max_number_of_reviews = config['number_of_reviews_to_use'] else: max_number_of_reviews = 1 self.variant = config["debug_variant"] self.logger.info(f"embedding_dimension = {self.embedding_dim}") self.logger.info(f"alpha = {self.alpha}") self.logger.info( f"item_description_fields = {item_description_fields}") self.user_embedding = nn.Embedding(self.n_users, self.embedding_dim) self.item_embedding = nn.Embedding(self.n_items, self.embedding_dim) self.user_bias = nn.Parameter(torch.zeros(self.n_users)) self.item_bias = nn.Parameter(torch.zeros(self.n_items)) self.bias = nn.Parameter(torch.zeros(1)) self.apply(self._init_weights) gensim_cache = open('gensim_cache_path', 'r').read().strip() os.environ['GENSIM_DATA_DIR'] = str(gensim_cache) # pretrained_embedding_name = "conceptnet-numberbatch-17-06-300" pretrained_embedding_name = "glove-wiki-gigaword-50" # because the size must be 50 the same as the embedding model_path = api.load(pretrained_embedding_name, return_path=True) model = gensim.models.KeyedVectors.load_word2vec_format(model_path) weights = torch.FloatTensor( model.vectors) # formerly syn0, which is soon deprecated self.logger.info(f"pretrained_embedding shape: {weights.shape}") self.word_embedding = nn.Embedding.from_pretrained(weights, freeze=True) self.vocab_size = len(model.key_to_index) s = time.time() self.lm_gt = torch.zeros((self.n_items, self.vocab_size), dtype=torch.uint8) self.lm_gt_len = torch.ones(self.n_items, dtype=torch.int16) item_desc_fields = [] if "item_description" in item_description_fields: item_desc_fields.append(3) if "item_genres" in item_description_fields: item_desc_fields.append(4) if "tags" in item_description_fields: item_desc_fields.append(4) if len(item_desc_fields) > 0: item_LM_file = os.path.join( dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.item") with open(item_LM_file, 'r') as infile: next(infile) for line in infile: split = line.split("\t") item_id = dataset.token2id_exists("item_id", split[0]) if item_id == -1: continue if item_id == 0: print("Isnt that padding?") for fi in item_desc_fields: if fi >= len(split): continue desc = split[fi] for term in desc.split(): if term in model.key_to_index: wv_term_index = model.key_to_index[term] self.lm_gt[item_id][wv_term_index] += 1 self.lm_gt_len[item_id] += 1 if "review" in item_description_fields: num_of_used_revs = {} item_desc_fields = [3] item_LM_file = os.path.join( dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.inter") with open(item_LM_file, 'r') as infile: next(infile) for line in infile: split = line.split("\t") item_id = dataset.token2id_exists("item_id", split[1]) if item_id == -1: continue if item_id == 0: print("Isnt that padding?") if item_id not in num_of_used_revs: num_of_used_revs[item_id] = 0 elif num_of_used_revs[item_id] >= max_number_of_reviews: continue for fi in item_desc_fields: desc = split[fi] if len(desc.split()) > 0: num_of_used_revs[item_id] += 1 for term in desc.split(): if term in model.key_to_index: wv_term_index = model.key_to_index[term] if term in model.key_to_index: wv_term_index = model.key_to_index[term] self.lm_gt[item_id][wv_term_index] += 1 self.lm_gt_len[item_id] += 1 self.lm_gt_len[(self.lm_gt_len == 0).nonzero(as_tuple=True)] = 1 e = time.time() self.logger.info(f"{e - s}s") self.logger.info(f"Done with lm_gt construction!") self.sigmoid = nn.Sigmoid() self.loss_rec = nn.BCELoss() self.loss_lm = SoftCrossEntropyLoss()
# We import Googles word2vec model. It contains over 3 million words. # This import can take awhile. import gensim.downloader as api print("Loading in the model. Please give the computer at least 2 minutes. \n") wv = api.load('word2vec-google-news-300') print("Finished loading in the model.\n")
def load_model(val, key): model = gensim_api.load(gensim_model_name) cprint('MODEL LOADED', 'green') val[key] = model
def load_gensim_embedding_model(model_name): available_models = gensim_data_downloader.info()['models'].keys() assert model_name in available_models, 'Invalid model_name: {}. Choose one from {}'.format( model_name, ', '.join(available_models)) return gensim_data_downloader.load(model_name)
import gensim from gensim import corpora import gensim.downloader as api from pprint import pprint dataset = api.load("fake-news") dataset = [wd for wd in dataset] dct = corpora.Dictionary(dataset) corpus = [dct.doc2bow(line) for line in dataset] # Construct bigrams model bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10) # Construct bigrams pprint(bigram[dataset[0]]) # Construct trigrams model trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10) # Construct trigram pprint(trigram[bigram[dataset[0]]])
import gensim.downloader as api word_vectors = api.load("glove-wiki-gigaword-100") sim=word_vectors.similarity("silent","singing") print(sim)
google_embedding = google_utils.embedding_finder(word) wiki_embedding = wiki_utils.embedding_finder(word.lower()) if google_embedding is not None: return google_scikit_wrapper.model_predictor(google_embedding, 'google') elif wiki_embedding is not None: return wiki_scikit_wrapper.model_predictor(wiki_embedding, 'wiki') elif random_string_utils.random_string_finder(word): return 'Random string' else: return 'Other' if __name__ == '__main__': print("\nPlease wait the embeddings repository is loading...\n") embeddings_repository = models.KeyedVectors\ .load_word2vec_format('../data/pre_trained_models/GoogleNews-vectors-negative300.bin.gz', binary=True) dates_repository = api.load("glove-wiki-gigaword-100") classifier = Classifier() while True: try: value = input("Please enter a string that you want to classify:\n") if value == 'no' or value == 'n': break print(f'You entered: {value}') print(f'The classifier predicted that the class of {value} is ' f'{classifier.run(value, embeddings_repository, dates_repository)}') print('\nIf you would like to stop type no or n\n') except (KeyboardInterrupt, SystemExit): raise
import gensim from gensim import downloader text8 = downloader.load('text8') model = gensim.models.word2vec.Word2Vec(text8, max_vocab_size=100000) model.wv.save_word2vec_format("gensim.model")
for i in range(len(target_vectors)): print(top_context_words[i],cos(flat(word_vector),flat(target_vectors[i]))) """# TSNE""" import matplotlib.pyplot as plt from sklearn.manifold import TSNE top_context_words.append(wrd) target_vectors.append(h1) tsne = TSNE() tv=[] for i in target_vectors: tv.append(flat(i)) embed_tsne = tsne.fit_transform(tv) fig, ax = plt.subplots(figsize=(5, 5)) for idx in range(len(target_vectors)): plt.scatter(*embed_tsne[idx, :], color='steelblue') plt.annotate(top_context_words[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7) """# Gensim Comparison with word vector error calculation""" from gensim.models import Word2Vec import gensim.downloader as api corpus = api.load('text8') model = Word2Vec(corpus) gensim_vec = model.wv[wrd] print("The Mean Error between Gensim Embedding and My Embedding for The word",wrd) print((np.square(gensim_vec - word_vector)).mean())
def load_file_of_vectors(self): self.__fasttext_model = api.load('fasttext-wiki-news-subwords-300')
import re import numpy as np from scipy import spatial import gensim.downloader as api alphanum = re.compile(r"[\W_]+") model = api.load("word2vec-google-news-300") embedding_dim = 300 def split_composite(w): m = re.split(r", | & | and |\s", w) return set([alphanum.sub("", s.lower()) for s in m]) def _get_embedding(label_set): result = [0] * embedding_dim num_words = len(label_set) for label in label_set: try: result += model[label] except KeyError: num_words -= 1 if num_words == 0: return [0] * embedding_dim return np.array(result) / num_words def get_class_vector(class_hier): f""" get_class_vector takes a string class label and returns a {embedding_dim} dimensional vector.
from pythonosc import osc_message_builder from pythonosc import osc_bundle_builder from pythonosc import udp_client from gensim.models import Word2Vec import gensim.downloader as api import gensim import time import subprocess #send OSC messages to port 6448 on localhost client = udp_client.SimpleUDPClient("127.0.0.1", 6448) print("Sends to port 6448 with OSC message name /wek/inputs") # Download model if necessary: model_location = api.load("glove-twitter-25", return_path=True) #Load model into variable: print("Loading model", " ...") wv_model = gensim.models.KeyedVectors.load_word2vec_format(model_location) #function for importing text file as list of words def read_words(words_file): return [word for line in open(words_file, 'r') for word in line.split()] song_corpus = read_words("smashmouth.txt") for word in song_corpus: print(word) subprocess.run(["say", word])
en_model.most_similar("wood",topn=20) ### GloVe As described [before](05representations.md) GloVe constitutes another method for calculating Word-Embbedings. Pre-trained GloVe vectors can be downloaded from [Glove](https://nlp.stanford.edu/projects/glove/) and imported into Python. However, gensim already provides a downloader for several word-embeddings, including GloVe embeddings of different length and different training-data. The corpora and embeddings, which are available via the gensim downloader, can be queried as follows: import gensim.downloader as api api.info(name_only=True) We select the GloVe word-embeddings `glove-wiki-gigaword-100` for download: word_vectors = api.load("glove-wiki-gigaword-100") # load pre-trained word-vectors from gensim-data type(word_vectors) As can be seen in the previous output, the downloaded data is available as a `KeyedVectors`-object. Hence the same methods can now be applied as in the case of the FastText - Word Embedding in the previous section. In the sequel we will apply not only the methods used above, but also new ones. Word analogy questions like *man is to king as woman is to ?* can be solved as in the code cell below: result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man']) print("{}: {:.4f}".format(*result[0])) Outliers within sets of words can be determined as follows: print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split())) Similiarity between a pair of words:
from gensim.scripts.glove2word2vec import glove2word2vec from gensim.models import KeyedVectors import numpy as np import csv from nltk.stem import PorterStemmer import sys import gensim.downloader as api import pickle import enchant enchant_dict = enchant.Dict("en_US") model = api.load("glove-wiki-gigaword-50") # glove_input_file = '../model/glove.6B.100d.txt' # word2vec_output_file = 'glove.6B.100d.txt.word2vec' # glove2word2vec(glove_input_file, word2vec_output_file) # filename = 'glove.6B.100d.txt.word2vec' # model = KeyedVectors.load_word2vec_format(filename) keywords = open("./keywords", mode='rb') keywords_dict = pickle.load(keywords) # print(keywords_dict) ps = PorterStemmer() movies_vector_dict = {} for movie in keywords_dict.keys(): vectors = [] for word in keywords_dict[movie]: if type(word) is tuple:
from re import sub from gensim.utils import simple_preprocess from nltk.corpus import stopwords from nltk import word_tokenize import gensim.downloader as ap from gensim.corpora import Dictionary from gensim.models import TfidfModel from gensim.models import WordEmbeddingSimilarityIndex from gensim.similarities import SparseTermSimilarityMatrix from gensim.similarities import SoftCosineSimilarity import numpy as np import ipfsapi glove = ap.load("glove-wiki-gigaword-50") api = ipfsapi.connect('127.0.0.1', 5001) STOPWORDS = set(stopwords.words('english')) class Reputation: def __init__(self,author_address,author_reputation,hashes,timestamp,m, T): self.cs = 1 self.author_address = author_address self.author_reputation = author_reputation self.hashes = hashes self.timestamp = timestamp self.m = m
import logging import json import gensim.downloader as api MODEL = api.load('fasttext-wiki-news-subwords-300') # info = api.info() # logging.basicConfig( # format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # for model_name, model_data in sorted(info['models'].items()): # print( # '%s (%d records): %s' % ( # model_name, # model_data.get('num_records', -1), # model_data['description'][:40] + '...', # ) # ) def get_relavant_entities(query, topn, restrict_vocab=None): ''' query: list of keywords topn: number of return entities restrict_vocab: is an optional integer which limits the range of vectors which are searched for most-similar values. For example, estrict_vocab=10000 would only check the first 10000 word vectors in the vocabulary order. (This may be meaningful if you’ve sorted the vocabulary by descending frequency.) ''' return MODEL.most_similar(positive=['ireland', 'IBM', 'Trafficking'], topn=50, restrict_vocab=restrict_vocab)
import numpy as np import pandas as pd from gensim.models import Word2Vec import gensim.downloader as api from gensim.models import KeyedVectors #%% model = api.load('glove-wiki-gigaword-300') model.save("E:\\py599\\wikiEmbedder.npz") #%% print("woman-->man == king-->? <take1>") result = model.most_similar(positive=['woman','king'],negative=['man',]) print(result[0]) print("\n") print("woman-->man == king-->? <take2>") result = model.most_similar_cosmul(positive=['woman','king'],negative=['man',]) print(result[0]) print("\n") print("paris-->france == london-->? <take3>") result = model.most_similar_cosmul(positive=['paris','france'],negative=['london',]) print(result[0]) print("\n") print("man-->men == woman-->? <take4>") result = model.most_similar_cosmul(positive=['man','woman'],negative=['men',]) print(result[0]) print("\n") print("similar to carolina? <take5>") result = model.similar_by_word('carolina') print(result[0]) print("\n") print("similar to carolina? <take6>")
# Hard coding context window size. c = 2 neg_samples = 2 * c # Check if cuda enabled is_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Reading dataset dat_file = './data/cleaned_documents.csv' dat = pd.read_csv(dat_file, converters={"text": ast.literal_eval}) print("Read cleaned doc, size = ", len(dat)) # Build vocabulary # If experiment 3 or 4 then partial vocabulary. if len(sys.argv) > 1 and sys.argv[1] in ['3', '4']: gen_model = api.load('glove-wiki-gigaword-100') E = 100 # size of embedding words = list(itertools.chain.from_iterable(dat.text)) if len(sys.argv) > 1 and sys.argv[1] in ['3', '4']: words = [x for x in words if x in gen_model] uni_freq = Counter(words) words = set(words) word2idx = {word: idx for idx, word in enumerate(words)} idx2word = {idx: word for idx, word in enumerate(words)} W = len(set(words)) print("Vocab size : ", W) for each in uni_freq: