def test_info(self): data = api.info("text8") self.assertEqual(data["parts"], 1) self.assertEqual(data["file_name"], 'text8.gz') data = api.info() self.assertEqual(sorted(data.keys()), sorted(['models', 'corpora'])) self.assertTrue(len(data['models'])) self.assertTrue(len(data['corpora']))
def load_gensim_embedding_model(model_name): available_models = gensim_data_downloader.info()["models"].keys() assert (model_name in available_models ), "Invalid model_name: {}. Choose one from {}".format( model_name, ", ".join(available_models)) return gensim_data_downloader.load(model_name)
def load_gensim_embedding_model(model_name): """ Function to load and select the word embeddings model, using the Gensim lirbary. """ available_models = gensim_data_downloader.info()['models'].keys() assert model_name in available_models, 'Invalid model_name: {}. Choose one from {}'.format(model_name, ', '.join(available_models)) model_path = gensim_data_downloader.load(model_name, return_path=True) return KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')
def __init__(self): info = api.info() self.__type = 0 #default self.__ListSelection = 5 #default self.__model = None self.__address = "/Users/markhan/UCL_CS/System_Engineering/final/bias-detect/bias_backend/bias_backend/bias_backend/bias_backend_app/Algorithm/GoogleNews-vectors-negative300.bin.gz" #default self.__preTrainedModelList = list(info['models'].keys()) self.__corporaList = list(info['corpora'].keys())
def get_gensim_pretrained_info(entity, desc_len=None): """ :param entity: either 'corpora' or 'models' :param desc_len: description length of each entity, entire description is printed if this is None :return: None """ info = api.info() for entity_name, entity_data in sorted(info[entity].items()): print(f"{entity_name:<40} {entity_data.get('num_records', -1)} records: " f"{entity_data['description'][:desc_len] + '...'}")
def test_api_model_loading(sample_corpus_embedding): """Test embeddings loaded through the Gensim download API.""" embedder = EmbeddingTransformer( model=list(DEFAULT_PRETRAINED_EMBEDDINGS.keys())[0]) embeddings = embedder.transform(sample_corpus_embedding) assert embeddings.shape[0] == len(sample_corpus_embedding) assert np.all(embeddings[1] == embeddings[2]) embedder = EmbeddingTransformer(model=list(api.info()["models"].keys())[0]) embeddings = embedder.transform(sample_corpus_embedding) assert embeddings.shape[0] == len(sample_corpus_embedding) assert np.all(embeddings[1] == embeddings[2])
def __init__(self, model: str = "glove", aggregation: str = "average"): """ Load pre-trained embeddings, either locally if model is a local file path or a Word2VecKeyedVector object, or downloaded from the gensim API if a string is provided. """ if aggregation not in {"average", "sum", "minmax"}: raise ValueError( f"Unknown embeddings aggregation mode: {aggregation}, the available " "ones are: average, sum, or minmax.") if isinstance(model, str): model = model.lower() if model in DEFAULT_PRETRAINED_EMBEDDINGS.keys(): model_gensim_name = DEFAULT_PRETRAINED_EMBEDDINGS[model] self.model = api.load(model_gensim_name) elif model in api.info()["models"].keys(): self.model = api.load(model) # pragma: no cover elif os.path.exists(model): logger.info("Loading local model") self.model = Word2VecKeyedVectors.load(model) if not isinstance(self.model, Word2VecKeyedVectors): raise TypeError( "The input model should be a Word2VecKeyedVectors object but " f"it is a {type(self.model)} object.") else: raise KeyError( f"Unknown pre-trained model name: {model}. Available models are" + ", ".join(api.info()["models"].keys())) logger.info("Loaded model keyed vectors: " + model) elif isinstance(model, Word2VecKeyedVectors): self.model = model logger.info("Loaded model keyed vectors.") else: raise TypeError( "Input pre-trained model should be a string or a gensim " "Word2VecKeyedVectors object") self.aggregation = aggregation self.embedding_dimension = self.model.vector_size if self.aggregation == "minmax": self.embedding_dimension *= 2
def load_gensim_embedding_model(model_name): """ Load word embeddings (gensim KeyedVectors) """ available_models = gensim_data_downloader.info()['models'].keys() assert model_name in available_models, 'Invalid model_name: {}. Choose one from {}'.format( model_name, ', '.join(available_models)) # gensim throws some nasty warnings about vocabulary with warnings.catch_warnings(): warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') model = gensim_data_downloader.load(model_name) return model
def CheckModel(): print("initialize") my_file = Path("model1/glove-wiki-gigaword-200") if my_file.is_file(): print("File is present") filepath = os.getcwd() + "/model1/glove-wiki-gigaword-200" model = KeyedVectors.load(filepath) else: print("Downloading file") info = api.info() model = api.load("glove-wiki-gigaword-200") filepath = os.getcwd() + "/model1/glove-wiki-gigaword-200" model.save(filepath) print("Downloading complete") return model
def find_similar_words(word_embedding, words): ''' Given a list of a words, find the ten most similar words to each. Return a dictionary mapping each word in the input list to the similar words along with their similarity. Common word embeddings include 'fasttext-wiki-news-subwords-300' and 'glove-wiki-gigaword-200'. Example: {'adventurous': [['adventuresome', 0.673180103302002], ['inventive', 0.5974040627479553], ['imaginative', 0.5858909487724304], ['enterprising', 0.5562216639518738], ['musically', 0.5521135330200195], ['impetuous', 0.5404343008995056], ['inquisitive', 0.5328224897384644], ['venturesome', 0.5321169495582581], ['enjoyable', 0.5309233069419861], ['offbeat', 0.5194555521011353]], 'affectionate': [['playful', 0.6456809639930725], ['respectful', 0.6125648021697998], ['sarcastic', 0.6028381586074829], ['affection', 0.5752658247947693], ['sardonic', 0.5718863010406494], ['loving', 0.5700308084487915], ['endearing', 0.5636808276176453], ['polite', 0.5526844263076782], ['wry', 0.5466963648796082], ['irreverent', 0.5442217588424683]]} Keyword arguments: word_embedding -- the gensim word embedding model, see https://github.com/RaRe-Technologies/gensim-data words -- the list of words to find similar words of ''' info = api.info() # show info about available models/datasets model = api.load(word_embedding ) # download the model and return as object ready for use embedding_similarity = {} for word in words: embedding_similarity[word] = model.most_similar(word) with open('embedding-data.json', 'w') as outfile: json.dump(embedding_similarity, outfile) return embedding_similarity
def _init_gen_dir(self, name): name = EMBEDDING_SHORTHANDS.get(name, name) data_root = EMBEDDING_DATA_PATH gen_dir = join(data_root, name) gensim_models = gensim_data.info(name_only=True)["models"] if not exists(gen_dir) and name not in gensim_models: offline_models = list_folders(data_root) shorthand_names = [*EMBEDDING_SHORTHANDS] available_embeddings = set( offline_models + gensim_models + shorthand_names ) raise ValueError( """Expected Embedding name to be one of {0},\ got {1}.""".format( available_embeddings, name ) ) else: self._gen_dir = gen_dir self._name = name
def __init__(self, batch_size, train_data_x, train_data_y, embeddings_index, word_index, config): """Create object that will be passed to fit_generator, native keras function :param batch_size: Size of the batch that is being used in the fit_generator :type batch_size: int :param train_data_x: Training data, tweets as sequences of word_indexes, already padded, preprocessed etc. training ready :type train_data_x: 2D array of floats :param train_data_y: One-hot encoded ground-truth labels :type train_data_y: 2D array of floats :param embeddings_index: Dictionary in which there is word as a key, and value is n-dim embedding array :type embeddings_index: dict :param word_index: Look-up Table that returns word_id given the word :type word_index: dict :param config: Configuration object that defines some useful hyper-parameters :type config: dict """ self.batch_size = batch_size self.train_data_x = train_data_x self.train_data_y = train_data_y self.embeddings_index = embeddings_index self.word_index = word_index self.word_index_keys_as_list = list(word_index.keys()) self.word_index_keys_as_arr = np.array(list(word_index.keys())) self.config = config # TODO: Consider shuffling data on_epoch_end() # Load model straight from gensim to efficiently find most_similar words info = api.info() # show info about available models/datasets self.model_emb = api.load( "glove-twitter-25" ) # download the model and return as object ready for use self.on_epoch_end() print("self.__len__() = {}".format(self.__len__())) return
def get_model(): info = api.info() # show info about available models/datasets model = api.load("glove-twitter-25") # download the model and return as object ready for use return model
model = Word2Vec(corpus) ############################################################################### # # Now that we have our word2vec model, let's find words that are similar to 'tree' # print(model.most_similar('tree')) ############################################################################### # # You can use the API to download many corpora and models. You can get the list of all the models and corpora that are provided, by using the code below: # import json info = api.info() print(json.dumps(info, indent=4)) ############################################################################### # There are two types of custom: corpora and models. print(info.keys()) ############################################################################### # Let's have a look at the available corpora: for corpus_name, corpus_data in sorted(info['corpora'].items()): print('%s (%d records): %s' % ( corpus_name, corpus_data.get('num_records', -1), corpus_data['description'][:40] + '...', ))
import gensim.downloader as api from gensim.models.word2vec import Word2Vec import json info = api.info() #corpus = api.load('text8') for model_name, model_data in sorted(info['models'].items()): print('%s (%d records): %s' % ( model_name, model_data.get('num_records', -1), model_data['description'][:40] + '...', )) fake_news_info = api.info('glove-wiki-gigaword-50') print(json.dumps(fake_news_info, indent=4)) model = api.load("glove-wiki-gigaword-50") print(model.most_similar("cat")) model.init_sims(replace=True) model.save( "C:/Users/KEVINBONYTHEKKANATH-/Desktop/Projects/Chatbot/pretrained_word_similarity" ) print(model.similarity('kevin', 'bonita')) print(model.similarity('cat', 'meow'))
## Use this script to retrain the popular pre-trained models on your custom corpus import os import re from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors, Word2Vec from gensim.scripts.glove2word2vec import glove2word2vec from sklearn import datasets from nltk.tokenize import word_tokenize import gensim.downloader as api api.info() # return dict with info about available models/datasets api.info("text8") ## If you have downloaded the files then load using the following method def load_glove_files_local(glove_path): #glove_path = "../scoring_file/w2v_glove/glove.6B.300d.txt" file_path = os.path.join(os.getcwd(), glove_path) glove_file = datapath(os.path.join(os.getcwd(), glove_path)) tmp_file = get_tmpfile("w2v.txt") #convert from glove to word2vec _ = glove2word2vec(glove_file, tmp_file) #load the keyed vectors model_glove = KeyedVectors.load_word2vec_format(tmp_file) return model_glove def check_coverage(vocab, embeddings): '''Checks what percentage of vocab is covered by the embedding model''' vocab_length = len(vocab)
# download articles' texts articles_m = get_articles.get_articles_for_period(urls_m, TEXTS_MEDUZA, period) articles_v = get_articles.get_articles_for_period(urls_v, TEXTS_VEDOMOSTI, period) articles_k = get_articles.get_articles_for_period(urls_k, TEXTS_KOMMERSANT, period) ### or read articles from file articles_k = file_handling.read_from_file(TEXTS_KOMMERSANT) articles_v = file_handling.read_from_file(TEXTS_VEDOMOSTI) articles_m = file_handling.read_from_file(TEXTS_MEDUZA) ##################################### # Get information about the model or dataset api.info('word2vec-ruscorpora-300') # Download model w2v_model = api.load("word2vec-ruscorpora-300") # add tags to texts tagged_m = add_tags.add_tags_to_articles(articles_m, TAGGED_MEDUZA, period) tagged_v = add_tags.add_tags_to_articles(articles_v, TAGGED_VEDOMOSTI, period) tagged_k = add_tags.add_tags_to_articles(articles_k, TAGGED_KOMMERSANT, period) ### or read tagged articles from file tagged_m = file_handling.read_from_file(TAGGED_MEDUZA) tagged_v = file_handling.read_from_file(TAGGED_VEDOMOSTI) tagged_k = file_handling.read_from_file(TAGGED_KOMMERSANT) # convert texts to vectors vectors_m = add_tags.article_to_vector(tagged_m, w2v_model, period,
def run(): print('Loading data...') train_data = load_imdb(IMDB_DATA_PATH, test=False) # test_data = load_imdb(IMDB_DATA_PATH, test=True) train_tokenized = [tokenize(sentence) for sentence, _ in train_data] # test_tokenized = [tokenize(sentence) for sentence, _ in test_data] vocab = set(chain(*train_tokenized)) vocab_size = len(vocab) print('Data loaded with vocabulary size: ', vocab_size) print('Building embedding dict...') api.info('glove-wiki-gigaword-100') word2vec = api.load("glove-wiki-gigaword-100") # word2vec = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=False, encoding='utf-8') word2idx = {word: idx + 1 for idx, word in enumerate(vocab)} word2idx[UNK_STR] = UNK_TOKEN idx2word = {idx + 1: word for idx, word in enumerate(vocab)} idx2word[UNK_TOKEN] = UNK_STR print('Done') print('Generating ready-to-train data') train_features = torch.tensor( pad_sentences(idx_sentences(train_tokenized, word2idx))) train_labels = torch.tensor([label for _, label in train_data]) # test_features = torch.tensor(pad_sentences(idx_sentences(test_tokenized, word2idx))) # test_labels = torch.tensor([label for _, label in test_data]) print('Done') embed_size = 100 weight = torch.zeros(vocab_size + 1, embed_size) for i in range(len(word2vec.index2word)): try: index = word2idx[word2vec.index2word[i]] except: continue weight[index, :] = torch.from_numpy( word2vec.get_vector(idx2word[index])) n_epochs = 5 embed_size = 100 hidden_size = 100 n_layers = 2 bidirectional = True batch_size = 512 n_labels = 2 learning_rate = 0.5 model = SentimentNet(embed_size=embed_size, hidden_size=hidden_size, n_layers=n_layers, bidirectional=bidirectional, weight=weight, n_labels=n_labels) if os.path.exists(MODEL_PATH): model = torch.load(MODEL_PATH) model = model.to(DEVICE) loss_func = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate) train_set = TensorDataset(train_features, train_labels) # test_set = TensorDataset(test_features, test_labels) train_iter = DataLoader(train_set, batch_size=batch_size, shuffle=True) # test_iter = DataLoader(test_set, batch_size=batch_size, shuffle=False) print('Start training...') for epoch in range(n_epochs): # start = time.time() # train_loss, test_less = 0, 0 # train_acc, test_acc = 0, 0 n, m = 0, 0 for feature, label in train_iter: n += 1 model.zero_grad() feature = feature.to(DEVICE) label = label.to(DEVICE) pred = model(feature) loss = loss_func(pred, label) print('Train step: %d, loss: %.3f' % (n, loss.item())) loss.backward() optimizer.step() torch.save(model, MODEL_PATH)
''' This module contains functions to do a semantic search on Wikipedia ''' from pprint import pprint if __name__ == '__main__': print 'Wikipedia Semantic Search' import gensim.downloader as api # print(api.load("fake-news", return_path=True)) pprint(api.info())
def get_model_info(self, name: str = "glove-twitter-25" ) -> None: # // for pre-made "Print some info about a model." api.info(name)
def get_model_info(self, name="glove-twitter-25"): # // for pre-made api.info(name)
def set_embedding(self): info = api.info() # show info about available models/datasets self.model = api.load("glove-wiki-gigaword-50") self.embedding_weights = np.vstack( [np.zeros(self.model.vectors.shape[1]), self.model.vectors])
def models(): # model names from https://github.com/RaRe-Technologies/gensim-data info = api.info() # show info about available models/datasets return info
def get_available_embeddings(self): return (list(api.info()["models"].keys()))
import gensim.downloader as api print("importing stuff") info = api.info() # show info about available models/datasets model = api.load("glove-twitter-100" ) # download the model and return as object ready for use output = model.most_similar(["sales", "orders"]) print(output) #from gensim.models.word2vec import Word2Vec #import gensim.downloader as api #corpus = api.load('text8') # download the corpus and return it opened as an iterable #model = Word2Vec(corpus) # train a model from the corpus #model.most_similar("car"
import logging import gensim.downloader as api logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) info = api.info() for model_name, model_data in sorted(info['models'].items()): print('%s (%d records): %s' % ( model_name, model_data.get('num_records', -1), model_data['description'][:40] + '...', ))
def load_model(self): # if the reference isn't in the possible models, FileNotFoundError is raised if self.reference in downloader.info()['models']: return downloader.load(self.reference) else: raise FileNotFoundError
from keras.layers.core import Reshape, Flatten from keras.callbacks import EarlyStopping from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam from keras.models import Model from keras import regularizers # gensim import gensim from gensim.models import Word2Vec from gensim.utils import simple_preprocess from gensim.models.keyedvectors import KeyedVectors import gensim.downloader as api api.info("text8") # retrieve information about text8 dataset {u'checksum': u'68799af40b6bda07dfa47a32612e5364', u'description': u'Cleaned small sample from wikipedia', u'file_name': u'text8.gz', u'parts': 1, u'source': u'http://mattmahoney.net/dc/text8.zip'} api.info() def cleantxt(txt): # collecting english stop words from nltk-library stpw = stopwords.words('english') # Adding custom stop-words stpw.extend(['www','http','utc'])
def load_gensim_embedding_model(model_name): available_models = gensim_data_downloader.info()['models'].keys() assert model_name in available_models, 'Invalid model_name: {}. Choose one from {}'.format( model_name, ', '.join(available_models)) return gensim_data_downloader.load(model_name)
import gensim from gensim import corpora import gensim.downloader as api # Get information about the model or dataset api.info('text8') #text8 is a dataset consisting of first 100,000,000 bytes of plain text from Wikipedia #Download the pre-trained text8 model dataset = api.load("text8") dataset = [wd for wd in dataset] dct = corpora.Dictionary(dataset) corpus = [dct.doc2bow(line) for line in dataset] # Build the bigram models bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10) # Construct bigram print(bigram[dataset[0]]) #After generating bigrams, we can pass the output to train a new Phrases Model by applying the bigrammed corpus on the trained bigram model # Build the trigram models trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10) # Construct trigram print(trigram[bigram[dataset[0]]])