Beispiel #1
0
 def test_info(self):
     data = api.info("text8")
     self.assertEqual(data["parts"], 1)
     self.assertEqual(data["file_name"], 'text8.gz')
     data = api.info()
     self.assertEqual(sorted(data.keys()), sorted(['models', 'corpora']))
     self.assertTrue(len(data['models']))
     self.assertTrue(len(data['corpora']))
Beispiel #2
0
def load_gensim_embedding_model(model_name):
    available_models = gensim_data_downloader.info()["models"].keys()
    assert (model_name in available_models
            ), "Invalid model_name: {}. Choose one from {}".format(
                model_name, ", ".join(available_models))

    return gensim_data_downloader.load(model_name)
Beispiel #3
0
def load_gensim_embedding_model(model_name):
    """ 
    Function to load and select the word embeddings model, using the Gensim lirbary. 
    """
    available_models = gensim_data_downloader.info()['models'].keys()
    assert model_name in available_models, 'Invalid model_name: {}. Choose one from {}'.format(model_name, ', '.join(available_models))
    model_path = gensim_data_downloader.load(model_name, return_path=True)
    return KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')
    def __init__(self):

        info = api.info()
        self.__type = 0  #default
        self.__ListSelection = 5  #default
        self.__model = None
        self.__address = "/Users/markhan/UCL_CS/System_Engineering/final/bias-detect/bias_backend/bias_backend/bias_backend/bias_backend_app/Algorithm/GoogleNews-vectors-negative300.bin.gz"  #default
        self.__preTrainedModelList = list(info['models'].keys())
        self.__corporaList = list(info['corpora'].keys())
def get_gensim_pretrained_info(entity, desc_len=None):
    """
    :param entity: either 'corpora' or 'models'
    :param desc_len: description length of each entity, entire description is printed if this is None
    :return: None
    """
    info = api.info()
    for entity_name, entity_data in sorted(info[entity].items()):
        print(f"{entity_name:<40} {entity_data.get('num_records', -1)} records: "
              f"{entity_data['description'][:desc_len] + '...'}")
def test_api_model_loading(sample_corpus_embedding):
    """Test embeddings loaded through the Gensim download API."""
    embedder = EmbeddingTransformer(
        model=list(DEFAULT_PRETRAINED_EMBEDDINGS.keys())[0])
    embeddings = embedder.transform(sample_corpus_embedding)
    assert embeddings.shape[0] == len(sample_corpus_embedding)
    assert np.all(embeddings[1] == embeddings[2])

    embedder = EmbeddingTransformer(model=list(api.info()["models"].keys())[0])
    embeddings = embedder.transform(sample_corpus_embedding)
    assert embeddings.shape[0] == len(sample_corpus_embedding)
    assert np.all(embeddings[1] == embeddings[2])
 def __init__(self, model: str = "glove", aggregation: str = "average"):
     """ Load pre-trained embeddings, either locally if model is a local file path
     or a Word2VecKeyedVector object, or downloaded from the gensim API if a string
     is provided.
     """
     if aggregation not in {"average", "sum", "minmax"}:
         raise ValueError(
             f"Unknown embeddings aggregation mode: {aggregation}, the available "
             "ones are: average, sum, or minmax.")
     if isinstance(model, str):
         model = model.lower()
         if model in DEFAULT_PRETRAINED_EMBEDDINGS.keys():
             model_gensim_name = DEFAULT_PRETRAINED_EMBEDDINGS[model]
             self.model = api.load(model_gensim_name)
         elif model in api.info()["models"].keys():
             self.model = api.load(model)  # pragma: no cover
         elif os.path.exists(model):
             logger.info("Loading local model")
             self.model = Word2VecKeyedVectors.load(model)
             if not isinstance(self.model, Word2VecKeyedVectors):
                 raise TypeError(
                     "The input model should be a Word2VecKeyedVectors object but "
                     f"it is a {type(self.model)} object.")
         else:
             raise KeyError(
                 f"Unknown pre-trained model name: {model}. Available models are"
                 + ", ".join(api.info()["models"].keys()))
         logger.info("Loaded model keyed vectors: " + model)
     elif isinstance(model, Word2VecKeyedVectors):
         self.model = model
         logger.info("Loaded model keyed vectors.")
     else:
         raise TypeError(
             "Input pre-trained model should be a string or a gensim "
             "Word2VecKeyedVectors object")
     self.aggregation = aggregation
     self.embedding_dimension = self.model.vector_size
     if self.aggregation == "minmax":
         self.embedding_dimension *= 2
Beispiel #8
0
def load_gensim_embedding_model(model_name):
    """
    Load word embeddings (gensim KeyedVectors) 
    """
    available_models = gensim_data_downloader.info()['models'].keys()
    assert model_name in available_models, 'Invalid model_name: {}. Choose one from {}'.format(
        model_name, ', '.join(available_models))

    # gensim throws some nasty warnings about vocabulary
    with warnings.catch_warnings():
        warnings.filterwarnings(action='ignore',
                                category=UserWarning,
                                module='gensim')
        model = gensim_data_downloader.load(model_name)
    return model
Beispiel #9
0
def CheckModel():
    print("initialize")
    my_file = Path("model1/glove-wiki-gigaword-200")
    if my_file.is_file():
        print("File is present")
        filepath = os.getcwd() + "/model1/glove-wiki-gigaword-200"
        model = KeyedVectors.load(filepath)
    else:
        print("Downloading file")
        info = api.info()
        model = api.load("glove-wiki-gigaword-200")
        filepath = os.getcwd() + "/model1/glove-wiki-gigaword-200"
        model.save(filepath)
        print("Downloading complete")

    return model
Beispiel #10
0
def find_similar_words(word_embedding, words):
    '''
    Given a list of a words, find the ten most similar words to each. Return a dictionary
    mapping each word in the input list to the similar words along with their similarity.
    Common word embeddings include 'fasttext-wiki-news-subwords-300' and 'glove-wiki-gigaword-200'.

    Example:
        {'adventurous':              [['adventuresome', 0.673180103302002],
                                     ['inventive', 0.5974040627479553],
                                     ['imaginative', 0.5858909487724304],
                                     ['enterprising', 0.5562216639518738],
                                     ['musically', 0.5521135330200195],
                                     ['impetuous', 0.5404343008995056],
                                     ['inquisitive', 0.5328224897384644],
                                     ['venturesome', 0.5321169495582581],
                                     ['enjoyable', 0.5309233069419861],
                                     ['offbeat', 0.5194555521011353]],
         'affectionate':              [['playful', 0.6456809639930725],
                                      ['respectful', 0.6125648021697998],
                                      ['sarcastic', 0.6028381586074829],
                                      ['affection', 0.5752658247947693],
                                      ['sardonic', 0.5718863010406494],
                                      ['loving', 0.5700308084487915],
                                      ['endearing', 0.5636808276176453],
                                      ['polite', 0.5526844263076782],
                                      ['wry', 0.5466963648796082],
                                      ['irreverent', 0.5442217588424683]]}

    Keyword arguments:
    word_embedding -- the gensim word embedding model, see https://github.com/RaRe-Technologies/gensim-data
    words -- the list of words to find similar words of
    '''

    info = api.info()  # show info about available models/datasets
    model = api.load(word_embedding
                     )  # download the model and return as object ready for use

    embedding_similarity = {}

    for word in words:
        embedding_similarity[word] = model.most_similar(word)

    with open('embedding-data.json', 'w') as outfile:
        json.dump(embedding_similarity, outfile)

    return embedding_similarity
Beispiel #11
0
 def _init_gen_dir(self, name):
     name = EMBEDDING_SHORTHANDS.get(name, name)
     data_root = EMBEDDING_DATA_PATH
     gen_dir = join(data_root, name)
     gensim_models = gensim_data.info(name_only=True)["models"]
     if not exists(gen_dir) and name not in gensim_models:
         offline_models = list_folders(data_root)
         shorthand_names = [*EMBEDDING_SHORTHANDS]
         available_embeddings = set(
             offline_models + gensim_models + shorthand_names
         )
         raise ValueError(
             """Expected Embedding name to be one of {0},\
             got {1}.""".format(
                 available_embeddings, name
             )
         )
     else:
         self._gen_dir = gen_dir
         self._name = name
    def __init__(self, batch_size, train_data_x, train_data_y,
                 embeddings_index, word_index, config):
        """Create object that will be passed to fit_generator, native keras function
        :param batch_size: Size of the batch that is being used in the fit_generator
        :type batch_size: int
        :param train_data_x: Training data, tweets as sequences of word_indexes, already padded, preprocessed etc. training ready
        :type train_data_x: 2D array of floats
        :param train_data_y: One-hot encoded ground-truth labels
        :type train_data_y: 2D array of floats
        :param embeddings_index: Dictionary in which there is word as a key, and value is n-dim embedding array
        :type embeddings_index: dict
        :param word_index: Look-up Table that returns word_id given the word
        :type word_index: dict
        :param config: Configuration object that defines some useful hyper-parameters
        :type config: dict
        """

        self.batch_size = batch_size
        self.train_data_x = train_data_x
        self.train_data_y = train_data_y
        self.embeddings_index = embeddings_index
        self.word_index = word_index
        self.word_index_keys_as_list = list(word_index.keys())
        self.word_index_keys_as_arr = np.array(list(word_index.keys()))

        self.config = config
        # TODO: Consider shuffling data on_epoch_end()

        # Load model straight from gensim to efficiently find most_similar words
        info = api.info()  # show info about available models/datasets
        self.model_emb = api.load(
            "glove-twitter-25"
        )  # download the model and return as object ready for use

        self.on_epoch_end()
        print("self.__len__() = {}".format(self.__len__()))
        return
Beispiel #13
0
def get_model():
    info = api.info()  # show info about available models/datasets
    model = api.load("glove-twitter-25")  # download the model and return as object ready for use
    return model
model = Word2Vec(corpus)

###############################################################################
#
# Now that we have our word2vec model, let's find words that are similar to 'tree'
#

print(model.most_similar('tree'))

###############################################################################
#
# You can use the API to download many corpora and models. You can get the list of all the models and corpora that are provided, by using the code below:
#

import json
info = api.info()
print(json.dumps(info, indent=4))

###############################################################################
# There are two types of custom: corpora and models.
print(info.keys())

###############################################################################
# Let's have a look at the available corpora:
for corpus_name, corpus_data in sorted(info['corpora'].items()):
    print('%s (%d records): %s' % (
        corpus_name,
        corpus_data.get('num_records', -1),
        corpus_data['description'][:40] + '...',
    ))
Beispiel #15
0
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

import json
info = api.info()
#corpus = api.load('text8')
for model_name, model_data in sorted(info['models'].items()):
    print('%s (%d records): %s' % (
        model_name,
        model_data.get('num_records', -1),
        model_data['description'][:40] + '...',
    ))

fake_news_info = api.info('glove-wiki-gigaword-50')
print(json.dumps(fake_news_info, indent=4))

model = api.load("glove-wiki-gigaword-50")
print(model.most_similar("cat"))
model.init_sims(replace=True)
model.save(
    "C:/Users/KEVINBONYTHEKKANATH-/Desktop/Projects/Chatbot/pretrained_word_similarity"
)

print(model.similarity('kevin', 'bonita'))
print(model.similarity('cat', 'meow'))
Beispiel #16
0
## Use this script to retrain the popular pre-trained models on your custom corpus

import os
import re

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn import datasets
from nltk.tokenize import word_tokenize

import gensim.downloader as api
api.info()  # return dict with info about available models/datasets
api.info("text8")

## If you have downloaded the files then load using the following method
def load_glove_files_local(glove_path):
    #glove_path = "../scoring_file/w2v_glove/glove.6B.300d.txt"
    file_path = os.path.join(os.getcwd(), glove_path)
    glove_file = datapath(os.path.join(os.getcwd(), glove_path))
    tmp_file = get_tmpfile("w2v.txt")
    #convert from glove to word2vec
    _ = glove2word2vec(glove_file, tmp_file)
    #load the keyed vectors
    model_glove = KeyedVectors.load_word2vec_format(tmp_file)
    return model_glove

def check_coverage(vocab, embeddings):
    '''Checks what percentage of vocab is covered by the embedding model'''
    vocab_length = len(vocab)
Beispiel #17
0
# download articles' texts
articles_m = get_articles.get_articles_for_period(urls_m, TEXTS_MEDUZA, period)
articles_v = get_articles.get_articles_for_period(urls_v, TEXTS_VEDOMOSTI,
                                                  period)
articles_k = get_articles.get_articles_for_period(urls_k, TEXTS_KOMMERSANT,
                                                  period)

### or read articles from file
articles_k = file_handling.read_from_file(TEXTS_KOMMERSANT)
articles_v = file_handling.read_from_file(TEXTS_VEDOMOSTI)
articles_m = file_handling.read_from_file(TEXTS_MEDUZA)

#####################################

# Get information about the model or dataset
api.info('word2vec-ruscorpora-300')
# Download model
w2v_model = api.load("word2vec-ruscorpora-300")

# add tags to texts
tagged_m = add_tags.add_tags_to_articles(articles_m, TAGGED_MEDUZA, period)
tagged_v = add_tags.add_tags_to_articles(articles_v, TAGGED_VEDOMOSTI, period)
tagged_k = add_tags.add_tags_to_articles(articles_k, TAGGED_KOMMERSANT, period)

### or read tagged articles from file
tagged_m = file_handling.read_from_file(TAGGED_MEDUZA)
tagged_v = file_handling.read_from_file(TAGGED_VEDOMOSTI)
tagged_k = file_handling.read_from_file(TAGGED_KOMMERSANT)

# convert texts to vectors
vectors_m = add_tags.article_to_vector(tagged_m, w2v_model, period,
Beispiel #18
0
def run():
    print('Loading data...')
    train_data = load_imdb(IMDB_DATA_PATH, test=False)
    # test_data = load_imdb(IMDB_DATA_PATH, test=True)

    train_tokenized = [tokenize(sentence) for sentence, _ in train_data]
    # test_tokenized = [tokenize(sentence) for sentence, _ in test_data]

    vocab = set(chain(*train_tokenized))
    vocab_size = len(vocab)
    print('Data loaded with vocabulary size: ', vocab_size)

    print('Building embedding dict...')
    api.info('glove-wiki-gigaword-100')
    word2vec = api.load("glove-wiki-gigaword-100")
    # word2vec = gensim.models.KeyedVectors.load_word2vec_format(EMBEDDING_PATH, binary=False, encoding='utf-8')
    word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}
    word2idx[UNK_STR] = UNK_TOKEN
    idx2word = {idx + 1: word for idx, word in enumerate(vocab)}
    idx2word[UNK_TOKEN] = UNK_STR
    print('Done')

    print('Generating ready-to-train data')
    train_features = torch.tensor(
        pad_sentences(idx_sentences(train_tokenized, word2idx)))
    train_labels = torch.tensor([label for _, label in train_data])
    # test_features = torch.tensor(pad_sentences(idx_sentences(test_tokenized, word2idx)))
    # test_labels = torch.tensor([label for _, label in test_data])
    print('Done')

    embed_size = 100
    weight = torch.zeros(vocab_size + 1, embed_size)
    for i in range(len(word2vec.index2word)):
        try:
            index = word2idx[word2vec.index2word[i]]
        except:
            continue
        weight[index, :] = torch.from_numpy(
            word2vec.get_vector(idx2word[index]))

    n_epochs = 5
    embed_size = 100
    hidden_size = 100
    n_layers = 2
    bidirectional = True
    batch_size = 512
    n_labels = 2
    learning_rate = 0.5

    model = SentimentNet(embed_size=embed_size,
                         hidden_size=hidden_size,
                         n_layers=n_layers,
                         bidirectional=bidirectional,
                         weight=weight,
                         n_labels=n_labels)
    if os.path.exists(MODEL_PATH):
        model = torch.load(MODEL_PATH)
    model = model.to(DEVICE)
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    train_set = TensorDataset(train_features, train_labels)
    # test_set = TensorDataset(test_features, test_labels)
    train_iter = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    # test_iter = DataLoader(test_set, batch_size=batch_size, shuffle=False)

    print('Start training...')
    for epoch in range(n_epochs):
        # start = time.time()
        # train_loss, test_less = 0, 0
        # train_acc, test_acc = 0, 0
        n, m = 0, 0
        for feature, label in train_iter:
            n += 1
            model.zero_grad()
            feature = feature.to(DEVICE)
            label = label.to(DEVICE)
            pred = model(feature)
            loss = loss_func(pred, label)
            print('Train step: %d, loss: %.3f' % (n, loss.item()))
            loss.backward()
            optimizer.step()

    torch.save(model, MODEL_PATH)
Beispiel #19
0
'''
This module contains functions to do a semantic search on Wikipedia
'''

from pprint import pprint

if __name__ == '__main__':
    print 'Wikipedia Semantic Search'

    import gensim.downloader as api

    # print(api.load("fake-news", return_path=True))

    pprint(api.info())
Beispiel #20
0
 def get_model_info(self,
                    name: str = "glove-twitter-25"
                    ) -> None:  # // for pre-made
     "Print some info about a model."
     api.info(name)
 def get_model_info(self, name="glove-twitter-25"):  # // for pre-made
     api.info(name)
Beispiel #22
0
    def set_embedding(self):
        info = api.info()  # show info about available models/datasets
        self.model = api.load("glove-wiki-gigaword-50")

        self.embedding_weights = np.vstack(
            [np.zeros(self.model.vectors.shape[1]), self.model.vectors])
 def models():
     # model names from https://github.com/RaRe-Technologies/gensim-data
     info = api.info()  # show info about available models/datasets
     return info
 def get_available_embeddings(self):
     return (list(api.info()["models"].keys()))
Beispiel #25
0
import gensim.downloader as api
print("importing stuff")
info = api.info()  # show info about available models/datasets
model = api.load("glove-twitter-100"
                 )  # download the model and return as object ready for use
output = model.most_similar(["sales", "orders"])
print(output)

#from gensim.models.word2vec import Word2Vec
#import gensim.downloader as api

#corpus = api.load('text8')  # download the corpus and return it opened as an iterable
#model = Word2Vec(corpus)  # train a model from the corpus
#model.most_similar("car"
Beispiel #26
0
import logging
import gensim.downloader as api

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
info = api.info()

for model_name, model_data in sorted(info['models'].items()):
    print('%s (%d records): %s' % (
        model_name,
        model_data.get('num_records', -1),
        model_data['description'][:40] + '...',
    ))
Beispiel #27
0
 def load_model(self):
     # if the reference isn't in the possible models, FileNotFoundError is raised
     if self.reference in downloader.info()['models']:
         return downloader.load(self.reference)
     else:
         raise FileNotFoundError
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Model
from keras import regularizers

# gensim
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

import gensim.downloader as api

api.info("text8")  # retrieve information about text8 dataset
{u'checksum': u'68799af40b6bda07dfa47a32612e5364',
 u'description': u'Cleaned small sample from wikipedia',
 u'file_name': u'text8.gz',
 u'parts': 1,
 u'source': u'http://mattmahoney.net/dc/text8.zip'}

api.info()

def cleantxt(txt):
  
    # collecting english stop words from nltk-library
    stpw = stopwords.words('english')
    
    # Adding custom stop-words
    stpw.extend(['www','http','utc'])
Beispiel #29
0
def load_gensim_embedding_model(model_name):
    available_models = gensim_data_downloader.info()['models'].keys()
    assert model_name in available_models, 'Invalid model_name: {}. Choose one from {}'.format(
        model_name, ', '.join(available_models))
    return gensim_data_downloader.load(model_name)
Beispiel #30
0
import gensim
from gensim import corpora
import gensim.downloader as api

# Get information about the model or dataset
api.info('text8')

#text8 is a dataset consisting of first 100,000,000 bytes of plain text from Wikipedia
#Download the pre-trained text8 model
dataset = api.load("text8")
dataset = [wd for wd in dataset]

dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Build the bigram models
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

# Construct bigram
print(bigram[dataset[0]])

#After generating bigrams, we can pass the output to train a new Phrases Model by applying the bigrammed corpus on the trained bigram model
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

# Construct trigram
print(trigram[bigram[dataset[0]]])