コード例 #1
0
ファイル: test_api.py プロジェクト: abs51295/gensim
 def test_load_dataset(self):
     dataset_path = os.path.join(base_dir, "__testing_matrix-synopsis", "__testing_matrix-synopsis.gz")
     if os.path.isdir(base_dir):
         shutil.rmtree(base_dir)
     self.assertEqual(api.load("__testing_matrix-synopsis", return_path=True), dataset_path)
     shutil.rmtree(base_dir)
     self.assertEqual(len(list(api.load("__testing_matrix-synopsis"))), 1)
     shutil.rmtree(base_dir)
コード例 #2
0
ファイル: test_api.py プロジェクト: abs51295/gensim
 def test_multipart_load(self):
     dataset_path = os.path.join(
         base_dir, '__testing_multipart-matrix-synopsis', '__testing_multipart-matrix-synopsis.gz'
     )
     if os.path.isdir(base_dir):
         shutil.rmtree(base_dir)
     self.assertEqual(dataset_path, api.load("__testing_multipart-matrix-synopsis", return_path=True))
     shutil.rmtree(base_dir)
     dataset = api.load("__testing_multipart-matrix-synopsis")
     self.assertEqual(len(list(dataset)), 1)
コード例 #3
0
ファイル: baseline.py プロジェクト: DeNeutoy/crosswords
def main(clues_path:str):

    count = Counter()
    total = 0
    multi_word = 0
    #model = gensim_models.load("glove-wiki-gigaword-100")
    model = gensim_models.load("word2vec-google-news-300")
    for clue_json in clues_iterator(clues_path):
        
        if clue_json["separatorLocations"]:
            multi_word += 1
            continue
        clue = clue_json["clue"]
        length = int(clue_json["length"])
        solution = clue_json["solution"]

        results = process_clue(clue, length, solution, model)

        count.update(results)
        total +=1
        print(total)


    for k, v in count.items():

        print(k, ":\t", v/total)


    print("multi word: ", multi_word/ (multi_word + total))
コード例 #4
0
    def testMallet2ModelOn20NewsGroups(self):
        corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")]
        dictionary = Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]

        lda_mallet_model = ldamallet.LdaMallet(
            self.mallet_path, corpus=corpus,
            num_topics=20, id2word=dictionary, iterations=500)

        lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000)
        self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
コード例 #5
0
ファイル: test_api.py プロジェクト: abs51295/gensim
 def test_load_model(self):
     if os.path.isdir(base_dir):
         shutil.rmtree(base_dir)
     vector_dead = np.array([
         0.17403787, -0.10167074, -0.00950371, -0.10367849, -0.14034484,
         -0.08751217, 0.10030612, 0.07677923, -0.32563496, 0.01929072,
         0.20521086, -0.1617067, 0.00475458, 0.21956187, -0.08783089,
         -0.05937332, 0.26528183, -0.06771874, -0.12369668, 0.12020949,
         0.28731, 0.36735833, 0.28051138, -0.10407482, 0.2496888,
         -0.19372769, -0.28719661, 0.11989869, -0.00393865, -0.2431484,
         0.02725661, -0.20421691, 0.0328669, -0.26947051, -0.08068217,
         -0.10245913, 0.1170633, 0.16583319, 0.1183883, -0.11217165,
         0.1261425, -0.0319365, -0.15787181, 0.03753783, 0.14748634,
         0.00414471, -0.02296237, 0.18336892, -0.23840059, 0.17924534
     ])
     dataset_path = os.path.join(
         base_dir, "__testing_word2vec-matrix-synopsis", "__testing_word2vec-matrix-synopsis.gz"
     )
     model = api.load("__testing_word2vec-matrix-synopsis")
     vector_dead_calc = model["dead"]
     self.assertTrue(np.allclose(vector_dead, vector_dead_calc))
     shutil.rmtree(base_dir)
     self.assertEqual(api.load("__testing_word2vec-matrix-synopsis", return_path=True), dataset_path)
     shutil.rmtree(base_dir)
man_df.Description = man_df.Description.apply(lambda x: remove_empty(x))
man_df.Description = man_df.Description.apply(
    lambda x: list(map(remove_stopwords, x)))
man_df.Description = man_df.Description.apply(lambda x: flatten(x))
man_df.Description = man_df.Description.apply(lambda x: remove_non_english(x))
# man_df.Description = man_df.Description.apply(lambda x: reduce_lemma(x))

# Dictionary with the Code and Description -> Dictionary<Code, Description>
man_desc_dict = build_dict(man_df)

# Besides not necessary to recompute CSV unless there are changes, loading the model is very time consuming
if not os.path.exists(MATCH_DIR + METHOD):
    print("Downloading model")

    # Pretrained Word2Vec model
    model = downloader.load('word2vec-google-news-300')

    # Normalize vectors using L2 regularization
    model.init_sims(replace=True)

    sims_dict = {}

    print("Computing distance")

    for cap_code, cap_desc in cap_desc_dict.items():

        # List of Tuples -> Tuple(MAN_CODE, DISTANCE)
        sims = []

        for man_code, man_desc in man_desc_dict.items():
コード例 #7
0
#!/usr/bin/python3
'''
This file calculates pagerank vectors for small-scale webgraphs.
'''

import math
import torch
import gzip
import csv

import logging
import gensim.downloader as api

model = api.load("glove-twitter-25")


class WebGraph():
    def __init__(self, filename, max_nnz=None, filter_ratio=None):

        self.url_dict = {}
        indices = []

        from collections import defaultdict
        target_counts = defaultdict(lambda: 0)

        # loop through filename to extract the indices
        logging.debug('computing indices')
        with gzip.open(filename, newline='', mode='rt') as f:
            for i, row in enumerate(csv.DictReader(f)):
                if max_nnz is not None and i > max_nnz:
                    break
コード例 #8
0
def labelizeTweets(tweets, label_type):
    labelized = []
    for i, v in tqdm(enumerate(tweets)):
        label = '%s_%s' % (label_type, i)
        labelized.append(TaggedDocument(v, [label]))
    return labelized


X_train = labelizeTweets(X_train, 'TRAIN')
X_test = labelizeTweets(X_test, 'TEST')

# tweet_w2v = Word2Vec(size=n_dim, min_count=1)
# tweet_w2v.build_vocab([x.words for x in tqdm(X_train)])
# tweet_w2v.train([x.words for x in tqdm(X_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs)
tweet_w2v = api.load("glove-twitter-200")
print("pre-trained vocab size: ", len(tweet_w2v.wv.vocab))

# print(tweet_w2v.wv.vocab)
# print("VIRUS EMBEDDING: ", tweet_w2v['virus'])
# print("Virus most similar words: ", tweet_w2v.wv.most_similar('virus'))

#### Tweet embeddings
print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=3)
# matrix = vectorizer.fit_transform([x.words for x in X_train])
matrix = vectorizer.fit_transform(tweet_w2v.wv.vocab)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))

コード例 #9
0
import os
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')

# fixes weird OMP mac bug
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

gensim_emb_name = "glove-wiki-gigaword-50" # 'glove-twitter-25' 
gensim_fn = './data/{}.vk'.format(gensim_emb_name)

# we only want to do this once
if os.path.exists(gensim_fn):
    gensim_model = KeyedVectors.load(gensim_fn)  # mmap makes this significantly faster (lazy loading)
else:
    gensim_model = api.load(gensim_emb_name)
    gensim_model.save(gensim_fn)

gensim_embedder = Word2VecEmbedder(model=gensim_model)

def write_lexicon(words, fn):
    defs = []
    for word in words:
        synsets = wn.synsets(word)
        if len(synsets) > 0:
            word = synsets[0]
            defs.append(word.definition())   
        else:
            defs.append(None)
    lexicon = pd.DataFrame({"Word": words, 'Definition': defs})
    lexicon.to_csv(fn, index=False)
コード例 #10
0
ファイル: word2vec-api.py プロジェクト: tum-esi/wade
def raiseError(error):
    return error


if __name__ == '__main__':
    global model

    #----------- Parsing Arguments ---------------
    p = argparse.ArgumentParser()
    p.add_argument("--model", help="Path to the trained model")
    p.add_argument("--binary", help="Specifies the loaded model is binary")
    p.add_argument("--host", help="Host name (default: localhost)")
    p.add_argument("--port", help="Port (default: 5000)")
    p.add_argument("--path", help="Path (default: /word2vec)")
    args = p.parse_args()

    model_path = args.model if args.model else "./model.bin.gz"
    binary = True if args.binary else False
    host = args.host if args.host else "localhost"
    path = args.path if args.path else "/word2vec"
    port = int(args.port) if args.port else 5000
    if not args.model:
        print(
            "Usage: word2vec-apy.py --model path/to/the/model [--host host --port 1234]"
        )
    model = modelDownloader.load("word2vec-google-news-300")
    model.init_sims(replace=True)
    api.add_resource(N_Similarity, path + '/n_similarity')
    api.add_resource(Similarity, path + '/similarity')
    api.add_resource(WMDistance, path + '/wmd')
    app.run(host=host, port=port)
コード例 #11
0
ファイル: jointsrmfnegs.py プロジェクト: ghazalehnt/RecBole
    def __init__(self, config, dataset):
        super(JOINTSRMFNEGS, self).__init__(config, dataset)

        # load dataset info
        self.LABEL = config['LABEL_FIELD']

        self.embedding_dim = config['embedding_dimension']
        self.alpha = config["alpha"]
        item_description_fields = config['item_description_fields']

        LM_neg_samples = config["LM_neg_samples"]

        self.logger.info(f"embedding_dimension = {self.embedding_dim}")
        self.logger.info(f"alpha = {self.alpha}")
        self.logger.info(
            f"item_description_fields = {item_description_fields}")
        self.logger.info(f"LM_neg_samples = {LM_neg_samples}")

        self.user_embedding = nn.Embedding(self.n_users, self.embedding_dim)
        self.item_embedding = nn.Embedding(self.n_items, self.embedding_dim)
        self.user_bias = nn.Parameter(torch.zeros(self.n_users))
        self.item_bias = nn.Parameter(torch.zeros(self.n_items))
        self.bias = nn.Parameter(torch.zeros(1))
        self.apply(self._init_weights)

        gensim_cache = open('gensim_cache_path', 'r').read().strip()
        os.environ['GENSIM_DATA_DIR'] = str(gensim_cache)
        # pretrained_embedding_name = "conceptnet-numberbatch-17-06-300"
        pretrained_embedding_name = "glove-wiki-gigaword-50"  # because the size must be 50 the same as the embedding
        model_path = api.load(pretrained_embedding_name, return_path=True)
        model = gensim.models.KeyedVectors.load_word2vec_format(model_path)
        self.vocab_size = len(model.key_to_index)
        weights = torch.FloatTensor(
            model.vectors)  # formerly syn0, which is soon deprecated
        self.logger.info(f"pretrained_embedding shape: {weights.shape}")
        self.word_embedding = nn.Embedding.from_pretrained(weights,
                                                           freeze=True)

        # tHIS IS NOT POSSIBLE BC OF THE MEMORY SIZE!!!
        # noise_dist = {}  # This is the noise distribution!
        # self.lm_gt = torch.zeros((self.n_items, len(model.key_to_index)), device=self.device)
        # item_LM_file = os.path.join(dataset.dataset.dataset_path, f"{dataset.dataset.dataset_name}.item")
        # item_desc_fields = []
        # if "item_description" in item_description_fields:
        #     item_desc_fields.append(3)
        # if "item_genres" in item_description_fields:
        #     item_desc_fields.append(4)
        # #TODO other fields? e.g. review? have to write another piece of code
        # with open(item_LM_file, 'r') as infile:
        #     next(infile)
        #     for line in infile:
        #         split = line.split("\t")
        #         item_id = dataset.token2id("item_id", split[0])
        #         for fi in item_desc_fields:
        #             desc = split[fi]
        #             for term in desc.split():
        #                 if term in model.key_to_index:
        #                     wv_term_index = model.key_to_index[term]
        #                 else:
        #                     wv_term_index = model.key_to_index["unk"]
        #                 self.lm_gt[item_id][wv_term_index] += 1
        #                 if wv_term_index not in noise_dist:
        #                     noise_dist[wv_term_index] = 0
        #                 noise_dist[wv_term_index] += 1
        # self.logger.info(f"Done with lm_gt construction!")

        noise_dist = {}
        self.lm_gt_keys = [[] for i in range(self.n_items)]
        self.lm_gt_values = [[] for i in range(self.n_items)]
        item_LM_file = os.path.join(dataset.dataset.dataset_path,
                                    f"{dataset.dataset.dataset_name}.item")
        item_desc_fields = []
        if "item_description" in item_description_fields:
            item_desc_fields.append(3)
        if "item_genres" in item_description_fields:
            item_desc_fields.append(4)
        # TODO other fields? e.g. review? have to write another piece of code
        with open(item_LM_file, 'r') as infile:
            next(infile)
            for line in infile:
                split = line.split("\t")
                item_id = dataset.token2id_exists("item_id", split[0])
                if item_id == -1:
                    continue
                for fi in item_desc_fields:
                    desc = split[fi]
                    for term in desc.split():
                        if term in model.key_to_index:
                            wv_term_index = model.key_to_index[term]
                            if wv_term_index not in self.lm_gt_keys[item_id]:
                                self.lm_gt_keys[item_id].append(wv_term_index)
                                self.lm_gt_values[item_id].append(1)
                            else:
                                idx = self.lm_gt_keys[item_id].index(
                                    wv_term_index)
                                self.lm_gt_values[item_id][idx] += 1
                            if wv_term_index not in noise_dist:
                                noise_dist[wv_term_index] = 0
                            noise_dist[wv_term_index] += 1
        self.logger.info(f"Done with lm_gt construction!")

        # keys_sum = 0
        # zeros = 0
        # max_len = 0
        # for lm in self.lm_gt_keys:
        #     if len(lm) == 0:
        #         zeros += 1
        #     else:
        #         keys_sum += len(lm)
        #         if len(lm) > max_len:
        #             max_len = len(lm)
        # print(keys_sum)
        # print(zeros)
        # print(keys_sum / (self.n_items - zeros))
        # print(max_len)
        # print(len(max(self.lm_gt_keys)))
        # exit(1)

        self.sigmoid = nn.Sigmoid()
        self.loss_rec = nn.BCELoss()
        self.loss_lm = SoftCrossEntropyLossByNegSampling(
            LM_neg_samples, noise_dist, 0.75,
            self.device)  # dist to the power of 3/4
コード例 #12
0
import gensim.downloader as api
print("downloading")
fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')
print("downloaded")
import numpy as np
import os


def get_vects(questions):
    list_of_vectors = []
    model = dict()
    #iterate through the questions and make them a list of words
    for i in range(len(questions)):
        #print(questions[i],"\n") #print each question
        list_of_words = questions[i].split(" ")
        list_of_vectors = []
        #iterate through each word and get the vector
        if (len(list_of_words) > 0):
            for j in list_of_words:
                if j in fasttext_model300.wv.vocab:
                    list_of_vectors.append(fasttext_model300.wv.word_vec(j))
            two_dim = np.vstack(tuple(list_of_vectors))
            #get mean across cols
            """filename = "Documents/WT2/w2voutput.txt"
            if os.path.exists(filename):
                append = 'a'
            else:
                append = 'w'
            with open(filename, append) as fp:
                fp.write(str(questions[i])+str(" ")+str(np.mean(two_dim, axis = 0))+str("\n"))"""
            model[questions[i]] = np.mean(two_dim, axis=0)
コード例 #13
0
                sentiment_terms.append([])
    dataset['sentiment_terms'] = sentiment_terms

    return dataset

trainset = read(filename='TP2/data/traindata.csv')
devset = read(filename='TP2/data/devdata.csv')

le = LabelEncoder()
le.fit(trainset['sentiment'])
trainset['sentiment_label'] = le.transform(trainset['sentiment'])
devset['sentiment_label'] = le.transform(devset['sentiment'])


import gensim.downloader as api
twitter_model = api.load('glove-twitter-50')

words = []
from itertools import islice
with open('TP2/resources/negative-words.txt') as fin:
    for line in islice(fin, 36, 4818):
        words.append(line[:-1])

with open('TP2/resources/positive-words.txt') as fin:
    for line in islice(fin, 36, 2041):
        words.append(line[:-1])

vec_corpus = []
for i in range(len(trainset)):
    sentiment_terms = trainset.iloc[i, :]['sentiment_terms']
    for term in sentiment_terms:
コード例 #14
0
from datetime import datetime
import gensim.downloader as api

# SETTING
AFFECT_NORMS_PATH = '../Norms/AffectiveNorms/BRM-emot-submit.csv'
ASSOCIATION_NROMS_PATH = '../Norms/AssociationNorms/association_matrix.csv'
MATERIAL_PATH = '../Materials/'
ITERATION_N = 10000  # For random list creation
TRAINED_DATA = "word2vec-google-news-300"  #word2vec model

affect_df = pd.read_csv(AFFECT_NORMS_PATH)
print('LOADED affect norms')

# Loading pretrained word2vec model takes time.
# Comment out the two lines below when you don't use word2vec data.
word_vectors = api.load(TRAINED_DATA)
print('LOADED word2vec vectors')

association_df = pd.read_csv(ASSOCIATION_NROMS_PATH, index_col=0)
print('LOADED association norms')

cues_in_norms = set(association_df.index)


def cos_sim(v1, v2):
    """
    Calculate cosine similarity.
    If vector's length == 0, it returns np.nan.
    """
    if (len(v1) == 0) or (len(v2) == 0):
        return np.nan
コード例 #15
0
def word2vec_padding(list_of_embeddings, length, embedding_length):
    zero_vec = np.zeros(embedding_length)
    for _ in range(length - len(list_of_embeddings)):
        list_of_embeddings.append(zero_vec)
    return list_of_embeddings[:length]


def word2vec_sum(list_of_embeddings, embedding_length):
    ret_value = np.zeros(embedding_length)
    for embedding in list_of_embeddings:
        ret_value += embedding
    return ret_value


word2vec_model = glove_api.load('glove-wiki-gigaword-50')
embedding_size = word2vec_model.vector_size
word2vec_parser = Word2Vec(
    type_func=lambda x: torch.FloatTensor(x).to(device).unsqueeze(0),
    word2vec_model=word2vec_model,
    return_func=lambda x: word2vec_padding(x, 65, embedding_size))

with open(os.getcwd() + '/data/zork_walkthrough_' + task + '.txt', 'rb') as f:
    data = pickle.load(f)

states = [word2vec_parser(state) for state in data['states']]
raw_actions = data['actions']
actions = []
bows = []

noise = MultivariateNormal(torch.zeros(50), torch.eye(50))
コード例 #16
0
ファイル: add_cross.py プロジェクト: valntinaf/nltk-utils
import gensim.downloader as gensim
import numpy as np

#download/load model
model = gensim.load("glove-twitter-25")

v1 = model["flower"]
v2 = model["flowers"]
v3 = model["animal"]
v4 = model["cow"]
v5 = model["cat"]
print(np.dot(v1, v2))
print(np.dot(v3, v4))
print(np.dot(v3, v5))
print(np.dot(v4, v5))

nubes = model["cloud"]
negro = model["black"]
tormenta = model["storm"]
effect = tormenta

cause = np.add(nubes, negro)
print(cause)
print(effect)

print(len(cause))
print(len(effect))

relation = np.cross(cause.tolist(), effect.tolist())

event = np.add(cause, np.cross(cause, effect))
コード例 #17
0
for dataset in [data_training, data_test]:
    for index, row in dataset.iterrows():
        for answer in answer_cols:
            answer_to_numb = []
            for word in text_to_word_list(row[answer]):
                if word not in vocab:
                    vocab[word] = len(inverse_vocabulary)
                    answer_to_numb.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    answer_to_numb.append(vocab[word])

            dataset.set_value(index, answer, answer_to_numb)

embed_size = 300
model = api.load("word2vec-google-news-300"
                 )  # download the model and return as object ready for use
word_vectors = model.wv
embedding_matrix = np.zeros((len(vocab) + 1, embed_size))

for word, i in vocab.items():
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embed_size)

del (word_vectors)

maxlen_1 = max(
    data_training.StudentAnswer.map(lambda x: len(x)).max(),
    data_training.ReferenceAnswers.map(lambda x: len(x)).max(),
コード例 #18
0
ファイル: jointsrmffull.py プロジェクト: ghazalehnt/RecBole
    def __init__(self, config, dataset):
        super(JOINTSRMFFULL, self).__init__(config, dataset)
        # load dataset info
        self.LABEL = config['LABEL_FIELD']

        self.embedding_dim = config['embedding_dimension']
        self.alpha = config["alpha"]
        item_description_fields = config['item_description_fields']
        if "number_of_reviews_to_use" in config:
            max_number_of_reviews = config['number_of_reviews_to_use']
        else:
            max_number_of_reviews = 1
        self.variant = config["debug_variant"]

        self.logger.info(f"embedding_dimension = {self.embedding_dim}")
        self.logger.info(f"alpha = {self.alpha}")
        self.logger.info(
            f"item_description_fields = {item_description_fields}")

        self.user_embedding = nn.Embedding(self.n_users, self.embedding_dim)
        self.item_embedding = nn.Embedding(self.n_items, self.embedding_dim)
        self.user_bias = nn.Parameter(torch.zeros(self.n_users))
        self.item_bias = nn.Parameter(torch.zeros(self.n_items))
        self.bias = nn.Parameter(torch.zeros(1))
        self.apply(self._init_weights)

        gensim_cache = open('gensim_cache_path', 'r').read().strip()
        os.environ['GENSIM_DATA_DIR'] = str(gensim_cache)
        # pretrained_embedding_name = "conceptnet-numberbatch-17-06-300"
        pretrained_embedding_name = "glove-wiki-gigaword-50"  # because the size must be 50 the same as the embedding
        model_path = api.load(pretrained_embedding_name, return_path=True)
        model = gensim.models.KeyedVectors.load_word2vec_format(model_path)
        weights = torch.FloatTensor(
            model.vectors)  # formerly syn0, which is soon deprecated
        self.logger.info(f"pretrained_embedding shape: {weights.shape}")
        self.word_embedding = nn.Embedding.from_pretrained(weights,
                                                           freeze=True)
        self.vocab_size = len(model.key_to_index)

        s = time.time()
        self.lm_gt = torch.zeros((self.n_items, self.vocab_size),
                                 dtype=torch.uint8)
        self.lm_gt_len = torch.ones(self.n_items, dtype=torch.int16)
        item_desc_fields = []
        if "item_description" in item_description_fields:
            item_desc_fields.append(3)
        if "item_genres" in item_description_fields:
            item_desc_fields.append(4)
        if "tags" in item_description_fields:
            item_desc_fields.append(4)
        if len(item_desc_fields) > 0:
            item_LM_file = os.path.join(
                dataset.dataset.dataset_path,
                f"{dataset.dataset.dataset_name}.item")
            with open(item_LM_file, 'r') as infile:
                next(infile)
                for line in infile:
                    split = line.split("\t")
                    item_id = dataset.token2id_exists("item_id", split[0])
                    if item_id == -1:
                        continue
                    if item_id == 0:
                        print("Isnt that padding?")
                    for fi in item_desc_fields:
                        if fi >= len(split):
                            continue
                        desc = split[fi]
                        for term in desc.split():
                            if term in model.key_to_index:
                                wv_term_index = model.key_to_index[term]
                                self.lm_gt[item_id][wv_term_index] += 1
                                self.lm_gt_len[item_id] += 1
        if "review" in item_description_fields:
            num_of_used_revs = {}
            item_desc_fields = [3]
            item_LM_file = os.path.join(
                dataset.dataset.dataset_path,
                f"{dataset.dataset.dataset_name}.inter")
            with open(item_LM_file, 'r') as infile:
                next(infile)
                for line in infile:
                    split = line.split("\t")
                    item_id = dataset.token2id_exists("item_id", split[1])
                    if item_id == -1:
                        continue
                    if item_id == 0:
                        print("Isnt that padding?")
                    if item_id not in num_of_used_revs:
                        num_of_used_revs[item_id] = 0
                    elif num_of_used_revs[item_id] >= max_number_of_reviews:
                        continue
                    for fi in item_desc_fields:
                        desc = split[fi]
                        if len(desc.split()) > 0:
                            num_of_used_revs[item_id] += 1
                        for term in desc.split():
                            if term in model.key_to_index:
                                wv_term_index = model.key_to_index[term]
                                if term in model.key_to_index:
                                    wv_term_index = model.key_to_index[term]
                                    self.lm_gt[item_id][wv_term_index] += 1
                                    self.lm_gt_len[item_id] += 1
        self.lm_gt_len[(self.lm_gt_len == 0).nonzero(as_tuple=True)] = 1
        e = time.time()
        self.logger.info(f"{e - s}s")
        self.logger.info(f"Done with lm_gt construction!")

        self.sigmoid = nn.Sigmoid()
        self.loss_rec = nn.BCELoss()
        self.loss_lm = SoftCrossEntropyLoss()
コード例 #19
0
# We import Googles word2vec model. It contains over 3 million words.
# This import can take awhile.

import gensim.downloader as api
print("Loading in the model. Please give the computer at least 2 minutes. \n")
wv = api.load('word2vec-google-news-300')
print("Finished loading in the model.\n")
コード例 #20
0
def load_model(val, key):
    model = gensim_api.load(gensim_model_name)
    cprint('MODEL LOADED', 'green')
    val[key] = model
コード例 #21
0
def load_gensim_embedding_model(model_name):
    available_models = gensim_data_downloader.info()['models'].keys()
    assert model_name in available_models, 'Invalid model_name: {}. Choose one from {}'.format(
        model_name, ', '.join(available_models))
    return gensim_data_downloader.load(model_name)
コード例 #22
0
import gensim
from gensim import corpora
import gensim.downloader as api
from pprint import pprint

dataset = api.load("fake-news")
dataset = [wd for wd in dataset]

dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Construct bigrams model
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

# Construct bigrams
pprint(bigram[dataset[0]])

# Construct trigrams model
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

# Construct trigram
pprint(trigram[bigram[dataset[0]]])
コード例 #23
0
import gensim.downloader as api


word_vectors = api.load("glove-wiki-gigaword-100")





sim=word_vectors.similarity("silent","singing")


print(sim)
コード例 #24
0
        google_embedding = google_utils.embedding_finder(word)
        wiki_embedding = wiki_utils.embedding_finder(word.lower())

        if google_embedding is not None:
            return google_scikit_wrapper.model_predictor(google_embedding, 'google')
        elif wiki_embedding is not None:
            return wiki_scikit_wrapper.model_predictor(wiki_embedding, 'wiki')
        elif random_string_utils.random_string_finder(word):
            return 'Random string'
        else:
            return 'Other'


if __name__ == '__main__':
    print("\nPlease wait the embeddings repository is loading...\n")
    embeddings_repository = models.KeyedVectors\
        .load_word2vec_format('../data/pre_trained_models/GoogleNews-vectors-negative300.bin.gz', binary=True)
    dates_repository = api.load("glove-wiki-gigaword-100")
    classifier = Classifier()
    while True:
        try:
            value = input("Please enter a string that you want to classify:\n")
            if value == 'no' or value == 'n':
                break
            print(f'You entered: {value}')
            print(f'The classifier predicted that the class of {value} is '
                  f'{classifier.run(value, embeddings_repository, dates_repository)}')
            print('\nIf you would like to stop type no or n\n')
        except (KeyboardInterrupt, SystemExit):
            raise
コード例 #25
0
import gensim
from gensim import downloader

text8 = downloader.load('text8')
model = gensim.models.word2vec.Word2Vec(text8,
                                        max_vocab_size=100000)
                                        
model.wv.save_word2vec_format("gensim.model")
コード例 #26
0
for i in range(len(target_vectors)):
  print(top_context_words[i],cos(flat(word_vector),flat(target_vectors[i])))

"""# TSNE"""

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
top_context_words.append(wrd)
target_vectors.append(h1) 

tsne = TSNE()
tv=[]
for i in target_vectors:
  tv.append(flat(i))
embed_tsne = tsne.fit_transform(tv)

fig, ax = plt.subplots(figsize=(5, 5))
for idx in range(len(target_vectors)):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(top_context_words[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)

"""# Gensim Comparison with word vector error calculation"""

from gensim.models import Word2Vec
import gensim.downloader as api
corpus = api.load('text8')
model = Word2Vec(corpus)

gensim_vec = model.wv[wrd]
print("The Mean Error between Gensim Embedding and My Embedding for The word",wrd)
print((np.square(gensim_vec - word_vector)).mean())
コード例 #27
0
 def load_file_of_vectors(self):
     self.__fasttext_model = api.load('fasttext-wiki-news-subwords-300')
コード例 #28
0
import re
import numpy as np
from scipy import spatial
import gensim.downloader as api

alphanum = re.compile(r"[\W_]+")
model = api.load("word2vec-google-news-300")
embedding_dim = 300


def split_composite(w):
    m = re.split(r", | & | and |\s", w)
    return set([alphanum.sub("", s.lower()) for s in m])


def _get_embedding(label_set):
    result = [0] * embedding_dim
    num_words = len(label_set)
    for label in label_set:
        try:
            result += model[label]
        except KeyError:
            num_words -= 1
    if num_words == 0:
        return [0] * embedding_dim
    return np.array(result) / num_words


def get_class_vector(class_hier):
    f"""
    get_class_vector takes a string class label and returns a {embedding_dim} dimensional vector.
コード例 #29
0
from pythonosc import osc_message_builder
from pythonosc import osc_bundle_builder
from pythonosc import udp_client
from gensim.models import Word2Vec
import gensim.downloader as api
import gensim
import time
import subprocess
#send OSC messages to port 6448 on localhost
client = udp_client.SimpleUDPClient("127.0.0.1", 6448)

print("Sends to port 6448 with OSC message name /wek/inputs")

# Download model if necessary:
model_location = api.load("glove-twitter-25", return_path=True)

#Load model into variable:
print("Loading model", " ...")
wv_model = gensim.models.KeyedVectors.load_word2vec_format(model_location)


#function for importing text file as list of words
def read_words(words_file):
    return [word for line in open(words_file, 'r') for word in line.split()]


song_corpus = read_words("smashmouth.txt")
for word in song_corpus:
    print(word)
    subprocess.run(["say", word])
コード例 #30
0
en_model.most_similar("wood",topn=20)

### GloVe
As described [before](05representations.md) GloVe constitutes another method for calculating Word-Embbedings. Pre-trained GloVe vectors can be downloaded from
[Glove](https://nlp.stanford.edu/projects/glove/) and imported into Python. However, gensim already provides a downloader for several word-embeddings, including GloVe embeddings of different length and different training-data. 

The corpora and embeddings, which are available via the gensim downloader, can be queried as follows:

import gensim.downloader as api

api.info(name_only=True)

We select the GloVe word-embeddings `glove-wiki-gigaword-100` for download: 

word_vectors = api.load("glove-wiki-gigaword-100")  # load pre-trained word-vectors from gensim-data

type(word_vectors)

As can be seen in the previous output, the downloaded data is available as a `KeyedVectors`-object. Hence the same methods can now be applied as in the case of the FastText - Word Embedding in the previous section. In the sequel we will apply not only the methods used above, but also new ones.

Word analogy questions like *man is to king as woman is to ?* can be solved as in the code cell below:

result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

Outliers within sets of words can be determined as follows:

print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split()))

Similiarity between a pair of words:
コード例 #31
0
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import numpy as np
import csv
from nltk.stem import PorterStemmer
import sys
import gensim.downloader as api
import pickle
import enchant

enchant_dict = enchant.Dict("en_US")

model = api.load("glove-wiki-gigaword-50")

# glove_input_file = '../model/glove.6B.100d.txt'
# word2vec_output_file = 'glove.6B.100d.txt.word2vec'
# glove2word2vec(glove_input_file, word2vec_output_file)
# filename = 'glove.6B.100d.txt.word2vec'
# model = KeyedVectors.load_word2vec_format(filename)

keywords = open("./keywords", mode='rb')
keywords_dict = pickle.load(keywords)

# print(keywords_dict)
ps = PorterStemmer()

movies_vector_dict = {}
for movie in keywords_dict.keys():
    vectors = []
    for word in keywords_dict[movie]:
        if type(word) is tuple:
コード例 #32
0
from re import sub
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk import word_tokenize
import gensim.downloader as ap
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity
import numpy as np
import ipfsapi

glove = ap.load("glove-wiki-gigaword-50")
api = ipfsapi.connect('127.0.0.1', 5001)
STOPWORDS = set(stopwords.words('english'))



class Reputation:

    

    def __init__(self,author_address,author_reputation,hashes,timestamp,m, T):

        self.cs = 1
        self.author_address = author_address
        self.author_reputation = author_reputation
        self.hashes = hashes
        self.timestamp = timestamp
        self.m = m
コード例 #33
0
import logging
import json
import gensim.downloader as api

MODEL = api.load('fasttext-wiki-news-subwords-300')

# info = api.info()
# logging.basicConfig(
#     format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# for model_name, model_data in sorted(info['models'].items()):
#     print(
#         '%s (%d records): %s' % (
#             model_name,
#             model_data.get('num_records', -1),
#             model_data['description'][:40] + '...',
#         )
#     )


def get_relavant_entities(query, topn, restrict_vocab=None):
    '''
    query: list of keywords
    topn: number of return entities
    restrict_vocab: is an optional integer which limits the range of vectors
    which are searched for most-similar values. For example,
    estrict_vocab=10000 would only check the first 10000 word vectors in the vocabulary order.
    (This may be meaningful if you’ve sorted the vocabulary by descending frequency.)
    '''
    return MODEL.most_similar(positive=['ireland', 'IBM', 'Trafficking'], topn=50, restrict_vocab=restrict_vocab)

コード例 #34
0
ファイル: hwEmbeddings.py プロジェクト: singerm0/AI-things
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import gensim.downloader as api
from gensim.models import KeyedVectors
#%%
model = api.load('glove-wiki-gigaword-300')
model.save("E:\\py599\\wikiEmbedder.npz")
#%%
print("woman-->man == king-->? <take1>")
result = model.most_similar(positive=['woman','king'],negative=['man',])
print(result[0])
print("\n")
print("woman-->man == king-->? <take2>")
result = model.most_similar_cosmul(positive=['woman','king'],negative=['man',])
print(result[0])
print("\n")
print("paris-->france == london-->? <take3>")
result = model.most_similar_cosmul(positive=['paris','france'],negative=['london',])
print(result[0])
print("\n")
print("man-->men == woman-->? <take4>")
result = model.most_similar_cosmul(positive=['man','woman'],negative=['men',])
print(result[0])
print("\n")
print("similar to carolina? <take5>")
result = model.similar_by_word('carolina')
print(result[0])
print("\n")
print("similar to carolina? <take6>")
コード例 #35
0
# Hard coding context window size.
c = 2
neg_samples = 2 * c

# Check if cuda enabled
is_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Reading  dataset
dat_file = './data/cleaned_documents.csv'
dat = pd.read_csv(dat_file, converters={"text": ast.literal_eval})
print("Read cleaned doc, size = ", len(dat))

#  Build vocabulary
# If experiment 3 or 4 then partial vocabulary.
if len(sys.argv) > 1 and sys.argv[1] in ['3', '4']:
    gen_model = api.load('glove-wiki-gigaword-100')

E = 100  # size of embedding
words = list(itertools.chain.from_iterable(dat.text))

if len(sys.argv) > 1 and sys.argv[1] in ['3', '4']:
    words = [x for x in words if x in gen_model]

uni_freq = Counter(words)
words = set(words)
word2idx = {word: idx for idx, word in enumerate(words)}
idx2word = {idx: word for idx, word in enumerate(words)}
W = len(set(words))

print("Vocab size : ", W)
for each in uni_freq: