def test_ngram_oov_subword_stability(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) for i in range(5): self.assertTrue( isclose(self.vectors_oov_1.query("*<"), self.vectors_oov_2.query("*<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<"), self.vectors_oov_2.query("*<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<"), self.vectors_oov_2.query("*<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<"), self.vectors_oov_2.query("*<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<"), self.vectors_oov_2.query("*<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<"), self.vectors_oov_2.query("*<<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<"), self.vectors_oov_2.query("*<<<<<<<")).all()) self.vectors_oov_1.close() self.vectors_oov_2.close()
def get_word_vector(word): global model if model is None: # import fasttext # if os.environ.get('LANGUAGE', 'en').lower() == 'en': # print('Loading English word vectors') # model = fasttext.load_model('data/cc.en.300.bin') # else: # print('Loading Vietnamese word vectors') # model = fasttext.load_model('data/cc.vi.300.bin') # return model.get_word_vector(word.replace(' ', '_')) from pymagnitude import Magnitude if os.environ.get('LANGUAGE', 'en').lower() == 'en': print('Loading English word vectors') model = Magnitude('data/cc.en.300.magnitude', language='en', lazy_loading=20000) else: print('Loading Vietnamese word vectors') model = Magnitude('data/cc.vi.300.magnitude', language='vi', lazy_loading=20000) print('Loading completed') return model.query(word)
def test_oov_subword_values(self): self.vectors_oov_1 = Magnitude( MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude( MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.assertTrue(isclose(self.vectors_oov_1.query("discriminatoryy")[0], -0.059116619334669426)) self.assertTrue(isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397)) self.assertTrue(isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271)) self.assertTrue(isclose(self.vectors_oov_1.query("uberx")[0], 0.0952671681336)) self.assertTrue(isclose(self.vectors_oov_1.query("misssipi")[0], 0.0577835297955)) self.assertTrue(isclose(self.vectors_oov_2.query("discriminatoryy")[0], -0.059116619334669426)) self.assertTrue(isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397)) self.assertTrue(isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271)) self.assertTrue(isclose(self.vectors_oov_2.query("uberx")[0], 0.0952671681336)) self.assertTrue(isclose(self.vectors_oov_2.query("misssipi")[0], 0.0577835297955)) self.vectors_oov_1.close() self.vectors_oov_2.close()
def get_simlex_and_metrics(): simlex_data = load_simlex_data('../data/MSimLex999_Polish.txt') euklidean_metric = EuclideanMetric( Magnitude( '../data/nkjp+wiki-lemmas-restricted-300-skipg-ns.magnitude')) cosine_metric = CosineMetric( Magnitude( '../data/nkjp+wiki-lemmas-restricted-300-skipg-ns.magnitude')) return simlex_data, euklidean_metric, cosine_metric
def setUp(self): self.vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=True) self.vectors_cs = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=False, eager=False) self.vectors_sw = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) self.vectors_approx = Magnitude(MagnitudeTest.MAGNITUDE_APPROX_PATH, case_insensitive=True, eager=False) self.tmp_vectors = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat_1 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat_2 = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, eager=False) self.concat = Magnitude(self.concat_1, self.concat_2) self.vectors_feat = FeaturizerMagnitude(100, case_insensitive=True) self.v = { 'padding': self.tmp_vectors._padding_vector(), 'I': self.tmp_vectors.query("I"), 'saw': self.tmp_vectors.query("saw"), 'a': self.tmp_vectors.query("a"), 'cat': self.tmp_vectors.query("cat"), 'He': self.tmp_vectors.query("He"), 'went': self.tmp_vectors.query("went"), 'to': self.tmp_vectors.query("to"), 'the': self.tmp_vectors.query("the"), 'mall': self.tmp_vectors.query("mall"), 'blah123': self.tmp_vectors.query("blah123") }
def __init__(self, embeddings_source=EMBEDDINGS, out_embeddings_source=OUT_EMBEDDINGS, formulas_source=FORMULAS, phraser_source=PHRASER): """ :param embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param out_embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param formulas_source: can be url or path to a JSON-serialized dict of formulae, if not supplied a default file is loaded """ # hidden layer embeddings (W) self.embeddings = Magnitude(embeddings_source, eager=False) # output layer embeddings (O) self.out_embeddings = Magnitude(out_embeddings_source) # load pre-trained formulas from embeddings with open(formulas_source, 'r') as f: self.formulas_with_abbreviations = load(f) self.dp = DataPreparation(local=False) self.es = ElasticConnection() self.formulas = { k: v for k, v in self.formulas_with_abbreviations.items() if k not in self.ABBR_LIST } self.formula_counts = { root_formula: sum(formulas.values()) for root_formula, formulas in self.formulas.items() } self.most_common_forms = { formula_group_name: (formula_group_name if formula_group_name in self.dp.ELEMENTS else max(formulae.items(), key=operator.itemgetter(1))[0]) for formula_group_name, formulae in self.formulas_with_abbreviations.items() } self.phraser = Phraser.load(phraser_source)
def glove_via_magnitude(topn=500, min_similarity=None, filename='glove.6B.100d.magnitude', lang='en_US'): from pymagnitude import Magnitude v = Magnitude(os.path.join(TOPDIR, filename)) training_set = list() units = set() for unit_list in classifier.ambiguous_units(): for unit in unit_list[1]: units.add(unit) for unit in units: print('Processing {}...'.format(unit.name)) name = unit.name surfaces = set(unit.name) if isinstance(unit, classes.Unit): surfaces.update(unit.surfaces) surfaces.update(unit.symbols) for surface in surfaces: neighbours = v.most_similar( v.query(surface), topn=topn, min_similarity=min_similarity) training_set.append({ 'unit': name, 'text': ' '.join(neighbour[0] for neighbour in neighbours) }) print('Done') with language.topdir(lang).joinpath('train/similars.json').open( 'w', encoding='utf-8') as file: json.dump(training_set, file, sort_keys=True, indent=4)
def __init__(self, emdim): base_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'data') self.fasttext_dim = 300 self.glove_dim = emdim - 300 assert self.glove_dim in [50, 100, 200, 300], "Embedding dimension must be one of the following: 350, 400, 500, 600" print("Will download magnitude files from the server if they aren't avaialble locally.. So, grab a cup of coffee while the downloading is under progress..") glove = Magnitude(MagnitudeUtils.download_model('glove/medium/glove.6B.{}d'.format(self.glove_dim), download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True) fasttext = Magnitude(MagnitudeUtils.download_model('fasttext/medium/wiki-news-300d-1M-subword', download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True) self.vectors = Magnitude(glove, fasttext)
def load_data(data_dir='./data'): """Loads all data in `data_dir` as a dict Each of `dev`, `train` and `test` contains (1) `raw` folder (2) `relations.json`. We don't need to worry about `raw` folder, and instead focus on `relations.json` which contains all the information we need for our classification task. Args: data_dir: str, the root directory of all data Returns: dict, where the keys are: `dev`, `train` and `test` and the values are lists of relations data in `relations.json` """ assert os.path.exists(data_dir), "`data_dir` does not exist in `load_data`" data = {} vectors = Magnitude("glove.6B.50d.magnitude") #vectors = Magnitude("glove.6B.300d.magnitude") get_sense_dict(os.path.join(data_dir, "train")) #print(sense_dict) for folder in os.listdir(data_dir): #print(folder) print("Loading", folder) folder_path = os.path.join(data_dir, folder) #print(folder_path) data[folder] = load_relations(folder_path, vectors) ''' print("Loading", "dev") folder_path = os.path.join(data_dir, "dev") data["dev"] = load_relations(folder_path, vectors) ''' return data
def get_nearest_words(): """ provides words closely related to the keywords Parameters: keywords -- an array of words closely related to the concept Returns: closest_words -- these are displayed on the right panel of the concept screen Testing: http://localhost:3001/api/get_nearest_words?keywords=lunch,slice,pie,pasta """ keywords = request.args.get('keywords', '') from pymagnitude import Magnitude #vectors = Magnitude('http://magnitude.plasticity.ai/word2vec/heavy/GoogleNews-vectors-negative300.magnitude', stream=True) # full url for streaming from 10GB model #vectors = Magnitude('http://magnitude.plasticity.ai/glove/light/glove.6B.50d.magnitude', stream=True) vectors = Magnitude('./pretrained_features/glove.6B.50d.magnitude') # there is likely overlap if the concepts words are closely related closest_words = set() for k in keywords.split(','): results = vectors.most_similar(k, topn=10) # Most similar by key #vectors.most_similar(vectors.query(k), topn = 100) # Most similar by vector for r in results: # just add the word, not the word's probability closest_words.add(r[0]) closest_words = closest_words - set(list(keywords.split(','))) return json.dumps(list(closest_words))
def predict(chain, embedding=False, interpolation=False): if embedding or interpolation: vectors = Magnitude('GoogleNews-vectors-negative300.magnitude') scores = dict() for verb in verbs: score = 0 for event in chain: if embedding: score += vectors.similarity(event[0], verb) elif interpolation: score += (ALPHA * vectors.similarity(event[0], verb) + (1 - ALPHA) * pmi(event, (verb, None, None))) else: score += pmi(event, (verb, None, None)) scores[verb] = score cleaned_scores = dict() chain_verbs = set() for event in chain: chain_verbs.add(event) for candidate in scores: if candidate not in chain_verbs: cleaned_scores[candidate] = scores[candidate] ranked_scores = sorted(list(cleaned_scores.items()), key=lambda x: x[1], reverse=True) return ranked_scores
def extract_wordvec_generalization(word, path_to_word_vectors, neighbor_number): ''' Extracts the nearest neighbor from vector space ''' vectors = Magnitude(path_to_word_vectors) generalized_attribute = vectors.most_similar( word, topn=neighbor_number)[neighbor_number - 1][0] return generalized_attribute
def test_list(self): self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, use_numpy=False, eager=False) self.assertTrue(isinstance(self.vectors_list.query("cat"), list)) self.vectors_list.close()
def create_magnitude(case_insensitive=True, eager=False, **kwargs): vectors = Magnitude( MAGNITUDE_PATH, case_insensitive=case_insensitive, eager=eager, **kwargs) return vectors
def test_lang_none_oov_stem(self): self.vectors_l = Magnitude(MagnitudeTest.MAGNITUDE_PATH, language=None) self.assertEqual(self.vectors_l._oov_stem('rejumping'), 'rejumping') self.assertEqual( self.vectors_l._oov_stem('reuberificationing'), 'reuberificationing') self.vectors_l.close()
def create_vocab_tensors(input_vocab_index): """Creates a matrix of the glove embeddings for terms contained in the model for improve runtime Also used in ESIM""" print('Creating vocabulary tensors...') # Define GloVe model from Magnitude package model = Magnitude(config.glove_magnitude_path) np.random.seed(config.SEED) # Randomly initialize matrix vocab_tensors = np.random.normal( 0, 1, (input_vocab_index.n_words, model.dim)).astype('float32') vocab_words = list(input_vocab_index.word2index.keys()) unk_words = [] # Get vector for each word in vocabulary if in model for idx, word in enumerate(vocab_words): if word in model: vocab_tensors[idx] = model.query(word) else: unk_words.append(word) # Override special tokens special_tokens = ['SOS', 'EOS', 'UNK'] # Override special tokens vocab_tensors[:len(special_tokens), :] = np.random.uniform( -0.1, 0.1, (len(special_tokens), model.dim)).astype('float32') print('Tensor vocabulary complete.') print(' Total vocabulary size {}, {} UNK words ({:.2}%)'.format( len(vocab_words), len(unk_words), (len(unk_words) / len(vocab_words)) * 100)) return torch.tensor(vocab_tensors, dtype=torch.float64), unk_words
def load(self, path, blocking): # Require that vector path exists, if a path is provided and it's not found, Magnitude will try download from it's servers if not path or not os.path.isfile(path): raise IOError(ENOENT, "Vector model file not found", path) # Load magnitude model. If this is a training run (no embeddings yet), block until the vectors are fully loaded return Magnitude(path, case_insensitive=True, blocking=blocking)
def embedding_task(self, tweets): """Embedding task.""" logging.info(f"DATAFRAME PASSED FOR EMBEDDING SIZE {len(tweets)} ") vectors = Magnitude("glove_vectors/glove.6B.100d.magnitude") for tweet in tweets: tweet.embedding = get_text_embedding(vectors, tweet.text) mongo_task(tweets) return tweets
def get_magnitude(self, magnitude): if type(magnitude)==str: magnitude = Magnitude(magnitude) elif type(magnitude)==pymagnitude.Magnitude: pass else: raise ValueError("magnitude type not recognized") return magnitude
def __init__(self, magnitude_path=None): """ :param str magnitude_path: Path to a .pymagnitude embeddings file. """ self.database = magnitude_path if self.database is not None: self.embeddings = Magnitude(self.database)
def load(self, path): # Ensure that vector path exists if not path or not os.path.isfile(path): raise IOError(ENOENT, "Vector model file not found", path) # Load magnitude model. If this is a training run (uninitialized config), block until vectors are fully loaded return Magnitude(path, case_insensitive=True, blocking=not self.initialized)
def test_oov_subword_dim_placeholders(self): self.vectors_placeholders = Magnitude( MagnitudeTest.MAGNITUDE_SUBWORD_PATH, placeholders=5, case_insensitive=True, eager=False) self.assertEqual(self.vectors_placeholders.query("*<<<<").shape, self.vectors_placeholders.query("cat").shape) self.assertTrue(isclose(self.vectors.query("*<<<<")[0], self.vectors_placeholders.query("*<<<<")[0])) self.vectors_placeholders.close()
def test_placeholders(self): self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, placeholders=5, eager=False) self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, )) self.assertEqual( self.vectors_placeholders.query("cat")[0], self.vectors.query("cat")[0]) self.vectors_placeholders.close()
def create_magnitude(case_insensitive=True, eager=False, **kwargs): stream = True if ('http://' in MAGNITUDE_PATH or 'https://' in MAGNITUDE_PATH) else False log = stream vectors = Magnitude(MAGNITUDE_PATH, case_insensitive=case_insensitive, eager=eager, stream=stream, log=log, **kwargs) return vectors
def init_model(model_type): """ Function to initialize the pre-trained word embedding model :return: model """ if model_type == 'magnitude': model = Magnitude('../model/crawl-300d-2M.magnitude') elif model_type == 'gensim': model = KeyedVectors.load('../model/pre_trained_word2vec_embeddings.bin') else: print("Invalid model type.") sys.exit(1) return model, model_type
def put_embeddings(self, rSubmission: RedditSubmission): vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300') rSubmission.post_title_embedding = np.mean(vecs.query( rSubmission.post_title.split()), axis=0) if len(rSubmission.post_text) > 0: rSubmission.post_text_embedding = np.mean(vecs.query( rSubmission.post_text.split()), axis=0) return rSubmission
def test_oov_values(self): self.vectors_oov_1 = Magnitude( MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude( MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.assertTrue(isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397)) self.assertTrue(isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271)) self.assertTrue(isclose(self.vectors_oov_1.query("*<<<<")[0], -0.0372075283555)) self.assertTrue(isclose(self.vectors_oov_1.query("*<<<<<")[0], -0.0201727917272)) self.assertTrue(isclose(self.vectors_oov_1.query("*<<<<<<")[0], -0.0475993225776)) self.assertTrue(isclose(self.vectors_oov_1.query("*<<<<<<<")[0], 0.0129938352266)) self.assertTrue(isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397)) self.assertTrue(isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271)) self.assertTrue(isclose(self.vectors_oov_2.query("*<<<<")[0], -0.0372075283555)) self.assertTrue(isclose(self.vectors_oov_2.query("*<<<<<")[0], -0.0201727917272)) self.assertTrue(isclose(self.vectors_oov_2.query("*<<<<<<")[0], -0.0475993225776)) self.assertTrue(isclose(self.vectors_oov_2.query("*<<<<<<<")[0], 0.0129938352266)) self.vectors_oov_1.close() self.vectors_oov_2.close()
def test_embedtext_creation(): extractor_cfg = { "_name": "embedtext", "index": "anserini", "tokenizer": "anserini", "embeddings": "glove6b", "zerounk": True, "calcidf": True, "maxqlen": MAXQLEN, "maxdoclen": MAXDOCLEN, } extractor = EmbedText(extractor_cfg) benchmark = DummyBenchmark({"_fold": "s1", "rundocsonly": False}) collection = DummyCollection({"_name": "dummy"}) index_cfg = {"_name": "anserini", "indexstops": False, "stemmer": "porter"} index = AnseriniIndex(index_cfg) index.modules["collection"] = collection tok_cfg = {"_name": "anserini", "keepstops": True, "stemmer": "none"} tokenizer = AnseriniTokenizer(tok_cfg) extractor.modules["index"] = index extractor.modules["tokenizer"] = tokenizer qids = list(benchmark.qrels.keys()) # ["301"] qid = qids[0] docids = list(benchmark.qrels[qid].keys()) extractor.create(qids, docids, benchmark.topics[benchmark.query_type]) expected_vocabs = [ "lessdummy", "dummy", "doc", "hello", "greetings", "world", "from", "outer", "space", "<pad>" ] expected_stoi = {s: i for i, s in enumerate(expected_vocabs)} assert set(extractor.stoi.keys()) == set(expected_stoi.keys()) emb_path = "glove/light/glove.6B.300d" fullemb = Magnitude(MagnitudeUtils.download_model(emb_path)) assert extractor.embeddings.shape == (len(expected_vocabs), fullemb.dim) for i in range(extractor.embeddings.shape[0]): if i == extractor.pad: assert extractor.embeddings[i].sum() < 1e-5 continue s = extractor.itos[i] assert (extractor.embeddings[i] - fullemb.query(s)).sum() < 1e-5 return extractor
def embedding_similarities( self, t1, terms2, emb_path="http://magnitude.plasticity.ai/fasttext/medium/wiki-news-300d-1M-subword.magnitude", emb_stream=True): from pymagnitude import Magnitude if not hasattr(self, '_wv') or self._wv is None: self._wv = {} if emb_path not in self._wv: self._wv[emb_path] = Magnitude(emb_path, stream=emb_stream) return self._wv[emb_path].similarity(t1, terms2)
def test_list_multiple(self): self.vectors_list = Magnitude( MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, use_numpy=False, eager=False) q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] self.assertTrue(isinstance(self.vectors_list.query(q[0]), list)) self.assertTrue(isclose(self.vectors.query(q[0]), asarray(self.vectors_list.query(q[0]))).all()) self.assertTrue(isinstance(self.vectors_list.query(q), list)) self.assertTrue(isclose(self.vectors.query(q), asarray(self.vectors_list.query(q))).all()) self.vectors_list.close()