def test_oov_subword_values(self): self.vectors_oov_1 = Magnitude( MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.vectors_oov_2 = Magnitude( MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, ngram_oov=False, eager=False) self.assertTrue(isclose(self.vectors_oov_1.query("discriminatoryy")[0], -0.059116619334669426)) self.assertTrue(isclose(self.vectors_oov_1.query("*<")[0], -0.0759614511397)) self.assertTrue(isclose(self.vectors_oov_1.query("*<<")[0], 0.00742723997271)) self.assertTrue(isclose(self.vectors_oov_1.query("uberx")[0], 0.0952671681336)) self.assertTrue(isclose(self.vectors_oov_1.query("misssipi")[0], 0.0577835297955)) self.assertTrue(isclose(self.vectors_oov_2.query("discriminatoryy")[0], -0.059116619334669426)) self.assertTrue(isclose(self.vectors_oov_2.query("*<")[0], -0.0759614511397)) self.assertTrue(isclose(self.vectors_oov_2.query("*<<")[0], 0.00742723997271)) self.assertTrue(isclose(self.vectors_oov_2.query("uberx")[0], 0.0952671681336)) self.assertTrue(isclose(self.vectors_oov_2.query("misssipi")[0], 0.0577835297955)) self.vectors_oov_1.close() self.vectors_oov_2.close()
def extract_wordvec_generalization(word, path_to_word_vectors, neighbor_number): ''' Extracts the nearest neighbor from vector space ''' vectors = Magnitude(path_to_word_vectors) generalized_attribute = vectors.most_similar( word, topn=neighbor_number)[neighbor_number - 1][0] return generalized_attribute
def glove_via_magnitude(topn=500, min_similarity=None, filename='glove.6B.100d.magnitude', lang='en_US'): from pymagnitude import Magnitude v = Magnitude(os.path.join(TOPDIR, filename)) training_set = list() units = set() for unit_list in classifier.ambiguous_units(): for unit in unit_list[1]: units.add(unit) for unit in units: print('Processing {}...'.format(unit.name)) name = unit.name surfaces = set(unit.name) if isinstance(unit, classes.Unit): surfaces.update(unit.surfaces) surfaces.update(unit.symbols) for surface in surfaces: neighbours = v.most_similar( v.query(surface), topn=topn, min_similarity=min_similarity) training_set.append({ 'unit': name, 'text': ' '.join(neighbour[0] for neighbour in neighbours) }) print('Done') with language.topdir(lang).joinpath('train/similars.json').open( 'w', encoding='utf-8') as file: json.dump(training_set, file, sort_keys=True, indent=4)
def create_vocab_tensors(input_vocab_index): """Creates a matrix of the glove embeddings for terms contained in the model for improve runtime Also used in ESIM""" print('Creating vocabulary tensors...') # Define GloVe model from Magnitude package model = Magnitude(config.glove_magnitude_path) np.random.seed(config.SEED) # Randomly initialize matrix vocab_tensors = np.random.normal( 0, 1, (input_vocab_index.n_words, model.dim)).astype('float32') vocab_words = list(input_vocab_index.word2index.keys()) unk_words = [] # Get vector for each word in vocabulary if in model for idx, word in enumerate(vocab_words): if word in model: vocab_tensors[idx] = model.query(word) else: unk_words.append(word) # Override special tokens special_tokens = ['SOS', 'EOS', 'UNK'] # Override special tokens vocab_tensors[:len(special_tokens), :] = np.random.uniform( -0.1, 0.1, (len(special_tokens), model.dim)).astype('float32') print('Tensor vocabulary complete.') print(' Total vocabulary size {}, {} UNK words ({:.2}%)'.format( len(vocab_words), len(unk_words), (len(unk_words) / len(vocab_words)) * 100)) return torch.tensor(vocab_tensors, dtype=torch.float64), unk_words
def test_lang_none_oov_stem(self): self.vectors_l = Magnitude(MagnitudeTest.MAGNITUDE_PATH, language=None) self.assertEqual(self.vectors_l._oov_stem('rejumping'), 'rejumping') self.assertEqual( self.vectors_l._oov_stem('reuberificationing'), 'reuberificationing') self.vectors_l.close()
def test_ngram_oov_subword_stability(self): self.vectors_oov_1 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) self.vectors_oov_2 = Magnitude(MagnitudeTest.MAGNITUDE_SUBWORD_PATH, case_insensitive=True, eager=False) for i in range(5): self.assertTrue( isclose(self.vectors_oov_1.query("*<"), self.vectors_oov_2.query("*<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<"), self.vectors_oov_2.query("*<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<"), self.vectors_oov_2.query("*<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<"), self.vectors_oov_2.query("*<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<"), self.vectors_oov_2.query("*<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<"), self.vectors_oov_2.query("*<<<<<<")).all()) self.assertTrue( isclose(self.vectors_oov_1.query("*<<<<<<<"), self.vectors_oov_2.query("*<<<<<<<")).all()) self.vectors_oov_1.close() self.vectors_oov_2.close()
def predict(chain, embedding=False, interpolation=False): if embedding or interpolation: vectors = Magnitude('GoogleNews-vectors-negative300.magnitude') scores = dict() for verb in verbs: score = 0 for event in chain: if embedding: score += vectors.similarity(event[0], verb) elif interpolation: score += (ALPHA * vectors.similarity(event[0], verb) + (1 - ALPHA) * pmi(event, (verb, None, None))) else: score += pmi(event, (verb, None, None)) scores[verb] = score cleaned_scores = dict() chain_verbs = set() for event in chain: chain_verbs.add(event) for candidate in scores: if candidate not in chain_verbs: cleaned_scores[candidate] = scores[candidate] ranked_scores = sorted(list(cleaned_scores.items()), key=lambda x: x[1], reverse=True) return ranked_scores
def get_nearest_words(): """ provides words closely related to the keywords Parameters: keywords -- an array of words closely related to the concept Returns: closest_words -- these are displayed on the right panel of the concept screen Testing: http://localhost:3001/api/get_nearest_words?keywords=lunch,slice,pie,pasta """ keywords = request.args.get('keywords', '') from pymagnitude import Magnitude #vectors = Magnitude('http://magnitude.plasticity.ai/word2vec/heavy/GoogleNews-vectors-negative300.magnitude', stream=True) # full url for streaming from 10GB model #vectors = Magnitude('http://magnitude.plasticity.ai/glove/light/glove.6B.50d.magnitude', stream=True) vectors = Magnitude('./pretrained_features/glove.6B.50d.magnitude') # there is likely overlap if the concepts words are closely related closest_words = set() for k in keywords.split(','): results = vectors.most_similar(k, topn=10) # Most similar by key #vectors.most_similar(vectors.query(k), topn = 100) # Most similar by vector for r in results: # just add the word, not the word's probability closest_words.add(r[0]) closest_words = closest_words - set(list(keywords.split(','))) return json.dumps(list(closest_words))
def get_word_vector(word): global model if model is None: # import fasttext # if os.environ.get('LANGUAGE', 'en').lower() == 'en': # print('Loading English word vectors') # model = fasttext.load_model('data/cc.en.300.bin') # else: # print('Loading Vietnamese word vectors') # model = fasttext.load_model('data/cc.vi.300.bin') # return model.get_word_vector(word.replace(' ', '_')) from pymagnitude import Magnitude if os.environ.get('LANGUAGE', 'en').lower() == 'en': print('Loading English word vectors') model = Magnitude('data/cc.en.300.magnitude', language='en', lazy_loading=20000) else: print('Loading Vietnamese word vectors') model = Magnitude('data/cc.vi.300.magnitude', language='vi', lazy_loading=20000) print('Loading completed') return model.query(word)
def test_list(self): self.vectors_list = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, use_numpy=False, eager=False) self.assertTrue(isinstance(self.vectors_list.query("cat"), list)) self.vectors_list.close()
def __init__(self, magnitude_path=None): """ :param str magnitude_path: Path to a .pymagnitude embeddings file. """ self.database = magnitude_path if self.database is not None: self.embeddings = Magnitude(self.database)
def test_oov_subword_dim_placeholders(self): self.vectors_placeholders = Magnitude( MagnitudeTest.MAGNITUDE_SUBWORD_PATH, placeholders=5, case_insensitive=True, eager=False) self.assertEqual(self.vectors_placeholders.query("*<<<<").shape, self.vectors_placeholders.query("cat").shape) self.assertTrue(isclose(self.vectors.query("*<<<<")[0], self.vectors_placeholders.query("*<<<<")[0])) self.vectors_placeholders.close()
def get_simlex_and_metrics(): simlex_data = load_simlex_data('../data/MSimLex999_Polish.txt') euklidean_metric = EuclideanMetric( Magnitude( '../data/nkjp+wiki-lemmas-restricted-300-skipg-ns.magnitude')) cosine_metric = CosineMetric( Magnitude( '../data/nkjp+wiki-lemmas-restricted-300-skipg-ns.magnitude')) return simlex_data, euklidean_metric, cosine_metric
def test_placeholders(self): self.vectors_placeholders = Magnitude(MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, placeholders=5, eager=False) self.assertEqual(self.vectors_placeholders.query("cat").shape, (305, )) self.assertEqual( self.vectors_placeholders.query("cat")[0], self.vectors.query("cat")[0]) self.vectors_placeholders.close()
def put_embeddings(self, rSubmission: RedditSubmission): vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300') rSubmission.post_title_embedding = np.mean(vecs.query( rSubmission.post_title.split()), axis=0) if len(rSubmission.post_text) > 0: rSubmission.post_text_embedding = np.mean(vecs.query( rSubmission.post_text.split()), axis=0) return rSubmission
def test_embedtext_creation(): extractor_cfg = { "_name": "embedtext", "index": "anserini", "tokenizer": "anserini", "embeddings": "glove6b", "zerounk": True, "calcidf": True, "maxqlen": MAXQLEN, "maxdoclen": MAXDOCLEN, } extractor = EmbedText(extractor_cfg) benchmark = DummyBenchmark({"_fold": "s1", "rundocsonly": False}) collection = DummyCollection({"_name": "dummy"}) index_cfg = {"_name": "anserini", "indexstops": False, "stemmer": "porter"} index = AnseriniIndex(index_cfg) index.modules["collection"] = collection tok_cfg = {"_name": "anserini", "keepstops": True, "stemmer": "none"} tokenizer = AnseriniTokenizer(tok_cfg) extractor.modules["index"] = index extractor.modules["tokenizer"] = tokenizer qids = list(benchmark.qrels.keys()) # ["301"] qid = qids[0] docids = list(benchmark.qrels[qid].keys()) extractor.create(qids, docids, benchmark.topics[benchmark.query_type]) expected_vocabs = [ "lessdummy", "dummy", "doc", "hello", "greetings", "world", "from", "outer", "space", "<pad>" ] expected_stoi = {s: i for i, s in enumerate(expected_vocabs)} assert set(extractor.stoi.keys()) == set(expected_stoi.keys()) emb_path = "glove/light/glove.6B.300d" fullemb = Magnitude(MagnitudeUtils.download_model(emb_path)) assert extractor.embeddings.shape == (len(expected_vocabs), fullemb.dim) for i in range(extractor.embeddings.shape[0]): if i == extractor.pad: assert extractor.embeddings[i].sum() < 1e-5 continue s = extractor.itos[i] assert (extractor.embeddings[i] - fullemb.query(s)).sum() < 1e-5 return extractor
def test_list_multiple(self): self.vectors_list = Magnitude( MagnitudeTest.MAGNITUDE_PATH, case_insensitive=True, use_numpy=False, eager=False) q = [["I", "saw", "a", "cat"], ["He", "went", "to", "the", "mall"]] self.assertTrue(isinstance(self.vectors_list.query(q[0]), list)) self.assertTrue(isclose(self.vectors.query(q[0]), asarray(self.vectors_list.query(q[0]))).all()) self.assertTrue(isinstance(self.vectors_list.query(q), list)) self.assertTrue(isclose(self.vectors.query(q), asarray(self.vectors_list.query(q))).all()) self.vectors_list.close()
def __init__(self, embeddings_source=EMBEDDINGS, out_embeddings_source=OUT_EMBEDDINGS, formulas_source=FORMULAS, phraser_source=PHRASER): """ :param embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param out_embeddings_source: can be instance of a Magnitude object or url or path to a serialized Magnitude object :param formulas_source: can be url or path to a JSON-serialized dict of formulae, if not supplied a default file is loaded """ # hidden layer embeddings (W) self.embeddings = Magnitude(embeddings_source, eager=False) # output layer embeddings (O) self.out_embeddings = Magnitude(out_embeddings_source) # load pre-trained formulas from embeddings with open(formulas_source, 'r') as f: self.formulas_with_abbreviations = load(f) self.dp = DataPreparation(local=False) self.es = ElasticConnection() self.formulas = { k: v for k, v in self.formulas_with_abbreviations.items() if k not in self.ABBR_LIST } self.formula_counts = { root_formula: sum(formulas.values()) for root_formula, formulas in self.formulas.items() } self.most_common_forms = { formula_group_name: (formula_group_name if formula_group_name in self.dp.ELEMENTS else max(formulae.items(), key=operator.itemgetter(1))[0]) for formula_group_name, formulae in self.formulas_with_abbreviations.items() } self.phraser = Phraser.load(phraser_source)
def read_magnitude_vectors(magnitude_filepath, vocab_filepath, vocab_size, dim, special_tokens=[UNK]): """Read word vectors from *.magnitude Args: magnitude_filepath (str): magnituide file path vocab_filepath (str): vocabulary file path vocab_size (int): Maximum vocab size (including special tokens) dim (int): Dimension of the word vectors to load special_tokens (list[str]) Return: words (list[str]): list of length vocab_size embeddings (np.array): (vocab_size, dim) """ logging.info('Loading word vectors from %s', magnitude_filepath) words = [x for x in special_tokens] word_set = set() with open(vocab_filepath, 'r', 'utf8') as fin: for line in fin: word = line.strip().split("\t")[0] if word in word_set: logging.warning( "token must be unique. non-unique token='{}'".format(word)) elif len(word) > 0: word_set.add(word) words.append(word) if len(words) == vocab_size: break magnitude = Magnitude(magnitude_filepath, case_insensitive=True, normalized=True) vectors = magnitude.query(words[len(special_tokens):]) # special vectors for UNK special_vectors = np.random.normal(size=(len(special_tokens), dim)) special_vectors /= np.linalg.norm(special_vectors, ord=2, axis=1, keepdims=True) # Concatenate vectors = np.vstack([special_vectors, vectors]).astype('float32') assert vectors.shape[0] == len(words) assert vectors.shape[1] == dim logging.info('Loaded %d word vectors; shape = %s', len(words), str(vectors.shape)) return words, vectors
def _build_matrix(self, tokenizer): vector = Magnitude('vectors/glove.6B.50d.magnitude') GLOVE_VECTOR_DIMENSION = 50 MAX_NUM_WORDS = 300 word_index = tokenizer.word_index num_words = min(MAX_NUM_WORDS, len(word_index)) + 1 embedding_matrix = np.zeros((num_words, GLOVE_VECTOR_DIMENSION)) for word, i in tqdm(word_index.items()): if i > MAX_NUM_WORDS: continue embedding_vector = vector.query(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector return embedding_matrix
def __init__(self, embedding_name): """ If the _is_initialized class property is not set, build the benchmark and model (expensive) Else, do nothing. """ self.embedding_name = embedding_name self.embedding = Magnitude( MagnitudeUtils.download_model( self.SUPPORTED_EMBEDDINGS[embedding_name], download_dir=os.environ.get("CAPREOLUS_CACHE", get_default_cache_dir()) ), lazy_loading=-1, blocking=True, ) self.stoi = {self.PAD: 0} # string to integer. Associates an integer value with every token self.itos = {0: self.PAD}
class MagnitudeFactory(EmbeddingFactory): def __init__(self, embedding_type: EmbeddingType): super().__init__(embedding_type) cache_dir = Path(fs.get_project_root_dir()) / ".magnitude" fs.mkdir_if_not_exists(cache_dir) embed_file = self._embedding_type.url[self._embedding_type.url. rfind("/") + 1:] compressed_file = Path(cache_dir) / embed_file if not compressed_file.exists(): logger.info( ' Downloading magnitude file ("{}")...'.format(embed_file)) wget.download(self._embedding_type.url, compressed_file) self._embed_file = compressed_file logger.info(' Loading Magnitude module...') self._magnitude_vecs = Magnitude(self._embed_file) def build(self, vocab_list: List[str], **kwargs) -> (List[str], List[str], Dict[str, Dict[str, Any]]): oov, iov = [], [] vec_dict = {} for w in vocab_list: is_oov = w not in self._magnitude_vecs vec = self._magnitude_vecs.query(w) vec_dict[w] = {"vec": vec, "trainable": is_oov} if is_oov: oov.append(w) else: iov.append(w) return oov, iov, vec_dict
def load_data(data_dir='./data'): """Loads all data in `data_dir` as a dict Each of `dev`, `train` and `test` contains (1) `raw` folder (2) `relations.json`. We don't need to worry about `raw` folder, and instead focus on `relations.json` which contains all the information we need for our classification task. Args: data_dir: str, the root directory of all data Returns: dict, where the keys are: `dev`, `train` and `test` and the values are lists of relations data in `relations.json` """ assert os.path.exists(data_dir), "`data_dir` does not exist in `load_data`" data = {} vectors = Magnitude("glove.6B.50d.magnitude") #vectors = Magnitude("glove.6B.300d.magnitude") get_sense_dict(os.path.join(data_dir, "train")) #print(sense_dict) for folder in os.listdir(data_dir): #print(folder) print("Loading", folder) folder_path = os.path.join(data_dir, folder) #print(folder_path) data[folder] = load_relations(folder_path, vectors) ''' print("Loading", "dev") folder_path = os.path.join(data_dir, "dev") data["dev"] = load_relations(folder_path, vectors) ''' return data
class GloveEncoder(): """Encodes an input sentence as a mean or max pooled sentence embedding given the individual word embeddings""" def __init__(self, pooling='mean'): self.name = 'GloveEncoder' self.trainable_model = False self.pooling = pooling self.model = Magnitude(config.glove_magnitude_path) self.hidden_size = self.model.dim def sentence_embedding(self, input_text): words_in_model = [ word for word in input_text.split() if word in self.model ] sentence_embedding = np.zeros((len(words_in_model), self.model.dim)) sentence_embedding.fill(np.nan) for idx, token in enumerate(words_in_model): sentence_embedding[idx] = self.model.query(token) if self.pooling == 'max': sentence_embedding = np.max(sentence_embedding, axis=0) else: sentence_embedding = np.mean(sentence_embedding, axis=0) return torch.tensor(sentence_embedding.reshape(1, 1, -1), device=DEVICE)
def load(self, path, blocking): # Require that vector path exists, if a path is provided and it's not found, Magnitude will try download from it's servers if not path or not os.path.isfile(path): raise IOError(ENOENT, "Vector model file not found", path) # Load magnitude model. If this is a training run (no embeddings yet), block until the vectors are fully loaded return Magnitude(path, case_insensitive=True, blocking=blocking)
class MagnitudeFactory(EmbeddingFactory): def __init__(self, embedding_type: EmbeddingType): super().__init__(embedding_type) cache_dir = Path(fs.get_project_root_dir()) / ".magnitude" fs.mkdir_if_not_exists(cache_dir) embed_file = self._embedding_type.url[self._embedding_type.url. rfind("/") + 1:] compressed_file = Path(cache_dir) / embed_file if not compressed_file.exists(): logger.info( ' Downloading magnitude file ("{}")...'.format(embed_file)) wget.download(self._embedding_type.url, compressed_file) self._embed_file = compressed_file logger.info(' Loading Magnitude module...') self._magnitude_vecs = Magnitude(self._embed_file) def build(self, vocab_list: List[str], h5_file: Path, **kwargs) -> (List[str], List[str]): oov, iov = [], [] with h5py.File(h5_file, mode="w") as vec_h5: for w in vocab_list: is_oov = w not in self._magnitude_vecs vec = self._magnitude_vecs.query(w) vec_h5.create_dataset("{key}/vec".format(key=w), data=vec) vec_h5.create_dataset("{key}/trainable".format(key=w), data=1 if is_oov else 0) if is_oov: oov.append(w) else: iov.append(w) return oov, iov
def __init__(self, emdim): base_dir = os.path.join(os.path.dirname(__file__), os.pardir, 'data') self.fasttext_dim = 300 self.glove_dim = emdim - 300 assert self.glove_dim in [50, 100, 200, 300], "Embedding dimension must be one of the following: 350, 400, 500, 600" print("Will download magnitude files from the server if they aren't avaialble locally.. So, grab a cup of coffee while the downloading is under progress..") glove = Magnitude(MagnitudeUtils.download_model('glove/medium/glove.6B.{}d'.format(self.glove_dim), download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True) fasttext = Magnitude(MagnitudeUtils.download_model('fasttext/medium/wiki-news-300d-1M-subword', download_dir=os.path.join(base_dir, 'magnitude')), case_insensitive=True) self.vectors = Magnitude(glove, fasttext)
def put_embeddings(self, rSubmission: RedditSubmission): vecs = Magnitude('word2vec/light/GoogleNews-vectors-negative300') # vecs = Magnitude('http://magnitude.plasticity.ai/word2vec/light/GoogleNews-vectors-negative300.magnitude') rSubmission.post_title_embedding = np.mean(vecs.query( rSubmission.post_title.split()), axis=0) if len(rSubmission.post_text) > 0: rSubmission.post_text_embedding = np.mean(vecs.query( rSubmission.post_text.split()), axis=0) logger.info('Embedded submission: ', rSubmission.post_title) return rSubmission
def create_magnitude(case_insensitive=True, eager=False, **kwargs): vectors = Magnitude( MAGNITUDE_PATH, case_insensitive=case_insensitive, eager=eager, **kwargs) return vectors
def __init__(self, embedding_type: EmbeddingType): super().__init__(embedding_type) cache_dir = Path(fs.get_project_root_dir()) / ".magnitude" fs.mkdir_if_not_exists(cache_dir) embed_file = self._embedding_type.url[self._embedding_type.url. rfind("/") + 1:] compressed_file = Path(cache_dir) / embed_file if not compressed_file.exists(): logger.info( ' Downloading magnitude file ("{}")...'.format(embed_file)) wget.download(self._embedding_type.url, compressed_file) self._embed_file = compressed_file logger.info(' Loading Magnitude module...') self._magnitude_vecs = Magnitude(self._embed_file)