def test_damerau_levenshtein_distance_ndarray(self): assert damerau_levenshtein_distance_ndarray( 'Saturday', np.array(['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']) ).tolist() == [3, 5, 5, 6, 4, 5, 0] assert damerau_levenshtein_distance_ndarray( 'Sjöstedt', np.array(['Sjöstedt', 'Sjostedt', 'Söstedt', 'Sjöedt']) ).tolist() == [0, 1, 1, 2]
def compact_word_vectors(self, vocab, filename=None, array=None, top=20000): """ Retrieve pretrained word spectors for our vocabulary. The returned word array has row indices corresponding to the compact index of a word, and columns correponding to the word vector. Arguments --------- vocab : dict Dictionary where keys are the loose index, and values are the word string. use_spacy : bool Use SpaCy to load in word vectors. Otherwise Gensim. filename : str Filename for SpaCy-compatible word vectors or if use_spacy=False then uses word2vec vectors via gensim. Returns ------- data : numpy float array Array such that data[compact_index, :] = word_vector Examples -------- >>> import numpy.linalg as nl >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'} >>> word_indices = np.zeros(50).astype('int32') >>> word_indices[:25] = 19 # 'Shuttle' shows 25 times >>> word_indices[25:35] = 5 # 'astronomy' is in 10 times >>> word_indices[40:46] = 7 # 'cold' is in 6 times >>> word_indices[46:] = 3 # 'hot' is in 3 times >>> corpus = Corpus() >>> corpus.update_word_count(word_indices) >>> corpus.finalize() >>> v, s, f = corpus.compact_word_vectors(vocab) >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y) >>> vocab[corpus.compact_to_loose[2]] 'shuttle' >>> vocab[corpus.compact_to_loose[3]] 'astronomy' >>> vocab[corpus.compact_to_loose[4]] 'cold' >>> sim_shuttle_astro = sim(v[2, :], v[3, :]) >>> sim_shuttle_cold = sim(v[2, :], v[4, :]) >>> sim_shuttle_astro > sim_shuttle_cold True """ n_words = len(self.compact_to_loose) from gensim.models.word2vec import Word2Vec model = Word2Vec.load_word2vec_format(filename, binary=True) n_dim = model.syn0.shape[1] data = np.random.normal(size=(n_words, n_dim)).astype('float32') data -= data.mean() data += model.syn0.mean() data /= data.std() data *= model.syn0.std() if array is not None: data = array n_words = data.shape[0] keys_raw = model.vocab.keys() keys = [s.encode('ascii', 'ignore') for s in keys_raw] lens = [len(s) for s in model.vocab.keys()] choices = np.array(keys, dtype='S') lengths = np.array(lens, dtype='int32') s, f = 0, 0 rep0 = lambda w: w rep1 = lambda w: w.replace(' ', '_') rep2 = lambda w: w.title().replace(' ', '_') reps = [rep0, rep1, rep2] for compact in np.arange(top): loose = self.compact_to_loose.get(compact, None) if loose is None: continue word = vocab.get(loose, None) if word is None: continue word = word.strip() vector = None for rep in reps: clean = rep(word) if clean in model.vocab: vector = model[clean] break if vector is None: try: word = unicode(word) idx = lengths >= len(word) - 3 idx &= lengths <= len(word) + 3 sel = choices[idx] d = damerau_levenshtein_distance_ndarray(word, sel) choice = np.array(keys_raw)[idx][np.argmin(d)] # choice = difflib.get_close_matches(word, choices)[0] vector = model[choice] print compact, word, ' --> ', choice except IndexError: pass if vector is None: f += 1 continue s += 1 data[compact, :] = vector[:] return data, s, f
def compact_word_vectors(self, vocab, filename=None, array=None, top=20000): """ Retrieve pretrained word spectors for our vocabulary. The returned word array has row indices corresponding to the compact index of a word, and columns correponding to the word vector. Arguments --------- vocab : dict Dictionary where keys are the loose index, and values are the word string. use_spacy : bool Use SpaCy to load in word vectors. Otherwise Gensim. filename : str Filename for SpaCy-compatible word vectors or if use_spacy=False then uses word2vec vectors via gensim. Returns ------- data : numpy float array Array such that data[compact_index, :] = word_vector Examples -------- >>> import numpy.linalg as nl >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'} >>> word_indices = np.zeros(50).astype('int32') >>> word_indices[:25] = 19 # 'Shuttle' shows 25 times >>> word_indices[25:35] = 5 # 'astronomy' is in 10 times >>> word_indices[40:46] = 7 # 'cold' is in 6 times >>> word_indices[46:] = 3 # 'hot' is in 3 times >>> corpus = Corpus() >>> corpus.update_word_count(word_indices) >>> corpus.finalize() >>> v, s, f = corpus.compact_word_vectors(vocab) >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y) >>> vocab[corpus.compact_to_loose[2]] 'shuttle' >>> vocab[corpus.compact_to_loose[3]] 'astronomy' >>> vocab[corpus.compact_to_loose[4]] 'cold' >>> sim_shuttle_astro = sim(v[2, :], v[3, :]) >>> sim_shuttle_cold = sim(v[2, :], v[4, :]) >>> sim_shuttle_astro > sim_shuttle_cold True """ n_words = len(self.compact_to_loose) import gensim model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True) n_dim = model.syn0.shape[1] data = np.random.normal(size=(n_words, n_dim)).astype('float32') data -= data.mean() data += model.syn0.mean() data /= data.std() data *= model.syn0.std() if array is not None: data = array n_words = data.shape[0] keys_raw = model.vocab.keys() keys = [s.encode('ascii', 'ignore') for s in keys_raw] lens = [len(s) for s in model.vocab.keys()] choices = np.array(keys, dtype='S') lengths = np.array(lens, dtype='int32') s, f = 0, 0 def rep0(w): return w def rep1(w): return w.replace(' ', '_') def rep2(w): return w.title().replace(' ', '_') reps = [rep0, rep1, rep2] for compact in np.arange(min(top, n_words)): loose = self.compact_to_loose.get(compact, None) if loose is None: continue word = vocab.get(loose, None) if word is None: continue word = word.strip() vector = None for rep in reps: clean = rep(word) if clean in model.vocab: vector = model[clean] break if vector is None: try: word = unicode(word) idx = lengths >= len(word) - 3 idx &= lengths <= len(word) + 3 sel = choices[idx] d = damerau_levenshtein_distance_ndarray(word, sel) choice = np.array(keys_raw)[idx][np.argmin(d)] # choice = difflib.get_close_matches(word, choices)[0] vector = model[choice] print compact, word, ' --> ', choice except IndexError: pass if vector is None: f += 1 continue s += 1 data[compact, :] = vector[:] return data, s, f
def compact_word_vectors(self, vocab, filename=None, array=None, top=20000): """ Retrieve pretrained word spectors for our vocabulary. The returned word array has row indices corresponding to the compact index of a word, and columns correponding to the word vector. This is called by data/preprocess.py to map our corpus vocabulary to GoogleNews-based vector data. Arguments --------- vocab : dict Dictionary where keys are the loose index, and values are the word string. use_spacy : bool Use SpaCy to load in word vectors. Otherwise Gensim. filename : str Filename for SpaCy-compatible word vectors or if use_spacy=False then uses word2vec vectors via gensim. Returns ------- data : numpy float array Array such that data[compact_index, :] = word_vector Examples -------- >>> import numpy.linalg as nl >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'} >>> word_indices = np.zeros(50).astype('int32') >>> word_indices[:25] = 19 # 'Shuttle' shows 25 times >>> word_indices[25:35] = 5 # 'astronomy' is in 10 times >>> word_indices[40:46] = 7 # 'cold' is in 6 times >>> word_indices[46:] = 3 # 'hot' is in 3 times >>> corpus = Corpus() >>> corpus.update_word_count(word_indices) >>> corpus.finalize() >>> v, s, f = corpus.compact_word_vectors(vocab) >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y) >>> vocab[corpus.compact_to_loose[2]] 'shuttle' >>> vocab[corpus.compact_to_loose[3]] 'astronomy' >>> vocab[corpus.compact_to_loose[4]] 'cold' >>> sim_shuttle_astro = sim(v[2, :], v[3, :]) >>> sim_shuttle_cold = sim(v[2, :], v[4, :]) >>> sim_shuttle_astro > sim_shuttle_cold True """ n_words = len(self.compact_to_loose) # Depreciated in 3.3.0 # -------------------- # from gensim.models.word2vec import Word2Vec # model = Word2Vec.load_word2vec_format(filename, binary=True) from gensim.models import KeyedVectors # load with # C binary format logger.info('Loading word2vec data from {}'.format(filename)) model = KeyedVectors.load_word2vec_format(filename, binary=True) n_dim = model.syn0.shape[1] data = np.random.normal(size=(n_words, n_dim)).astype('float32') data -= data.mean() data += model.syn0.mean() data /= data.std() data *= model.syn0.std() if array is not None: data = array n_words = data.shape[0] # model.vocab is vocabulary of loaded GoogleNews word2vec data. # Extract word strings of the GoogleNews vocabulary. keys_raw = model.vocab.keys() # keys = [s.encode('ascii', 'ignore') for s in keys_raw] keys = [s for s in keys_raw] # Extract word string length of each word in the GoogleNews vocabulary. lens = [len(s) for s in model.vocab.keys()] # choices = np.array(keys, dtype='S') choices = np.array(keys) lengths = np.array(lens, dtype='int32') s, f = 0, 0 # Some clean-up rules rep0 = lambda w: w rep1 = lambda w: w.replace(' ', '_') rep2 = lambda w: w.title().replace(' ', '_') reps = [rep0, rep1, rep2] # We only keep the first top# words in corpus, in the desc-order of # term-frequency. for compact in tqdm(np.arange(top)): # not need vectors for special items if compact == 0 or compact == 1: s += 1 f += 1 data[compact, :] = None continue # skipping those compact# not associated with long hash#, or # those long hash# not associated with any word in the our corpus vocabulary. # normally this should not happen. loose = self.compact_to_loose.get(compact, None) if loose is None: print( 'ATTN: skipping compact# {} because cannot find long hash#' .format(compact)) continue word = vocab.get(loose, None) if word is None: print( 'ATTN: skipping long hash# {} because cannot find word in corpus vocab' .format(loose)) continue word = word.strip() vector = None # Try all clean-up rules to see if we can find the word in # GoogleNews word2vec vocabalary for rep in reps: clean = rep(word) # note that model is gensim model loaded from GoogleNews word2vec data. # model.vocab is the vocabulary of GoogleNews word2vec vocabulary. # TODO: note that if we use LEMMA when constructing our own library, # then lemma words in our library may or may not be available in # GoogleNews vocabulary, unless we have access to LEMMA data in # GoogleNews vocabulary. In that case we currently depend on # similarity to look for the replacement word. if clean in model.vocab: vector = model[clean] break # Cannot find one word from our corpus vocabulary in GoogleNews # corpus vocabulary? This may happen because of resons like typo. # To recover as much as possible, we search for the similiar words # in GoogleNews vocabulary using our word (similiarity is mesured # in damerau_levenshtein_distance). if vector is None: # logger.info('No match {} in GoogleNews - look for the most similiar'.format(word)) try: # not required in Python3 # word = unicode(word) # select all words in the GoogleNews vocabulary that # word-len between [our-word-len - 3, our-word-len +3] idx = lengths >= len(word) - 3 idx &= lengths <= len(word) + 3 sel = choices[idx] # calculate distance between our word and all selected words # in the GoogleNews vocabulary. # d = damerau_levenshtein_distance_withNPArray(word, sel) # choice = np.array(keys_raw)[idx][np.argmin(d)] d = damerau_levenshtein_distance_ndarray(word, sel) # pick the nearest word choice = np.array(keys)[idx][np.argmin(d)] # choice = difflib.get_close_matches(word, choices)[0] vector = model[choice] print(compact, word, ' --> ', choice) except IndexError: pass if vector is None: f += 1 continue s += 1 data[compact, :] = vector[:] return data, s, f
print("normalized_damerau_levenshtein_distance('{}', '{}') = {}".format( 'gifts', 'profit', normalized_damerau_levenshtein_distance('gifts', 'profit'))) print( "normalized_damerau_levenshtein_distance('{}', '{}') = {} # unicode example\n" .format('Sjöstedt', 'Sjostedt', normalized_damerau_levenshtein_distance( 'Sjöstedt', 'Sjostedt'))) # unicode example print('# edit distances for a single sequence against an array of sequences') array = np.array([ 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ]) print("damerau_levenshtein_distance_ndarray('{}', np.array({})) = {}".format( 'Saturday', array, damerau_levenshtein_distance_ndarray('Saturday', array))) print( "normalized_damerau_levenshtein_distance_ndarray('{}', np.array({})) = {}\n" .format('Saturday', array, normalized_damerau_levenshtein_distance_ndarray('Saturday', array))) print( '# normalized edit distances for a single sequence against an array of sequences - unicode' ) array = np.array(['Sjöstedt', 'Sjostedt', 'Söstedt', 'Sjöedt']) print("damerau_levenshtein_distance_ndarray('{}', np.array({})) = {}".format( 'Sjöstedt', array, damerau_levenshtein_distance_ndarray('Sjöstedt', array))) print( "normalized_damerau_levenshtein_distance_ndarray('{}', np.array({})) = {}\n"