def count_buckets(ft, words, new_ngrams_size): new_to_old_buckets = defaultdict(set) old_hash_count = defaultdict(int) for word in words: old_hashes = ft_ngram_hashes(word, ft.min_n, ft.max_n, ft.bucket, fb_compatible=ft.compatible_hash) new_hashes = ft_ngram_hashes(word, ft.min_n, ft.max_n, new_ngrams_size, fb_compatible=ft.compatible_hash) for old_hash in old_hashes: old_hash_count[old_hash] += 1 # calculate frequency of ngrams for proper weighting for old_hash, new_hash in zip(old_hashes, new_hashes): new_to_old_buckets[new_hash].add(old_hash) return new_to_old_buckets, old_hash_count
def get_ngram_ids(self, word): if word in self.vocab: return [self.vocab[word]] res = [] for ngram_id in ft_ngram_hashes(word, **self.hash_params): res.append(ngram_id + len(self.vocab)) return res
def ng_norm_vec(w: str): word_vec = np.zeros(wv.vectors_ngrams.shape[1], dtype=np.float32) ngram_hashes = ft_ngram_hashes(w, wv.min_n, wv.max_n, wv.bucket, wv.compatible_hash) for nh in ngram_hashes: word_vec += wv.vectors_ngrams[nh] # +1 same as in the adjust vecs method #word_vec /= len(ngram_hashes) # word_vec /= math.log(1 + len(ngram_hashes)) return word_vec
def standard_vec(w: str): word_vec = np.zeros(wv.vectors_ngrams.shape[1], dtype=np.float32) ngram_hashes = ft_ngram_hashes(w, wv.min_n, wv.max_n, wv.bucket, wv.compatible_hash) for nh in ngram_hashes: word_vec += wv.vectors_ngrams[nh] # +1 same as in the adjust vecs method if len(ngram_hashes) == 0: word_vec.fill(0) return word_vec else: return word_vec / len(ngram_hashes)
def word_vec(self, word, use_norm=False): """Get `word` representations in vector space, as a 1D numpy array. Parameters ---------- word : str Input word use_norm : bool, optional If True - resulting vector will be L2-normalized (unit euclidean length). Returns ------- numpy.ndarray Vector representation of `word`. Raises ------ KeyError If word and all ngrams not in vocabulary. """ if word in self.vocab: return super(FastTextKeyedVectors, self).word_vec(word, use_norm) elif self.bucket == 0: raise KeyError( 'cannot calculate vector for OOV word without ngrams') else: word_vec = np.zeros(self.vectors_ngrams.shape[1], dtype=np.float32) ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash) if len(ngram_hashes) == 0: return word_vec for nh in ngram_hashes: word_vec += self.vectors_ngrams[nh] result = word_vec / len(ngram_hashes) if use_norm: result /= np.sqrt(max(sum(result**2), EPSILON)) return result
modelfile = sys.argv[1] # Original fastText model freq_threshold = int( sys.argv[2]) # How frequent should the ngrams be? (e.g., 1000) filename = sys.argv[3] # File to save ngram vectors model = gensim.models.KeyedVectors.load(modelfile) model.init_sims(replace=True) print(model) ngram_identifiers = {} hashes = set() for word in model.vocab: human_ngrams = compute_ngrams(word, model.min_n, model.max_n) hash_ngrams = ft_ngram_hashes(word, model.min_n, model.max_n, model.bucket) for hum, hsh in zip(human_ngrams, hash_ngrams): if hum not in ngram_identifiers: ngram_identifiers[hum] = {} ngram_identifiers[hum]['hash'] = hsh ngram_identifiers[hum]['freq'] = 0 ngram_identifiers[hum]['freq'] += model.vocab[word].count hashes.add(hsh) print('Unique ngrams:', len(ngram_identifiers)) print('Unique ngram hashes:', len(hashes)) fin_ngram_identifiers = { n: ngram_identifiers[n]['hash'] for n in ngram_identifiers if ngram_identifiers[n]['freq'] > freq_threshold
def train_average_np( model: BaseSentence2VecModel, indexed_sentences: List[tuple], target: ndarray, memory: ndarray, ) -> [int, int]: """Training on a sequence of sentences and update the target ndarray. Called internally from :meth:`~fse.models.average.Average._do_train_job`. Warnings -------- This is the non-optimized, pure Python version. If you have a C compiler, fse will use an optimized code path from :mod:`fse.models.average_inner` instead. Parameters ---------- model : :class:`~fse.models.base_s2v.BaseSentence2VecModel` The BaseSentence2VecModel model instance. indexed_sentences : iterable of tuple The sentences used to train the model. target : ndarray The target ndarray. We use the index from indexed_sentences to write into the corresponding row of target. memory : ndarray Private memory for each working thread Returns ------- int, int Number of effective sentences (non-zero) and effective words in the vocabulary used during training the sentence embedding. """ size = model.wv.vector_size vocab = model.wv.vocab w_vectors = model.wv.vectors w_weights = model.word_weights s_vectors = target is_ft = model.is_ft mem = memory[0] if is_ft: # NOTE: For Fasttext: Use wv.vectors_vocab # Using the wv.vectors from fasttext had horrible effects on the sts results # I suspect this is because the wv.vectors are based on the averages of # wv.vectors_vocab + wv.vectors_ngrams, which will all point into very # similar directions. max_ngrams = model.batch_ngrams w_vectors = model.wv.vectors_vocab ngram_vectors = model.wv.vectors_ngrams min_n = model.wv.min_n max_n = model.wv.max_n bucket = model.wv.bucket oov_weight = np_max(w_weights) eff_sentences, eff_words = 0, 0 if not is_ft: for obj in indexed_sentences: mem.fill(0.0) sent = obj[0] sent_adr = obj[1] word_indices = [ vocab[word].index for word in sent if word in vocab ] eff_sentences += 1 if not len(word_indices): continue eff_words += len(word_indices) mem += np_sum( np_mult(w_vectors[word_indices], w_weights[word_indices][:, None]), axis=0, ) mem *= 1 / len(word_indices) s_vectors[sent_adr] = mem.astype(REAL) else: for obj in indexed_sentences: mem.fill(0.0) sent = obj[0] sent_adr = obj[1] if not len(sent): continue mem = zeros(size, dtype=REAL) eff_sentences += 1 eff_words += len(sent) # Counts everything in the sentence for word in sent: if word in vocab: word_index = vocab[word].index mem += w_vectors[word_index] * w_weights[word_index] else: ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket, True)[:max_ngrams] if len(ngram_hashes) == 0: continue mem += oov_weight * (np_sum(ngram_vectors[ngram_hashes], axis=0) / len(ngram_hashes)) # Implicit addition of zero if oov does not contain any ngrams s_vectors[sent_adr] = mem / len(sent) return eff_sentences, eff_words