Esempio n. 1
0
def count_buckets(ft, words, new_ngrams_size):
    new_to_old_buckets = defaultdict(set)
    old_hash_count = defaultdict(int)
    for word in words:
        old_hashes = ft_ngram_hashes(word, ft.min_n, ft.max_n, ft.bucket, fb_compatible=ft.compatible_hash)
        new_hashes = ft_ngram_hashes(word, ft.min_n, ft.max_n, new_ngrams_size, fb_compatible=ft.compatible_hash)

        for old_hash in old_hashes:
            old_hash_count[old_hash] += 1  # calculate frequency of ngrams for proper weighting

        for old_hash, new_hash in zip(old_hashes, new_hashes):
            new_to_old_buckets[new_hash].add(old_hash)
    return new_to_old_buckets, old_hash_count
Esempio n. 2
0
    def get_ngram_ids(self, word):
        if word in self.vocab:
            return [self.vocab[word]]
        res = []
        for ngram_id in ft_ngram_hashes(word, **self.hash_params):
            res.append(ngram_id + len(self.vocab))

        return res
def ng_norm_vec(w: str):
    word_vec = np.zeros(wv.vectors_ngrams.shape[1], dtype=np.float32)
    ngram_hashes = ft_ngram_hashes(w, wv.min_n, wv.max_n, wv.bucket, wv.compatible_hash)
    for nh in ngram_hashes:
        word_vec += wv.vectors_ngrams[nh]
    # +1 same as in the adjust vecs method
    #word_vec /= len(ngram_hashes)
    # word_vec /= math.log(1 + len(ngram_hashes))
    return word_vec
def standard_vec(w: str):
    word_vec = np.zeros(wv.vectors_ngrams.shape[1], dtype=np.float32)
    ngram_hashes = ft_ngram_hashes(w, wv.min_n, wv.max_n, wv.bucket, wv.compatible_hash)
    for nh in ngram_hashes:
        word_vec += wv.vectors_ngrams[nh]
    # +1 same as in the adjust vecs method
    if len(ngram_hashes) == 0:
        word_vec.fill(0)
        return word_vec

    else:
        return word_vec / len(ngram_hashes)
Esempio n. 5
0
    def word_vec(self, word, use_norm=False):
        """Get `word` representations in vector space, as a 1D numpy array.

        Parameters
        ----------
        word : str
            Input word
        use_norm : bool, optional
            If True - resulting vector will be L2-normalized (unit euclidean length).

        Returns
        -------
        numpy.ndarray
            Vector representation of `word`.

        Raises
        ------
        KeyError
            If word and all ngrams not in vocabulary.

        """
        if word in self.vocab:
            return super(FastTextKeyedVectors, self).word_vec(word, use_norm)
        elif self.bucket == 0:
            raise KeyError(
                'cannot calculate vector for OOV word without ngrams')
        else:
            word_vec = np.zeros(self.vectors_ngrams.shape[1], dtype=np.float32)
            ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n,
                                           self.bucket, self.compatible_hash)
            if len(ngram_hashes) == 0:
                return word_vec
            for nh in ngram_hashes:
                word_vec += self.vectors_ngrams[nh]
            result = word_vec / len(ngram_hashes)
            if use_norm:
                result /= np.sqrt(max(sum(result**2), EPSILON))
            return result
Esempio n. 6
0
    modelfile = sys.argv[1]  # Original fastText model
    freq_threshold = int(
        sys.argv[2])  # How frequent should the ngrams be? (e.g., 1000)
    filename = sys.argv[3]  # File to save ngram vectors

    model = gensim.models.KeyedVectors.load(modelfile)
    model.init_sims(replace=True)

    print(model)

    ngram_identifiers = {}
    hashes = set()

    for word in model.vocab:
        human_ngrams = compute_ngrams(word, model.min_n, model.max_n)
        hash_ngrams = ft_ngram_hashes(word, model.min_n, model.max_n,
                                      model.bucket)
        for hum, hsh in zip(human_ngrams, hash_ngrams):
            if hum not in ngram_identifiers:
                ngram_identifiers[hum] = {}
                ngram_identifiers[hum]['hash'] = hsh
                ngram_identifiers[hum]['freq'] = 0
            ngram_identifiers[hum]['freq'] += model.vocab[word].count
            hashes.add(hsh)

    print('Unique ngrams:', len(ngram_identifiers))
    print('Unique ngram hashes:', len(hashes))

    fin_ngram_identifiers = {
        n: ngram_identifiers[n]['hash']
        for n in ngram_identifiers
        if ngram_identifiers[n]['freq'] > freq_threshold
Esempio n. 7
0
def train_average_np(
    model: BaseSentence2VecModel,
    indexed_sentences: List[tuple],
    target: ndarray,
    memory: ndarray,
) -> [int, int]:
    """Training on a sequence of sentences and update the target ndarray.

    Called internally from :meth:`~fse.models.average.Average._do_train_job`.

    Warnings
    --------
    This is the non-optimized, pure Python version. If you have a C compiler,
    fse will use an optimized code path from :mod:`fse.models.average_inner` instead.

    Parameters
    ----------
    model : :class:`~fse.models.base_s2v.BaseSentence2VecModel`
        The BaseSentence2VecModel model instance.
    indexed_sentences : iterable of tuple
        The sentences used to train the model.
    target : ndarray
        The target ndarray. We use the index from indexed_sentences
        to write into the corresponding row of target.
    memory : ndarray
        Private memory for each working thread

    Returns
    -------
    int, int
        Number of effective sentences (non-zero) and effective words in the vocabulary used 
        during training the sentence embedding.

    """
    size = model.wv.vector_size
    vocab = model.wv.vocab

    w_vectors = model.wv.vectors
    w_weights = model.word_weights

    s_vectors = target

    is_ft = model.is_ft

    mem = memory[0]

    if is_ft:
        # NOTE: For Fasttext: Use wv.vectors_vocab
        # Using the wv.vectors from fasttext had horrible effects on the sts results
        # I suspect this is because the wv.vectors are based on the averages of
        # wv.vectors_vocab + wv.vectors_ngrams, which will all point into very
        # similar directions.
        max_ngrams = model.batch_ngrams
        w_vectors = model.wv.vectors_vocab
        ngram_vectors = model.wv.vectors_ngrams
        min_n = model.wv.min_n
        max_n = model.wv.max_n
        bucket = model.wv.bucket
        oov_weight = np_max(w_weights)

    eff_sentences, eff_words = 0, 0

    if not is_ft:
        for obj in indexed_sentences:
            mem.fill(0.0)
            sent = obj[0]
            sent_adr = obj[1]

            word_indices = [
                vocab[word].index for word in sent if word in vocab
            ]
            eff_sentences += 1
            if not len(word_indices):
                continue
            eff_words += len(word_indices)

            mem += np_sum(
                np_mult(w_vectors[word_indices],
                        w_weights[word_indices][:, None]),
                axis=0,
            )
            mem *= 1 / len(word_indices)
            s_vectors[sent_adr] = mem.astype(REAL)
    else:
        for obj in indexed_sentences:
            mem.fill(0.0)
            sent = obj[0]
            sent_adr = obj[1]

            if not len(sent):
                continue
            mem = zeros(size, dtype=REAL)

            eff_sentences += 1
            eff_words += len(sent)  # Counts everything in the sentence

            for word in sent:
                if word in vocab:
                    word_index = vocab[word].index
                    mem += w_vectors[word_index] * w_weights[word_index]
                else:
                    ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket,
                                                   True)[:max_ngrams]
                    if len(ngram_hashes) == 0:
                        continue
                    mem += oov_weight * (np_sum(ngram_vectors[ngram_hashes],
                                                axis=0) / len(ngram_hashes))
                # Implicit addition of zero if oov does not contain any ngrams
            s_vectors[sent_adr] = mem / len(sent)

    return eff_sentences, eff_words