Example #1
0
def read_embeddings(embeddings_filename, max_rank_emb):
    """Reads .vector or .bin file, modifies it to include <OOV> and <PADDING> and <SENTROOT>"""
    if embeddings_filename.endswith(".bin"):
        binary = True
    else:
        binary = False
    gensim_vectors = KeyedVectors.load_word2vec_format(
        embeddings_filename,
        binary=binary,
        limit=max_rank_emb,
        unicode_errors='replace')
    gensim_vectors.vocab["<OOV>"] = Vocab(index=1)
    gensim_vectors.vocab["<PADDING>"] = Vocab(index=0)
    gensim_vectors.vocab["<SENTROOT>"] = Vocab(index=2)
    for word_record in gensim_vectors.vocab.values():
        word_record.index += 3
    two_random_rows = numpy.random.uniform(
        low=-0.01, high=0.01, size=(3, gensim_vectors.vectors.shape[1]))
    # stack the two rows, and the embedding matrix on top of each other
    gensim_vectors.vectors = numpy.vstack(
        [two_random_rows, gensim_vectors.vectors])
    gensim_vectors.vectors = keras.utils.normalize(gensim_vectors.vectors,
                                                   axis=0)
    gensim_vectors.vectors = keras.utils.normalize(gensim_vectors.vectors)
    return gensim_vectors
 def add_word(word, weights):
     word_id = len(result.vocab)
     if word in result.vocab:
         return
     result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
     result.syn0[word_id] = weights
     result.index2word.append(word)
Example #3
0
    def create_keyedvector_from_matrix(self, embedding_matrix, word2id):
        """
        Imports the necessary attributes for the Embedding object from an embedding matrix and a word2id vocabulary. Can be used for custom pre-trained embeddings.
        Parameters
        ----------
        embedding_matrix: numpy.ndarray
            Embedding matrix as a numpy object
        word2id: dict
            Word vocabulary (key: word, value: word_index)
        """

        vocab = {
            word: word2id[word]
            for word in sorted(word2id, key=word2id.__getitem__, reverse=False)
        }
        embedding_matrix = embedding_matrix
        vector_size = embedding_matrix.shape[1]

        kv = KeyedVectors(vector_size)
        kv.vector_size = vector_size
        kv.vectors = embedding_matrix

        kv.index2word = list(vocab.keys())

        kv.vocab = {
            word: Vocab(index=word_id, count=0)
            for word, word_id in vocab.items()
        }

        self.embedding = kv
Example #4
0
    def __init__(self,
                 model_path,
                 model_params_path,
                 dim,
                 dtype="float32",
                 dtype_size=4,
                 header_size=32):
        self.header_size_bites = header_size * dtype_size
        self.dtype_size = dtype_size
        self.dtype = dtype

        self.dim = dim

        with open(model_params_path) as fd:
            params = json.load(fd)

        self.model = gensim.models.keyedvectors.FastTextKeyedVectors(
            vector_size=dim,
            min_n=params['hash_params']['minn'],
            max_n=params['hash_params']['maxn'],
            bucket=params['hash_params']['num_buckets'],
            compatible_hash=params['hash_params']['fb_compatible'])

        self.model.vectors_vocab = self.load_matrix(
            f'{model_path}.vectors_vocab.npy')
        self.model.vectors = self.load_matrix(f'{model_path}.vectors.npy')
        self.model.vectors_ngrams = self.load_matrix(
            f'{model_path}.vectors_ngrams.npy')
        self.model.vocab = dict((word, Vocab(index=idx, count=1))
                                for word, idx in params['vocab'].items())
Example #5
0
 def _load_tree(self, tree: dict) -> None:
     self.__dict__.update(tree)
     self.tokens = split_strings(self.tokens)
     self.frequencies = {
         w: self.frequencies["vals"][i]
         for i, w in enumerate(split_strings(self.frequencies["keys"]))}
     self.checker = SymSpell(max_dictionary_edit_distance=self.max_distance)
     self.checker.__dict__.update(tree["checker"])
     deletes = {}
     words = split_strings(self.checker._deletes["strings"])
     lengths = self.checker._deletes["lengths"]
     data = self.checker._deletes["data"]
     offset = 0
     for i, delindex in enumerate(self.checker._deletes["indexes"]):
         length = lengths[i]
         deletes[delindex] = [words[j] for j in data[offset:offset + length]]
         offset += length
     self.checker._deletes = deletes
     self.checker._words = {w: self.checker._words[i] for i, w in enumerate(words)}
     vectors = self.wv["vectors"]
     wv = FastTextKeyedVectors(vectors.shape[1], self.wv["min_n"], self.wv["max_n"],
                               self.wv["bucket"], True)
     wv.vectors = numpy.array(vectors)
     vocab = split_strings(self.wv["vocab"]["strings"])
     wv.vocab = {
         s: Vocab(index=i, count=self.wv["vocab"]["counts"][i])
         for i, s in enumerate(vocab)}
     wv.bucket = self.wv["bucket"]
     wv.index2word = wv.index2entity = vocab
     wv.num_ngram_vectors = self.wv["num_ngram_vectors"]
     wv.vectors_ngrams = numpy.array(self.wv["vectors_ngrams"])
     wv.hash2index = {k: v for v, k in enumerate(self.wv["hash2index"])}
     self.wv = wv
def load_vocab(wv, vocab_file, use_glove_format, restrict_vocab):
    # Read vocab.
    vocab_size = 0
    with open(vocab_file, "r") as f:
        wv.index2freq = []
        all_lines = f.readlines()[:restrict_vocab] if restrict_vocab > 0 else f.readlines()
        for index, line in enumerate(all_lines):
            if use_glove_format:
                word, count = line.strip().split(" ")  # vocab is indexed from 0; for co-occ we use 1-based indexing
                index = index
            else:
                index, word, count = line.strip().split("\t")
                index = int(index) - 1  # indexing starts at 1 in the file; for co-occ we use 0-based indexing
            wv.index2word.append(word)
            wv.vocab[word] = Vocab(index=index, count=int(count))
            wv.index2freq.append(count)
            vocab_size += 1

    wv.index2freq = array(wv.index2freq).astype(uint32)

    # Unused members from VanillaWordEmbeddingsKeyedVectors.
    wv.vectors_norm = None

    print("Loaded vocabulary with {} words".format(vocab_size))
    return vocab_size
def convert_graph_embedding_to_gensim(
        model_path: Union[str, Path],
        dataset_path: Union[str, Path],
        embedding_path: Union[str, Path] = None) -> Path:
    """
    Example usage
    convert_graph_embedding_to_gensim(
        model_path=(
                DEFAULT_OUTPUT_PATH / 'reviews_Cell_Phones_and_Accessories-50000-docs' / 'our' / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.model').as_posix(),
        dataset_path=(
                DEFAULT_OUTPUT_PATH / 'reviews_Cell_Phones_and_Accessories-50000-docs' / 'our' / 'aspect_2_aspect_graph-en_core_web_lg.en_core_web_lg.dataset').as_posix()
    )
    """
    if embedding_path is None:
        embedding_path = Path(model_path).with_suffix('.word2vec_format.bin')

    model = torch.load(model_path)
    dataset = torch.load(dataset_path)

    _save_word2vec_format(
        embedding_path,
        vocab={
            word.replace(' ', '_'): Vocab(index=index, count=1)
            for word, index in dataset[0].nodes_mapping[0].items()
        },
        vectors=model.embedding.weight.detach().numpy(),
        binary=True)

    return embedding_path
Example #8
0
 def add_word(word, weights):
     word_id = len(result.vocab)
     if word in result.vocab:
         logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
         return
     if counts is None:
         # most common scenario: no vocab file given. just make up some bogus counts, in descending order
         result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id)
     elif word in counts:
         # use count from the vocab file
         result.vocab[word] = Vocab(index=word_id, count=counts[word])
     else:
         # vocab file given, but word is missing -- set count to None (TODO: or raise?)
         logger.warning("vocabulary file is incomplete: '%s' is missing", word)
         result.vocab[word] = Vocab(index=word_id, count=None)
     result.vectors[word_id] = weights
     result.index2word.append(word)
Example #9
0
def generate_vocab(walks):
    index2word = []
    raw_vocab = defaultdict(int)

    for walk in walks:
        for word in walk:
            raw_vocab[word] += 1

    vocab = {}
    for word, v in iteritems(raw_vocab):
        vocab[word] = Vocab(count=v, index=len(index2word))
        index2word.append(word)

    index2word.sort(key=lambda word: vocab[word].count, reverse=True)
    for i, word in enumerate(index2word):
        vocab[word].index = i

    return vocab, index2word
Example #10
0
def add_embeddings(keyed_vectors, *words, init=None):
    from gensim.models.keyedvectors import Vocab
    if init is None:
        init = np.zeros
    syn0 = keyed_vectors.syn0
    for word in words:
        keyed_vectors.key_to_index[word] = Vocab(count=0, index=syn0.shape[0])
        keyed_vectors.syn0 = np.concatenate([syn0, init((1, syn0.shape[1]))])
        keyed_vectors.index2word.append(word)
    return syn0.shape[0]
Example #11
0
def save_gensim_model(words, word_reprs, output_file, binary=True):
    """Save word representations in w2v format. Word order is not preserved"""
    vocab = dict()
    for word in words:
        vocab[word] = Vocab(index=len(vocab))

    model = Word2VecKeyedVectors(word_reprs.shape[1])
    model.vocab = vocab
    model.vectors = word_reprs
    model.save_word2vec_format(fname=output_file, binary=binary)
Example #12
0
def generate_vocab(all_walks):
    index2word = []
    raw_vocab = defaultdict(int)

    for walks in all_walks:  # 遍历每一层walks
        for walk in walks:
            for word in walk:
                raw_vocab[word] += 1  # 统计一下word(node)出现的次数

    vocab = {}
    for word, v in iteritems(raw_vocab):  # 构建词袋,给word一个编码,按照出现的次数进行排序
        vocab[word] = Vocab(count=v, index=len(index2word))
        index2word.append(word)

    index2word.sort(key=lambda word: vocab[word].count,
                    reverse=True)  # 按照词袋的数量从大到小进行排序
    for i, word in enumerate(index2word):
        vocab[word].index = i  # 词袋的index重新排序

    return vocab, index2word
Example #13
0
def load(root_model):
    with open(root_model + '.txt', encoding='utf-8') as f:
        index_vocab = [t.strip() for t in f]
    vocab = {w: Vocab(index=i) for (i, w) in enumerate(index_vocab)}
    vectors = np.load(root_model + '.npy')
    m = Word2Vec()
    m.wv.index2word = index_vocab
    m.wv.vectors = vectors

    m.wv.vocab = vocab
    return m
Example #14
0
def add_embeddings(keyed_vectors, *words, init=None):
    import numpy as np
    from gensim.models.keyedvectors import Vocab
    if init is None:
        init = np.zeros
    vectors = keyed_vectors.vectors
    for word in words:
        keyed_vectors.vocab[word] = Vocab(count=0, index=vectors.shape[0])
        keyed_vectors.vectors = np.concatenate(
            [vectors, init((1, vectors.shape[1]))])
        keyed_vectors.index2word.append(word)
    return vectors.shape[0]
Example #15
0
    def add(self, wwc: Collection[Tuple[str, np.ndarray, int]]) -> Iterator[Vocab]:

        """`wwc` is a collection of (word, weights, count) tuples"""

        vocab_size, vector_size = self.vectors.shape
        self.vectors.resize((vocab_size + len(wwc), vector_size))
        for i, (word, weights, count) in enumerate(wwc):
            word_id = vocab_size + i
            voc = Vocab(index=word_id, count=count)
            self.vocab[word] = voc
            self.vectors[word_id] = weights
            self.index2word.append(word)
            yield voc
Example #16
0
def generate_vocab(all_walks):
    index2word = []
    raw_vocab = defaultdict(int)

    for walks in all_walks:
        for walk in walks:
            for word in walk:
                raw_vocab[word] += 1

    vocab = {}
    for word, v in iteritems(raw_vocab):
        vocab[word] = Vocab(count=v, index=len(index2word))
        index2word.append(word)

    # 按照每个单词出现的频次进行从大到小排序在vocab中
    index2word.sort(key=lambda word: vocab[word].count, reverse=True)
    for i, word in enumerate(index2word):
        vocab[word].index = i

    # vocab是一个按照walks中所有节点出现的频率从大到小排序后的单词表
    # index2word是节点集合
    return vocab, index2word
Example #17
0
    def _set_keyedvector(self, attrname, keys, dim, vec=None):
        keyed_vec = KeyedVectors(dim)
        dummy_max_count = len(keys) + 1
        for i, key in enumerate(keys):
            key = str(key)
            keyed_vec.vocab[key] = Vocab(index=i, count=dummy_max_count - i) # dummy count
            keyed_vec.index2word.append(key)

        if vec is not None:
            keyed_vec.vectors = vec
            keyed_vec.init_sims()

        setattr(self, attrname, keyed_vec)
Example #18
0
def load_emb(fp):
    n_vocab, dim = map(int, fp.readline().split())

    emb = KeyedVectors(dim)
    emb.vectors = np.empty((n_vocab, dim), dtype=np.float32)

    for i, line in tqdm(enumerate(fp), total=n_vocab, unit='word'):
        word, vec_str = line.split(' ', 1)
        emb.vectors[i] = np.fromstring(vec_str, sep=' ')
        emb.vocab[word] = Vocab(index=i)
        emb.index2word.append(word)

    return emb
Example #19
0
 def _create_pv(self):
     try:
         vectors = KeyedVectors()
     except TypeError:
         # Newer versions of gensim require a constructor argument.
         vectors = KeyedVectors(self._wv.shape[1])
     phrases = []
     for name in self.terminology.iter_names():
         # This iterates over unique names.
         vectors.vocab[name] = Vocab(index=len(vectors.vocab), count=None)
         vectors.index2word.append(name)
         phrases.append(self._phrase_vector(name))
     vectors.syn0 = vectors.syn0norm = np.array(phrases)
     return vectors
Example #20
0
def generate_vocab(all_walks):
    index2word = []
    raw_vocab = defaultdict(
        int)  # stores the count of a word appearing in the walk

    for walks in all_walks:
        for walk in walks:
            for word in walk:
                raw_vocab[word] += 1

    # compute count, then to sort based on count, at last set index after sort
    vocab = {}
    for word, v in iteritems(raw_vocab):
        vocab[word] = Vocab(
            count=v, index=len(index2word))  # vocab[word] = (count, index)
        index2word.append(word)  # index2word[index] == word

    index2word.sort(key=lambda word: vocab[word].count,
                    reverse=True)  # decending order
    for i, word in enumerate(index2word):
        vocab[word].index = i  # word2inedx

    return vocab, index2word  # vocab:
Example #21
0
 def update(self):
     wv = self.word_vectors_file.get_word_vectors()
     voc = self.vocabs_file.get_vocabs()['word']
     words_in_vocab = [
         k for k, _ in sorted(voc.items(), key=lambda i: i[1][0])
     ]
     word_embs = wv[words_in_vocab[1:]]
     unk_emb = np.mean(word_embs, 0, keepdims=True)
     embs = np.concatenate((unk_emb, word_embs), 0)
     kv = KeyedVectors(embs.shape[1])
     kv.syn0 = embs
     kv.vocab = dict(
         (k, Vocab(index=v[0], count=v[1])) for k, v in voc.items())
     kv.index2word = words_in_vocab
     kv.save(self.path)
Example #22
0
def __create_keyed_vector(matrix, orig_vocab):
    vocab = dict()
    index_to_word = []
    for word, word_id in sorted(orig_vocab.token2id.items(),
                                key=itemgetter(1)):
        index_to_word.append(word)
        vocab[word] = Vocab(index=word_id, count=orig_vocab.word_freq[word_id])
    vector_size = matrix.shape[1]

    keyed_vector = KeyedVectors(vector_size)
    keyed_vector.vector_size = vector_size
    keyed_vector.vocab = vocab
    keyed_vector.index2word = index_to_word
    keyed_vector.vectors = matrix
    assert (len(vocab), vector_size) == keyed_vector.vectors.shape
    return keyed_vector
Example #23
0
    def _load_dict(self, file_handle, encoding='utf8'):
        """Load a previously saved dictionary from disk, stored in Facebook's native fasttext format.

        Parameters
        ----------
        file_handle : file-like object
            The opened file handle to the persisted dictionary.
        encoding : str
            Specifies the encoding.

        """
        vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i')
        # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
        if nlabels > 0:
            raise NotImplementedError("Supervised fastText models are not supported")
        logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name)

        self.struct_unpack(file_handle, '@1q')  # number of tokens
        if self.new_format:
            pruneidx_size, = self.struct_unpack(file_handle, '@q')
        for i in range(vocab_size):
            word_bytes = b''
            char_byte = file_handle.read(1)
            # Read vocab word
            while char_byte != b'\x00':
                word_bytes += char_byte
                char_byte = file_handle.read(1)
            word = word_bytes.decode(encoding)
            count, _ = self.struct_unpack(file_handle, '@qb')

            self.wv.vocab[word] = Vocab(index=i, count=count)
            self.wv.index2word.append(word)

        assert len(self.wv.vocab) == nwords, (
            'mismatch between final vocab size ({} words), '
            'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords))
        if len(self.wv.vocab) != vocab_size:
            # expecting to log this warning only for pretrained french vector, wiki.fr
            logger.warning(
                "mismatch between final vocab size (%s words), and expected vocab size (%s words)",
                len(self.wv.vocab), vocab_size
            )

        if self.new_format:
            for j in range(pruneidx_size):
                self.struct_unpack(file_handle, '@2i')
Example #24
0
    def load_dict(self, file_handle, encoding='utf8'):
        vocab_size, nwords, _ = self.struct_unpack(file_handle, '@3i')
        # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
        logger.info("loading %s words for fastText model from %s", vocab_size,
                    self.file_name)

        self.struct_unpack(file_handle, '@1q')  # number of tokens
        if self.new_format:
            pruneidx_size, = self.struct_unpack(file_handle, '@q')
        for i in range(vocab_size):
            word_bytes = b''
            char_byte = file_handle.read(1)
            # Read vocab word
            while char_byte != b'\x00':
                word_bytes += char_byte
                char_byte = file_handle.read(1)
            word = word_bytes.decode(encoding)
            count, _ = self.struct_unpack(file_handle, '@qb')

            if i == nwords and i < vocab_size:
                # To handle the error in pretrained vector wiki.fr (French).
                # For more info : https://github.com/facebookresearch/fastText/issues/218

                assert word == "__label__", (
                    'mismatched vocab_size ({}) and nwords ({}), extra word "{}"'
                    .format(vocab_size, nwords, word))
                continue  # don't add word to vocab

            self.wv.vocab[word] = Vocab(index=i, count=count)
            self.wv.index2word.append(word)

        assert len(self.wv.vocab) == nwords, (
            'mismatch between final vocab size ({} words), '
            'and expected number of words ({} words)'.format(
                len(self.wv.vocab), nwords))
        if len(self.wv.vocab) != vocab_size:
            # expecting to log this warning only for pretrained french vector, wiki.fr
            logger.warning(
                "mismatch between final vocab size (%s words), and expected vocab size (%s words)",
                len(self.wv.vocab), vocab_size)

        if self.new_format:
            for j in range(pruneidx_size):
                self.struct_unpack(file_handle, '@2i')
Example #25
0
    def load_a_format(cls,
                      fin,
                      vocab_size,
                      vector_size,
                      datatype=np.float32,
                      discard=None):
        # type: (Iterable[str], int, int, Any, Optional[Callable[[str], bool]]) -> KeyedVectorsOriginal

        discard = discard or false

        result = cls(vector_size)
        result.vector_size = vector_size
        result.vectors = np.zeros((vocab_size, vector_size), dtype=datatype)

        for line in fin:
            word, vect = line.rstrip().split(" ", 1)

            if discard(word):
                continue

            weights = np.fromstring(vect, sep=" ", dtype=np.float32)
            # raise ValueError("invalid vector on line %s; is vector_size incorrect or file otherwise damaged?" % (i+1,))

            if word in result.vocab:
                raise DuplicateEntry(word)

            word_id = len(result.index2word)
            result.vocab[word] = Vocab(index=word_id,
                                       count=vocab_size - word_id)
            result.vectors[word_id] = weights
            result.index2word.append(word)

        real_size = len(result.index2word)

        if real_size != vocab_size:
            if discard is None:
                raise EOFError(
                    "unexpected end of input; is vocab_size incorrect or file otherwise damaged?"
                )
            else:
                result.vectors.resize(
                    (real_size, vector_size))  # this should be no-copy

        return result
def prep_embeddings_fast(ft, emb_func, limit_vocab_size=30000):
    w2v = Word2Vec()
    emb = None
    if type(emb_func) is np.ndarray:
        size = min(limit_vocab_size, emb_func.shape[0])
        emb = emb_func[:size]
    else:
        emb = np.zeros((limit_vocab_size, ft.D))
        for i in range(limit_vocab_size):
            emb[i], _ = emb_func(ft.id2word[i])
    size = emb.shape[0]
    w2v.index2word = ft.id2word[:size]
    w2v.vector_size = ft.D
    w2v.syn0 = emb
    dvocab = {}
    for word_id, word in enumerate(w2v.index2word):
        dvocab[word] = Vocab(index=word_id, count=ft.nwords - word_id)
    w2v.vocab = dvocab
    return w2v
Example #27
0
def main():
    """Entry point."""
    parser = argparse.ArgumentParser("AWD-LSTM Embeddings to Word Vectors")
    parser.add_argument("--model", required=True)
    parser.add_argument("--dictionary", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    dictionary = torch.load(args.dictionary)
    model = torch.load(args.model, map_location='cpu')
    embeddings = model[0].encoder.weight.data.cpu().numpy()

    kv = KeyedVectors(embeddings.shape[1])
    kv.syn0 = embeddings
    kv.vocab = {
        w: Vocab(index=i)
        for i, w in enumerate(dictionary.dictionary.idx2word)
    }
    kv.index2word = dictionary.dictionary.idx2word

    kv.save(args.output)
Example #28
0
    def prepare_vocab(self, hs, negative, wv, update=False,
                      keep_raw_vocab=False, trim_rule=None,
                      min_count=None, sample=None, dry_run=False):
        min_count = min_count or self.min_count
        sample = sample or self.sample
        drop_total = drop_unique = 0

        if not update:
            raise Exception('Nonce2Vec can only update a pre-existing '
                            'vocabulary')
        logger.info('Updating model with new vocabulary')
        new_total = pre_exist_total = 0
        # New words and pre-existing words are two separate lists
        new_words = []
        pre_exist_words = []
        if self.nonce is not None:
        # if self.nonce is not None and self.nonce in wv.vocab:
            if self.nonce in wv.vocab:
                gold_nonce = '{}_true'.format(self.nonce)
                nonce_index = wv.vocab[self.nonce].index
                wv.vocab[gold_nonce] = wv.vocab[self.nonce]
                wv.index2word[nonce_index] = gold_nonce
                # del wv.index2word[wv.vocab[self.nonce].index]
                del wv.vocab[self.nonce]
            for word, v in iteritems(self.raw_vocab):
                # Update count of all words already in vocab
                if word in wv.vocab:
                    pre_exist_words.append(word)
                    pre_exist_total += v
                    if not dry_run:
                        wv.vocab[word].count += v
                else:
                    # For new words, keep the ones above the min count
                    # AND the nonce (regardless of count)
                    if keep_vocab_item(word, v, min_count,
                                       trim_rule=trim_rule) or word == self.nonce:
                        new_words.append(word)
                        new_total += v
                        if not dry_run:
                            wv.vocab[word] = Vocab(count=v,
                                                   index=len(wv.index2word))
                            wv.index2word.append(word)
                    else:
                        drop_unique += 1
                        drop_total += v
            original_unique_total = len(pre_exist_words) \
                + len(new_words) + drop_unique
            pre_exist_unique_pct = len(pre_exist_words) \
                * 100 / max(original_unique_total, 1)
            new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1)
            logger.info('New added %i unique words (%i%% of original %i) '
                        'and increased the count of %i pre-existing words '
                        '(%i%% of original %i)', len(new_words),
                        new_unique_pct, original_unique_total,
                        len(pre_exist_words), pre_exist_unique_pct,
                        original_unique_total)
            retain_words = new_words + pre_exist_words
            retain_total = new_total + pre_exist_total

        # Precalculate each vocabulary item's threshold for sampling
        if not sample:
            # no words downsampled
            threshold_count = retain_total
        # Only retaining one subsampling notion from original gensim implementation
        else:
            threshold_count = sample * retain_total

        downsample_total, downsample_unique = 0, 0
        for w in retain_words:
            v = wv.vocab[w].count
            word_probability = (np.sqrt(v / threshold_count) + 1) \
                * (threshold_count / v)
            if word_probability < 1.0:
                downsample_unique += 1
                downsample_total += word_probability * v
            else:
                word_probability = 1.0
                downsample_total += v
            if not dry_run:
                wv.vocab[w].sample_int = int(round(word_probability * 2**32))

        if not dry_run and not keep_raw_vocab:
            logger.info('deleting the raw counts dictionary of %i items',
                        len(self.raw_vocab))
            self.raw_vocab = defaultdict(int)

        logger.info('sample=%g downsamples %i most-common words', sample,
                    downsample_unique)
        logger.info('downsampling leaves estimated %i word corpus '
                    '(%.1f%% of prior %i)', downsample_total,
                    downsample_total * 100.0 / max(retain_total, 1),
                    retain_total)

        # return from each step: words-affected, resulting-corpus-size,
        # extra memory estimates
        report_values = {
            'drop_unique': drop_unique, 'retain_total': retain_total,
            'downsample_unique': downsample_unique,
            'downsample_total': int(downsample_total),
            'num_retained_words': len(retain_words)
        }

        if self.null_word:
            # create null pseudo-word for padding when using concatenative
            # L1 (run-of-words)
            # this word is only ever input – never predicted – so count,
            # huffman-point, etc doesn't matter
            self.add_null_word(wv)

        if self.sorted_vocab and not update:
            self.sort_vocab(wv)
        if hs:
            # add info about each word's Huffman encoding
            self.create_binary_tree(wv)
        if negative:
            # build the table for drawing random words (for negative sampling)
            self.make_cum_table(wv)

        return report_values, pre_exist_words