Exemple #1
    def save_word2vec_format(self, fname, fvocab=None, binary=False):
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        if fvocab is not None:
            logger.info("Storing vocabulary in %s" % (fvocab))
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab),
                                          key=lambda item: -item[1].count):
                    vout.write("%s %s\n" % (word, vocab.count))
        logger.info("storing %sx%s projection weights into %s" %
                    (len(self.vocab), self.layer1_size, fname))
        assert (len(self.vocab), self.layer1_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write("%s %s\n" % self.syn0.shape)
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab),
                                      key=lambda item: -item[1].count):
                word = utils.to_utf8(word)  # always store in utf8
                row = self.syn0[vocab.index]
                if binary:
                    fout.write("%s %s\n" % (word, row.tostring()))
                    fout.write("%s %s\n" % (word, ' '.join("%f" % val
                                                           for val in row)))
Exemple #2
    def create_dictionary(self):
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary
Exemple #4
    def build_vocab(self, sentences):
        Build vocabulary from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        logger.info("collecting all words and their counts")
        sentence_no, vocab = -1, {}
        total_words = 0
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                    (sentence_no, total_words, len(vocab)))
            for word in sentence:
                total_words += 1
                if word in vocab:
                    vocab[word].count += 1
                    vocab[word] = Vocab(count=1)
        logger.info("collected %i word types from a corpus of %i words and %i sentences" %
            (len(vocab), total_words, sentence_no + 1))

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []
        for word, v in iteritems(vocab):
            if v.count >= self.min_count:
                v.index = len(self.vocab)
                self.vocab[word] = v
        logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

        # add info about each word's Huffman encoding
Exemple #5
def revdict(d):
    Reverse a dictionary mapping.

    When two keys map to the same value, only one of them will be kept in the
    result (which one is kept is arbitrary)."""
    return dict((v, k) for (k, v) in iteritems(d))
Exemple #7
    def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
        Remove document frequency statistics for tokens that appear in

        1. less than `no_below` documents (absolute number) or
        2. more than `no_above` documents (fraction of total corpus size, *not*
           absolute number).
        3. after (1) and (2), keep only the first `keep_n` most frequent tokens (or
           keep all if `None`).

        **Note:** since HashDictionary's id range is fixed and doesn't depend on
        the number of tokens seen, this doesn't really "remove" anything. It only
        clears some supplementary statistics, for easier debugging and a smaller RAM
        no_above_abs = int(
            no_above * self.num_docs
        )  # convert fractional threshold to absolute threshold
        ok = [
            item for item in iteritems(self.dfs_debug)
            if no_below <= item[1] <= no_above_abs
        ok = frozenset(
            for word, freq in sorted(ok, key=lambda item: -item[1])[:keep_n])

        self.dfs_debug = dict((word, freq)
                              for word, freq in iteritems(self.dfs_debug)
                              if word in ok)
        self.token2id = dict((token, tokenid)
                             for token, tokenid in iteritems(self.token2id)
                             if token in self.dfs_debug)
        self.id2token = dict(
            (tokenid, set(token for token in tokens
                          if token in self.dfs_debug))
            for tokenid, tokens in iteritems(self.id2token))
        self.dfs = dict((tokenid, freq)
                        for tokenid, freq in iteritems(self.dfs)
                        if self.id2token.get(tokenid, set()))

        # for word->document frequency
            "kept statistics for which were in no less than %i and no more than %i (=%.1f%%) documents"
            % (no_below, no_above_abs, 100.0 * no_above))
Exemple #8
    def doc2bow(self, document, allow_update=False, return_missing=False):
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` is set, then also update dictionary in the process: create
        ids for new words. At the same time, update document frequencies -- for
        each word appearing in this document, increase its document frequency (`self.dfs`)
        by one.

        If `allow_update` is **not** set, this function is `const`, aka read-only.
        result = {}
        missing = {}
        if isinstance(document, string_types):
            raise TypeError(
                "doc2bow expects an array of utf8 tokens on input, not a string"
        document = sorted(utils.to_utf8(token) for token in document)
        # construct (word, frequency) mapping. in python3 this is done simply
        # using Counter(), but here i use itertools.groupby() for the job
        for word_norm, group in itertools.groupby(document):
            frequency = len(
            )  # how many times does this word appear in the input document
            tokenid = self.token2id.get(word_norm, None)
            if tokenid is None:
                # first time we see this token (~normalized form)
                if return_missing:
                    missing[word_norm] = frequency
                if not allow_update:  # if we aren't allowed to create new tokens, continue with the next unique token
                tokenid = len(self.token2id)
                    word_norm] = tokenid  # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence!

            # update how many times a token appeared in the document
            result[tokenid] = frequency

        if allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            # increase document count for each unique token that appeared in the document
            for tokenid in iterkeys(result):
                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
            return result
Exemple #9
    def save_as_text(self, fname):
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for token, tokenid in sorted(iteritems(self.token2id)):
                fout.write("%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)))
Exemple #12
def cossim(vec1, vec2):
    vec1, vec2 = dict(vec1), dict(vec2)
    if not vec1 or not vec2:
        return 0.0
    vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1)))
    vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2)))
    assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries"
    if len(vec2) < len(vec1):
        vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector
    result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1))
    result /= vec1len * vec2len # rescale by vector lengths
    return result
Exemple #14
    def doc2bow(self, document, allow_update=False, return_missing=False):
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` is set, then also update dictionary in the process: create
        ids for new words. At the same time, update document frequencies -- for
        each word appearing in this document, increase its document frequency (`self.dfs`)
        by one.

        If `allow_update` is **not** set, this function is `const`, aka read-only.
        result = {}
        missing = {}
        if isinstance(document, string_types):
            raise TypeError("doc2bow expects an array of utf8 tokens on input, not a string")
        document = sorted(utils.to_utf8(token) for token in document)
        # construct (word, frequency) mapping. in python3 this is done simply
        # using Counter(), but here i use itertools.groupby() for the job
        for word_norm, group in itertools.groupby(document):
            frequency = len(list(group)) # how many times does this word appear in the input document
            tokenid = self.token2id.get(word_norm, None)
            if tokenid is None:
                # first time we see this token (~normalized form)
                if return_missing:
                    missing[word_norm] = frequency
                if not allow_update: # if we aren't allowed to create new tokens, continue with the next unique token
                tokenid = len(self.token2id)
                self.token2id[word_norm] = tokenid # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence!

            # update how many times a token appeared in the document
            result[tokenid] = frequency

        if allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            # increase document count for each unique token that appeared in the document
            for tokenid in iterkeys(result):
                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
            return result
Exemple #15
    def filter_tokens(self, bad_ids=None, good_ids=None):
        Remove the selected `bad_ids` tokens from all dictionary mappings, or, keep
        selected `good_ids` in the mapping and remove the rest.

        `bad_ids` and `good_ids` are collections of word ids to be removed.
        if bad_ids is not None:
            bad_ids = set(bad_ids)
            self.token2id = dict((token, tokenid)
                                 for token, tokenid in iteritems(self.token2id)
                                 if tokenid not in bad_ids)
            self.dfs = dict((tokenid, freq)
                            for tokenid, freq in iteritems(self.dfs)
                            if tokenid not in bad_ids)
        if good_ids is not None:
            good_ids = set(good_ids)
            self.token2id = dict((token, tokenid)
                                 for token, tokenid in iteritems(self.token2id)
                                 if tokenid in good_ids)
            self.dfs = dict((tokenid, freq)
                            for tokenid, freq in iteritems(self.dfs)
                            if tokenid in good_ids)
Exemple #19
    def doc2bow(self, document, allow_update=False, return_missing=False):
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` or `self.allow_update` is set, then also update dictionary
        in the process: update overall corpus statistics and document frequencies.
        For each id appearing in this document, increase its document frequency
        (`self.dfs`) by one.

        result = {}
        missing = {}
        document = sorted(
            document)  # convert the input to plain list (needed below)
        for word_norm, group in itertools.groupby(document):
            frequency = len(
            )  # how many times does this word appear in the input document
            tokenid = self.restricted_hash(word_norm)
            result[tokenid] = result.get(tokenid, 0) + frequency
            if self.debug:
                # increment document count for each unique token that appeared in the document
                self.dfs_debug[word_norm] = self.dfs_debug.get(word_norm,
                                                               0) + 1

        if allow_update or self.allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            if self.debug:
                # increment document count for each unique tokenid that appeared in the document
                # done here, because several words may map to the same tokenid
                for tokenid in iterkeys(result):
                    self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
            return result
Exemple #20
    def merge_with(self, other):
        Merge another dictionary into this dictionary, mapping same tokens to the
        same ids and new tokens to new ids. The purpose is to merge two corpora
        created using two different dictionaries, one from `self` and one from `other`.

        `other` can be any id=>word mapping (a dict, a Dictionary object, ...).

        Return a transformation object which, when accessed as `result[doc_from_other_corpus]`,
        will convert documents from a corpus built using the `other` dictionary
        into a document using the new, merged dictionary (see :class:`gensim.interfaces.TransformationABC`).


        >>> dict1 = Dictionary(some_documents)
        >>> dict2 = Dictionary(other_documents)  # ids not compatible with dict1!
        >>> dict2_to_dict1 = dict1.merge_with(dict2)
        >>> # now we can merge corpora from the two incompatible dictionaries into one
        >>> merged_corpus = itertools.chain(some_corpus_from_dict1, dict2_to_dict1[some_corpus_from_dict2])

        old2new = {}
        for other_id, other_token in iteritems(other):
            if other_token in self.token2id:
                new_id = self.token2id[other_token]
                new_id = len(self.token2id)
                self.token2id[other_token] = new_id
                self.dfs[new_id] = 0
            old2new[other_id] = new_id
                self.dfs[new_id] += other.dfs[other_id]
                # `other` isn't a Dictionary (probably just a dict) => ignore dfs, keep going
            self.num_docs += other.num_docs
            self.num_nnz += other.num_nnz
            self.num_pos += other.num_pos

        import gensim.models
        return gensim.models.VocabTransform(old2new)
Exemple #23
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.
        If provided, `id2word` is a dictionary mapping between word_ids (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logger.info("extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(
            )  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(izip(
                all_terms))  # build a mapping of word id(int) -> word (string)
            logger.info("using provided word mapping (%i ids)" % len(id2word))
            self.id2word = id2word
        self.word2id = dict((v, k) for k, v in iteritems(self.id2word))
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples

        logger.info("loaded corpus with %i documents and %i terms from %s" %
                    (self.num_docs, self.num_terms, fname))
Exemple #25
    def save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset()):
        Save the object to file (also see `load`).

        If `separately` is None, automatically detect large numpy/scipy.sparse arrays
        in the object being stored, and store them into separate files. This avoids
        pickle memory errors and allows mmap'ing large arrays back on load efficiently.

        You can also set `separately` manually, in which case it must be a list of attribute
        names to be stored in separate files. The automatic check is not performed in this case.

        `ignore` is a set of attribute names to *not* serialize (file handles, caches etc). On
        subsequent load() these attributes will be set to None.

        logger.info("saving %s object under %s, separately %s" % (self.__class__.__name__, fname, separately))
        subname = lambda suffix: fname + '.' + suffix + '.npy'
        tmp = {}
        if separately is None:
            separately = []
            for attrib, val in iteritems(self.__dict__):
                if isinstance(val, numpy.ndarray) and val.size >= sep_limit:
                elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and val.nnz >= sep_limit:

        # whatever's in `separately` or `ignore` at this point won't get pickled anymore
        for attrib in separately + list(ignore):
            if hasattr(self, attrib):
                tmp[attrib] = getattr(self, attrib)
                delattr(self, attrib)

            numpys, scipys, ignoreds = [], [], []
            for attrib, val in iteritems(tmp):
                if isinstance(val, numpy.ndarray) and attrib not in ignore:
                    logger.info("storing numpy array '%s' to %s" % (attrib, subname(attrib)))
                    numpy.save(subname(attrib), numpy.ascontiguousarray(val))
                elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
                    logger.info("storing scipy.sparse array '%s' under %s" % (attrib, subname(attrib)))
                    numpy.save(subname(attrib) + '.data.npy', val.data)
                    numpy.save(subname(attrib) + '.indptr.npy', val.indptr)
                    numpy.save(subname(attrib) + '.indices.npy', val.indices)
                    data, indptr, indices = val.data, val.indptr, val.indices
                    val.data, val.indptr, val.indices = None, None, None
                        pickle(val, subname(attrib)) # store array-less object
                        val.data, val.indptr, val.indices = data, indptr, indices
                    logger.info("not storing attribute %s" % (attrib))
            self.__dict__['__numpys'] = numpys
            self.__dict__['__scipys'] = scipys
            self.__dict__['__ignoreds'] = ignoreds
            pickle(self, fname)
            # restore the attributes
            for attrib, val in iteritems(tmp):
 def __getitem__(self, tokenid):
     if len(self.id2token) != len(self.token2id):
         # the word->id mapping has changed (presumably via add_documents);
         # recompute id->word accordingly
         self.id2token = dict((v, k) for k, v in iteritems(self.token2id))
     return self.id2token[tokenid]  # will throw for non-existent ids
Exemple #28
def precompute_idfs(wglobal, dfs, total_docs):
    """Precompute the inverse document frequency mapping for all terms."""
    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
    # this method is here just to speed things up a little.
    return dict((termid, wglobal(df, total_docs))
                for termid, df in iteritems(dfs))
Exemple #30
    def accuracy(self, questions, restrict_vocab=30000):
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word whose frequency
        is not in the top-N most frequent words (default top 30,000).

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        ok_vocab = dict(sorted(iteritems(self.vocab),
                               key=lambda item: -item[1].count)[:restrict_vocab])
        ok_index = set(v.index for v in itervalues(ok_vocab))

        def log_accuracy(section):
            correct, incorrect = section['correct'], section['incorrect']
            if correct + incorrect > 0:
                logger.info("%s: %.1f%% (%i/%i)" %
                    (section['section'], 100.0 * correct / (correct + incorrect),
                    correct, correct + incorrect))

        sections, section = [], None
        for line_no, line in enumerate(open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                section = {'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0}
                if not section:
                    raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
                    a, b, c, expected = [word.lower() for word in line.split()]  # TODO assumes vocabulary preprocessing uses lowercase, too...
                    logger.info("skipping invalid line #%i in %s" % (line_no, questions))
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" % (line_no, line))

                ignore = set(self.vocab[v].index for v in [a, b, c])  # indexes of words to ignore
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                for index in argsort(self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]:
                    if index in ok_index and index not in ignore:
                        predicted = self.index2word[index]
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted))
                section['correct' if predicted == expected else 'incorrect'] += 1
        if section:
            # store the last section, too

        total = {'section': 'total', 'correct': sum(s['correct'] for s in sections), 'incorrect': sum(s['incorrect'] for s in sections)}
        return sections
Exemple #31
    def save(self,
             sep_limit=10 * 1024**2,
        Save the object to file (also see `load`).

        If `separately` is None, automatically detect large numpy/scipy.sparse arrays
        in the object being stored, and store them into separate files. This avoids
        pickle memory errors and allows mmap'ing large arrays back on load efficiently.

        You can also set `separately` manually, in which case it must be a list of attribute
        names to be stored in separate files. The automatic check is not performed in this case.

        `ignore` is a set of attribute names to *not* serialize (file handles, caches etc). On
        subsequent load() these attributes will be set to None.

        logger.info("saving %s object under %s, separately %s" %
                    (self.__class__.__name__, fname, separately))
        subname = lambda suffix: fname + '.' + suffix + '.npy'
        tmp = {}
        if separately is None:
            separately = []
            for attrib, val in iteritems(self.__dict__):
                if isinstance(val, numpy.ndarray) and val.size >= sep_limit:
                elif isinstance(
                     scipy.sparse.csc_matrix)) and val.nnz >= sep_limit:

        # whatever's in `separately` or `ignore` at this point won't get pickled anymore
        for attrib in separately + list(ignore):
            if hasattr(self, attrib):
                tmp[attrib] = getattr(self, attrib)
                delattr(self, attrib)

            numpys, scipys, ignoreds = [], [], []
            for attrib, val in iteritems(tmp):
                if isinstance(val, numpy.ndarray) and attrib not in ignore:
                    logger.info("storing numpy array '%s' to %s" %
                                (attrib, subname(attrib)))
                    numpy.save(subname(attrib), numpy.ascontiguousarray(val))
                elif isinstance(
                     scipy.sparse.csc_matrix)) and attrib not in ignore:
                    logger.info("storing scipy.sparse array '%s' under %s" %
                                (attrib, subname(attrib)))
                    numpy.save(subname(attrib) + '.data.npy', val.data)
                    numpy.save(subname(attrib) + '.indptr.npy', val.indptr)
                    numpy.save(subname(attrib) + '.indices.npy', val.indices)
                    data, indptr, indices = val.data, val.indptr, val.indices
                    val.data, val.indptr, val.indices = None, None, None
                        pickle(val, subname(attrib))  # store array-less object
                        val.data, val.indptr, val.indices = data, indptr, indices
                    logger.info("not storing attribute %s" % (attrib))
            self.__dict__['__numpys'] = numpys
            self.__dict__['__scipys'] = scipys
            self.__dict__['__ignoreds'] = ignoreds
            pickle(self, fname)
            # restore the attributes
            for attrib, val in iteritems(tmp):
                setattr(self, attrib, val)