Beispiel #1
0
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        """
        self.close_shard() # no-op if no documents added to index since last query

        # reset num_best and normalize parameters, in case they were changed dynamically
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.normalize

        # there are 4 distinct code paths, depending on whether input `query` is
        # a corpus (or numpy/scipy matrix) or a single document, and whether the
        # similarity result should be a full array or only num_best most similar
        # documents.
        pool, shard_results = self.query_shards(query)
        if self.num_best is None:
            # user asked for all documents => just stack the sub-results into a single matrix
            # (works for both corpus / single doc query)
            result = numpy.hstack(shard_results)
        else:
            # the following uses a lot of lazy evaluation and (optionally) parallel
            # processing, to improve query latency and minimize memory footprint.
            offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
            convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim)
                                             for doc_index, sim in doc]
            is_corpus, query = utils.is_corpus(query)
            is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
            if not is_corpus:
                # user asked for num_best most similar and query is a single doc
                results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results))
                result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
            else:
                # the trickiest combination: returning num_best results when query was a corpus
                results = []
                for shard_no, result in enumerate(shard_results):
                    shard_result = [convert(doc, shard_no) for doc in result]
                    results.append(shard_result)
                result = []
                for parts in izip(*results):
                    merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1])
                    result.append(merged)
        if pool:
            # gc doesn't seem to collect the Pools, eventually leading to
            # "IOError 24: too many open files". so let's terminate it manually.
            pool.terminate()

        return result
Beispiel #2
0
    def compactify(self):
        """
        Assign new word ids to all words.

        This is done to make the ids more compact, e.g. after some tokens have
        been removed via :func:`filter_tokens` and there are gaps in the id series.
        Calling this method will remove the gaps.
        """
        logger.debug("rebuilding dictionary, shrinking gaps")

        # build mapping from old id -> new id
        idmap = dict(
            izip(itervalues(self.token2id), xrange(len(self.token2id))))

        # reassign mappings to new ids
        self.token2id = dict((token, idmap[tokenid])
                             for token, tokenid in iteritems(self.token2id))
        self.id2token = {}
        self.dfs = dict(
            (idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
Beispiel #3
0
    def compactify(self):
        """
        Assign new word ids to all words.

        This is done to make the ids more compact, e.g. after some tokens have
        been removed via :func:`filter_tokens` and there are gaps in the id series.
        Calling this method will remove the gaps.
        """
        logger.debug("rebuilding dictionary, shrinking gaps")

        # build mapping from old id -> new id
        idmap = dict(izip(itervalues(self.token2id),
                     xrange(len(self.token2id))))

        # reassign mappings to new ids
        self.token2id = dict((token, idmap[tokenid])
                             for token, tokenid in iteritems(self.token2id))
        self.id2token = {}
        self.dfs = dict((idmap[tokenid], freq)
                        for tokenid, freq in iteritems(self.dfs))
Beispiel #4
0
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        """
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.
        If provided, `id2word` is a dictionary mapping between word_ids (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logger.info("extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(
                all_terms
            )  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(izip(
                xrange(len(all_terms)),
                all_terms))  # build a mapping of word id(int) -> word (string)
        else:
            logger.info("using provided word mapping (%i ids)" % len(id2word))
            self.id2word = id2word
        self.word2id = dict((v, k) for k, v in iteritems(self.id2word))
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples

        logger.info("loaded corpus with %i documents and %i terms from %s" %
                    (self.num_docs, self.num_terms, fname))
Beispiel #5
0
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        """
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.
        If provided, `id2word` is a dictionary mapping between word_ids (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        self.fname = fname  # input file, see class doc for format
        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logger.info("extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(all_terms)  # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(
                izip(xrange(len(all_terms)), all_terms)
            )  # build a mapping of word id(int) -> word (string)
        else:
            logger.info("using provided word mapping (%i ids)" % len(id2word))
            self.id2word = id2word
        self.word2id = dict((v, k) for k, v in iteritems(self.id2word))
        self.num_terms = len(self.word2id)
        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples

        logger.info("loaded corpus with %i documents and %i terms from %s" % (self.num_docs, self.num_terms, fname))
Beispiel #6
0
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.

        **or**

        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        """
        self.close_shard(
        )  # no-op if no documents added to index since last query

        # reset num_best and normalize parameters, in case they were changed dynamically
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.normalize

        # there are 4 distinct code paths, depending on whether input `query` is
        # a corpus (or numpy/scipy matrix) or a single document, and whether the
        # similarity result should be a full array or only num_best most similar
        # documents.
        pool, shard_results = self.query_shards(query)
        if self.num_best is None:
            # user asked for all documents => just stack the sub-results into a single matrix
            # (works for both corpus / single doc query)
            result = numpy.hstack(shard_results)
        else:
            # the following uses a lot of lazy evaluation and (optionally) parallel
            # processing, to improve query latency and minimize memory footprint.
            offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
            convert = lambda doc, shard_no: [
                (doc_index + offsets[shard_no], sim) for doc_index, sim in doc
            ]
            is_corpus, query = utils.is_corpus(query)
            is_corpus = is_corpus or hasattr(
                query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
            if not is_corpus:
                # user asked for num_best most similar and query is a single doc
                results = (convert(result, shard_no)
                           for shard_no, result in enumerate(shard_results))
                result = heapq.nlargest(self.num_best,
                                        itertools.chain(*results),
                                        key=lambda item: item[1])
            else:
                # the trickiest combination: returning num_best results when query was a corpus
                results = []
                for shard_no, result in enumerate(shard_results):
                    shard_result = [convert(doc, shard_no) for doc in result]
                    results.append(shard_result)
                result = []
                for parts in izip(*results):
                    merged = heapq.nlargest(self.num_best,
                                            itertools.chain(*parts),
                                            key=lambda item: item[1])
                    result.append(merged)
        if pool:
            # gc doesn't seem to collect the Pools, eventually leading to
            # "IOError 24: too many open files". so let's terminate it manually.
            pool.terminate()

        return result
Beispiel #7
0
 def __iter__(self):
     for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]):
         yield zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])