def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. This batch query is more efficient than computing the similarities one document after another. """ self.close_shard() # no-op if no documents added to index since last query # reset num_best and normalize parameters, in case they were changed dynamically for shard in self.shards: shard.num_best = self.num_best shard.normalize = self.normalize # there are 4 distinct code paths, depending on whether input `query` is # a corpus (or numpy/scipy matrix) or a single document, and whether the # similarity result should be a full array or only num_best most similar # documents. pool, shard_results = self.query_shards(query) if self.num_best is None: # user asked for all documents => just stack the sub-results into a single matrix # (works for both corpus / single doc query) result = numpy.hstack(shard_results) else: # the following uses a lot of lazy evaluation and (optionally) parallel # processing, to improve query latency and minimize memory footprint. offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc] is_corpus, query = utils.is_corpus(query) is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 if not is_corpus: # user asked for num_best most similar and query is a single doc results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results)) result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) else: # the trickiest combination: returning num_best results when query was a corpus results = [] for shard_no, result in enumerate(shard_results): shard_result = [convert(doc, shard_no) for doc in result] results.append(shard_result) result = [] for parts in izip(*results): merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1]) result.append(merged) if pool: # gc doesn't seem to collect the Pools, eventually leading to # "IOError 24: too many open files". so let's terminate it manually. pool.terminate() return result
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict( izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict( (idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted( all_terms ) # sort the list of all words; rank in that list = word's integer id self.id2word = dict(izip( xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.word2id = dict((v, k) for k, v in iteritems(self.id2word)) self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s" % (self.num_docs, self.num_terms, fname))
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id self.id2word = dict( izip(xrange(len(all_terms)), all_terms) ) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.word2id = dict((v, k) for k, v in iteritems(self.id2word)) self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s" % (self.num_docs, self.num_terms, fname))
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. This batch query is more efficient than computing the similarities one document after another. """ self.close_shard( ) # no-op if no documents added to index since last query # reset num_best and normalize parameters, in case they were changed dynamically for shard in self.shards: shard.num_best = self.num_best shard.normalize = self.normalize # there are 4 distinct code paths, depending on whether input `query` is # a corpus (or numpy/scipy matrix) or a single document, and whether the # similarity result should be a full array or only num_best most similar # documents. pool, shard_results = self.query_shards(query) if self.num_best is None: # user asked for all documents => just stack the sub-results into a single matrix # (works for both corpus / single doc query) result = numpy.hstack(shard_results) else: # the following uses a lot of lazy evaluation and (optionally) parallel # processing, to improve query latency and minimize memory footprint. offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) convert = lambda doc, shard_no: [ (doc_index + offsets[shard_no], sim) for doc_index, sim in doc ] is_corpus, query = utils.is_corpus(query) is_corpus = is_corpus or hasattr( query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 if not is_corpus: # user asked for num_best most similar and query is a single doc results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results)) result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) else: # the trickiest combination: returning num_best results when query was a corpus results = [] for shard_no, result in enumerate(shard_results): shard_result = [convert(doc, shard_no) for doc in result] results.append(shard_result) result = [] for parts in izip(*results): merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1]) result.append(merged) if pool: # gc doesn't seem to collect the Pools, eventually leading to # "IOError 24: too many open files". so let's terminate it manually. pool.terminate() return result
def __iter__(self): for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]): yield zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])