class SegmentCorpus(CorpusABC): '''Implementation of gensim corpus interface that makes it possible to use segments as a read-only gensim corpora. ''' def __init__(self, segment_name, dictionary, segmentstorage, limit=None): self._segment_name = segment_name self._dictionary = dictionary self._segstorage = segmentstorage self._limit = limit n = len(dictionary[dictionary.keys().__iter__().next()]) self._transformer = NgramTransformer(n) def __iter__(self): if self._limit is None: for segment in self._segstorage.load_iterator(name=self._segment_name): yield self._dictionary.doc2bow(self._transformer.transform([segment.value])[0]) else: for idx, segment in enumerate(self._segstorage.load_iterator(name=self._segment_name)): if idx >= self._limit: break yield self._dictionary.doc2bow(self._transformer.transform([segment.value])[0]) def __len__(self): count = self._segstorage.count(self._segment_name) if self._limit is not None: return min(count, self._limit) return count def save(self, fname): raise NotImplementedError('This corpus does not support saving as it is merely a read-only interface to a combined MongoDb-backed database.')
class DictionaryLearner(object): '''Learn a gensim dictionary from all available documents.''' def __init__(self, n=4): '''Initialize a DictionaryLearner instance using vocabulary of ngrams of size `n`.''' self._ngram = NgramTransformer(n) self._dictionary = Dictionary() def fit(self, documentstorage, filter_extremes=True): '''Fit a dictonary using documents from given documentstorage.''' for document in documentstorage.load_iterator(u''): text_document = document.text ngrams = self._ngram.transform([text_document]) self._dictionary.add_documents(ngrams) if filter_extremes: self._dictionary.filter_extremes() def get(self): return self._dictionary