class SegmentCorpus(CorpusABC):
    '''Implementation of gensim corpus interface that
    makes it possible to use segments as a read-only gensim corpora.
    '''
    
    def __init__(self, segment_name, dictionary, segmentstorage, limit=None):
        self._segment_name = segment_name
        self._dictionary = dictionary
        self._segstorage = segmentstorage
        self._limit = limit
        
        n = len(dictionary[dictionary.keys().__iter__().next()])
        self._transformer = NgramTransformer(n)

    def __iter__(self):
        if self._limit is None:
            for segment in self._segstorage.load_iterator(name=self._segment_name):
                yield self._dictionary.doc2bow(self._transformer.transform([segment.value])[0])
        else:
            for idx, segment in enumerate(self._segstorage.load_iterator(name=self._segment_name)):
                if idx >= self._limit:
                    break
                yield self._dictionary.doc2bow(self._transformer.transform([segment.value])[0])
    
    def __len__(self):
        count = self._segstorage.count(self._segment_name)
        if self._limit is not None:
            return min(count, self._limit)
        return count
    
    def save(self, fname):
        raise NotImplementedError('This corpus does not support saving as it is merely a read-only interface to a combined MongoDb-backed database.')
 def __init__(self, segment_name, dictionary, segmentstorage, limit=None):
     self._segment_name = segment_name
     self._dictionary = dictionary
     self._segstorage = segmentstorage
     self._limit = limit
     
     n = len(dictionary[dictionary.keys().__iter__().next()])
     self._transformer = NgramTransformer(n)
class DictionaryLearner(object):
    '''Learn a gensim dictionary from all available documents.'''
    
    def __init__(self, n=4):
        '''Initialize a DictionaryLearner instance using vocabulary of ngrams of size `n`.'''
        self._ngram = NgramTransformer(n)
        self._dictionary = Dictionary()
    
    def fit(self, documentstorage, filter_extremes=True):
        '''Fit a dictonary using documents from given documentstorage.'''
        for document in documentstorage.load_iterator(u''):
            text_document = document.text
            ngrams = self._ngram.transform([text_document])
            self._dictionary.add_documents(ngrams)
        if filter_extremes:
            self._dictionary.filter_extremes()

    def get(self):
        return self._dictionary
 def __init__(self, n=4):
     '''Initialize a DictionaryLearner instance using vocabulary of ngrams of size `n`.'''
     self._ngram = NgramTransformer(n)
     self._dictionary = Dictionary()