class TextCorpus(interfaces.CorpusABC): """ Helper class to simplify the pipeline of getting bag-of-words vectors (= a gensim corpus) from plain text. This is an abstract base class: override the `get_texts()` method to match your particular input. Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized with a dictionary in `self.dictionary` and will support the `iter` corpus method. You must only provide a correct `get_texts` implementation. """ def __init__(self, input=None): super(TextCorpus, self).__init__() self.input = input self.dictionary = Dictionary() if input is not None: self.dictionary.add_documents(self.get_texts()) else: logger.warning("No input document stream provided; assuming " "dictionary will be initialized some other way.") def __iter__(self): """ The function that defines a corpus. Iterating over the corpus must yield sparse vectors, one for each document. """ for text in self.get_texts(): yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): return getstream(self.input) def get_texts(self): """ Iterate over the collection, yielding one document at a time. A document is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. Override this function to match your input (parse input files, do any text preprocessing, lowercasing, tokenizing etc.). There will be no further preprocessing of the words coming out of this function. """ # Instead of raising NotImplementedError, let's provide a sample implementation: # assume documents are lines in a single file (one document per line). # Yield each document as a list of lowercase tokens, via `utils.tokenize`. length = 0 for lineno, line in enumerate(getstream(self.input)): length += 1 yield utils.tokenize(line, lowercase=True) self.length = length def __len__(self): return self.length # will throw if corpus not initialized
class Corpus(object): """ """ def __init__(self, document_generator, stop_words): self.document_generator = document_generator self.stop_list = stop_words self.dictionary = Dictionary(document_generator()) self.tfidf_model = TfidfModel(self.dictionary) stop_ids = [self.dictionary.token_to_id[stop_word] for stop_word in self.stop_list if stop_word in self.dictionary.token_to_id] once_ids = [token_id for token_id, doc_freq in self.dictionary.doc_freqs.iteritems() if doc_freq == 1] self.dictionary.filter_tokens(stop_ids + once_ids) def add_documents(self, documents): self.dictionary.add_documents(documents) def __iter__(self): for document in self.document_generator(): # yield self.dictionary.doc_to_bag_of_words(tokens) #yield doc_to_vec(len(self.dictionary.items()), self.dictionary.doc_to_bag_of_words(document)) #yield doc_to_vec(len(self.dictionary.items()), # self.tfidf_model[self.dictionary.doc_to_bag_of_words(document)]) converted_document = self.dictionary.doc_to_bag_of_words(document) converted_document = self.tfidf_model[converted_document] word_count = len(self.dictionary.items()) for word_id in xrange(word_count): if word_id in converted_document: yield document.id, word_id, converted_document[word_id] else: yield document.id, word_id, 0
class Corpus(object): """ """ def __init__(self, document_generator, stop_words): self.document_generator = document_generator self.stop_list = stop_words self.dictionary = Dictionary(document_generator()) self.tfidf_model = TfidfModel(self.dictionary) stop_ids = [ self.dictionary.token_to_id[stop_word] for stop_word in self.stop_list if stop_word in self.dictionary.token_to_id ] once_ids = [ token_id for token_id, doc_freq in self.dictionary.doc_freqs.iteritems() if doc_freq == 1 ] self.dictionary.filter_tokens(stop_ids + once_ids) def add_documents(self, documents): self.dictionary.add_documents(documents) def __iter__(self): for document in self.document_generator(): # yield self.dictionary.doc_to_bag_of_words(tokens) #yield doc_to_vec(len(self.dictionary.items()), self.dictionary.doc_to_bag_of_words(document)) #yield doc_to_vec(len(self.dictionary.items()), # self.tfidf_model[self.dictionary.doc_to_bag_of_words(document)]) converted_document = self.dictionary.doc_to_bag_of_words(document) converted_document = self.tfidf_model[converted_document] word_count = len(self.dictionary.items()) for word_id in xrange(word_count): if word_id in converted_document: yield document.id, word_id, converted_document[word_id] else: yield document.id, word_id, 0
class TextCorpus(interfaces.CorpusABC): """ Helper class to simplify the pipeline of getting bag-of-words vectors (= a gensim corpus) from plain text. This is an abstract base class: override the `get_texts()` method to match your particular input. Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized with a dictionary in `self.dictionary` and will support the `iter` corpus method. You must only provide a correct `get_texts` implementation. """ def __init__(self, input=None): super(TextCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False if input is not None: self.dictionary.add_documents(self.get_texts()) else: logger.warning("No input document stream provided; assuming " "dictionary will be initialized some other way.") def __iter__(self): """ The function that defines a corpus. Iterating over the corpus must yield sparse vectors, one for each document. """ for text in self.get_texts(): if self.metadata: yield (self.dictionary.doc2bow(text[0], allow_update=False), text[1]) else: yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): return getstream(self.input) def get_texts(self): """ Iterate over the collection, yielding one document at a time. A document is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. Override this function to match your input (parse input files, do any text preprocessing, lowercasing, tokenizing etc.). There will be no further preprocessing of the words coming out of this function. """ # Instead of raising NotImplementedError, let's provide a sample implementation: # assume documents are lines in a single file (one document per line). # Yield each document as a list of lowercase tokens, via `utils.tokenize`. length = 0 for lineno, line in enumerate(getstream(self.input)): length += 1 yield utils.tokenize(line, lowercase=True) self.length = length def __len__(self): return self.length # will throw if corpus not initialized