Ejemplo n.º 1
0
class TextCorpus(interfaces.CorpusABC):
    """
    Helper class to simplify the pipeline of getting bag-of-words vectors (= a
    gensim corpus) from plain text.

    This is an abstract base class: override the `get_texts()` method to match
    your particular input.

    Given a filename (or a file-like object) in constructor, the corpus object
    will be automatically initialized with a dictionary in `self.dictionary` and
    will support the `iter` corpus method. You must only provide a correct `get_texts`
    implementation.

    """
    def __init__(self, input=None):
        super(TextCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        if input is not None:
            self.dictionary.add_documents(self.get_texts())
        else:
            logger.warning("No input document stream provided; assuming "
                           "dictionary will be initialized some other way.")


    def __iter__(self):
        """
        The function that defines a corpus.

        Iterating over the corpus must yield sparse vectors, one for each document.
        """
        for text in self.get_texts():
            yield self.dictionary.doc2bow(text, allow_update=False)


    def getstream(self):
        return getstream(self.input)


    def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        length = 0
        for lineno, line in enumerate(getstream(self.input)):
            length += 1
            yield utils.tokenize(line, lowercase=True)
        self.length = length


    def __len__(self):
        return self.length # will throw if corpus not initialized
Ejemplo n.º 2
0
class Corpus(object):
    """

    """
    def __init__(self, document_generator, stop_words):
        self.document_generator = document_generator
        self.stop_list = stop_words
        self.dictionary = Dictionary(document_generator())
        self.tfidf_model = TfidfModel(self.dictionary)
        stop_ids = [self.dictionary.token_to_id[stop_word] for stop_word in self.stop_list
                    if stop_word in self.dictionary.token_to_id]
        once_ids = [token_id for token_id, doc_freq in self.dictionary.doc_freqs.iteritems() if doc_freq == 1]
        self.dictionary.filter_tokens(stop_ids + once_ids)

    def add_documents(self, documents):
        self.dictionary.add_documents(documents)

    def __iter__(self):
        for document in self.document_generator():
            # yield self.dictionary.doc_to_bag_of_words(tokens)
            #yield doc_to_vec(len(self.dictionary.items()), self.dictionary.doc_to_bag_of_words(document))
            #yield doc_to_vec(len(self.dictionary.items()),
            #                 self.tfidf_model[self.dictionary.doc_to_bag_of_words(document)])
            converted_document = self.dictionary.doc_to_bag_of_words(document)
            converted_document = self.tfidf_model[converted_document]

            word_count = len(self.dictionary.items())
            for word_id in xrange(word_count):
                if word_id in converted_document:
                    yield document.id, word_id, converted_document[word_id]
                else:
                    yield document.id, word_id, 0
Ejemplo n.º 3
0
class Corpus(object):
    """

    """
    def __init__(self, document_generator, stop_words):
        self.document_generator = document_generator
        self.stop_list = stop_words
        self.dictionary = Dictionary(document_generator())
        self.tfidf_model = TfidfModel(self.dictionary)
        stop_ids = [
            self.dictionary.token_to_id[stop_word]
            for stop_word in self.stop_list
            if stop_word in self.dictionary.token_to_id
        ]
        once_ids = [
            token_id
            for token_id, doc_freq in self.dictionary.doc_freqs.iteritems()
            if doc_freq == 1
        ]
        self.dictionary.filter_tokens(stop_ids + once_ids)

    def add_documents(self, documents):
        self.dictionary.add_documents(documents)

    def __iter__(self):
        for document in self.document_generator():
            # yield self.dictionary.doc_to_bag_of_words(tokens)
            #yield doc_to_vec(len(self.dictionary.items()), self.dictionary.doc_to_bag_of_words(document))
            #yield doc_to_vec(len(self.dictionary.items()),
            #                 self.tfidf_model[self.dictionary.doc_to_bag_of_words(document)])
            converted_document = self.dictionary.doc_to_bag_of_words(document)
            converted_document = self.tfidf_model[converted_document]

            word_count = len(self.dictionary.items())
            for word_id in xrange(word_count):
                if word_id in converted_document:
                    yield document.id, word_id, converted_document[word_id]
                else:
                    yield document.id, word_id, 0
Ejemplo n.º 4
0
class TextCorpus(interfaces.CorpusABC):
    """
    Helper class to simplify the pipeline of getting bag-of-words vectors (= a
    gensim corpus) from plain text.

    This is an abstract base class: override the `get_texts()` method to match
    your particular input.

    Given a filename (or a file-like object) in constructor, the corpus object
    will be automatically initialized with a dictionary in `self.dictionary` and
    will support the `iter` corpus method. You must only provide a correct `get_texts`
    implementation.

    """
    def __init__(self, input=None):
        super(TextCorpus, self).__init__()
        self.input = input
        self.dictionary = Dictionary()
        self.metadata = False
        if input is not None:
            self.dictionary.add_documents(self.get_texts())
        else:
            logger.warning("No input document stream provided; assuming "
                           "dictionary will be initialized some other way.")

    def __iter__(self):
        """
        The function that defines a corpus.

        Iterating over the corpus must yield sparse vectors, one for each document.
        """
        for text in self.get_texts():
            if self.metadata:
                yield (self.dictionary.doc2bow(text[0],
                                               allow_update=False), text[1])
            else:
                yield self.dictionary.doc2bow(text, allow_update=False)

    def getstream(self):
        return getstream(self.input)

    def get_texts(self):
        """
        Iterate over the collection, yielding one document at a time. A document
        is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.

        Override this function to match your input (parse input files, do any
        text preprocessing, lowercasing, tokenizing etc.). There will be no further
        preprocessing of the words coming out of this function.
        """
        # Instead of raising NotImplementedError, let's provide a sample implementation:
        # assume documents are lines in a single file (one document per line).
        # Yield each document as a list of lowercase tokens, via `utils.tokenize`.
        length = 0
        for lineno, line in enumerate(getstream(self.input)):
            length += 1
            yield utils.tokenize(line, lowercase=True)
        self.length = length

    def __len__(self):
        return self.length  # will throw if corpus not initialized