Python CorpusPreprocessor Examples

Programming Language: Python

Namespace/Package Name: citemachine.text_process

Examples at hotexamples.com: 2

Python CorpusPreprocessor - 2 examples found. These are the top rated real world Python examples of citemachine.text_process.CorpusPreprocessor extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

text_to_number_encoding(1)

Example #1

Show file

File: recommender.py Project: michalwols/citemachine

    def __init__(self, corpus, corpus_preprocessor=None, num_topics=100,
                 train_at_init=False):
        """
        Args:
            corpus: an instance of a citation corpus class
            corpus_preprocessor:
            num_topics: number of topics to train the LDA with
            train_at_init: if True, trains a new LDA model at initialization,
                    otherwise need to call '_train' method to train the model
        """
        self.corpus = corpus
        if corpus_preprocessor:
            self.preprocessor = corpus_preprocessor
        else:
            self.preprocessor = CorpusPreprocessor(self.corpus)

        self.num_topics = num_topics

        if train_at_init:
            self._train(num_topics)
        else:
            self.LDA = None
            self.topics = None

Example #2

Show file

File: recommender.py Project: michalwols/citemachine

class LDARecommender(object):

    def __init__(self, corpus, corpus_preprocessor=None, num_topics=100,
                 train_at_init=False):
        """
        Args:
            corpus: an instance of a citation corpus class
            corpus_preprocessor:
            num_topics: number of topics to train the LDA with
            train_at_init: if True, trains a new LDA model at initialization,
                    otherwise need to call '_train' method to train the model
        """
        self.corpus = corpus
        if corpus_preprocessor:
            self.preprocessor = corpus_preprocessor
        else:
            self.preprocessor = CorpusPreprocessor(self.corpus)

        self.num_topics = num_topics

        if train_at_init:
            self._train(num_topics)
        else:
            self.LDA = None
            self.topics = None

    @classmethod
    def init_from_pickle(cls, pickle_path):
        """Used to instantiante new class by loading a pretrained model from
        an old pickled instance, which might not provide newly implemented
        methods"""
        with open(pickle_path, 'rb') as pkl:
            lda_recom = cPickle.load(pkl)

        self = cls(corpus=lda_recom.corpus,
                   corpus_preprocessor=lda_recom.preprocessor, 
                   num_topics=lda_recom.num_topics)
        self.LDA = lda_recom.LDA
        self.topics = lda_recom.topics

        return self

    def _train(self, num_topics=None):
        if num_topics:
            self.num_topics = num_topics

        self.LDA = LdaModel(self.preprocessor.number_encodings.values(),
                            num_topics=self.num_topics,
                            id2word=self.preprocessor.id_to_word_map)

        self.topics = topic_model.build_topics_dict(self.LDA,
                            self.preprocessor.number_encodings)

    def top_scoring_for_topics(self, topic_vector,
                               publication_year=None,
                               num_results=None):

        if publication_year is None:
            publication_year = date.today().year

        scores = topic_model.score_topics(topic_vector, self.topics)
        valid_scores = topic_model.filter_scores(scores,
                                                 publication_year,
                                                 self.corpus.citation_counts,
                                                 self.corpus.years)
        if num_results is None:
            return valid_scores
        else:
            return valid_scores[0:num_results]

    def top_scoring_for_doc(self, doc_id, num_results=None):

        topic_vector = self.topics[doc_id]
        publication_year = self.corpus.years[doc_id]
        return self.top_scoring_for_topics(topic_vector,
                                           publication_year,
                                           num_results)

    def top_scoring_for_text(self, text, publication_year=None,
                             num_results=None):

        topic_vector = self.text_to_topic_vector(text)
        return self.top_scoring_for_topics(topic_vector,
                                           publication_year,
                                           num_results)

    def text_to_topic_vector(self, text):
        num_encoded_text = self.preprocessor.text_to_number_encoding(text)
        topic_vector = self.LDA[num_encoded_text]
        return topic_vector