Example #1
0
 def stream_reduced_corpus(self):
     corpus = JsonLinesCorpus(self.file_corpus_plain)
     if corpus.has_plain_tokens():
         logger.info("building a reduced version of corpus '{}'".format(
             self.file_corpus_plain))
         dictionary = self.load_dictionary()
         corpus.convert_tokens_to_ids(self.file_corpus,
                                      id2word=dictionary.id2token)
     else:
         # corpus is already in reduced format. continue...
         self.file_corpus = self.file_corpus_plain
Example #2
0
 def load_corpus_for_topic(self, topic: Topic) -> JsonLinesCorpus:
     corpus_path = self._get_corpus_path(topic)
     if os.path.isfile(corpus_path):
         # load the corpus for this topic (if available)
         return JsonLinesCorpus(self._get_corpus_path(topic))
     else:
         if topic.parent:
             # ok, try again with this topic's parent
             return self.load_corpus_for_topic(topic.parent)
         else:
             # no parent left? then use the root corpus
             return JsonLinesCorpus(self.file_corpus)
Example #3
0
 def store_gensim_dict(
         self,
         corpus: JsonLinesCorpus) -> Tuple[Set[str], corpora.Dictionary]:
     """
     process token stream to build dictionary in memory, then serialize the entire dictionary.
     also stores document IDs in a separate file.
     """
     logger.info("building the dictionary...")
     dictionary = corpora.Dictionary()
     doc_ids = set()
     for i, doc in enumerate(corpus.iter_all()):
         doc_id = doc['id']
         doc_ids.add(doc_id)
         token_counts = doc['tokens']  # type: Dict[str, int]
         # unfortunately, dictionary.doc2bow() does not accept (token,count) tuples
         # therefore we expand the dictionary to a token list again... (yes, this is stupid)
         tokens = util.flatten([token] * count
                               for token, count in token_counts.items())
         dictionary.doc2bow(tokens, allow_update=True)
         if (i + 1) % 50000 == 0:
             logger.info("{} documents have been read so far".format(i + 1))
     # store the document IDs
     util.json_write(sorted(doc_ids), self.file_ids)
     # store the dictionary
     dictionary.filter_extremes(no_below=self.token_min_count,
                                no_above=0.2,
                                keep_n=self.dict_size_limit)
     dictionary.compactify()
     dictionary.save(self.file_dict, pickle_protocol=4)
     return doc_ids, dictionary
Example #4
0
    def corpus2corpus(self, corpus: JsonLinesCorpus, documents: Dict[str,
                                                                     Document],
                      topic: Topic) -> JsonLinesCorpus:
        """
        get a subset of a corpus. It will include all documents that contain
        the specified topic.
        Writes the reduced corpus to a new file whose name is derived from the document ID
        :param corpus: the source corpus
        :param documents: the document definition (contains document topics)
        :param topic: filter all documents in the corpus by this topic
        :return: a new corpus containing only the filtered documents
        """
        logger.info("creating a subset of corpus '{}' for topic '{}'".format(
            corpus.fname, topic.topic_id))

        # specify the filter function
        def doc_filter(doc_dict: Dict[str, Any]) -> bool:
            """
            :return: True, iff this document has the specified topic
            """
            doc = documents[doc_dict['id']]
            return doc.topics and topic in doc.topics

        # build the new corpus
        corpus_path = self._get_corpus_path(topic)
        return corpus.subset(corpus_path, doc_filter)
Example #5
0
    def stream_classify_documents(self,
                                  parent_topic: Topic,
                                  corpus: JsonLinesCorpus,
                                  documents: Dict[str, Document],
                                  topic_limit=0) -> List[Topic]:
        # load the actual topic model
        model = self.load_model(
            self._get_model_path(parent_topic))  # type: HdpModel

        # build Topic objects from model
        topics = {}
        try:
            for i in itertools.count():
                topic_id = "{}-{}".format(parent_topic.topic_id, i)
                show_topic_kwargs = {}
                if self.model == "hdp":
                    show_topic_kwargs = {'num_words': 10, 'formatted': False}
                elif self.model == "lda":
                    show_topic_kwargs = {'topn': 10}
                topic_terms = [(term, round(score, 5))
                               for term, score in model.show_topic(
                                   i, **show_topic_kwargs)]
                topic = parent_topic.add_child(topic_id, topic_terms)
                topics[i] = topic
        except IndexError:
            pass  # most pythonic way to interrupt iteration, if # of elements is unknown...

        # calculate the topics for each document
        logger.info(
            "classifying {} documents from topic '{}' into {} new categories".
            format(len(corpus), parent_topic.topic_id, len(topics)))
        t = time.time()
        for i, doc_dict in enumerate(corpus.iter_all()):
            if not doc_dict['id'] or doc_dict['id'] not in documents:
                logger.warning(
                    "Document '{}' at corpus index {} (topic: {}) was not found "
                    "in the document index and will be skipped".format(
                        doc_dict['id'], parent_topic.topic_id, i))
                continue
            doc_id = doc_dict['id']
            tokens = doc_dict['tokens']
            document = documents[doc_id]
            assert document.topics is None or parent_topic in document.topics, \
                "tried to classify a document which is not part of the current topic"
            doc_topics = sorted(model[tokens],
                                key=lambda x: x[1],
                                reverse=True)  # type: List[Tuple[str, float]]
            for topic_idx, score in (doc_topics[:topic_limit]
                                     if topic_limit else doc_topics):
                if score > 0.10:
                    document.add_topic(topics[topic_idx], round(score, 5))
            if (i + 1) % 10000 == 0:
                t1 = time.time()
                logger.info(
                    "{}/{} documents have been classified ({:.2f} doc/min)".
                    format(i + 1, len(corpus),
                           self.batch_size * 60 / (t1 - t)))
                t = t1
        return list(topics.values())
Example #6
0
    def test_model(self, fin_corpus: str, fin_model: str):
        model = self.load_model(fin_model)
        model.print_topics(num_topics=-1, num_words=10)

        corpus = JsonLinesCorpus(fin_corpus)
        for tokens in corpus:
            topics = model[tokens]
            print("dominant topics in https://arxiv.org/abs/{}".format(tokens))
            for topic, score in sorted(topics,
                                       key=lambda x: x[1],
                                       reverse=True):
                print("topic {} @ {:.3f}: {}".format(topic, score,
                                                     model.print_topic(topic)))
Example #7
0
    def stream_topic_model(self,
                           topic: Topic,
                           dictionary: corpora.Dictionary = None,
                           corpus: IndexedCorpus = None,
                           num_topics=20,
                           max_topics_per_doc=5):
        # load dictionary and corpus, if necessary
        if not dictionary:
            dictionary = self.load_dictionary()
            logger.warning(
                "the default dictionary was loaded from file. "
                "You should keep an instance in memory instead of calling this in a loop..."
            )
        if not corpus:
            corpus = JsonLinesCorpus(self.file_corpus)
            logger.warning(
                "the default corpus was loaded from file. You should provide a "
                "reduced corpus to increase performance (see corpus2corpus)")
        # build the model
        logger.info(
            "building a topic model with {} topics for {} documents in topic '{}'"
            .format(num_topics, len(corpus), topic.topic_id))
        t0 = time.time()
        if self.model == "lda":
            model = LdaMulticore(corpus,
                                 id2word=dictionary.id2token,
                                 num_topics=num_topics,
                                 passes=2,
                                 iterations=50,
                                 chunksize=2000,
                                 workers=self.n_threads)
        elif self.model == "hdp":
            # T = overall topic limit, K = max topics per document
            model = HdpModel(corpus,
                             id2word=dictionary.id2token,
                             T=num_topics,
                             K=max_topics_per_doc)
        else:
            raise ValueError("Unknown model identifier '{}'".format(
                self.model))
        t1 = time.time()

        # serialize
        logger.info(
            "building the model took {:.1f} s. Serializing model...".format(
                t1 - t0))
        output_path = self._get_model_path(topic)
        with util.open_by_ext(output_path, 'wb') as fp:
            pickle.dump(model, fp, protocol=4)
            logger.info(
                "model dump finished, took {:.1f} s".format(time.time() - t1))
Example #8
0
 def stream_token_dict(self):
     """
     make a single run over the file containing all documents as plaintext.
     Parse all documents using spacy, store the token counts for each document
     and build the global token dict
     """
     if self.file_corpus_input:
         logger.info("reading corpus from '{}'".format(
             self.file_corpus_plain))
         corpus = JsonLinesCorpus(self.file_corpus_input)
         return self.store_gensim_dict(corpus)
     else:
         if self.abstracts_only:
             logger.info("reading abstracts from '{}'".format(
                 self.file_metadata))
             documents = util.json_read_lines(self.file_metadata,
                                              self.get_title_and_abstract)
         else:
             logger.info("reading documents from '{}'".format(
                 self.file_pdf_text))
             documents = util.json_read_lines(self.file_pdf_text,
                                              self.combine_pages)
         # limit document count (if configured)
         documents_limited = (next(documents)
                              for i in range(self.document_limit)
                              ) if self.document_limit else documents
         # filter by document language (if configured)
         documents_filtered = self.filter_by_lang(
             documents_limited, self.language_filter
         ) if self.language_filter else documents_limited
         # parse documents using spacy
         documents_tokens = self.spacy_parse(documents_filtered,
                                             batch_size=self.batch_size,
                                             n_threads=self.n_threads)
         # stream intermediate result to disk (in case data does not fit in RAM, which it won't if you're serious about this stuff)
         return self.store_tokens_and_gensim_dict(documents_tokens)
def build_corpus(tmp_dir, fname='corpus.json', data=test_data_dicts) -> JsonLinesCorpus:
    fname = os.path.join(tmp_dir, fname)
    JsonLinesCorpus.serialize(fname, data)
    return JsonLinesCorpus(fname)