def stream_reduced_corpus(self): corpus = JsonLinesCorpus(self.file_corpus_plain) if corpus.has_plain_tokens(): logger.info("building a reduced version of corpus '{}'".format( self.file_corpus_plain)) dictionary = self.load_dictionary() corpus.convert_tokens_to_ids(self.file_corpus, id2word=dictionary.id2token) else: # corpus is already in reduced format. continue... self.file_corpus = self.file_corpus_plain
def load_corpus_for_topic(self, topic: Topic) -> JsonLinesCorpus: corpus_path = self._get_corpus_path(topic) if os.path.isfile(corpus_path): # load the corpus for this topic (if available) return JsonLinesCorpus(self._get_corpus_path(topic)) else: if topic.parent: # ok, try again with this topic's parent return self.load_corpus_for_topic(topic.parent) else: # no parent left? then use the root corpus return JsonLinesCorpus(self.file_corpus)
def store_gensim_dict( self, corpus: JsonLinesCorpus) -> Tuple[Set[str], corpora.Dictionary]: """ process token stream to build dictionary in memory, then serialize the entire dictionary. also stores document IDs in a separate file. """ logger.info("building the dictionary...") dictionary = corpora.Dictionary() doc_ids = set() for i, doc in enumerate(corpus.iter_all()): doc_id = doc['id'] doc_ids.add(doc_id) token_counts = doc['tokens'] # type: Dict[str, int] # unfortunately, dictionary.doc2bow() does not accept (token,count) tuples # therefore we expand the dictionary to a token list again... (yes, this is stupid) tokens = util.flatten([token] * count for token, count in token_counts.items()) dictionary.doc2bow(tokens, allow_update=True) if (i + 1) % 50000 == 0: logger.info("{} documents have been read so far".format(i + 1)) # store the document IDs util.json_write(sorted(doc_ids), self.file_ids) # store the dictionary dictionary.filter_extremes(no_below=self.token_min_count, no_above=0.2, keep_n=self.dict_size_limit) dictionary.compactify() dictionary.save(self.file_dict, pickle_protocol=4) return doc_ids, dictionary
def corpus2corpus(self, corpus: JsonLinesCorpus, documents: Dict[str, Document], topic: Topic) -> JsonLinesCorpus: """ get a subset of a corpus. It will include all documents that contain the specified topic. Writes the reduced corpus to a new file whose name is derived from the document ID :param corpus: the source corpus :param documents: the document definition (contains document topics) :param topic: filter all documents in the corpus by this topic :return: a new corpus containing only the filtered documents """ logger.info("creating a subset of corpus '{}' for topic '{}'".format( corpus.fname, topic.topic_id)) # specify the filter function def doc_filter(doc_dict: Dict[str, Any]) -> bool: """ :return: True, iff this document has the specified topic """ doc = documents[doc_dict['id']] return doc.topics and topic in doc.topics # build the new corpus corpus_path = self._get_corpus_path(topic) return corpus.subset(corpus_path, doc_filter)
def stream_classify_documents(self, parent_topic: Topic, corpus: JsonLinesCorpus, documents: Dict[str, Document], topic_limit=0) -> List[Topic]: # load the actual topic model model = self.load_model( self._get_model_path(parent_topic)) # type: HdpModel # build Topic objects from model topics = {} try: for i in itertools.count(): topic_id = "{}-{}".format(parent_topic.topic_id, i) show_topic_kwargs = {} if self.model == "hdp": show_topic_kwargs = {'num_words': 10, 'formatted': False} elif self.model == "lda": show_topic_kwargs = {'topn': 10} topic_terms = [(term, round(score, 5)) for term, score in model.show_topic( i, **show_topic_kwargs)] topic = parent_topic.add_child(topic_id, topic_terms) topics[i] = topic except IndexError: pass # most pythonic way to interrupt iteration, if # of elements is unknown... # calculate the topics for each document logger.info( "classifying {} documents from topic '{}' into {} new categories". format(len(corpus), parent_topic.topic_id, len(topics))) t = time.time() for i, doc_dict in enumerate(corpus.iter_all()): if not doc_dict['id'] or doc_dict['id'] not in documents: logger.warning( "Document '{}' at corpus index {} (topic: {}) was not found " "in the document index and will be skipped".format( doc_dict['id'], parent_topic.topic_id, i)) continue doc_id = doc_dict['id'] tokens = doc_dict['tokens'] document = documents[doc_id] assert document.topics is None or parent_topic in document.topics, \ "tried to classify a document which is not part of the current topic" doc_topics = sorted(model[tokens], key=lambda x: x[1], reverse=True) # type: List[Tuple[str, float]] for topic_idx, score in (doc_topics[:topic_limit] if topic_limit else doc_topics): if score > 0.10: document.add_topic(topics[topic_idx], round(score, 5)) if (i + 1) % 10000 == 0: t1 = time.time() logger.info( "{}/{} documents have been classified ({:.2f} doc/min)". format(i + 1, len(corpus), self.batch_size * 60 / (t1 - t))) t = t1 return list(topics.values())
def test_model(self, fin_corpus: str, fin_model: str): model = self.load_model(fin_model) model.print_topics(num_topics=-1, num_words=10) corpus = JsonLinesCorpus(fin_corpus) for tokens in corpus: topics = model[tokens] print("dominant topics in https://arxiv.org/abs/{}".format(tokens)) for topic, score in sorted(topics, key=lambda x: x[1], reverse=True): print("topic {} @ {:.3f}: {}".format(topic, score, model.print_topic(topic)))
def stream_topic_model(self, topic: Topic, dictionary: corpora.Dictionary = None, corpus: IndexedCorpus = None, num_topics=20, max_topics_per_doc=5): # load dictionary and corpus, if necessary if not dictionary: dictionary = self.load_dictionary() logger.warning( "the default dictionary was loaded from file. " "You should keep an instance in memory instead of calling this in a loop..." ) if not corpus: corpus = JsonLinesCorpus(self.file_corpus) logger.warning( "the default corpus was loaded from file. You should provide a " "reduced corpus to increase performance (see corpus2corpus)") # build the model logger.info( "building a topic model with {} topics for {} documents in topic '{}'" .format(num_topics, len(corpus), topic.topic_id)) t0 = time.time() if self.model == "lda": model = LdaMulticore(corpus, id2word=dictionary.id2token, num_topics=num_topics, passes=2, iterations=50, chunksize=2000, workers=self.n_threads) elif self.model == "hdp": # T = overall topic limit, K = max topics per document model = HdpModel(corpus, id2word=dictionary.id2token, T=num_topics, K=max_topics_per_doc) else: raise ValueError("Unknown model identifier '{}'".format( self.model)) t1 = time.time() # serialize logger.info( "building the model took {:.1f} s. Serializing model...".format( t1 - t0)) output_path = self._get_model_path(topic) with util.open_by_ext(output_path, 'wb') as fp: pickle.dump(model, fp, protocol=4) logger.info( "model dump finished, took {:.1f} s".format(time.time() - t1))
def stream_token_dict(self): """ make a single run over the file containing all documents as plaintext. Parse all documents using spacy, store the token counts for each document and build the global token dict """ if self.file_corpus_input: logger.info("reading corpus from '{}'".format( self.file_corpus_plain)) corpus = JsonLinesCorpus(self.file_corpus_input) return self.store_gensim_dict(corpus) else: if self.abstracts_only: logger.info("reading abstracts from '{}'".format( self.file_metadata)) documents = util.json_read_lines(self.file_metadata, self.get_title_and_abstract) else: logger.info("reading documents from '{}'".format( self.file_pdf_text)) documents = util.json_read_lines(self.file_pdf_text, self.combine_pages) # limit document count (if configured) documents_limited = (next(documents) for i in range(self.document_limit) ) if self.document_limit else documents # filter by document language (if configured) documents_filtered = self.filter_by_lang( documents_limited, self.language_filter ) if self.language_filter else documents_limited # parse documents using spacy documents_tokens = self.spacy_parse(documents_filtered, batch_size=self.batch_size, n_threads=self.n_threads) # stream intermediate result to disk (in case data does not fit in RAM, which it won't if you're serious about this stuff) return self.store_tokens_and_gensim_dict(documents_tokens)
def build_corpus(tmp_dir, fname='corpus.json', data=test_data_dicts) -> JsonLinesCorpus: fname = os.path.join(tmp_dir, fname) JsonLinesCorpus.serialize(fname, data) return JsonLinesCorpus(fname)