def test_get_sentences(self): test_text = "First sentence. Second sentence. Sentence with 2.0 number. Sentence with H. pylori. Good bye." expected = [ "First sentence.", "Second sentence.", "Sentence with 2.0 number.", "Sentence with H. pylori.", "Good bye.", ] actual = get_sentences(test_text) self.assertListEqual(expected, actual)
def main(article_data_sources, writers, sentence_finder, data_sources_to_skip=0, sentences_to_skip=0): data_source_names = list(map(lambda x: str(x), article_data_sources)) constants.logger.info("data sources: %s" % data_source_names) total_sentence_number = 0 for i in range(data_sources_to_skip, len(article_data_sources)): article_data_source = article_data_sources[i] articles = article_data_source.get_articles() # todo: sort to be able to continue sentences_articles_tuples = ((sentence, article) for article in articles for sentence in get_sentences(article.text)) constants.logger.info("start looping sentences with data source №%i %s" % (i + 1, str(article_data_source))) sentence_number = sentences_to_skip for _ in range(sentences_to_skip): next(sentences_articles_tuples) sentences_to_skip = 0 for sentence_text, article in sentences_articles_tuples: try: sentence = sentence_finder.get_sentence(sentence_text, article) except Exception: constants.logger.info(format_exc()) constants.logger.info("sentence with error: %s" % sentence_text) constants.logger.info("got error in sentence loop; continue") continue if not sentence: continue for writer in writers: writer.write(sentence) sentence_number += 1 constants.logger.info("memory usage: %f" % memory_usage_psutil()) constants.logger.info("sentence № %i, data source № %i\n%s" % (sentence_number, i+1, sentence_text)) constants.logger.info("=" * 80) total_sentence_number += sentence_number constants.logger.info("finish looping sentences with %s\n" % str(article_data_source)) constants.pattern_logger.info('total number sentences: %i' % total_sentence_number)