def get_raw_point(topic_id: int) -> (list, list): topic_html = TopicDownloader.download_topic(topic_id) raw_topic = parser.parse(topic_html) labels = raw_topic.hubs + raw_topic.tags raw_text = raw_topic.text return raw_text, labels
def get_raw_point(self, topic_id: int) -> (list, list, str): topic_html = TopicDownloader.download_topic(topic_id) raw_topic = self.parser.parse(topic_html) labels = raw_topic.hubs + raw_topic.tags raw_text = raw_topic.text name = raw_topic.name return raw_text, labels, name
def json_dumper(objs): for obj in objs: yield json.dumps(obj) + "\n" def progress_indicator(objs): i = 0 for obj in objs: i += 1 if i % 100 == 0: print(i) yield obj if __name__ == "__main__": html = TopicDownloader.download_topic(269995) parsed = TopicParser().parse(html) pass # with open('data/raw_data.json', 'r') as in_file, open('data/clean_data.json', 'w+') as out_file: # pipe_1 = Pipeline(in_file) # pipe_1.add_processors( # [json_parser, # dict_to_topic, # cleaner, # set_ext, # clean_topic_to_dict, # json_dumper, # progress_indicator]) # out_file.writelines(pipe_1.process()) # # with open('data/word_count.json', 'w+') as fp:
def json_dumper(objs): for obj in objs: yield json.dumps(obj) + '\n' def progress_indicator(objs): i = 0 for obj in objs: i += 1 if i % 100 == 0: print(i) yield obj if __name__ == '__main__': html = TopicDownloader.download_topic(269995) parsed = TopicParser().parse(html) pass # with open('data/raw_data.json', 'r') as in_file, open('data/clean_data.json', 'w+') as out_file: # pipe_1 = Pipeline(in_file) # pipe_1.add_processors( # [json_parser, # dict_to_topic, # cleaner, # set_ext, # clean_topic_to_dict, # json_dumper, # progress_indicator]) # out_file.writelines(pipe_1.process()) # # with open('data/word_count.json', 'w+') as fp:
def test_is_error_page(self): html_1 = request('GET', 'http://geektimes.ru/post/10000/').text html_2 = request('GET', 'http://habrahabr.ru/post/951000/').text self.assertTrue(TopicDownloader.is_error_page(html_1)) self.assertTrue(TopicDownloader.is_error_page(html_2))
def test_download_html(self): url = 'http://habrahabr.ru/interesting/' html = TopicDownloader.download_html(url) self.assertEqual(request('GET', url).text, html)