예제 #1
0
def get_raw_point(topic_id: int) -> (list, list):
    topic_html = TopicDownloader.download_topic(topic_id)
    raw_topic = parser.parse(topic_html)
    labels = raw_topic.hubs + raw_topic.tags
    raw_text = raw_topic.text

    return raw_text, labels
예제 #2
0
    def get_raw_point(self, topic_id: int) -> (list, list, str):
        topic_html = TopicDownloader.download_topic(topic_id)
        raw_topic = self.parser.parse(topic_html)
        labels = raw_topic.hubs + raw_topic.tags
        raw_text = raw_topic.text
        name = raw_topic.name

        return raw_text, labels, name
예제 #3
0
def json_dumper(objs):
    for obj in objs:
        yield json.dumps(obj) + "\n"


def progress_indicator(objs):
    i = 0
    for obj in objs:
        i += 1
        if i % 100 == 0:
            print(i)
        yield obj


if __name__ == "__main__":
    html = TopicDownloader.download_topic(269995)
    parsed = TopicParser().parse(html)
    pass
    # with open('data/raw_data.json', 'r') as in_file, open('data/clean_data.json', 'w+') as out_file:
    #     pipe_1 = Pipeline(in_file)
    #     pipe_1.add_processors(
    #         [json_parser,
    #          dict_to_topic,
    #          cleaner,
    #          set_ext,
    #          clean_topic_to_dict,
    #          json_dumper,
    #          progress_indicator])
    #     out_file.writelines(pipe_1.process())
    #
    # with open('data/word_count.json', 'w+') as fp:
예제 #4
0
def json_dumper(objs):
    for obj in objs:
        yield json.dumps(obj) + '\n'


def progress_indicator(objs):
    i = 0
    for obj in objs:
        i += 1
        if i % 100 == 0:
            print(i)
        yield obj


if __name__ == '__main__':
    html = TopicDownloader.download_topic(269995)
    parsed = TopicParser().parse(html)
    pass
    # with open('data/raw_data.json', 'r') as in_file, open('data/clean_data.json', 'w+') as out_file:
    #     pipe_1 = Pipeline(in_file)
    #     pipe_1.add_processors(
    #         [json_parser,
    #          dict_to_topic,
    #          cleaner,
    #          set_ext,
    #          clean_topic_to_dict,
    #          json_dumper,
    #          progress_indicator])
    #     out_file.writelines(pipe_1.process())
    #
    # with open('data/word_count.json', 'w+') as fp:
예제 #5
0
    def test_is_error_page(self):
        html_1 = request('GET', 'http://geektimes.ru/post/10000/').text
        html_2 = request('GET', 'http://habrahabr.ru/post/951000/').text

        self.assertTrue(TopicDownloader.is_error_page(html_1))
        self.assertTrue(TopicDownloader.is_error_page(html_2))
예제 #6
0
    def test_download_html(self):
        url = 'http://habrahabr.ru/interesting/'

        html = TopicDownloader.download_html(url)

        self.assertEqual(request('GET', url).text, html)