Beispiel #1
0
def count_words_in_article(url):
    soup = get_soup_of_page(url)
    p_tags = get_all_body_p_tags_bbc(soup)
    word_counter = Counter()
    for pTag in p_tags:
        contents = str(pTag.contents[0])

        if 'href' not in contents and 'span' not in contents:
            word_counter.update(split_into_words(contents))
    return word_counter
Beispiel #2
0
def test_precision_recall(url, category, max_num_of_articles):
    links = deque()
    links.append(url)
    count = 0

    articles_in_training_set = read_object_from('visited_pages_set.p', set)
    articles_in_testing = read_object_from('tested_articles_url.p', set)
    print(articles_in_testing)
    while links and count < max_num_of_articles:
        try:
            next_url = links.popleft()
            soup = get_soup_of_page(next_url)
            links.extend(collect_links(soup))

            if next_url in articles_in_training_set or next_url in articles_in_testing:
                continue

            time.sleep(1)

            article_category = determine_category_file(next_url)
            if article_category != category:
                continue

            word_counter_new_article = count_words_in_article(next_url)
            category_guess = get_most_likely_category(word_counter_new_article)

            print('Currently going through ', next_url, ':')
            articles_in_testing.add(next_url)
            count += 1
            print('     Your guess is', category_guess, '. The actual category is', article_category)

        except AttributeError:
            print('something went wrong, here', next_url, 'we will look at the next link')
            continue

        except Exception as e:
            print('an unexpected error occurred, we will look at the next link: ', e)
            continue

    print('I have looked at', count, 'articles')
    pickle.dump(articles_in_testing, open('tested_articles_url.p', 'wb'))
Beispiel #3
0
def write_body_to_file(url,links):
    # トレイニングデータを作るために使います
    article_category = determine_category_file(url)

    if article_category == 'ignore':
        print('This url was ignored:', url)
        return

    print('Currently going through ', url, ':')
    f = open(article_category + '.csv', 'a')

    try:
        soup = get_soup_of_page(url)
        links.extend(collect_links(soup))

        p_tags = get_all_body_p_tags_bbc(soup)

        word_counter = read_object_from(article_category + '.p', Counter)
        for pTag in p_tags:
            contents = str(pTag.contents[0])

            # 後で見れるように、CSV ファイルにも書いて、pickle にも文字のカウンターをアップデートします
            if 'href' not in contents and 'span' not in contents:
                f.write(contents + '\n')
                word_counter.update(word.strip(string.punctuation).lower() for word in contents.split())
        pickle.dump(word_counter, open(article_category + '.p', 'wb'))

    except AttributeError:
        print('     This page does not have a body article: ', url)

    except Exception as e:
        print('Had some problem parsing through this page: ', url, e)
        traceback.print_exc()

    else:
        print('     successfully written to file', article_category)

    finally:
        f.close()
        return article_category