def count_words_in_article(url): soup = get_soup_of_page(url) p_tags = get_all_body_p_tags_bbc(soup) word_counter = Counter() for pTag in p_tags: contents = str(pTag.contents[0]) if 'href' not in contents and 'span' not in contents: word_counter.update(split_into_words(contents)) return word_counter
def test_precision_recall(url, category, max_num_of_articles): links = deque() links.append(url) count = 0 articles_in_training_set = read_object_from('visited_pages_set.p', set) articles_in_testing = read_object_from('tested_articles_url.p', set) print(articles_in_testing) while links and count < max_num_of_articles: try: next_url = links.popleft() soup = get_soup_of_page(next_url) links.extend(collect_links(soup)) if next_url in articles_in_training_set or next_url in articles_in_testing: continue time.sleep(1) article_category = determine_category_file(next_url) if article_category != category: continue word_counter_new_article = count_words_in_article(next_url) category_guess = get_most_likely_category(word_counter_new_article) print('Currently going through ', next_url, ':') articles_in_testing.add(next_url) count += 1 print(' Your guess is', category_guess, '. The actual category is', article_category) except AttributeError: print('something went wrong, here', next_url, 'we will look at the next link') continue except Exception as e: print('an unexpected error occurred, we will look at the next link: ', e) continue print('I have looked at', count, 'articles') pickle.dump(articles_in_testing, open('tested_articles_url.p', 'wb'))
def write_body_to_file(url,links): # トレイニングデータを作るために使います article_category = determine_category_file(url) if article_category == 'ignore': print('This url was ignored:', url) return print('Currently going through ', url, ':') f = open(article_category + '.csv', 'a') try: soup = get_soup_of_page(url) links.extend(collect_links(soup)) p_tags = get_all_body_p_tags_bbc(soup) word_counter = read_object_from(article_category + '.p', Counter) for pTag in p_tags: contents = str(pTag.contents[0]) # 後で見れるように、CSV ファイルにも書いて、pickle にも文字のカウンターをアップデートします if 'href' not in contents and 'span' not in contents: f.write(contents + '\n') word_counter.update(word.strip(string.punctuation).lower() for word in contents.split()) pickle.dump(word_counter, open(article_category + '.p', 'wb')) except AttributeError: print(' This page does not have a body article: ', url) except Exception as e: print('Had some problem parsing through this page: ', url, e) traceback.print_exc() else: print(' successfully written to file', article_category) finally: f.close() return article_category