def begin_crawling_from_this_page(url, max_num_of_articles=20): # max_num_of_articles は何ページアクセスする数ではなくて、何ページものの記事が欲しいかを決めます。 # 現在はページをアクセスするたびに console に print しているので結構console output が多いです links = deque() links.append(url) count = 0 pages_visited = read_object_from('visited_pages_set.p', set) articles_read_for_category = read_object_from('articles_read_counter.p', Counter) print('pages visited before: ', pages_visited) print('articles read before: ', articles_read_for_category) while links and count < max_num_of_articles: url = links.popleft() if url in pages_visited: continue pages_visited.add(url) time.sleep(1) written_to_file = write_body_to_file(url, links) if written_to_file: # .update takes a file name and increments the counter for it articles_read_for_category.update([written_to_file]) count += 1 pickle.dump(articles_read_for_category, open('articles_read_counter.p', 'wb')) pickle.dump(pages_visited, open('visited_pages_set.p', 'wb')) return
def whats_in_my_pickle(): print("\nWhat's in my pickle objects? ") a = read_object_from('articles_read_counter.p', Counter) b = read_object_from('visited_pages_set.p', set) print(' Total number of articles:', sum(a.values())) print(' Total number of links:', len(b)) print(a)
def get_most_likely_category(word_count_new_article): probability_dict = dict() categories = ['business', 'asia', 'technology', 'uk', 'europe'] for category in categories: update_probabilities(category) category_word_probabilities = read_object_from(category + '_probability.p', defaultdict) probability_dict[category] = get_total_probability(category_word_probabilities, word_count_new_article, category) largest_probability = max(probability_dict.values()) likely_category = [x for x, y in probability_dict.items() if y == largest_probability] return likely_category[0]
def test_precision_recall(url, category, max_num_of_articles): links = deque() links.append(url) count = 0 articles_in_training_set = read_object_from('visited_pages_set.p', set) articles_in_testing = read_object_from('tested_articles_url.p', set) print(articles_in_testing) while links and count < max_num_of_articles: try: next_url = links.popleft() soup = get_soup_of_page(next_url) links.extend(collect_links(soup)) if next_url in articles_in_training_set or next_url in articles_in_testing: continue time.sleep(1) article_category = determine_category_file(next_url) if article_category != category: continue word_counter_new_article = count_words_in_article(next_url) category_guess = get_most_likely_category(word_counter_new_article) print('Currently going through ', next_url, ':') articles_in_testing.add(next_url) count += 1 print(' Your guess is', category_guess, '. The actual category is', article_category) except AttributeError: print('something went wrong, here', next_url, 'we will look at the next link') continue except Exception as e: print('an unexpected error occurred, we will look at the next link: ', e) continue print('I have looked at', count, 'articles') pickle.dump(articles_in_testing, open('tested_articles_url.p', 'wb'))
def update_probabilities(category): word_count = read_object_from(category + '.p', Counter) total_num_words = sum(word_count.values()) # prob_of_word_not_seen = lambda: 1/total_num_words word_probabilities = defaultdict(int) word_probabilities['num_of_words'] = total_num_words for word, count in word_count.items(): # 全てのカウントに1を足します word_probabilities[word] = (count+1)/(total_num_words + 2) pickle.dump(word_probabilities, open(category + '_probability.p', 'wb')) return word_probabilities
def write_body_to_file(url,links): # トレイニングデータを作るために使います article_category = determine_category_file(url) if article_category == 'ignore': print('This url was ignored:', url) return print('Currently going through ', url, ':') f = open(article_category + '.csv', 'a') try: soup = get_soup_of_page(url) links.extend(collect_links(soup)) p_tags = get_all_body_p_tags_bbc(soup) word_counter = read_object_from(article_category + '.p', Counter) for pTag in p_tags: contents = str(pTag.contents[0]) # 後で見れるように、CSV ファイルにも書いて、pickle にも文字のカウンターをアップデートします if 'href' not in contents and 'span' not in contents: f.write(contents + '\n') word_counter.update(word.strip(string.punctuation).lower() for word in contents.split()) pickle.dump(word_counter, open(article_category + '.p', 'wb')) except AttributeError: print(' This page does not have a body article: ', url) except Exception as e: print('Had some problem parsing through this page: ', url, e) traceback.print_exc() else: print(' successfully written to file', article_category) finally: f.close() return article_category
def get_probability_of_category(category): a = read_object_from('articles_read_counter.p', Counter) total_num_articles = sum(a.values()) return a[category] / total_num_articles