def count_words_in_article(url): soup = get_soup_of_page(url) p_tags = get_all_body_p_tags_bbc(soup) word_counter = Counter() for pTag in p_tags: contents = str(pTag.contents[0]) if 'href' not in contents and 'span' not in contents: word_counter.update(split_into_words(contents)) return word_counter
def write_body_to_file(url,links): # トレイニングデータを作るために使います article_category = determine_category_file(url) if article_category == 'ignore': print('This url was ignored:', url) return print('Currently going through ', url, ':') f = open(article_category + '.csv', 'a') try: soup = get_soup_of_page(url) links.extend(collect_links(soup)) p_tags = get_all_body_p_tags_bbc(soup) word_counter = read_object_from(article_category + '.p', Counter) for pTag in p_tags: contents = str(pTag.contents[0]) # 後で見れるように、CSV ファイルにも書いて、pickle にも文字のカウンターをアップデートします if 'href' not in contents and 'span' not in contents: f.write(contents + '\n') word_counter.update(word.strip(string.punctuation).lower() for word in contents.split()) pickle.dump(word_counter, open(article_category + '.p', 'wb')) except AttributeError: print(' This page does not have a body article: ', url) except Exception as e: print('Had some problem parsing through this page: ', url, e) traceback.print_exc() else: print(' successfully written to file', article_category) finally: f.close() return article_category