Example #1
0
def count_words_in_article(url):
    soup = get_soup_of_page(url)
    p_tags = get_all_body_p_tags_bbc(soup)
    word_counter = Counter()
    for pTag in p_tags:
        contents = str(pTag.contents[0])

        if 'href' not in contents and 'span' not in contents:
            word_counter.update(split_into_words(contents))
    return word_counter
Example #2
0
def write_body_to_file(url,links):
    # トレイニングデータを作るために使います
    article_category = determine_category_file(url)

    if article_category == 'ignore':
        print('This url was ignored:', url)
        return

    print('Currently going through ', url, ':')
    f = open(article_category + '.csv', 'a')

    try:
        soup = get_soup_of_page(url)
        links.extend(collect_links(soup))

        p_tags = get_all_body_p_tags_bbc(soup)

        word_counter = read_object_from(article_category + '.p', Counter)
        for pTag in p_tags:
            contents = str(pTag.contents[0])

            # 後で見れるように、CSV ファイルにも書いて、pickle にも文字のカウンターをアップデートします
            if 'href' not in contents and 'span' not in contents:
                f.write(contents + '\n')
                word_counter.update(word.strip(string.punctuation).lower() for word in contents.split())
        pickle.dump(word_counter, open(article_category + '.p', 'wb'))

    except AttributeError:
        print('     This page does not have a body article: ', url)

    except Exception as e:
        print('Had some problem parsing through this page: ', url, e)
        traceback.print_exc()

    else:
        print('     successfully written to file', article_category)

    finally:
        f.close()
        return article_category