Exemple #1
0
def extract_article_data(source):

    if hasattr(source, 'read'):
        html_data = source.read()
    else:
        try:
            source = convert_utf8_url_to_ascii(source)
            html_data = fetch_html_content(source)
        except HTTPError as e:
            if e.code == 404 or e.code == 403:
                return None, None
            else:
                raise
        except Exception:
            raise

    soup = bs4.BeautifulSoup(html_data)

    # this is how we detect paywalled articles
    if soup.find(attrs={"id": "main-content"}).h2 and soup.find(attrs={"id": "main-content"}).h2.find(attrs={'class': 'ir locked'}):
        title = extract_title(soup)
        return (ArticleData(source, title, constants.NO_DATE, constants.NO_TIME, datetime.today(), [], [constants.NO_CATEGORY_NAME], None, None, constants.PAYWALLED_CONTENT), html_data)

    else:
        title = extract_title(soup)
        author_name = extract_author_name(soup)
        intro, links_from_intro = extract_intro(soup)
        text, tagged_urls_intext = extract_text_content_and_links(soup)
        category = extract_category(soup)
        sidebar_links = extract_links_from_sidebar_box(soup)
        article_tags = extract_article_tags(soup)
        embedded_media_from_top_box = extract_links_to_embedded_content(soup)
        embedded_media_from_bottom = extract_embedded_media_from_bottom(soup)
        embedded_media_in_article = extract_embedded_media_in_article(soup)
        embedded_media = embedded_media_from_top_box + embedded_media_from_bottom + embedded_media_in_article
        all_links = tagged_urls_intext + sidebar_links + article_tags + embedded_media + links_from_intro
        pub_date, pub_time = extract_date_and_time(soup)
        fetched_datetime = datetime.today()

        updated_tagged_urls = tagging.update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER)

        # print generate_test_func('embedded_storify_top_box', 'lesoir_new', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_data, source, 'embedded_storify_top_box', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir_new')

        return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                updated_tagged_urls,
                category, author_name,
                intro, text),
                html_data)
Exemple #2
0
def extract_article_data(source):
    """
    """

    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        source = convert_utf8_url_to_ascii(source)
        try:
            html_content = fetch_html_content(source)
        except urllib2.HTTPError as err:
            if err.code == 404:
                return None, "<html><head><title>404</title></head><body></body></html>"
            else:
                raise err

    hxs = HtmlXPathSelector(text=html_content)

    if is_page_error_404(hxs):
        return None, html_content
    else:
        category = hxs.select("//p[starts-with(@class, 'fil_ariane')]/a//text()").extract()
        #old version
        title = hxs.select("//div[@id='article']/h1/text()").extract()[0]
        # new version:
        # title = hxs.select("//div[@id='article']/article//h1/text()").extract()[0]

        pub_date, pub_time = extract_date(hxs)
        author = hxs.select("//p[@class='auteur']/text()").extract()[0]
        fetched_datetime = datetime.today()

        intro, intro_links = extract_intro_and_links(hxs)

        content, content_links = extract_content_and_links(hxs)
        associated_links = extract_associated_links(hxs)
        all_links = intro_links + content_links + associated_links
        updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)

        # import os
        # generate_unittest("embedded_dailymotion_video", "sudinfo", dict(urls=updated_tagged_urls), html_content, "csxjdb://sudinfo/2012-03-26/13.05.07/raw_data/7.html", os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/sudinfo"), True)

        return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                            updated_tagged_urls,
                            category, author,
                            intro, content),
                html_content)
Exemple #3
0
def get_frontpage_toc():
    BASE_URL = u'http://www.sudinfo.be/'
    html_content = fetch_content_from_url(BASE_URL)
    hxs = HtmlXPathSelector(text=html_content)

    column1_headlines = hxs.select("//div[starts-with(@class, 'first-content')]//div[starts-with(@class, 'article')]//h2/a")
    column3_headlines = hxs.select("//div[@class='octetFun']/a/child::h3/..")
    buzz_headlines = hxs.select("//div[@class='buzz exergue clearfix']//h2/a")
    buzz_headlines.extend(hxs.select("//div[@class='buzz exergue clearfix']//li/a"))

    all_link_selectors = it.chain(column1_headlines, column3_headlines, buzz_headlines)
    headlines = [extract_title_url_from_hxs_a(link_selector) for link_selector in all_link_selectors]

    regional_headlines = get_regional_toc()
    headlines.extend(regional_headlines)

    news, blogposts = separate_blogposts_and_news(headlines)
    return [(title, convert_utf8_url_to_ascii(url)) for title, url in make_full_url(BASE_URL, news)], blogposts, []