def extract_article_data(source): if hasattr(source, 'read'): html_data = source.read() else: try: source = convert_utf8_url_to_ascii(source) html_data = fetch_html_content(source) except HTTPError as e: if e.code == 404 or e.code == 403: return None, None else: raise except Exception: raise soup = bs4.BeautifulSoup(html_data) # this is how we detect paywalled articles if soup.find(attrs={"id": "main-content"}).h2 and soup.find(attrs={"id": "main-content"}).h2.find(attrs={'class': 'ir locked'}): title = extract_title(soup) return (ArticleData(source, title, constants.NO_DATE, constants.NO_TIME, datetime.today(), [], [constants.NO_CATEGORY_NAME], None, None, constants.PAYWALLED_CONTENT), html_data) else: title = extract_title(soup) author_name = extract_author_name(soup) intro, links_from_intro = extract_intro(soup) text, tagged_urls_intext = extract_text_content_and_links(soup) category = extract_category(soup) sidebar_links = extract_links_from_sidebar_box(soup) article_tags = extract_article_tags(soup) embedded_media_from_top_box = extract_links_to_embedded_content(soup) embedded_media_from_bottom = extract_embedded_media_from_bottom(soup) embedded_media_in_article = extract_embedded_media_in_article(soup) embedded_media = embedded_media_from_top_box + embedded_media_from_bottom + embedded_media_in_article all_links = tagged_urls_intext + sidebar_links + article_tags + embedded_media + links_from_intro pub_date, pub_time = extract_date_and_time(soup) fetched_datetime = datetime.today() updated_tagged_urls = tagging.update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER) # print generate_test_func('embedded_storify_top_box', 'lesoir_new', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_data, source, 'embedded_storify_top_box', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir_new') return (ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text), html_data)
def extract_article_data(source): """ """ if hasattr(source, 'read'): html_content = source.read() else: source = convert_utf8_url_to_ascii(source) try: html_content = fetch_html_content(source) except urllib2.HTTPError as err: if err.code == 404: return None, "<html><head><title>404</title></head><body></body></html>" else: raise err hxs = HtmlXPathSelector(text=html_content) if is_page_error_404(hxs): return None, html_content else: category = hxs.select("//p[starts-with(@class, 'fil_ariane')]/a//text()").extract() #old version title = hxs.select("//div[@id='article']/h1/text()").extract()[0] # new version: # title = hxs.select("//div[@id='article']/article//h1/text()").extract()[0] pub_date, pub_time = extract_date(hxs) author = hxs.select("//p[@class='auteur']/text()").extract()[0] fetched_datetime = datetime.today() intro, intro_links = extract_intro_and_links(hxs) content, content_links = extract_content_and_links(hxs) associated_links = extract_associated_links(hxs) all_links = intro_links + content_links + associated_links updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER) # import os # generate_unittest("embedded_dailymotion_video", "sudinfo", dict(urls=updated_tagged_urls), html_content, "csxjdb://sudinfo/2012-03-26/13.05.07/raw_data/7.html", os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/sudinfo"), True) return (ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content)
def get_frontpage_toc(): BASE_URL = u'http://www.sudinfo.be/' html_content = fetch_content_from_url(BASE_URL) hxs = HtmlXPathSelector(text=html_content) column1_headlines = hxs.select("//div[starts-with(@class, 'first-content')]//div[starts-with(@class, 'article')]//h2/a") column3_headlines = hxs.select("//div[@class='octetFun']/a/child::h3/..") buzz_headlines = hxs.select("//div[@class='buzz exergue clearfix']//h2/a") buzz_headlines.extend(hxs.select("//div[@class='buzz exergue clearfix']//li/a")) all_link_selectors = it.chain(column1_headlines, column3_headlines, buzz_headlines) headlines = [extract_title_url_from_hxs_a(link_selector) for link_selector in all_link_selectors] regional_headlines = get_regional_toc() headlines.extend(regional_headlines) news, blogposts = separate_blogposts_and_news(headlines) return [(title, convert_utf8_url_to_ascii(url)) for title, url in make_full_url(BASE_URL, news)], blogposts, []