def extract_intro(soup): intro_box = soup.find(attrs={"class": "intro"}) tagged_urls = [] if intro_box: intro_fragments = intro_box.find_all('b') intro = utils.remove_text_formatting_markup_from_fragments(intro_fragments) inline_links = intro_box.find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] plaintext_urls = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(intro)) for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('in intro') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.add('in intro') tags.add('plaintext') tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: intro = "" return intro, tagged_urls
def extract_links_from_text_hxs(hxs): tagged_urls = list() # intext urls: take all the <a>, except what might be inside a rendered tweet intext_link_hxs = hxs.select(".//a") for link_hxs in intext_link_hxs: title, url = extract_title_and_url(link_hxs) tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags.add('in text') tagged_urls.append(make_tagged_url(url, title, tags)) #plaintext text urls raw_content = hxs.select(".//p/text()").extract() if raw_content: for paragraph in raw_content: plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags.update(['plaintext', 'in text']) tagged_urls.append(make_tagged_url(url, url, tags)) #embedded objects iframe_sources = hxs.select(".//iframe/@src").extract() for url in iframe_sources: tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS) tags = tags.union(['in text', 'embedded', 'iframe']) tagged_urls.append(make_tagged_url(url, url, tags)) return tagged_urls
def extract_links_from_sidebar_box(soup): tagged_urls = list() sidebar_box = soup.find(attrs={"class": "teas_article_306 mar10 clear clearfix relatedcomponents"}) # there are links to articles if sidebar_box: sidebar_box.find_all(attrs={"class": "clearfix"}) articles = sidebar_box.find_all(attrs={"class": "clearfix"}) links = articles[0].find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('sidebar box') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) # and also links to thematic tags tags = sidebar_box.find_all(attrs={"class": "bt_meer_over clearfix"}) for tag in tags: links = tag.find_all("a") titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES) tags.update(base_tags) tags.add('keyword') tags.add('sidebar box') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) return tagged_urls
def extract_tagged_url_from_embedded_script(script, site_netloc, site_internal_sites): if script.get('src'): script_url = script.get('src') if twitter_utils.is_twitter_widget_url(script_url): if script.contents: title, url, tags = twitter_utils.get_widget_type(script.contents[0]) else: # sometimes the TWTR.Widget code is in the next <script> container. Whee. sibling_script = script.findNextSibling('script') title, url, tags = twitter_utils.get_widget_type(sibling_script.contents[0]) tags |= classify_and_tag(url, site_netloc, site_internal_sites) tags |= set(['script', 'embedded']) return make_tagged_url(url, title, tags) else: if script.findNextSibling('noscript'): noscript = script.findNextSibling('noscript') link = noscript.find('a') if link: url = link.get('href') title = remove_text_formatting_markup_from_fragments(link.contents) all_tags = classify_and_tag(url, site_netloc, site_internal_sites) all_tags |= set(['script', 'embedded']) return make_tagged_url(url, title, all_tags) else: raise ValueError("No link was found in the <noscript> section. Update the parser.") else: raise ValueError("Could not extract fallback noscript url for this embedded javascript object. Update the parser.") else: raise ValueError("Embedded script of unknown type was detected. Update the parser.")
def extract_text_content_and_links(soup) : tagged_urls = list() inline_links = [] text = list() article_body = soup.find(attrs = {"class" : "article-body"}) text_fragments = article_body.find_all("p") other_fragments = article_body.find_all("h2", {"style": "display: inline; font-size: 1em; padding: 0px; margin: 0px;"}) all_fragments = text_fragments + other_fragments if all_fragments: for paragraph in text_fragments: text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph))) plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph)) for url in plaintext_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(['plaintext', 'in text']) tagged_urls.append(tagging.make_tagged_url(url, url, tags)) else: text = u"" for p in all_fragments: link = p.find_all("a") inline_links.extend(link) titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links] for title, url, base_tags in titles_and_urls: tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES) tags.update(base_tags) tags.add('in text') tagged_urls.append(tagging.make_tagged_url(url, title, tags)) return text, tagged_urls
def extract_links_from_embedded_content(story): tagged_urls = [] # generic iframes iframe_items = story.findAll("iframe", recursive=True) for iframe in iframe_items: url = iframe.get('src') all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'iframe']))) # extract embedded storify scripts = story.findAll('script', recursive=True) for script in scripts: url = script.get('src') if url: scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) if netloc == "storify.com": url = url.rstrip(".js") all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'storify']))) # TO DO NEXT : reconstruc kplayer URL kplayer = story.find('div', {'class': 'containerKplayer'}) if kplayer: kplayer_flash = kplayer.find('div', {'class': 'flash_kplayer'}) url_part1 = kplayer_flash.object['data'] url_part2 = kplayer_flash.object.find('param', {'name': 'flashVars'})['value'] if url_part1 is not None and url_part2 is not None: url = "%s?%s" % (url_part1, url_part2) all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS) tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'kplayer']))) else: raise ValueError("We couldn't find an URL in the flash player. Update the parser.") return tagged_urls
def extract_associated_links(hxs): links = hxs.select("//div[@id='picture']/descendant::div[@class='bloc-01']//a") all_tagged_urls = [] if links: def extract_url_and_title(link_hxs): url = link_hxs.select('@href').extract()[0] title = u"".join(link_hxs.select("text()").extract()) tags = set() if not title: title = u'No Title' tags.add(constants.GHOST_LINK_TAG) if not url: url = u'' tags.add('no target') return url, title, tags all_tagged_urls = list() for item in links: url, title, tags = extract_url_and_title(item) tags.update(classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)) link_type = item.select('@class') if link_type and link_type[0] in LINK_TYPE_TO_TAG: tags.update(LINK_TYPE_TO_TAG[link_type]) tags.add("sidebar box") all_tagged_urls.append(make_tagged_url(url, title, tags)) media_links = hxs.select("//div[@id='picture']/descendant::div[@class='wrappAllMedia']/div") for i, item in enumerate(media_links): if item.select('./img'): pass # images are lame elif item.select(".//div[starts-with(@id, 'media-youtube')]"): youtube_div = item.select(".//div[starts-with(@id, 'media-youtube')]") youtube_object = youtube_div.select("./object") url = hxs_media_utils.extract_url_from_youtube_object(youtube_object) tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags |= set(['youtube', 'embedded', 'video']) title = parser_constants.NO_TITLE all_tagged_urls.append(make_tagged_url(url, title, tags)) elif item.select(".//div[contains(@class, 'emvideo-kewego')]"): kplayer_div = item.select(".//div[contains(@class, 'emvideo-kewego')]") kplayer_object = kplayer_div.select("./object") url = hxs_media_utils.extract_url_from_kplayer_object(kplayer_object) tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES) tags |= set(['kewego', 'embedded', 'video']) title = parser_constants.NO_TITLE all_tagged_urls.append(make_tagged_url(url, title, tags)) elif not item.select("./div/text()"): pass # empty divs are lame else: raise ValueError("The media box contains something other than an image or a youtube video. Update your parser") return all_tagged_urls
def test_links_embedded_kewego_gallery(self): """ sudinfo parser can extract kewego videos from the article media gallery""" with open(os.path.join(DATA_ROOT, "links_embedded_kewego_gallery.html")) as f: article, raw_html = sudinfo.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("http://portfolio.sudpresse.be/main.php?g2_itemId=992521", u"""Une belle après-midi à Bleid""", set(['internal', 'sidebar box', 'gallery'])), make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=7b7e2d7a9682&skinKey=a07930e183e6&sig=054c411daa8s&autostart=0&advertise=true", u"""__NO_TITLE__""", set(['kewego', 'video', 'external', 'embedded'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_kplayer_without_title(self): with open(os.path.join(DATA_ROOT, "kplayer_without_title.html")) as f: article, raw_html = lesoir_new.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://soirmag.lesoir.be/search/node/gandolfi", u"""Dans toutes ses interviews""", set(['internal', 'internal site', 'in text'])), make_tagged_url("http://soirmag.lesoir.be/search/node/gandolfi", u"""Les articles sur Barbara Gandolfi sur SoirMag""", set(['internal', 'sidebar box', 'internal site'])), make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=5ff3260def2a&skinKey=6624e00d250s&sig=d09800d9f8as&autostart=false&advertise=true", u"""__NO_TITLE__""", set(['kplayer', 'external', 'embedded', 'top box'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_embedded_scribble_live(self): """ lesoir_new parser correctly extracts and tags an embedded scribble live """ with open(os.path.join(DATA_ROOT, "embedded_scribble_live.html")) as f: article, raw_html = lesoir_new.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://football.lesoir.be/jupiler-pro-league/resultats", u"""Tous les résultats et classements""", set(['internal', 'sidebar box', 'internal site'])), make_tagged_url("http://embed.scribblelive.com/Embed/v5.aspx?Id=86477&ThemeId=7346", u"""http://embed.scribblelive.com/Embed/v5.aspx?Id=86477&ThemeId=7346""", set(['iframe', 'external', 'embedded'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_intro_type2(self): """ lesoir_new can extract other type of intro""" with open(os.path.join(DATA_ROOT, "intro_type2.html")) as f: article, raw_html = lesoir_new.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://www.lesoir.be/191377/article/culture/cinema/2013-02-16/berlinale-%C2%ABthe-broken-circle-breakdown%C2%BB-remporte-prix-du-public", u"""Berlinale: «The Broken Circle Breakdown» remporte le prix du Public""", set(['internal', 'sidebar box'])), make_tagged_url("http://www.youtube.com/watch?v=ZtoCo9pJ2yU", u"""http://www.youtube.com/watch?v=ZtoCo9pJ2yU""", set(['video', 'external', 'embedded', 'top box'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_intro_type1(self): """ lesoir_new can extract intro""" with open(os.path.join(DATA_ROOT, "intro_type1.html")) as f: article, raw_html = lesoir_new.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://www.lacapitale.be/674531/article/actualite/politique/2013-03-01/didier-reynders-veut-mettre-nos-imams-sous-controle", u"""dans un entretien donné aux journaux SudPresse""", set(['same owner', 'external', 'in text'])), make_tagged_url("http://www.lacapitale.be/674531/article/actualite/politique/2013-03-01/didier-reynders-veut-mettre-nos-imams-sous-controle", u"""Didier Reynders veut mettre nos imams sous contrôle (SudPresse)""", set(['sidebar box', 'external', 'same owner'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_same_owner_tagging(self): """ Sudpresse parser correctly tags 'same owner' links """ with open(os.path.join(DATA_ROOT, "same_owner_tagging.html")) as f: article, raw_html = sudpresse.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Nos confrères de Nord Eclair France """, set(['same owner', 'external', 'in text'])), make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Voir sur le site nordeclair.fr""", set(['external', 'same owner', 'sidebar box'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_links_new_ignore_animated_gifs_in_video_div(self): """ lavenir [new template] parser ignore <img> elements located where a video should have been. It also works for animated gif files. Which are pronoucened 'jif', btw.""" with open(os.path.join(DATA_ROOT, "links_new_ignore_animated_gifs_in_video_div.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])), make_tagged_url("/sports/football/premierleague", u"""Premier League""", set(['internal', 'keyword'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_new_links_ignore_photosets(self): """ lavenir [new template] parser ignores photosets""" with open(os.path.join(DATA_ROOT, "new_links_ignore_photosets.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("/sports/cyclisme", u"""Cyclisme""", set(['internal', 'keyword'])), make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_new_links_special_in_text(self): """ lavenir [new template] parser can in-text links, even if they look like they are located in a bolded paragraph""" with open(os.path.join(DATA_ROOT, "new_links_special_in_text.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("http://tech.lavenir.net/ge/visite_touristique.kmz", u"""Pour découvrir ce parcours en mode 3D avec photographies, cliquez sur ce lien""", set(['internal', 'internal site', 'in text'])), make_tagged_url("/sports/jogging", u"""Jogging""", set(['internal', 'keyword'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_intext_links_tagging(self): """ Sudpresse parser correctly tags 'in text' links.""" with open(os.path.join(DATA_ROOT, "intext_links_tagging.html")) as f: article, raw_html = sudpresse.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://www.lameuse.be/vervietois2011", u"""www.lameuse.be/vervietois2011""", set(['same owner', 'external', 'in text'])), make_tagged_url("http://verviers.lameuse.be", u"""http://verviers.lameuse.be""", set(['same owner', 'external', 'in text'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_embedded_ustream(self): with open(os.path.join(DATA_ROOT, "embedded_ustream.html")) as f: article, raw_html = lesoir_new.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://www.lesoir.be/187345/article/economie/2013-02-11/gaz-schiste-menace-pour-belgique", u"""notre dossier sur le gaz de schiste""", set(['internal', 'in text'])), make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=5ff3260def2a&skinKey=6624e00d250s&sig=ed3b67b4053s&autostart=false&advertise=true", u"""__NO_TITLE__""", set(['kplayer', 'video', 'external', 'embedded', 'top box'])), make_tagged_url("__NO_URL__", u"""__NO_TITLE__""", set(['video', u'unfinished', 'embedded'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_embedded_video_extraction(self): """ sudinfo parser can extract and tag embedded video from the bottom of an article. """ with open(os.path.join(DATA_ROOT, "embedded_video_extraction.html")) as f: article, raw_html = sudinfo.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url(u"http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&sig=5a5a3d9f57ds", u"""http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&sig=5a5a3d9f57ds""", set(['video', 'external', 'embedded', 'bottom'])), make_tagged_url(u"/338194/article/regions/tournai/2012-02-29/prostitution-“dodo-la-saumure”-va-demander-l’acquittement-sur-tout-jeudi-devant", u"""Prostitution: “Dodo la Saumure” va demander l’acquittement sur tout jeudi devant la justice""", set(['internal', 'sidebar box'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_sidebar_box_tagging(self): """ sudinfo parser can extract and tag sidebar links from an article. """ with open(os.path.join(DATA_ROOT, "sidebar_box_tagging.html")) as f: article, raw_html = sudinfo.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url(u"/338420/article/sports/foot-belge/2012-02-29/grece-belgique-1-1-les-diables-tiennent-le-nul-a-10-contre-11", u"""Grèce - Belgique (1-1): les Diables tiennent le nul à 10 contre 11""", set(['internal', 'sidebar box'])), make_tagged_url(u"/338862/article/sports/foot-etranger/2012-02-29/foot-amicaux-la-france-surprend-l’italie-decoit-l’argentine-dit-merci-a-messi", u"""Foot (amicaux): la France surprend, l’Italie déçoit, l’Argentine dit "merci" à Messi""", set(['internal', 'sidebar box'])), make_tagged_url(u"/338806/article/sports/foot-belge/2012-02-29/angleterre-belgique-4-0-les-diablotins-encaissent-un-but-d’anthologie-video", u"""Angleterre - Belgique (4-0): les Diablotins encaissent un but d’anthologie (vidéo)""", set(['internal', 'sidebar box'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_new_links_soccer_video_from_hungary(self): """ lavenir [new template] parser loves hungarian soccer videos""" with open(os.path.join(DATA_ROOT, "new_links_soccer_video_from_hungary.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("http://videa.hu/videok/sport/as-roma-3-1-genoa-alessio-romagnoli-francesco-totti-a7Mhqa5118CHtLlG", u"""szólj hozzá: AS Roma 3-1 Genoa MATCH HIGHLIGHTS""", set(['video', 'external', 'embedded'])), make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])), make_tagged_url("/sports/football/serie-a", u"""Serie A""", set(['internal', 'keyword'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_ignore_embedded_images(self): """ lesoir parser ignores links that point to an embedded image.""" with open(os.path.join(DATA_ROOT, "links_ignore_embedded_images.html")) as f: article, raw_html = lesoir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("http://www.lesoir.be/sports/basket/2012-05-11/championnat-ostende-rafle-la-pole-juste-avant-les-playoffs-915188.php", u"""Championnat : Ostende rafle la pole juste avant les playoffs""", set(['internal', 'sidebar box', 'internal site', 'recent'])), make_tagged_url("http://www.lesoir.be/sports/football/2012-05-11/kompany-elu-joueur-de-la-saison-en-angleterre-915181.php", u"""Kompany élu Joueur de la saison en Angleterre""", set(['internal', 'sidebar box', 'internal site', 'recent'])), make_tagged_url("http://www.lesoir.be/actualite/belgique/2012-05-11/collision-de-godinne-les-trains-n-etaient-pas-equipes-du-systeme-europeen-de-securite-915177.php", u"""Collision de Godinne : les trains nétaient pas équipés du système européen de sécurité""", set(['internal', 'sidebar box', 'internal site', 'recent'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_in_text_same_owner(self): """ sudinfo parser can extract and tag in text and sidebar links to same owner sites.""" with open(os.path.join(DATA_ROOT, "in_text_same_owner.html")) as f: article, raw_html = sudinfo.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://www.lesoir.be/sports/football/2012-05-31/en-combien-de-temps-hazard-gagne-t-il-votre-salaire-918967.php", u"""Le Soir.be""", set(['same owner', 'external', 'in text'])), make_tagged_url("http://www.lesoir.be/sports/football/2012-05-31/en-combien-de-temps-hazard-gagne-t-il-votre-salaire-918967.php", u"""En combien de temps, Eden Hazard gagne votre salaire?""", set(['sidebar box', 'external', 'same owner'])), make_tagged_url("slate.fr", u"""slate.fr""", set(['in text', 'plaintext', 'external'])) ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_links_new_ignore_images_in_video_div(self): """ lavenir [new template] parser ignore <img> elements located where a video should have been""" with open(os.path.join(DATA_ROOT, "links_new_ignore_images_in_video_div.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("http://www.lavenir.net/buzz/insolite", u"""Insolite""", set(['internal', 'keyword'])), make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])), make_tagged_url("/sports/football", u"""Football""", set(['internal', 'keyword'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_links_highlighted_youtube_and_ghost_links(self): """ lavenir parser can extracts links to videos in that weird 'highlighted' top section. Also, it deals with ghost links like a champ.""" with open(os.path.join(DATA_ROOT, "links_highlighted_youtube_and_ghost_links.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("http://rainn.org/", u"""RAINN,""", set(['external', 'in text'])), make_tagged_url("http://www.youtube.com/embed/KtzqvqzBdUQ", u"""http://www.youtube.com/embed/KtzqvqzBdUQ""", set(['video', 'external', 'embedded'])), make_tagged_url("http://", u"""__GHOST_LINK__""", set([u'ghost link', 'sidebar box'])), make_tagged_url("http://", u"""__GHOST_LINK__""", set(['bottom box', u'ghost link'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_embedded_dailymotion_video(self): with open(os.path.join(DATA_ROOT, "embedded_dailymotion_video.html")) as f: article, raw_html = lesoir_new.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url(u"/156026/article/actualite/monde/2013-01-11/l-etat-d-urgence-été-décrété-au-mali", u"""L'Etat d'urgence a été décrété au Mali""", set(['internal', 'sidebar box'])), make_tagged_url(u"/159828/article/actualite/monde/2013-01-12/raid-raté-en-somalie-confusion-autour-l-otage-français", u"""Raid raté en Somalie : confusion autour de l'otage français""", set(['internal', 'sidebar box'])), make_tagged_url(u"/159960/article/actualite/france/2013-01-12/mali-l’intégralité-du-discours-françois-hollande", u"""Mali : l’intégralité du discours de François Hollande""", set(['internal', 'sidebar box'])), make_tagged_url(u"/tag/mali", u"""Mali""", set(['internal', 'keyword'])), make_tagged_url(u"http://www.dailymotion.com/embed/video/xwpmlt", u"""http://www.dailymotion.com/embed/video/xwpmlt""", set(['video', 'external', 'embedded', 'top box'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_links_embedded_youtube(self): """ lavenir parser correctly extract links from embedded youtube videos""" with open(os.path.join(DATA_ROOT, "links_embedded_youtube.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("http://www.youtube.com/embed/gS9o1FAszdk", u"""http://www.youtube.com/embed/gS9o1FAszdk""", set(['iframe', 'external', 'embedded', 'in text'])), make_tagged_url("http://www.youtube.com/embed/qMxX-QOV9tI", u"""http://www.youtube.com/embed/qMxX-QOV9tI""", set(['iframe', 'external', 'embedded', 'in text'])), make_tagged_url("http://www.youtube.com/embed/6KUJE2xs-RE", u"""http://www.youtube.com/embed/6KUJE2xs-RE""", set(['iframe', 'external', 'embedded', 'in text'])), make_tagged_url("http://www.youtube.com/embed/uSD4vsh1zDA", u"""http://www.youtube.com/embed/uSD4vsh1zDA""", set(['iframe', 'external', 'embedded', 'in text'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_links_new_thinglink(self): with open(os.path.join(DATA_ROOT, "links_new_thinglink.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("//www.thinglink.com/jse/embed.js#288963391949635586", u"""__THINGLINK_ANNOTATED_IMAGE__""", set(['external', 'embedded', 'annoted image'])), make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20120806_00189384", u"""06/08/12 « Il y a de l’envie sur le terrain »""", set(['internal', 'sidebar box'])), make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20120806_00189287", u"""06/08/12 Albert Gonthier a tout vu""", set(['internal', 'sidebar box'])), make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20120806_00189384", u"""« Il y a de l’envie sur le terrain »""", set(['bottom box', 'internal'])), make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20120806_00189287", u"""Albert Gonthier a tout vu""", set(['bottom box', 'internal'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_new_links_bottom_links(self): """ lavenir [new template] parser can extract links for related articles""" with open(os.path.join(DATA_ROOT, "new_links_bottom_links.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links urls = [ make_tagged_url("/sports/cnt/DMF20130303_00276326", u"""Euro indoor de Göteborg: pas de médaille pour Tia Hellebaut, qui ne passe pas 1m92""", set(['bottom box', 'internal', 'related'])), make_tagged_url("http://www.lavenir.net/diaporamas", u"""Diaporamas""", set(['internal', 'keyword'])), make_tagged_url("/sports/athletisme", u"""Athlétisme""", set(['internal', 'keyword'])), make_tagged_url("http://www.lavenir.net/channel/index.aspx?channelid=497", u"""Tout sur l'Euro indoor d'athlétisme""", set(['internal', 'keyword'])), ] expected_links = urls assert_taggedURLs_equals(expected_links, extracted_links)
def test_in_text_links(self): """ lavenir parser correctly extracts and tags in text links and does not mistakelny extracts the end of a sentence as a plaintext link""" with open(os.path.join(DATA_ROOT, "in_text_links.html")) as f: article, raw_html = lavenir.extract_article_data(f) extracted_links = article.links tagged_urls = [ make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20130223_00273017", u"""Oscar Pistorius""", set(['internal', 'in text'])), make_tagged_url("http://www.lavenir.net/sports/cnt/DMF20130222_00272411", u"""est sorti libre vendredi après-midi""", set(['internal', 'in text'])), make_tagged_url("/channel/index.aspx?channelid=490", u"""L'athlète Pistorius tue sa compagne: toutes nos infos""", set(['internal', 'sidebar box'])), make_tagged_url("/channel/index.aspx?channelid=490", u"""L'athlète Pistorius tue sa compagne: toutes nos infos""", set(['bottom box', 'internal'])), ] expected_links = tagged_urls assert_taggedURLs_equals(expected_links, extracted_links)