Example #1
0
def extract_intro(soup):
    intro_box = soup.find(attrs={"class": "intro"})
    tagged_urls = []

    if intro_box:
        intro_fragments = intro_box.find_all('b')
        intro = utils.remove_text_formatting_markup_from_fragments(intro_fragments)
        inline_links = intro_box.find_all("a")
        titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]
        plaintext_urls = utils.extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(intro))

        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.update(base_tags)
            tags.add('in intro')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

        for url in plaintext_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.add('in intro')
            tags.add('plaintext')
            tagged_urls.append(tagging.make_tagged_url(url, url, tags))
    else:
        intro = ""

    return intro, tagged_urls
Example #2
0
def extract_links_from_text_hxs(hxs):
    tagged_urls = list()
    # intext urls: take all the <a>, except what might be inside a rendered tweet

    intext_link_hxs = hxs.select(".//a")
    for link_hxs in intext_link_hxs:
        title, url = extract_title_and_url(link_hxs)
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags.add('in text')
        tagged_urls.append(make_tagged_url(url, title, tags))

    #plaintext text urls
    raw_content = hxs.select(".//p/text()").extract()

    if raw_content:
        for paragraph in raw_content:
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(make_tagged_url(url, url, tags))

    #embedded objects
    iframe_sources = hxs.select(".//iframe/@src").extract()
    for url in iframe_sources:
        tags = classify_and_tag(url, LAVENIR_NETLOC, LAVENIR_INTERNAL_BLOGS)
        tags = tags.union(['in text', 'embedded', 'iframe'])
        tagged_urls.append(make_tagged_url(url, url, tags))

    return tagged_urls
Example #3
0
def extract_links_from_sidebar_box(soup):
    tagged_urls = list()
    sidebar_box = soup.find(attrs={"class": "teas_article_306 mar10 clear clearfix relatedcomponents"})
    # there are links to articles
    if sidebar_box:
        sidebar_box.find_all(attrs={"class": "clearfix"})
        articles = sidebar_box.find_all(attrs={"class": "clearfix"})
        links = articles[0].find_all("a")
        titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links]
        for title, url, base_tags in titles_and_urls:
            tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
            tags.update(base_tags)
            tags.add('sidebar box')
            tagged_urls.append(tagging.make_tagged_url(url, title, tags))

        # and also links to thematic tags
        tags = sidebar_box.find_all(attrs={"class": "bt_meer_over clearfix"})
        for tag in tags:
            links = tag.find_all("a")
            titles_and_urls = [extract_title_and_url_from_bslink(link) for link in links]
            for title, url, base_tags in titles_and_urls:
                tags = tagging.classify_and_tag(url, SEPTSURSEPT_NETLOC, SEPTSURSEPT_INTERNAL_SITES)
                tags.update(base_tags)
                tags.add('keyword')
                tags.add('sidebar box')
                tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    return tagged_urls
Example #4
0
def extract_tagged_url_from_embedded_script(script, site_netloc, site_internal_sites):
    if script.get('src'):
        script_url = script.get('src')
        if twitter_utils.is_twitter_widget_url(script_url):
            if script.contents:
                title, url, tags = twitter_utils.get_widget_type(script.contents[0])
            else:
                # sometimes the TWTR.Widget code is in the next <script> container. Whee.
                sibling_script = script.findNextSibling('script')
                title, url, tags = twitter_utils.get_widget_type(sibling_script.contents[0])
            tags |= classify_and_tag(url, site_netloc, site_internal_sites)
            tags |= set(['script', 'embedded'])
            return make_tagged_url(url, title, tags)
        else:
            if script.findNextSibling('noscript'):
                noscript = script.findNextSibling('noscript')
                link = noscript.find('a')
                if link:
                    url = link.get('href')
                    title = remove_text_formatting_markup_from_fragments(link.contents)
                    all_tags = classify_and_tag(url, site_netloc, site_internal_sites)
                    all_tags |= set(['script', 'embedded'])
                    return make_tagged_url(url, title, all_tags)
                else:
                    raise ValueError("No link was found in the <noscript> section. Update the parser.")
            else:
                raise ValueError("Could not extract fallback noscript url for this embedded javascript object. Update the parser.")
    else:
        raise ValueError("Embedded script of unknown type was detected. Update the parser.")
Example #5
0
def extract_text_content_and_links(soup) :
    tagged_urls = list()
    inline_links = []
    text = list()

    article_body = soup.find(attrs = {"class" : "article-body"})
    text_fragments = article_body.find_all("p")
    other_fragments = article_body.find_all("h2", {"style": "display: inline; font-size: 1em; padding: 0px; margin: 0px;"})
    all_fragments = text_fragments + other_fragments

    if all_fragments:
        for paragraph in text_fragments:
            text.append(u"".join(remove_text_formatting_markup_from_fragments(paragraph)))
            plaintext_urls = extract_plaintext_urls_from_text(remove_text_formatting_and_links_from_fragments(paragraph))
            for url in plaintext_urls:
                tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
                tags.update(['plaintext', 'in text'])
                tagged_urls.append(tagging.make_tagged_url(url, url, tags))

    else:
        text = u""

    for p in all_fragments:
        link = p.find_all("a")
        inline_links.extend(link)

    titles_and_urls = [extract_title_and_url_from_bslink(i) for i in inline_links]

    for title, url, base_tags in titles_and_urls:
        tags = tagging.classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_SITES)
        tags.update(base_tags)
        tags.add('in text')
        tagged_urls.append(tagging.make_tagged_url(url, title, tags))

    return text, tagged_urls
Example #6
0
def extract_links_from_embedded_content(story):
    tagged_urls = []

    # generic iframes
    iframe_items = story.findAll("iframe", recursive=True)
    for iframe in iframe_items:
        url = iframe.get('src')
        all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
        tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'iframe'])))

    # extract embedded storify
    scripts = story.findAll('script', recursive=True)
    for script in scripts:
        url = script.get('src')
        if url:
            scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
            if netloc == "storify.com":
                url = url.rstrip(".js")
                all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
                tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'storify'])))

    # TO DO NEXT : reconstruc kplayer URL
    kplayer = story.find('div', {'class': 'containerKplayer'})
    if kplayer:
        kplayer_flash = kplayer.find('div', {'class': 'flash_kplayer'})
        url_part1 = kplayer_flash.object['data']
        url_part2 = kplayer_flash.object.find('param', {'name': 'flashVars'})['value']
        if url_part1 is not None and url_part2 is not None:
            url = "%s?%s" % (url_part1, url_part2)
            all_tags = classify_and_tag(url, LESOIR_NETLOC, LESOIR_INTERNAL_BLOGS)
            tagged_urls.append(make_tagged_url(url, url, all_tags | set(['embedded', 'kplayer'])))
        else:
            raise ValueError("We couldn't find an URL in the flash player. Update the parser.")

    return tagged_urls
Example #7
0
def extract_associated_links(hxs):
    links = hxs.select("//div[@id='picture']/descendant::div[@class='bloc-01']//a")

    all_tagged_urls = []

    if links:
        def extract_url_and_title(link_hxs):
            url = link_hxs.select('@href').extract()[0]
            title = u"".join(link_hxs.select("text()").extract())

            tags = set()
            if not title:
                title = u'No Title'
                tags.add(constants.GHOST_LINK_TAG)
            if not url:
                url = u''
                tags.add('no target')
            return url, title, tags

        all_tagged_urls = list()
        for item in links:
            url, title, tags = extract_url_and_title(item)
            tags.update(classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES))
            link_type = item.select('@class')
            if link_type and link_type[0] in LINK_TYPE_TO_TAG:
                tags.update(LINK_TYPE_TO_TAG[link_type])

            tags.add("sidebar box")

            all_tagged_urls.append(make_tagged_url(url, title, tags))

    media_links = hxs.select("//div[@id='picture']/descendant::div[@class='wrappAllMedia']/div")

    for i, item in enumerate(media_links):
        if item.select('./img'):
            pass # images are lame
        elif item.select(".//div[starts-with(@id, 'media-youtube')]"):
            youtube_div = item.select(".//div[starts-with(@id, 'media-youtube')]")
            youtube_object = youtube_div.select("./object")
            url = hxs_media_utils.extract_url_from_youtube_object(youtube_object)
            tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
            tags |= set(['youtube', 'embedded', 'video'])
            title = parser_constants.NO_TITLE
            all_tagged_urls.append(make_tagged_url(url, title, tags))
        elif item.select(".//div[contains(@class, 'emvideo-kewego')]"):
            kplayer_div = item.select(".//div[contains(@class, 'emvideo-kewego')]")
            kplayer_object = kplayer_div.select("./object")
            url = hxs_media_utils.extract_url_from_kplayer_object(kplayer_object)
            tags = classify_and_tag(url, SUDINFO_OWN_NETLOC, SUDINFO_INTERNAL_SITES)
            tags |= set(['kewego', 'embedded', 'video'])
            title = parser_constants.NO_TITLE
            all_tagged_urls.append(make_tagged_url(url, title, tags))
        elif not item.select("./div/text()"):
            pass # empty divs are lame
        else:

            raise ValueError("The media box contains something other than an image or a youtube video. Update your parser")

    return all_tagged_urls
Example #8
0
 def test_links_embedded_kewego_gallery(self):
     """ sudinfo parser can extract kewego videos from the article media gallery"""
     with open(os.path.join(DATA_ROOT, "links_embedded_kewego_gallery.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://portfolio.sudpresse.be/main.php?g2_itemId=992521", u"""Une belle après-midi à Bleid""", set(['internal', 'sidebar box', 'gallery'])),
             make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=7b7e2d7a9682&skinKey=a07930e183e6&sig=054c411daa8s&autostart=0&advertise=true", u"""__NO_TITLE__""", set(['kewego', 'video', 'external', 'embedded'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #9
0
 def test_kplayer_without_title(self):
     with open(os.path.join(DATA_ROOT, "kplayer_without_title.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://soirmag.lesoir.be/search/node/gandolfi", u"""Dans toutes ses interviews""", set(['internal', 'internal site', 'in text'])),
             make_tagged_url("http://soirmag.lesoir.be/search/node/gandolfi", u"""Les articles sur Barbara Gandolfi sur SoirMag""", set(['internal', 'sidebar box', 'internal site'])),
             make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=5ff3260def2a&skinKey=6624e00d250s&sig=d09800d9f8as&autostart=false&advertise=true", u"""__NO_TITLE__""", set(['kplayer', 'external', 'embedded', 'top box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #10
0
 def test_embedded_scribble_live(self):
     """ lesoir_new parser correctly extracts and tags an embedded scribble live """
     with open(os.path.join(DATA_ROOT, "embedded_scribble_live.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://football.lesoir.be/jupiler-pro-league/resultats", u"""Tous les résultats et classements""", set(['internal', 'sidebar box', 'internal site'])),
             make_tagged_url("http://embed.scribblelive.com/Embed/v5.aspx?Id=86477&ThemeId=7346", u"""http://embed.scribblelive.com/Embed/v5.aspx?Id=86477&ThemeId=7346""", set(['iframe', 'external', 'embedded'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #11
0
 def test_intro_type2(self):
     """ lesoir_new can extract other type of intro"""
     with open(os.path.join(DATA_ROOT, "intro_type2.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lesoir.be/191377/article/culture/cinema/2013-02-16/berlinale-%C2%ABthe-broken-circle-breakdown%C2%BB-remporte-prix-du-public", u"""Berlinale: «The Broken Circle Breakdown» remporte le prix du Public""", set(['internal', 'sidebar box'])),
             make_tagged_url("http://www.youtube.com/watch?v=ZtoCo9pJ2yU", u"""http://www.youtube.com/watch?v=ZtoCo9pJ2yU""", set(['video', 'external', 'embedded', 'top box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #12
0
 def test_intro_type1(self):
     """ lesoir_new can extract intro"""
     with open(os.path.join(DATA_ROOT, "intro_type1.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lacapitale.be/674531/article/actualite/politique/2013-03-01/didier-reynders-veut-mettre-nos-imams-sous-controle", u"""dans un entretien donné aux journaux SudPresse""", set(['same owner', 'external', 'in text'])),
             make_tagged_url("http://www.lacapitale.be/674531/article/actualite/politique/2013-03-01/didier-reynders-veut-mettre-nos-imams-sous-controle", u"""Didier Reynders veut mettre nos imams sous contrôle (SudPresse)""", set(['sidebar box', 'external', 'same owner'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #13
0
 def test_same_owner_tagging(self):
     """ Sudpresse parser correctly tags 'same owner' links """
     with open(os.path.join(DATA_ROOT, "same_owner_tagging.html")) as f:
         article, raw_html = sudpresse.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Nos confrères de Nord Eclair France """, set(['same owner', 'external', 'in text'])),
             make_tagged_url("http://www.nordeclair.fr/Actualite/Depeches/2012/02/13/dujardin-a-lille.shtml", u"""Voir sur le site nordeclair.fr""", set(['external', 'same owner', 'sidebar box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #14
0
 def test_links_new_ignore_animated_gifs_in_video_div(self):
     """ lavenir [new template] parser ignore <img> elements located where a video should have been. It also works for animated gif files. Which are pronoucened 'jif', btw."""
     with open(os.path.join(DATA_ROOT, "links_new_ignore_animated_gifs_in_video_div.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])),
             make_tagged_url("/sports/football/premierleague", u"""Premier League""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #15
0
 def test_new_links_ignore_photosets(self):
     """ lavenir [new template] parser ignores photosets"""
     with open(os.path.join(DATA_ROOT, "new_links_ignore_photosets.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("/sports/cyclisme", u"""Cyclisme""", set(['internal', 'keyword'])),
             make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #16
0
 def test_new_links_special_in_text(self):
     """ lavenir [new template] parser can in-text links, even if they look like they are located in a bolded paragraph"""
     with open(os.path.join(DATA_ROOT, "new_links_special_in_text.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://tech.lavenir.net/ge/visite_touristique.kmz", u"""Pour découvrir ce parcours en mode 3D avec photographies, cliquez sur ce lien""", set(['internal', 'internal site', 'in text'])),
             make_tagged_url("/sports/jogging", u"""Jogging""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #17
0
 def test_intext_links_tagging(self):
     """ Sudpresse parser correctly tags 'in text' links."""
     with open(os.path.join(DATA_ROOT, "intext_links_tagging.html")) as f:
         article, raw_html = sudpresse.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lameuse.be/vervietois2011", u"""www.lameuse.be/vervietois2011""", set(['same owner', 'external', 'in text'])),
             make_tagged_url("http://verviers.lameuse.be", u"""http://verviers.lameuse.be""", set(['same owner', 'external', 'in text'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #18
0
 def test_embedded_ustream(self):
     with open(os.path.join(DATA_ROOT, "embedded_ustream.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lesoir.be/187345/article/economie/2013-02-11/gaz-schiste-menace-pour-belgique", u"""notre dossier sur le gaz de schiste""", set(['internal', 'in text'])),
             make_tagged_url("http://sll.kewego.com/swf/p3/epix.swf?language_code=fr&playerKey=5ff3260def2a&skinKey=6624e00d250s&sig=ed3b67b4053s&autostart=false&advertise=true", u"""__NO_TITLE__""", set(['kplayer', 'video', 'external', 'embedded', 'top box'])),
             make_tagged_url("__NO_URL__", u"""__NO_TITLE__""", set(['video', u'unfinished', 'embedded'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #19
0
 def test_embedded_video_extraction(self):
     """ sudinfo parser can extract and tag embedded video from the bottom of an article. """
     with open(os.path.join(DATA_ROOT, "embedded_video_extraction.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url(u"http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&sig=5a5a3d9f57ds", u"""http://api.kewego.com/video/getHTML5Thumbnail/?playerKey=7b7e2d7a9682&sig=5a5a3d9f57ds""", set(['video', 'external', 'embedded', 'bottom'])),
             make_tagged_url(u"/338194/article/regions/tournai/2012-02-29/prostitution-“dodo-la-saumure”-va-demander-l’acquittement-sur-tout-jeudi-devant", u"""Prostitution: “Dodo la Saumure” va demander l’acquittement sur tout jeudi devant la justice""", set(['internal', 'sidebar box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #20
0
 def test_sidebar_box_tagging(self):
     """ sudinfo parser can extract and tag sidebar links from an article. """
     with open(os.path.join(DATA_ROOT, "sidebar_box_tagging.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url(u"/338420/article/sports/foot-belge/2012-02-29/grece-belgique-1-1-les-diables-tiennent-le-nul-a-10-contre-11", u"""Grèce - Belgique (1-1): les Diables tiennent le nul à 10 contre 11""", set(['internal', 'sidebar box'])),
             make_tagged_url(u"/338862/article/sports/foot-etranger/2012-02-29/foot-amicaux-la-france-surprend-l’italie-decoit-l’argentine-dit-merci-a-messi", u"""Foot (amicaux): la France surprend, l’Italie déçoit, l’Argentine dit "merci" à Messi""", set(['internal', 'sidebar box'])),
             make_tagged_url(u"/338806/article/sports/foot-belge/2012-02-29/angleterre-belgique-4-0-les-diablotins-encaissent-un-but-d’anthologie-video", u"""Angleterre - Belgique (4-0): les Diablotins encaissent un but d’anthologie (vidéo)""", set(['internal', 'sidebar box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #21
0
 def test_new_links_soccer_video_from_hungary(self):
     """ lavenir [new template] parser loves hungarian soccer videos"""
     with open(os.path.join(DATA_ROOT, "new_links_soccer_video_from_hungary.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://videa.hu/videok/sport/as-roma-3-1-genoa-alessio-romagnoli-francesco-totti-a7Mhqa5118CHtLlG", u"""szólj hozzá: AS Roma 3-1 Genoa MATCH HIGHLIGHTS""", set(['video', 'external', 'embedded'])),
             make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])),
             make_tagged_url("/sports/football/serie-a", u"""Serie A""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #22
0
 def test_ignore_embedded_images(self):
     """ lesoir parser ignores links that point to an embedded image."""
     with open(os.path.join(DATA_ROOT, "links_ignore_embedded_images.html")) as f:
         article, raw_html = lesoir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://www.lesoir.be/sports/basket/2012-05-11/championnat-ostende-rafle-la-pole-juste-avant-les-playoffs-915188.php", u"""Championnat : Ostende rafle la pole juste avant les playoffs""", set(['internal', 'sidebar box', 'internal site', 'recent'])),
             make_tagged_url("http://www.lesoir.be/sports/football/2012-05-11/kompany-elu-joueur-de-la-saison-en-angleterre-915181.php", u"""Kompany élu Joueur de la saison en Angleterre""", set(['internal', 'sidebar box', 'internal site', 'recent'])),
             make_tagged_url("http://www.lesoir.be/actualite/belgique/2012-05-11/collision-de-godinne-les-trains-n-etaient-pas-equipes-du-systeme-europeen-de-securite-915177.php", u"""Collision de Godinne : les trains n’étaient pas équipés du système européen de sécurité""", set(['internal', 'sidebar box', 'internal site', 'recent'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #23
0
 def test_in_text_same_owner(self):
     """ sudinfo parser can extract and tag in text and sidebar links to same owner sites."""
     with open(os.path.join(DATA_ROOT, "in_text_same_owner.html")) as f:
         article, raw_html = sudinfo.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lesoir.be/sports/football/2012-05-31/en-combien-de-temps-hazard-gagne-t-il-votre-salaire-918967.php", u"""Le Soir.be""", set(['same owner', 'external', 'in text'])),
             make_tagged_url("http://www.lesoir.be/sports/football/2012-05-31/en-combien-de-temps-hazard-gagne-t-il-votre-salaire-918967.php", u"""En combien de temps, Eden Hazard gagne votre salaire?""", set(['sidebar box', 'external', 'same owner'])),
             make_tagged_url("slate.fr", u"""slate.fr""", set(['in text', 'plaintext', 'external']))
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #24
0
 def test_links_new_ignore_images_in_video_div(self):
     """ lavenir [new template] parser ignore <img> elements located where a video should have been"""
     with open(os.path.join(DATA_ROOT, "links_new_ignore_images_in_video_div.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://www.lavenir.net/buzz/insolite", u"""Insolite""", set(['internal', 'keyword'])),
             make_tagged_url("http://www.lavenir.net/filinfo/sports", u"""Sports""", set(['internal', 'keyword'])),
             make_tagged_url("/sports/football", u"""Football""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #25
0
 def test_links_highlighted_youtube_and_ghost_links(self):
     """ lavenir parser can extracts links to videos in that weird 'highlighted' top section. Also, it deals with ghost links like a champ."""
     with open(os.path.join(DATA_ROOT, "links_highlighted_youtube_and_ghost_links.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://rainn.org/", u"""RAINN,""", set(['external', 'in text'])),
             make_tagged_url("http://www.youtube.com/embed/KtzqvqzBdUQ", u"""http://www.youtube.com/embed/KtzqvqzBdUQ""", set(['video', 'external', 'embedded'])),
             make_tagged_url("http://", u"""__GHOST_LINK__""", set([u'ghost link', 'sidebar box'])),
             make_tagged_url("http://", u"""__GHOST_LINK__""", set(['bottom box', u'ghost link'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #26
0
 def test_embedded_dailymotion_video(self):
     with open(os.path.join(DATA_ROOT, "embedded_dailymotion_video.html")) as f:
         article, raw_html = lesoir_new.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url(u"/156026/article/actualite/monde/2013-01-11/l-etat-d-urgence-été-décrété-au-mali", u"""L'Etat d'urgence a été décrété au Mali""", set(['internal', 'sidebar box'])),
             make_tagged_url(u"/159828/article/actualite/monde/2013-01-12/raid-raté-en-somalie-confusion-autour-l-otage-français", u"""Raid raté en Somalie : confusion autour de l'otage français""", set(['internal', 'sidebar box'])),
             make_tagged_url(u"/159960/article/actualite/france/2013-01-12/mali-l’intégralité-du-discours-françois-hollande", u"""Mali : l’intégralité du discours de François Hollande""", set(['internal', 'sidebar box'])),
             make_tagged_url(u"/tag/mali", u"""Mali""", set(['internal', 'keyword'])),
             make_tagged_url(u"http://www.dailymotion.com/embed/video/xwpmlt", u"""http://www.dailymotion.com/embed/video/xwpmlt""", set(['video', 'external', 'embedded', 'top box'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #27
0
 def test_links_embedded_youtube(self):
     """ lavenir parser correctly extract links from embedded youtube videos"""
     with open(os.path.join(DATA_ROOT, "links_embedded_youtube.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("http://www.youtube.com/embed/gS9o1FAszdk", u"""http://www.youtube.com/embed/gS9o1FAszdk""", set(['iframe', 'external', 'embedded', 'in text'])),
             make_tagged_url("http://www.youtube.com/embed/qMxX-QOV9tI", u"""http://www.youtube.com/embed/qMxX-QOV9tI""", set(['iframe', 'external', 'embedded', 'in text'])),
             make_tagged_url("http://www.youtube.com/embed/6KUJE2xs-RE", u"""http://www.youtube.com/embed/6KUJE2xs-RE""", set(['iframe', 'external', 'embedded', 'in text'])),
             make_tagged_url("http://www.youtube.com/embed/uSD4vsh1zDA", u"""http://www.youtube.com/embed/uSD4vsh1zDA""", set(['iframe', 'external', 'embedded', 'in text'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #28
0
 def test_links_new_thinglink(self):
     with open(os.path.join(DATA_ROOT, "links_new_thinglink.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("//www.thinglink.com/jse/embed.js#288963391949635586", u"""__THINGLINK_ANNOTATED_IMAGE__""", set(['external', 'embedded', 'annoted image'])),
             make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20120806_00189384", u"""06/08/12 « Il y a de l’envie sur le terrain »""", set(['internal', 'sidebar box'])),
             make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20120806_00189287", u"""06/08/12 Albert Gonthier a tout vu""", set(['internal', 'sidebar box'])),
             make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20120806_00189384", u"""« Il y a de l’envie sur le terrain »""", set(['bottom box', 'internal'])),
             make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20120806_00189287", u"""Albert Gonthier a tout vu""", set(['bottom box', 'internal'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #29
0
 def test_new_links_bottom_links(self):
     """ lavenir [new template] parser can extract links for related articles"""
     with open(os.path.join(DATA_ROOT, "new_links_bottom_links.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         urls = [
             make_tagged_url("/sports/cnt/DMF20130303_00276326", u"""Euro indoor de Göteborg: pas de médaille pour Tia Hellebaut, qui ne passe pas 1m92""", set(['bottom box', 'internal', 'related'])),
             make_tagged_url("http://www.lavenir.net/diaporamas", u"""Diaporamas""", set(['internal', 'keyword'])),
             make_tagged_url("/sports/athletisme", u"""Athlétisme""", set(['internal', 'keyword'])),
             make_tagged_url("http://www.lavenir.net/channel/index.aspx?channelid=497", u"""Tout sur l'Euro indoor d'athlétisme""", set(['internal', 'keyword'])),
         ]
         expected_links = urls
         assert_taggedURLs_equals(expected_links, extracted_links)
Example #30
0
 def test_in_text_links(self):
     """ lavenir parser correctly extracts and tags in text links and does not mistakelny extracts the end of a sentence as a plaintext link"""
     with open(os.path.join(DATA_ROOT, "in_text_links.html")) as f:
         article, raw_html = lavenir.extract_article_data(f)
         extracted_links = article.links
         tagged_urls = [
             make_tagged_url("http://www.lavenir.net/article/detail.aspx?articleid=DMF20130223_00273017", u"""Oscar Pistorius""", set(['internal', 'in text'])),
             make_tagged_url("http://www.lavenir.net/sports/cnt/DMF20130222_00272411", u"""est sorti libre vendredi après-midi""", set(['internal', 'in text'])),
             make_tagged_url("/channel/index.aspx?channelid=490", u"""L'athlète Pistorius tue sa compagne: toutes nos infos""", set(['internal', 'sidebar box'])),
             make_tagged_url("/channel/index.aspx?channelid=490", u"""L'athlète Pistorius tue sa compagne: toutes nos infos""", set(['bottom box', 'internal'])),
         ]
         expected_links = tagged_urls
         assert_taggedURLs_equals(expected_links, extracted_links)