Example #1
0
    def test_tidy_url(self):

        data = [
                ("http://menmedia.co.uk/asiannews/news/crime/s/1420665_man-wanted-in-connection-with-robbery-and-assault?rss=yes",
                    "http://menmedia.co.uk/asiannews/news/crime/s/1420665_man-wanted-in-connection-with-robbery-and-assault"),
                ("http://www.belfasttelegraph.co.uk/news/health/diabetes-lsquocan-be-reversed-through-low-calorie-dietrsquo-16015584.html?r=RSS",
                    "http://www.belfasttelegraph.co.uk/news/health/diabetes-lsquocan-be-reversed-through-low-calorie-dietrsquo-16015584.html"),
                ("http://nocruft.com/wibble-pibble","http://nocruft.com/wibble-pibble"),
            ]

        for url,tidied in data:
            self.assertEqual(ScraperUtils.tidy_url(url), tidied)
Example #2
0
    def testCanonicalURLs(self):
        # html, base_url, expected
        snippets = [ ('<head><link rel="canonical" href="http://example.com/products" /></head>', "", "http://example.com/products" ),
            ('<head><link href="http://example.com/products" rel="canonical" /></head>', "", "http://example.com/products"),
            ("""<HEAD><LINK foo="wibble"
                HRef ="http://example.com/products" class="pibble"
                REL = "canonical" /   ></HEAD>""", "", "http://example.com/products"),
            ('<head><meta property="og:url" content="http://www.imdb.com/title/tt0117500/" /></head>', "", 'http://www.imdb.com/title/tt0117500/'),
            # test relative url
            ('<head><meta property="og:url" content="/title/tt0117500/" /></head>', "http://www.imdb.com/title/tt0117500/", 'http://www.imdb.com/title/tt0117500/'),
            # a live BBC example:
#            (urllib2.urlopen('http://www.bbc.co.uk/news/world-africa-13058694').read(), 'http://www.bbc.co.uk/news/world-africa-13058694'),
            # and one from the mirror:
#          (urllib2.urlopen('http://www.mirror.co.uk/news/top-stories/2011/05/11/william-and-kate-to-get-around-on-old-bikes-during-their-luxury-honeymoon-115875-23121689/').read(),
#                "http://www.mirror.co.uk/news/royal-wedding/2011/05/11/royal-honeymoon-prince-william-and-kate-middleton-to-get-around-seychelles-island-on-rickety-old-bikes-115875-23121689/" ),
            ]
        for html,base_url,expected in snippets:
            got = ScraperUtils.extract_canonical_url(html,base_url)
            self.assertEqual(got,expected)
Example #3
0
    body_div = article.cssselect('[itemprop~="articleBody"], [itemprop~="reviewBody"]')[0]

    # cruft removal
    for cruft in body_div.cssselect('.inline-pipes-list, #gigya-share-btns-2'):
        cruft.drop_tree()

    art['content'] = ukmedia.SanitiseHTML(unicode(lxml.html.tostring(body_div)))
    art['description'] = ukmedia.FirstPara( art['content'] )
    art['srcorgname'] = u'independent'


    return art




def ContextFromURL( url ):
    """Build up an article scrape context from a bare url."""
#    url = TidyURL(url)
    context = {}
    context['srcurl'] = url
    context['permalink'] = url
    context['srcorgname'] = u'independent'
    context['lastseen'] = datetime.now()
    return context

if __name__ == "__main__":
    ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=150 )


Example #4
0
    m = art_idpat.search( url )
    if m:
        return 'ft_' + m.group(1)
    m = blog_idpat.search( url )
    if m:
        return 'ftblog_' + m.group(1)

    return None


def ContextFromURL( url ):
    """Build up an article scrape context from a bare url."""
    context = {}
    context['srcurl'] = url
    context['permalink'] = url
    context['srcid'] = CalcSrcID( url )
    context['srcorgname'] = u'ft'
    context['lastseen'] = datetime.now()

    # to clean the url...
    context = ScrubFunc( context, None )
    return context





if __name__ == "__main__":
    ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=50, prep=Prep )

Example #5
0
    srcid = CalcSrcID( context['srcurl'] )
    if not srcid:
        return None # suppress it
    context['srcid'] = srcid
    return context


def FindArticles(sesh):
    """ get a set of articles to scrape from the rss feeds """

    articles = ScraperUtils.FindArticlesFromRSS( blog_feeds, u'skynews', ScrubFunc )
    return articles


def ContextFromURL( url ):
    """Build up an article scrape context from a bare url."""
    # NOTE: urls from the rss feed have a couple of extra components which
    # we _could_ strip out here...
    context = {}
    context['permalink'] = url
    context['srcurl'] = url
    context['srcid'] = CalcSrcID( url )
#    context['srcorgname'] = u'skynews'
    context['lastseen'] = datetime.now()
    return context


if __name__ == "__main__":
    ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract )

Example #6
0
    return art


def TidyURL( url ):
    """ Tidy up URL - trim off params, query, fragment... """
    o = urlparse.urlparse( url )
    url = urlparse.urlunparse( (o[0],o[1],o[2],'','','') );
    return url

def ContextFromURL( url ):
    """Build up an article scrape context from a bare url."""
    url = TidyURL(url)
    context = {}
    context['srcurl'] = url
    context['permalink'] = url
    context['srcorgname'] = u'times'
    context['lastseen'] = datetime.now()
    return context



if __name__ == "__main__":
    # create a url opener which remembers cookies (as well as throttling and all the other uber-opener stuff)
    cj = cookielib.LWPCookieJar()
    opener = ScraperUtils.build_uber_opener(cookiejar=cj)

    # large maxerrors to handle video-only pages
    ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=200, prep=Prep, sesh=opener )