body_div = article.cssselect('[itemprop~="articleBody"], [itemprop~="reviewBody"]')[0] # cruft removal for cruft in body_div.cssselect('.inline-pipes-list, #gigya-share-btns-2'): cruft.drop_tree() art['content'] = ukmedia.SanitiseHTML(unicode(lxml.html.tostring(body_div))) art['description'] = ukmedia.FirstPara( art['content'] ) art['srcorgname'] = u'independent' return art def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" # url = TidyURL(url) context = {} context['srcurl'] = url context['permalink'] = url context['srcorgname'] = u'independent' context['lastseen'] = datetime.now() return context if __name__ == "__main__": ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=150 )
srcid = CalcSrcID( context['srcurl'] ) if not srcid: return None # suppress it context['srcid'] = srcid return context def FindArticles(sesh): """ get a set of articles to scrape from the rss feeds """ articles = ScraperUtils.FindArticlesFromRSS( blog_feeds, u'skynews', ScrubFunc ) return articles def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" # NOTE: urls from the rss feed have a couple of extra components which # we _could_ strip out here... context = {} context['permalink'] = url context['srcurl'] = url context['srcid'] = CalcSrcID( url ) # context['srcorgname'] = u'skynews' context['lastseen'] = datetime.now() return context if __name__ == "__main__": ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract )
m = art_idpat.search( url ) if m: return 'ft_' + m.group(1) m = blog_idpat.search( url ) if m: return 'ftblog_' + m.group(1) return None def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" context = {} context['srcurl'] = url context['permalink'] = url context['srcid'] = CalcSrcID( url ) context['srcorgname'] = u'ft' context['lastseen'] = datetime.now() # to clean the url... context = ScrubFunc( context, None ) return context if __name__ == "__main__": ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=50, prep=Prep )
return art def TidyURL( url ): """ Tidy up URL - trim off params, query, fragment... """ o = urlparse.urlparse( url ) url = urlparse.urlunparse( (o[0],o[1],o[2],'','','') ); return url def ContextFromURL( url ): """Build up an article scrape context from a bare url.""" url = TidyURL(url) context = {} context['srcurl'] = url context['permalink'] = url context['srcorgname'] = u'times' context['lastseen'] = datetime.now() return context if __name__ == "__main__": # create a url opener which remembers cookies (as well as throttling and all the other uber-opener stuff) cj = cookielib.LWPCookieJar() opener = ScraperUtils.build_uber_opener(cookiejar=cj) # large maxerrors to handle video-only pages ScraperUtils.scraper_main( FindArticles, ContextFromURL, Extract, max_errors=200, prep=Prep, sesh=opener )