def scrape_articles( found, extract, opts): """Scrape list of articles, return error counts. found -- list of article contexts to scrape extract -- extract function opts: max_errors -- tolerated number of errors before bailing test force_rescrape etc... """ extralogging = False max_errors = getattr(opts,'max_errors',0) expected_journo = getattr(opts,'expected_journo',None) # remove dupes (eg often articles appear in more than one RSS feed) found = unique_articles(found) # randomise the order of articles, so that if the scraper does abort # due to too many errors, successive runs should be able to pick up # all the scrapable articles. random.shuffle(found) #assert(len(found)>0) ukmedia.DBUG2("%d articles to scrape\n" % (len(found))) if opts.test: ukmedia.DBUG("DRY RUN\n") store = ArticleDB.ArticleDB() failcount = 0 abortcount = 0 newcount = 0 had_count = 0 rescrape_count = 0 for context in found: try: known_urls = set((context['srcurl'], context['permalink'])) got = store.find_article(known_urls) if len(got) > 0: if extralogging: for article_id in got: ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id))) if not opts.force_rescrape: had_count += 1 continue; # skip it - we've already got it else: assert(len(got) == 1) #ukmedia.DBUG2( u"fetching %s\n" % (context['srcurl']) ) resp = urllib2.urlopen( context['srcurl'] ) # is the server sending an charset encoding? kwargs = {} content_type = resp.info().getheader('Content-Type','') m = re.compile(r';\s*charset\s*=\s*([^;]*)', re.I).search(content_type) if m: kwargs['encoding'] = m.group(1) # grab the content html = resp.read() # add any URLs we were redirected via... for code,url in resp.redirects: known_urls.add(url) if code==301: # permanant redirect context['permalink'] = url # check html for a rel="canonical" link: canonical_url = extract_canonical_url(html, context['permalink']) if canonical_url is not None: known_urls.add(canonical_url) context['permalink'] = canonical_url # strip off "?rss=yes" etc from permalink tidied_url = tidy_url(context['permalink']) if tidied_url != context['permalink']: context['permalink'] = tidied_url known_urls.add(tidied_url) context['urls'] = known_urls # check that all urls are OK (eg express.co.uk have a habit of publishing borked ones for blogs) for url in known_urls: url.encode('utf-8') # will raise an exception if dud # repeat url-based existence check with the urls we now have # TODO: if so, add any new urls... maybe rescrape and update article? article_id = None got = store.find_article(known_urls) if len(got) > 0: if extralogging: for article_id in got: ukmedia.DBUG( u"already got %s [a%s] (attributed to: %s)\n" % (context['srcurl'], article_id,GetAttrLogStr(article_id))) if not opts.force_rescrape: had_count += 1 continue; # skip it - we've already got it else: assert(len(got) == 1) article_id = got[0] # some extra, last minute context :-) context[ 'lastscraped' ] = datetime.now() art = extract(html, context, **kwargs) if art: # set the srcorg id for the article if 'srcorgname' in art and art['srcorgname'] is not None: srcorg = Misc.GetOrgID( art[ 'srcorgname' ] ) else: # no publication specified - look up using domain name o = urlparse.urlparse(art['permalink']) domain = o[1].lower() srcorg = Publication.find_or_create(domain) art['srcorg'] = srcorg # resolve bylined authors to journo ids authors = Byline.CrackByline(art['byline']) attributed = [] for author in authors: attributed.append(Journo.find_or_create(author, art, expected_journo)) art['journos'] = attributed if opts.test: ukmedia.PrettyDump( art ) if article_id: # rescraping existing article art['id'] = article_id article_id = store.upsert( art ) rescrape_count += 1 else: # article_id = store.upsert( art ) newcount += 1 if opts.test: DB.conn().rollback() else: DB.conn().commit() except Exception, err: DB.conn().rollback() # always just bail out upon ctrl-c if isinstance( err, KeyboardInterrupt ): raise failcount = failcount+1 # TODO: phase out NonFatal! just get scraper to print out a warning message instead if isinstance( err, ukmedia.NonFatal ): continue report = traceback.format_exc() if 'title' in context: msg = u"FAILED (%s): '%s' (%s)" % (err, context['title'], context['srcurl']) else: msg = u"FAILED (%s): (%s)" % (err,context['srcurl']) ukmedia.DBUG( msg + "\n" ) ukmedia.DBUG2( report + "\n" ) ukmedia.DBUG2( '-'*60 + "\n" ) abortcount = abortcount + 1 if abortcount > max_errors: print >>sys.stderr, "Too many errors - ABORTING" raise