def retrigger_urls(self, url_list): self.log.info("Retrigging %s urls", len(url_list)) with self.db.session_context(override_timeout_ms=1000 * 60 * 15) as sess: for url in url_list: epoch = raw_misc.get_epoch_for_url(url) nl = urllib.parse.urlsplit(url).netloc linksd = [{ 'url': url, 'starturl': url, 'netloc': nl, 'distance': dbm.DB_DEFAULT_DIST, 'priority': dbm.DB_MED_PRIORITY, 'state': "new", 'addtime': datetime.datetime.now(), # Don't retrigger unless the ignore time has elaped. 'epoch': raw_misc.get_epoch_for_url(url, nl), }] RawArchiver.RawUrlUpserter.do_link_batch_update_sess( self.log, sess, linksd) row = sess.query(self.db.RawWebPages) \ .filter(self.db.RawWebPages.url == url) \ .scalar() print(row, row.state, row.epoch)
def links_to_dicts(links_in, starturl, distance, priority): ret = [] for link in links_in: if link in SEEN_CACHE: continue SEEN_CACHE[link] = True # print("Doing insert", commit_each, link) netloc = urllib.parse.urlsplit(link).netloc assert link.startswith("http"), "Link %s doesn't seem to be HTTP content?" % link assert netloc data = { 'url' : link, 'starturl' : starturl, 'netloc' : netloc, 'distance' : distance, 'priority' : priority, 'state' : "new", 'addtime' : datetime.datetime.now(), # Don't retrigger unless the ignore time has elaped. 'epoch' : raw_misc.get_epoch_for_url(link, netloc), } ret.append(data) return ret
def process_job(self, jobid, ctnt, fname, mimetype, joburl=None): if not joburl: with self.job_context(jobid) as (sess, job): joburl = job.url module = RawArchiver.misc.getModuleForUrl(joburl) fname, ctnt, mimetype = module.check_postfetch(joburl, self.wg_proxy, fname, ctnt, mimetype) links = self.extractLinks(ctnt, mimetype, joburl) if isinstance(ctnt, str): ctnt = ctnt.encode("utf-8") print("Saving....") saved_to = saveFile(ctnt, joburl, fname) print("Saved!") self.log.info("Saved file to path: %s", saved_to) with self.job_context(jobid) as (sess, job): starturl = job.starturl distance = job.distance priority = job.priority while True: have_history = self.checkHaveHistory(sess, job.url) if have_history: break try: self.log.info("Need to push content into history table.") job.mimetype = (job.mimetype + " ") if job.mimetype else " " job.fetchtime = datetime.datetime.now() - datetime.timedelta(days=7) sess.commit() self.log.info("Pushing old job content into history table!") break except (sqlalchemy.exc.InvalidRequestError, sqlalchemy.exc.OperationalError, sqlalchemy.exc.IntegrityError): sess.rollback() while True: try: job.state = 'complete' job.fetchtime = datetime.datetime.now() job.fspath = saved_to job.mimetype = mimetype job.epoch = raw_misc.get_epoch_for_url(job.url) sess.commit() self.log.info("Marked plain job with id %s, url %s as complete!", job.id, job.url) break except (sqlalchemy.exc.InvalidRequestError, sqlalchemy.exc.OperationalError, sqlalchemy.exc.IntegrityError): sess.rollback() if links: self.upsertResponseLinks(job, links, starturl, distance, priority)
def do_remote_job(self, response): jobid = response['jobid'] if 'ret' in response and 'success' in response and response['success'] is True: assert 'module' in response, "No module in response message? Response: %s" % response assert 'call' in response, "No call in response message? Response: %s" % response assert response['module'] == 'SmartWebRequest', "Incorrect module? Module: '%s'" % response['module'] assert response['call'] == 'smartGetItem', "Incorrect call? Call: '%s'" % response['call'] content, fileN, mType = response['ret'] self.process_job(jobid, content, fileN, mType) else: with self.job_context(jobid) as (_, job): job.epoch = raw_misc.get_epoch_for_url(job.url) + 1 job.state = 'error' job.errno = -4 content = "DOWNLOAD FAILED" content += "<br>" if 'traceback' in response: content += "<pre>" content += "<br>".join(response['traceback']) content += "</pre>" log_func = self.log.error if '<FetchFailureError 410 -> ' in content: job.epoch = raw_misc.get_epoch_for_url(job.url) + 10 log_func = self.log.warning job.errno = 410 elif '<FetchFailureError 404 -> ' in content: job.epoch = raw_misc.get_epoch_for_url(job.url) + 10 log_func = self.log.warning job.errno = 404 elif '<FetchFailureError 403 -> ' in content: job.epoch = raw_misc.get_epoch_for_url(job.url) + 2 job.errno = 403 elif '<FetchFailureError 500 -> ' in content: job.epoch = raw_misc.get_epoch_for_url(job.url) + 2 job.errno = 500 else: job.epoch = raw_misc.get_epoch_for_url(job.url) + 2 job.errno = -1 max_len_trunc = 450 for line in response['traceback']: if len(line) > max_len_trunc: log_func("Remote traceback: %s [...snip...]", line[:max_len_trunc]) else: log_func("Remote traceback: %s", line) else: self.log.error("No traceback in response?") self.log.error("Response: %s", response) self.log.error("Error in remote fetch.")