Example #1
0
    def retrigger_urls(self, url_list):
        self.log.info("Retrigging %s urls", len(url_list))

        with self.db.session_context(override_timeout_ms=1000 * 60 *
                                     15) as sess:
            for url in url_list:
                epoch = raw_misc.get_epoch_for_url(url)
                nl = urllib.parse.urlsplit(url).netloc

                linksd = [{
                    'url': url,
                    'starturl': url,
                    'netloc': nl,
                    'distance': dbm.DB_DEFAULT_DIST,
                    'priority': dbm.DB_MED_PRIORITY,
                    'state': "new",
                    'addtime': datetime.datetime.now(),

                    # Don't retrigger unless the ignore time has elaped.
                    'epoch': raw_misc.get_epoch_for_url(url, nl),
                }]

                RawArchiver.RawUrlUpserter.do_link_batch_update_sess(
                    self.log, sess, linksd)

                row = sess.query(self.db.RawWebPages)       \
                 .filter(self.db.RawWebPages.url == url) \
                 .scalar()

                print(row, row.state, row.epoch)
Example #2
0
def links_to_dicts(links_in, starturl, distance, priority):
	ret = []

	for link in links_in:

		if link in SEEN_CACHE:
			continue

		SEEN_CACHE[link] = True

		# print("Doing insert", commit_each, link)
		netloc = urllib.parse.urlsplit(link).netloc

		assert link.startswith("http"), "Link %s doesn't seem to be HTTP content?" % link
		assert netloc

		data = {
			'url'             : link,
			'starturl'        : starturl,
			'netloc'          : netloc,
			'distance'        : distance,
			'priority'        : priority,
			'state'           : "new",
			'addtime'         : datetime.datetime.now(),

			# Don't retrigger unless the ignore time has elaped.
			'epoch' : raw_misc.get_epoch_for_url(link, netloc),
			}

		ret.append(data)
	return ret
Example #3
0
	def process_job(self, jobid, ctnt, fname, mimetype, joburl=None):
		if not joburl:
			with self.job_context(jobid) as (sess, job):
				joburl = job.url

		module = RawArchiver.misc.getModuleForUrl(joburl)
		fname, ctnt, mimetype = module.check_postfetch(joburl, self.wg_proxy, fname, ctnt, mimetype)
		links = self.extractLinks(ctnt, mimetype, joburl)

		if isinstance(ctnt, str):
			ctnt = ctnt.encode("utf-8")

		print("Saving....")
		saved_to = saveFile(ctnt, joburl, fname)
		print("Saved!")

		self.log.info("Saved file to path: %s", saved_to)

		with self.job_context(jobid) as (sess, job):

			starturl = job.starturl
			distance = job.distance
			priority = job.priority

			while True:
				have_history = self.checkHaveHistory(sess, job.url)
				if have_history:
					break
				try:
					self.log.info("Need to push content into history table.")
					job.mimetype        = (job.mimetype + " ") if job.mimetype else " "

					job.fetchtime = datetime.datetime.now() - datetime.timedelta(days=7)


					sess.commit()
					self.log.info("Pushing old job content into history table!")
					break
				except (sqlalchemy.exc.InvalidRequestError, sqlalchemy.exc.OperationalError, sqlalchemy.exc.IntegrityError):
					sess.rollback()

			while True:
				try:
					job.state           = 'complete'
					job.fetchtime       = datetime.datetime.now()
					job.fspath          = saved_to
					job.mimetype        = mimetype
					job.epoch           = raw_misc.get_epoch_for_url(job.url)

					sess.commit()
					self.log.info("Marked plain job with id %s, url %s as complete!", job.id, job.url)
					break

				except (sqlalchemy.exc.InvalidRequestError, sqlalchemy.exc.OperationalError, sqlalchemy.exc.IntegrityError):
					sess.rollback()

		if links:
			self.upsertResponseLinks(job, links, starturl, distance, priority)
Example #4
0
	def do_remote_job(self, response):
		jobid = response['jobid']

		if 'ret' in response and 'success' in response and response['success'] is True:
			assert 'module' in response, "No module in response message? Response: %s" % response
			assert 'call' in response, "No call in response message? Response: %s" % response

			assert response['module'] == 'SmartWebRequest', "Incorrect module? Module: '%s'" % response['module']
			assert response['call'] == 'smartGetItem', "Incorrect call? Call: '%s'" % response['call']
			content, fileN, mType = response['ret']
			self.process_job(jobid, content, fileN, mType)

		else:
			with self.job_context(jobid) as (_, job):
				job.epoch           = raw_misc.get_epoch_for_url(job.url) + 1
				job.state = 'error'
				job.errno = -4

				content = "DOWNLOAD FAILED"
				content += "<br>"
				if 'traceback' in response:
					content += "<pre>"
					content += "<br>".join(response['traceback'])
					content += "</pre>"

					log_func = self.log.error

					if '<FetchFailureError 410 -> ' in content:
						job.epoch           = raw_misc.get_epoch_for_url(job.url) + 10
						log_func = self.log.warning
						job.errno = 410
					elif '<FetchFailureError 404 -> ' in content:
						job.epoch           = raw_misc.get_epoch_for_url(job.url) + 10
						log_func = self.log.warning
						job.errno = 404
					elif '<FetchFailureError 403 -> ' in content:
						job.epoch           = raw_misc.get_epoch_for_url(job.url) + 2
						job.errno = 403
					elif '<FetchFailureError 500 -> ' in content:
						job.epoch           = raw_misc.get_epoch_for_url(job.url) + 2
						job.errno = 500
					else:
						job.epoch           = raw_misc.get_epoch_for_url(job.url) + 2
						job.errno = -1

					max_len_trunc = 450

					for line in response['traceback']:
						if len(line) > max_len_trunc:
							log_func("Remote traceback: %s [...snip...]", line[:max_len_trunc])
						else:
							log_func("Remote traceback: %s", line)
				else:
					self.log.error("No traceback in response?")
					self.log.error("Response: %s", response)

			self.log.error("Error in remote fetch.")