def run(self): # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port), username=h3().username, password=h3().password) logger.info("Starting %s" % (self.job.name)) targets = json.load(self.input()[1].open('r')) nevercrawl = json.load(self.input()[2].open('r')) logger.debug("Found %s Targets in date range." % len(targets)) job = W3actJob(targets, self.job.name, heritrix=h, heritrix_job_dir=h3().local_job_folder, nevercrawl=nevercrawl) status = h.status(self.job.name) logger.info("Got current job status: %s" % status) logger.info("Starting job %s (from checkpoint = %s)..." % (job.name, self.from_latest_checkpoint)) job.start(from_latest_checkpoint=self.from_latest_checkpoint) launch_id = h.get_launch_id(self.job.name) logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))) #with self.output().open('w') as f: # f.write('{}\n'.format(launch_id)) # Record an output file that can be use as a Target by a different task.: mark_job_as(job, launch_id, 'started') return
def restart_job(frequency, start=datetime.utcnow()): """Restarts the job for a particular frequency.""" logger.info("Restarting %s at %s" % (frequency, start)) try: w = w3act(args.w3act_url, args.w3act_user, args.w3act_pw) export = w.get_ld_export(frequency) logger.debug("Found %s Targets in export." % len(export)) targets = [ t for t in export if (t["crawlStartDateISO"] is None or dateutil.parser.parse(t["crawlStartDateISO"]) < start) and ( t["crawlEndDateISO"] is None or dateutil.parser.parse(t["crawlEndDateISO"]) > start) ] logger.debug("Found %s Targets in date range." % len(targets)) h = hapy.Hapy("https://%s:%s" % (args.host, args.port), username=args.user, password=args.password) #h = heritrix.API(host="https://%s:%s/engine" % (settings.HERITRIX_HOST, settings.HERITRIX_PORTS[frequency]), user="******", passwd="bl_uk", verbose=False, verify=False) if frequency in h.listjobs() and h.status(frequency) != "": stop_running_job(frequency, h) #TODO: Automated QA job = W3actJob(targets, name=frequency, heritrix=h) if not args.test: logger.debug("Starting job %s with %s seeds." % (job.name, len(job.seeds))) job.start() else: logger.debug("Would start job %s with %s seeds." % (job.name, len(job.seeds))) logger.debug("Seeds:") for surl in job.seeds: logger.debug("- %s" % surl) except: logger.error("%s: %s" % (frequency, str(sys.exc_info()))) logger.error("%s: %s" % (frequency, traceback.format_exc()))
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True): """ Restarts the job for a particular frequency. """ try: logger.info("Stopping/starting %s at %s" % (frequency, start)) # Set up connection to W3ACT: w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'), cfg.get('act', 'password')) # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3', 'host'), cfg.get('h3', 'port')), username=cfg.get('h3', 'username'), password=cfg.get('h3', 'password')) # Stop job if currently running: if frequency in h.list_jobs() and h.status(frequency) != "": """Stops a running job, notifies RabbitMQ and cleans up the directory.""" launch_id = h.get_launch_id(frequency) job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h) job.stop() remove_action_files(frequency) crawl.status.update_job_status.delay( job.name, "%s/%s" % (job.name, launch_id), "STOPPED") # Pass on to the next step in the chain: logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id)) assemble_job_output.delay(frequency, launch_id) else: job = None # Start job if requested: if restart: targets = w.get_ld_export(frequency) # logger.info("Found %s Targets in export." % len(export)) # targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)] logger.debug("Found %s Targets in date range." % len(targets)) job = W3actJob(w, targets, frequency, heritrix=h) logger.info("Starting job %s..." % job.name) job.start() launch_id = h.get_launch_id(frequency) crawl.status.update_job_status.delay( job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED") logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))) return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)) else: if job: logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id)) return "Stopped job %s/%s without restarting..." % (job.name, launch_id) else: logger.warning("No running '%s' job to stop!" % frequency) return "No running '%s' job to stop!" % frequency except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)