def get_h3_status(self, job, server): # Set up connection to H3: h = hapyx.HapyX(server['url'], username=server['user'], password=server['pass'], timeout=5) state = {} try: logger.info("Getting status for job %s on %s" % (job, server)) info = h.get_job_info(job) state['details'] = info if info.has_key('job'): state['status'] = info['job'].get("crawlControllerState", None) if not state['status']: state['status'] = info['job'].get("statusDescription", None) state['status'] = state['status'].upper() except Exception as e: state['status'] = "DOWN" state['error'] = "Could not reach Heritrix! %s" % e # app.logger.exception(e) # Classify if state['status'] == "DOWN": state['status-class'] = "status-oos" elif state['status'] == "RUNNING": # Replacing RUNNING with docs/second rate rate = state['details']['job']['rateReport']['currentDocsPerSecond'] state['rate'] = "%.1f" % float(rate) if rate < 1.0: state['status-class'] = "status-warning" else: state['status-class'] = "status-good" else: state['status-class'] = "status-warning" return state
def run(self): # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port), username=h3().username, password=h3().password) logger.info("Starting %s" % (self.job.name)) targets = json.load(self.input()[1].open('r')) nevercrawl = json.load(self.input()[2].open('r')) logger.debug("Found %s Targets in date range." % len(targets)) job = W3actJob(targets, self.job.name, heritrix=h, heritrix_job_dir=h3().local_job_folder, nevercrawl=nevercrawl) status = h.status(self.job.name) logger.info("Got current job status: %s" % status) logger.info("Starting job %s (from checkpoint = %s)..." % (job.name, self.from_latest_checkpoint)) job.start(from_latest_checkpoint=self.from_latest_checkpoint) launch_id = h.get_launch_id(self.job.name) logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))) #with self.output().open('w') as f: # f.write('{}\n'.format(launch_id)) # Record an output file that can be use as a Target by a different task.: mark_job_as(job, launch_id, 'started') return
def unpause_dc(): servers = json.load(systems().servers) services = json.load(systems().services) for job in ['dc0-2016', 'dc1-2016', 'dc2-2016', 'dc3-2016']: server = servers[services['jobs'][job]['server']] h = hapyx.HapyX(server['url'], username=server['user'], password=server['pass']) h.unpause_job(services['jobs'][job]['name']) return redirect(url_for('status'))
def complete(self): # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port), username=h3().username, password=h3().password) # Is this job known? if self.job.name in h.list_jobs(): status = h.status(self.job.name) if status == "": return True else: return False else: return True
def setup_heritrix(self, api=None, host=None, port=None, user="******", passwd="bl_uk"): if api is not None: self.heritrix = api else: self.heritrix = hapyx.HapyX(host="https://%s:%s/engine" % (host, port), user=user, passwd=passwd, verbose=False, verify=False) self.heritrix.add_job_directory(self.job_dir)
def run(self): # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port), username=h3().username, password=h3().password) # Is that job running? status = h.status(self.job.name) if status != "": # Check the launch ID is not current: launch_id = h.get_launch_id(self.job.name) if launch_id == self.launch_id: # Declare that we are awaiting an external process to stop this job: yield StopJobExternalTask(self.job, self.launch_id) # Not running, so mark as stopped: with self.output().open('w') as f: f.write('{} {}\n'.format(self.job.name, self.launch_id))
def run(self): # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port), username=h3().username, password=h3().password) logger.info("I'm stopping %s" % (self.job.name)) # Stop job if currently running: if self.job.name in h.list_jobs() and h.status(self.job.name) != "": """Stops a running job, cleans up the directory, initiates job assembly.""" launch_id = h.get_launch_id(self.job.name) job = W3actJob.from_directory( "%s/%s" % (h3().local_job_folder, self.job.name), heritrix=h) job.stop() remove_action_files(self.job.name, HERITRIX_JOBS=h3().local_job_folder) # Record an output file that can be use as a Target by a different task: mark_job_as(job, launch_id, 'stopped') else: logger.warning("No {} job to be stopped!".format(self.job.name))
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True): """ Restarts the job for a particular frequency. """ try: logger.info("Stopping/starting %s at %s" % (frequency, start)) # Set up connection to W3ACT: w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'), cfg.get('act', 'password')) # Set up connection to H3: h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3', 'host'), cfg.get('h3', 'port')), username=cfg.get('h3', 'username'), password=cfg.get('h3', 'password')) # Stop job if currently running: if frequency in h.list_jobs() and h.status(frequency) != "": """Stops a running job, notifies RabbitMQ and cleans up the directory.""" launch_id = h.get_launch_id(frequency) job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h) job.stop() remove_action_files(frequency) crawl.status.update_job_status.delay( job.name, "%s/%s" % (job.name, launch_id), "STOPPED") # Pass on to the next step in the chain: logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id)) assemble_job_output.delay(frequency, launch_id) else: job = None # Start job if requested: if restart: targets = w.get_ld_export(frequency) # logger.info("Found %s Targets in export." % len(export)) # targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)] logger.debug("Found %s Targets in date range." % len(targets)) job = W3actJob(w, targets, frequency, heritrix=h) logger.info("Starting job %s..." % job.name) job.start() launch_id = h.get_launch_id(frequency) crawl.status.update_job_status.delay( job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED") logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))) return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)) else: if job: logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id)) return "Stopped job %s/%s without restarting..." % (job.name, launch_id) else: logger.warning("No running '%s' job to stop!" % frequency) return "No running '%s' job to stop!" % frequency except BaseException as e: logger.exception(e) raise self.retry(countdown=10, exe=e)
def get_hapy_for_job(job): host = "https://%s-%s:%s" % (h3().host, job.name, h3().port) return hapyx.HapyX(host, username=h3().username, password=h3().password)