def depile(self): if self.queue is None: yield self.init_queue() if not len(self.queue): returnD(None) status = yield self.get_scrapyd_status() if status["pending"] > 0: returnD(None) # Add some random wait to allow possible concurrent Hyphe instance # to compete for ScrapyD's empty slots yield deferredSleep(1./randint(4,20)) # Order jobs by corpus with less currently running crawls then age ordered = sorted(self.queue.items(), key=lambda x: \ float("%s.%s" % (status.get(x[1]["corpus"], 0), x[1]["timestamp"]))) job_id, job = ordered[0] res = yield self.send_scrapy_query('schedule', job["crawl_arguments"]) ts = now_ts() if is_error(res): logger.msg("WARNING: error sending job %s to ScrapyD: %s" % (job, res)) self.queue[job_id]['timestamp'] = ts # let it retry a bit later else: yield self.db.update_job(job["corpus"], job_id, res['jobid'], ts) yield self.db.add_log(job["corpus"], job_id, "CRAWL_SCHEDULED", ts) del(self.queue[job_id])
def stop(self, now=False): if self.monitor.running: self.monitor.stop() if self.stopping(): returnD(None) self.status = "error" if self.error else "stopping" while not now and self.call_running: yield deferredSleep(0.1) if self.transport: self.protocol.stop() self.transport = None self.log("Traph stopped") if not self.error: self.status = "stopped" self.checkAndRemovePID()
def stop(self, now=False): if self.monitor.running: self.monitor.stop() if self.stopping(): returnD(None) self.status = "error" if self.error else "stopping" while not now and self.call_running: yield deferredSleep(0.1) if self.transport: self.protocol.stop() self.transport = None self.log("Traph stopped") if not self.error: self.status = "stopped" else: self.checkAndRemovePID()