Example #1
0
 def __init__(self, config):
     self.db = MongoDB(config)
     self.scrapyd = 'http://%s:%s/' % (environ.get('HYPHE_CRAWLER_HOST', config['host']), int(environ.get('HYPHE_CRAWLER_PORT', config['scrapy_port'])))
     self.db_name = config["db_name"]
     self.queue = None
     self.depiler = LoopingCall(self.depile)
     self.depiler.start(0.2, True)
Example #2
0
 def __init__(self, config):
     self.db = MongoDB(config)
     self.scrapyd = 'http://%s:%s/' % (environ.get('HYPHE_CRAWLER_HOST', config['host']), int(environ.get('HYPHE_CRAWLER_PORT', config['scrapy_port'])))
     self.queue = None
     self.depiler = LoopingCall(self.depile)
     self.depiler.start(1, True)
Example #3
0
class JobsQueue(object):

    def __init__(self, config):
        self.db = MongoDB(config)
        self.scrapyd = 'http://%s:%s/' % (environ.get('HYPHE_CRAWLER_HOST', config['host']), int(environ.get('HYPHE_CRAWLER_PORT', config['scrapy_port'])))
        self.queue = None
        self.depiler = LoopingCall(self.depile)
        self.depiler.start(1, True)

    @inlineCallbacks
    def init_queue(self):
        self.queue = {}
        corpora = yield self.db.list_corpus(fields=[])
        dl = [self.db.get_waiting_jobs(corpus["_id"]) for corpus in corpora]
        alljobs = yield DeferredList(dl, consumeErrors=True)
        for bl, res in alljobs:
            if not bl:
                print "ERROR collecting old crawljobs for a corpus", res
            corpus, jobs = res
            for job in jobs:
                self.queue[job["_id"]] = {
                  "corpus": corpus,
                  "timestamp": job["created_at"],
                  "crawl_arguments": job["crawl_arguments"]
                }

    def stop(self):
        if self.depiler.running:
            self.depiler.stop()

    # Let's scrape ScrapyD's internal jobs webpage since the API
    # does not provide global information on all spiders...
    @inlineCallbacks
    def get_scrapyd_status(self):
        url = "%sjobs" % self.scrapyd
        jobs = yield getPage(url)
        status = {"pending": 0}
        read = None
        for line in jobs.split("><tr"):
            if ">Pending<" in line:
                read = "pending"
            elif ">Running<" in line:
                read = "running"
            elif ">Finished<" in line:
                read = None
            elif read == "running":
                corpus = line[line.find(".") + 1 : line.find("<", 2)]
                if corpus not in status:
                    status[corpus] = 0
                status[corpus] += 1
            elif read:
                status[read] += 1
        returnD(status)

    @inlineCallbacks
    def send_scrapy_query(self, action, arguments=None):
        url = "%s%s.json" % (self.scrapyd, action)
        method = "POST"
        headers = None
        if action.startswith('list'):
            method = "GET"
            if arguments:
                args = [str(k)+'='+str(v) for (k, v) in arguments.iteritems()]
                url += '?' + '&'.join(args)
                arguments = None
        elif arguments:
            arguments = urlencode(arguments)
            headers = {'Content-Type': 'application/x-www-form-urlencoded'}
        try:
            res = yield getPage(url, method=method, postdata=arguments, \
              headers=headers, timeout=30)
            result = loadjson(res)
            returnD(result)
        except ConnectionRefusedError:
            returnD(format_error("Could not contact scrapyd server, " + \
              "maybe it's not started..."))
        except Exception as e:
            returnD(format_error(e))

    @inlineCallbacks
    def add_job(self, args, corpus, webentity_id):
        ts = now_ts()
        job_id = yield self.db.add_job(corpus, webentity_id, args, ts)
        self.queue[job_id] = {
          "corpus": corpus,
          "timestamp": ts,
          "crawl_arguments": args
        }
        yield self.db.add_log(corpus, job_id, "CRAWL_ADDED", ts)
        returnD(job_id)

    @inlineCallbacks
    def depile(self):
        if self.queue is None:
            yield self.init_queue()
        if not len(self.queue):
            returnD(None)

        status = yield self.get_scrapyd_status()
        if status["pending"] > 0:
            returnD(None)
        # Add some random wait to allow possible concurrent Hyphe instance
        # to compete for ScrapyD's empty slots
        yield deferredSleep(1./randint(4,20))

        # Order jobs by corpus with less currently running crawls then age
        ordered = sorted(self.queue.items(), key=lambda x: \
          float("%s.%s" % (status.get(x[1]["corpus"], 0), x[1]["timestamp"])))
        job_id, job = ordered[0]
        res = yield self.send_scrapy_query('schedule', job["crawl_arguments"])
        ts = now_ts()
        if is_error(res):
            logger.msg("WARNING: error sending job %s to ScrapyD: %s" % (job, res))
            self.queue[job_id]['timestamp'] = ts    # let it retry a bit later
        else:
            yield self.db.update_job(job["corpus"], job_id, res['jobid'], ts)
            yield self.db.add_log(job["corpus"], job_id, "CRAWL_SCHEDULED", ts)
            del(self.queue[job_id])

    def cancel_corpus_jobs(self, corpus):
        for _id, job in self.queue.items():
            if job["corpus"] == corpus:
                del(self.queue[_id])

    def count_waiting_jobs(self, corpus):
        return len([0 for j in self.queue.values() if j["corpus"] == corpus])
Example #4
0
 def __init__(self, config):
     self.db = MongoDB(config)
     self.scrapyd = 'http://%s:%s/' % (config['host'], config['scrapy_port'])
     self.queue = None
     self.depiler = LoopingCall(self.depile)
     self.depiler.start(1, True)