def new_WE(self, weid, prefixes, name=None, status="DISCOVERED", startpages=[], tags={}): timestamp = now_ts() if not name: for p in prefixes: try: name = name_lru(prefixes[0]) break except ValueError: pass else: name = prefixes[0] return { "_id": weid, "prefixes": prefixes, "name": name, "status": status, "tags": tags, "homepage": None, "startpages": startpages, "crawled": False, "creationDate": timestamp, "lastModificationDate": timestamp }
def add_corpus(self, corpus, name, password, options, tlds=None): now = now_ts() yield self.db["corpus"].insert({ "_id": corpus, "name": name, "password": salt(password), "options": options, "total_webentities": 0, "webentities_in": 0, "webentities_in_untagged": 0, "webentities_in_uncrawled": 0, "webentities_out": 0, "webentities_undecided": 0, "webentities_discovered": 0, "total_crawls": 0, "total_pages": 0, "total_pages_crawled": 0, "created_at": now, "last_activity": now, "recent_changes": False, "last_index_loop": now, "links_duration": 1, "last_links_loop": 0, "tlds": tlds }, safe=True) yield self.init_corpus_indexes(corpus)
def add_corpus(self, corpus, name, password, options, tlds=None): now = now_ts() yield self.db()["corpus"].insert_one({ "_id": corpus, "name": name, "password": salt(password), "options": options, "total_webentities": 0, "webentities_in": 0, "webentities_in_untagged": 0, "webentities_in_uncrawled": 0, "webentities_out": 0, "webentities_undecided": 0, "webentities_discovered": 0, "total_crawls": 0, "crawls_pending": 0, "crawls_running": 0, "total_pages": 0, "total_pages_crawled": 0, "total_pages_queued": 0, "total_links_found": 0, "recent_changes": False, "last_index_loop": now, "links_duration": 1, "last_links_loop": 0, "tags": Binary(msgpack.packb({})), "webentities_links": Binary(msgpack.packb({})), "created_at": now, "last_activity": now, "tlds": tlds }) yield self.init_corpus_indexes(corpus)
def depile(self): if self.queue is None: yield self.init_queue() if not len(self.queue): returnD(None) status = yield self.get_scrapyd_status() if status["pending"] > 0: returnD(None) # Add some random wait to allow possible concurrent Hyphe instance # to compete for ScrapyD's empty slots yield deferredSleep(1./randint(4,20)) # Order jobs by corpus with less currently running crawls then age ordered = sorted(self.queue.items(), key=lambda x: \ float("%s.%s" % (status.get(x[1]["corpus"], 0), x[1]["timestamp"]))) job_id, job = ordered[0] res = yield self.send_scrapy_query('schedule', job["crawl_arguments"]) ts = now_ts() if is_error(res): logger.msg("WARNING: error sending job %s to ScrapyD: %s" % (job, res)) self.queue[job_id]['timestamp'] = ts # let it retry a bit later else: yield self.db.update_job(job["corpus"], job_id, res['jobid'], ts) yield self.db.add_log(job["corpus"], job_id, "CRAWL_SCHEDULED", ts) del(self.queue[job_id])
def update_job(self, corpus, job_id, crawl_id, timestamp=None): if not timestamp: timestamp = now_ts() yield self.jobs(corpus).update_one( {"_id": job_id}, {"$set": { "crawljob_id": crawl_id, "scheduled_at": timestamp }})
def save_stats(self, corpus, corpus_metas): yield self.stats(corpus).insert({ "timestamp": now_ts(), "total": corpus_metas["total_webentities"], "in": corpus_metas['webentities_in'], "out": corpus_metas['webentities_out'], "discovered": corpus_metas['webentities_discovered'], "undecided": corpus_metas['webentities_undecided'] }, safe=True)
def add_log(self, corpus, job, msg, timestamp=None): if not timestamp: timestamp = now_ts() if type(job) != list: job = [job] yield self.logs(corpus).insert_many([{ '_job': _id, 'timestamp': timestamp, 'log': msg } for _id in job])
def add_job(self, args, corpus, webentity_id): ts = now_ts() job_id = yield self.db.add_job(corpus, webentity_id, args, ts) self.queue[job_id] = { "corpus": corpus, "timestamp": ts, "crawl_arguments": args } yield self.db.add_log(corpus, job_id, "CRAWL_ADDED", ts) returnD(job_id)
def save_stats(self, corpus, corpus_metas): yield self.stats(corpus).insert( { "timestamp": now_ts(), "total": corpus_metas["total_webentities"], "in": corpus_metas['webentities_in'], "out": corpus_metas['webentities_out'], "discovered": corpus_metas['webentities_discovered'], "undecided": corpus_metas['webentities_undecided'] }, safe=True)
def save_stats(self, corpus, corpus_metas): new = { "total": corpus_metas["total_webentities"], "in": corpus_metas['webentities_in'], "in_untagged": corpus_metas['webentities_in_untagged'], "in_uncrawled": corpus_metas['webentities_in_uncrawled'], "out": corpus_metas['webentities_out'], "discovered": corpus_metas['webentities_discovered'], "undecided": corpus_metas['webentities_undecided'] } old = yield self.get_last_stats(corpus) if old: del (old["timestamp"], old["_id"]) if not old or old != new: new["timestamp"] = now_ts() yield self.stats(corpus).insert_one(new)
def save_stats(self, corpus, corpus_metas): new = { "total": corpus_metas["total_webentities"], "in": corpus_metas['webentities_in'], "in_untagged": corpus_metas['webentities_in_untagged'], "in_uncrawled": corpus_metas['webentities_in_uncrawled'], "out": corpus_metas['webentities_out'], "discovered": corpus_metas['webentities_discovered'], "undecided": corpus_metas['webentities_undecided'] } old = yield self.get_last_stats(corpus) if old: del(old["timestamp"], old["_id"]) if not old or old != new: new["timestamp"] = now_ts() yield self.stats(corpus).insert_one(new)
def add_job(self, corpus, job_id, webentity_id, args, timestamp=None): if not timestamp: timestamp = now_ts() yield self.jobs(corpus).insert({ "_id": job_id, "webentity_id": webentity_id, "nb_crawled_pages": 0, "nb_pages": 0, "nb_links": 0, "crawl_arguments": args, "crawling_status": crawling_statuses.PENDING, "indexing_status": indexing_statuses.PENDING, "created_at": timestamp, "started_at": None, "crawled_at": None, "finished_at": None }, safe=True)
def add_corpus(self, corpus, name, password, options): now = now_ts() yield self.db["corpus"].insert({ "_id": corpus, "name": name, "password": salt(password), "options": options, "total_webentities": 0, "webentities_in": 0, "webentities_out": 0, "webentities_undecided": 0, "webentities_discovered": 0, "total_crawls": 0, "total_pages": 0, "total_pages_crawled": 0, "created_at": now, "last_activity": now, "last_index_loop": now, "last_links_loop": now }, safe=True) yield self.init_corpus_indexes(corpus)
def add_job(self, corpus, webentity_id, args, timestamp=None): if not timestamp: timestamp = now_ts() _id = str(uuid()) yield self.jobs(corpus).insert_one({ "_id": _id, "crawljob_id": None, "webentity_id": webentity_id, "nb_crawled_pages": 0, "nb_unindexed_pages": 0, "nb_pages": 0, "nb_links": 0, "crawl_arguments": args, "crawling_status": crawling_statuses.PENDING, "indexing_status": indexing_statuses.PENDING, "created_at": timestamp, "scheduled_at": None, "started_at": None, "crawled_at": None, "finished_at": None }) returnD(_id)
def upsert_WE(self, corpus, weid, metas, update_timestamp=True): if update_timestamp: metas["lastModificationDate"] = now_ts() yield self.WEs(corpus).update_one({"_id": weid}, {"$set": metas}, upsert=True)
def add_log(self, corpus, job, msg, timestamp=None): if not timestamp: timestamp = now_ts() if type(job) != list: job = [job] yield self.logs(corpus).insert([{'_job': _id, 'timestamp': timestamp, 'log': msg} for _id in job], multi=True, safe=True)
def update_job(self, corpus, job_id, crawl_id, timestamp=None): if not timestamp: timestamp = now_ts() yield self.jobs(corpus).update({"_id": job_id}, {"$set": {"crawljob_id": crawl_id, "scheduled_at": timestamp}}, safe=True)
def upsert_WE(self, corpus, weid, metas, updateTimestamp=True): if updateTimestamp: metas["lastModificationDate"] = now_ts() yield self.WEs(corpus).update_one({"_id": weid}, {"$set": metas}, upsert=True)