def __init__(self, host, port, db, queue_col, page_col, jobid): store = MongoConnection(host, port)[db] self.jobid = jobid self.pageStore = store[page_col] self.queueStore = store[queue_col] self.queueStore.ensure_index(mongosort(ASCENDING('_job')), background=True)
def init_corpus_indexes(self, corpus, retry=True): try: yield self.db()['corpus'].create_index(sortdesc('last_activity'), background=True) yield self.WEs(corpus).create_index(sortasc('name'), background=True) yield self.WEs(corpus).create_index(sortasc('status'), background=True) yield self.WEs(corpus).create_index(sortasc('crawled'), background=True) yield self.WEs(corpus).create_index(mongosort(textIndex("$**")), background=True) yield self.WECRs(corpus).create_index(sortasc('prefix'), background=True) yield self.pages(corpus).create_index(sortasc('timestamp'), background=True) yield self.pages(corpus).create_index(sortasc('_job'), background=True) yield self.pages(corpus).create_index(sortasc('_job') + sortasc('forgotten'), background=True) yield self.pages(corpus).create_index(sortasc('url'), background=True) yield self.queue(corpus).create_index(sortasc('timestamp'), background=True) yield self.queue(corpus).create_index(sortasc('_job') + sortdesc('timestamp'), background=True) yield self.logs(corpus).create_index(sortasc('timestamp'), background=True) yield self.jobs(corpus).create_index(sortasc('crawling_status'), background=True) yield self.jobs(corpus).create_index(sortasc('indexing_status'), background=True) yield self.jobs(corpus).create_index(sortasc('webentity_id'), background=True) yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortasc('created_at'), background=True) yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortdesc('created_at'), background=True) yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortasc("crawling_status") + sortasc("indexing_status") + sortasc('created_at'), background=True) yield self.jobs(corpus).create_index(sortasc('crawling_status') + sortasc('indexing_status') + sortasc('created_at'), background=True) yield self.stats(corpus).create_index(sortasc('timestamp'), background=True) except OperationFailure as e: # catch and destroy old indices built with older pymongo versions if retry: yield self.db()['corpus'].drop_indexes() for coll in ["pages", "queue", "logs", "jobs", "stats"]: yield self._get_coll(corpus, coll).drop_indexes() yield self.init_corpus_indexes(corpus, retry=False) else: raise e
def init_corpus_indexes(self, corpus, retry=True): try: yield self.db()['corpus'].create_index(sortdesc('last_activity'), background=True) yield self.WEs(corpus).create_index(sortasc('name'), background=True) yield self.WEs(corpus).create_index(sortasc('status'), background=True) yield self.WEs(corpus).create_index(sortasc('crawled'), background=True) yield self.WEs(corpus).create_index(mongosort(textIndex("$**")), background=True) yield self.WECRs(corpus).create_index(sortasc('prefix'), background=True) yield self.pages(corpus).create_index(sortasc('timestamp'), background=True) yield self.pages(corpus).create_index(sortasc('_job'), background=True) yield self.pages(corpus).create_index(sortasc('_job') + sortasc('forgotten'), background=True) yield self.pages(corpus).create_index(sortasc('url'), background=True) yield self.queue(corpus).create_index(sortasc('timestamp'), background=True) yield self.queue(corpus).create_index(sortasc('_job'), background=True) yield self.queue(corpus).create_index(sortasc('_job') + sortdesc('timestamp'), background=True) yield self.logs(corpus).create_index(sortasc('timestamp'), background=True) yield self.jobs(corpus).create_index(sortasc('created_at'), background=True) yield self.jobs(corpus).create_index(sortasc('webentity_id'), background=True) yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortasc('created_at'), background=True) yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortdesc('created_at'), background=True) yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortasc("crawling_status") + sortasc("indexing_status") + sortasc('created_at'), background=True) yield self.jobs(corpus).create_index(sortasc('previous_webentity_id'), background=True) yield self.jobs(corpus).create_index(sortasc('crawljob_id'), background=True) yield self.jobs(corpus).create_index(sortasc('crawljob_id') + sortasc('crawling_status'), background=True) yield self.jobs(corpus).create_index(sortasc('crawljob_id') + sortasc('indexing_status'), background=True) yield self.jobs(corpus).create_index(sortasc('crawljob_id') + sortasc('crawling_status') + sortasc('indexing_status'), background=True) yield self.jobs(corpus).create_index(sortasc('crawling_status'), background=True) yield self.jobs(corpus).create_index(sortasc('indexing_status'), background=True) yield self.jobs(corpus).create_index(sortasc('crawling_status') + sortasc('indexing_status'), background=True) yield self.jobs(corpus).create_index(sortasc('crawling_status') + sortasc('indexing_status') + sortasc('created_at'), background=True) yield self.stats(corpus).create_index(sortasc('timestamp'), background=True) yield self.stats(corpus).create_index(sortdesc('timestamp'), background=True) except OperationFailure as e: # catch and destroy old indices built with older pymongo versions if retry: yield self.db()['corpus'].drop_indexes() for coll in ["pages", "queue", "logs", "jobs", "stats"]: yield self._get_coll(corpus, coll).drop_indexes() yield self.init_corpus_indexes(corpus, retry=False) else: raise e
def sortdesc(field): return mongosort(DESCENDING(field))
def sortasc(field): return mongosort(ASCENDING(field))
def __init__(self, host, port, db, queue_col, page_col, jobid): store = MongoConnection(host, port)[db] self.jobid = jobid self.pageStore = store[page_col] self.queueStore = store[queue_col] self.queueStore.create_index(mongosort(ASCENDING("_job")))