Example #1
0
 def __init__(self, host, port, db, queue_col, page_col, jobid):
     store = MongoConnection(host, port)[db]
     self.jobid = jobid
     self.pageStore = store[page_col]
     self.queueStore = store[queue_col]
     self.queueStore.ensure_index(mongosort(ASCENDING('_job')),
                                  background=True)
Example #2
0
 def init_corpus_indexes(self, corpus, retry=True):
     try:
         yield self.db()['corpus'].create_index(sortdesc('last_activity'), background=True)
         yield self.WEs(corpus).create_index(sortasc('name'), background=True)
         yield self.WEs(corpus).create_index(sortasc('status'), background=True)
         yield self.WEs(corpus).create_index(sortasc('crawled'), background=True)
         yield self.WEs(corpus).create_index(mongosort(textIndex("$**")), background=True)
         yield self.WECRs(corpus).create_index(sortasc('prefix'), background=True)
         yield self.pages(corpus).create_index(sortasc('timestamp'), background=True)
         yield self.pages(corpus).create_index(sortasc('_job'), background=True)
         yield self.pages(corpus).create_index(sortasc('_job') + sortasc('forgotten'), background=True)
         yield self.pages(corpus).create_index(sortasc('url'), background=True)
         yield self.queue(corpus).create_index(sortasc('timestamp'), background=True)
         yield self.queue(corpus).create_index(sortasc('_job') + sortdesc('timestamp'), background=True)
         yield self.logs(corpus).create_index(sortasc('timestamp'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawling_status'), background=True)
         yield self.jobs(corpus).create_index(sortasc('indexing_status'), background=True)
         yield self.jobs(corpus).create_index(sortasc('webentity_id'), background=True)
         yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortasc('created_at'), background=True)
         yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortdesc('created_at'), background=True)
         yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortasc("crawling_status") + sortasc("indexing_status") + sortasc('created_at'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawling_status') + sortasc('indexing_status') + sortasc('created_at'), background=True)
         yield self.stats(corpus).create_index(sortasc('timestamp'), background=True)
     except OperationFailure as e:
         # catch and destroy old indices built with older pymongo versions
         if retry:
             yield self.db()['corpus'].drop_indexes()
             for coll in ["pages", "queue", "logs", "jobs", "stats"]:
                 yield self._get_coll(corpus, coll).drop_indexes()
             yield self.init_corpus_indexes(corpus, retry=False)
         else:
             raise e
Example #3
0
 def init_corpus_indexes(self, corpus, retry=True):
     try:
         yield self.db()['corpus'].create_index(sortdesc('last_activity'), background=True)
         yield self.WEs(corpus).create_index(sortasc('name'), background=True)
         yield self.WEs(corpus).create_index(sortasc('status'), background=True)
         yield self.WEs(corpus).create_index(sortasc('crawled'), background=True)
         yield self.WEs(corpus).create_index(mongosort(textIndex("$**")), background=True)
         yield self.WECRs(corpus).create_index(sortasc('prefix'), background=True)
         yield self.pages(corpus).create_index(sortasc('timestamp'), background=True)
         yield self.pages(corpus).create_index(sortasc('_job'), background=True)
         yield self.pages(corpus).create_index(sortasc('_job') + sortasc('forgotten'), background=True)
         yield self.pages(corpus).create_index(sortasc('url'), background=True)
         yield self.queue(corpus).create_index(sortasc('timestamp'), background=True)
         yield self.queue(corpus).create_index(sortasc('_job'), background=True)
         yield self.queue(corpus).create_index(sortasc('_job') + sortdesc('timestamp'), background=True)
         yield self.logs(corpus).create_index(sortasc('timestamp'), background=True)
         yield self.jobs(corpus).create_index(sortasc('created_at'), background=True)
         yield self.jobs(corpus).create_index(sortasc('webentity_id'), background=True)
         yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortasc('created_at'), background=True)
         yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortdesc('created_at'), background=True)
         yield self.jobs(corpus).create_index(sortasc('webentity_id') + sortasc("crawling_status") + sortasc("indexing_status") + sortasc('created_at'), background=True)
         yield self.jobs(corpus).create_index(sortasc('previous_webentity_id'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawljob_id'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawljob_id') + sortasc('crawling_status'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawljob_id') + sortasc('indexing_status'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawljob_id') + sortasc('crawling_status') + sortasc('indexing_status'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawling_status'), background=True)
         yield self.jobs(corpus).create_index(sortasc('indexing_status'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawling_status') + sortasc('indexing_status'), background=True)
         yield self.jobs(corpus).create_index(sortasc('crawling_status') + sortasc('indexing_status') + sortasc('created_at'), background=True)
         yield self.stats(corpus).create_index(sortasc('timestamp'), background=True)
         yield self.stats(corpus).create_index(sortdesc('timestamp'), background=True)
     except OperationFailure as e:
         # catch and destroy old indices built with older pymongo versions
         if retry:
             yield self.db()['corpus'].drop_indexes()
             for coll in ["pages", "queue", "logs", "jobs", "stats"]:
                 yield self._get_coll(corpus, coll).drop_indexes()
             yield self.init_corpus_indexes(corpus, retry=False)
         else:
             raise e
Example #4
0
def sortdesc(field):
    return mongosort(DESCENDING(field))
Example #5
0
def sortasc(field):
    return mongosort(ASCENDING(field))
Example #6
0
def sortdesc(field):
    return mongosort(DESCENDING(field))
Example #7
0
def sortasc(field):
    return mongosort(ASCENDING(field))
Example #8
0
 def __init__(self, host, port, db, queue_col, page_col, jobid):
     store = MongoConnection(host, port)[db]
     self.jobid = jobid
     self.pageStore = store[page_col]
     self.queueStore = store[queue_col]
     self.queueStore.ensure_index(mongosort(ASCENDING('_job')), background=True)
Example #9
0
 def __init__(self, host, port, db, queue_col, page_col, jobid):
     store = MongoConnection(host, port)[db]
     self.jobid = jobid
     self.pageStore = store[page_col]
     self.queueStore = store[queue_col]
     self.queueStore.create_index(mongosort(ASCENDING("_job")))