def addPendingCrawlDocDict(url, level, parent_id, res_dict={}, text='', real_url='', docid=0): pvalues = {} pvalues['request_url'] = url # fill url and docid when save to db pvalues['url'] = real_url if real_url == '': pvalues['url'] = urlutils.normalize(url) pvalues['docid'] = docid if docid == 0: pvalues['docid'] = mmh3.hash(pvalues['url']) pvalues['outlink_text'] = text pvalues['level'] = level + 1 pvalues['reservation_dict'] = str(res_dict) pvalues['detect_time'] = int(timeutils.utcnow_ts()) pvalues['crawl_status'] = 'fresh' pvalues['parent_docid'] = parent_id pvalues['recrawl_times'] = 0 return pvalues pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save()
def _addRecrawlTime(pend_id, recrawl_time, crawlfail=False): pvalues = {} pvalues['id'] = pend_id pvalues['recrawl_times'] = recrawl_time + 1 if crawlfail: pend_ref = models.CrawlFailPending() else: pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save(update=True)
def _updateScheduleTime(pend_id, crawlfail=False): pvalues = {} pvalues['id'] = pend_id pvalues['schedule_time'] = int(timeutils.utcnow_ts()) if crawlfail: pend_ref = models.CrawlFailPending() else: pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save(update=True)
def _updateCrawlStatus(pend_id, crawl_status, crawlfail=False): pvalues = {} pvalues['id'] = pend_id pvalues['crawl_status'] = crawl_status if crawlfail: pend_ref = models.CrawlFailPending() else: pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save(update=True)
def _updateScheduleDoc(pend_id, recrawl_time, crawl_status, crawlfail=False): pvalues = {} pvalues['id'] = pend_id pvalues['crawl_status'] = crawl_status pvalues['schedule_time'] = int(timeutils.utcnow_ts()) pvalues['recrawl_times'] = recrawl_time + 1 if crawlfail: pend_ref = models.CrawlFailPending() else: pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save(update=True)
def addPendingCrawlDoc(url, level, parent_id, res_dict={}, text=''): pvalues = addPendingCrawlDocDict(url, level, parent_id, res_dict, text) pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save()