def _addRecrawlTime(pend_id, recrawl_time, crawlfail=False): pvalues = {} pvalues['id'] = pend_id pvalues['recrawl_times'] = recrawl_time + 1 if crawlfail: pend_ref = models.CrawlFailPending() else: pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save(update=True)
def _updateScheduleTime(pend_id, crawlfail=False): pvalues = {} pvalues['id'] = pend_id pvalues['schedule_time'] = int(timeutils.utcnow_ts()) if crawlfail: pend_ref = models.CrawlFailPending() else: pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save(update=True)
def _updateCrawlStatus(pend_id, crawl_status, crawlfail=False): pvalues = {} pvalues['id'] = pend_id pvalues['crawl_status'] = crawl_status if crawlfail: pend_ref = models.CrawlFailPending() else: pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save(update=True)
def _updateScheduleDoc(pend_id, recrawl_time, crawl_status, crawlfail=False): pvalues = {} pvalues['id'] = pend_id pvalues['crawl_status'] = crawl_status pvalues['schedule_time'] = int(timeutils.utcnow_ts()) pvalues['recrawl_times'] = recrawl_time + 1 if crawlfail: pend_ref = models.CrawlFailPending() else: pend_ref = models.CrawlPending() pend_ref.update(pvalues) pend_ref.save(update=True)
def saveFailCrawlDoc(crawldoc): '''step1: save crawl fail crawldoc to crawl_fail_result, docid can repeat step2: update crawl url status which at crawl_pending to crawled step3: update or insert crawl fail status which at crawl_fail_pending''' values = crawldoc.convert values['reservation_dict'] = str(crawldoc.reservation_dict) values['created_at'] = timeutils.utcnow_ts() utils.convert_datetimes(values, 'created_at', 'deleted_at', 'updated_at') crawldoc_ref = models.CrawlFailResult() crawldoc_ref.update(values) crawldoc_ref.save() _updateCrawlStatus(crawldoc.pending_id, 'crawled', crawlfail=False) pvalues = {} pvalues['request_url'] = crawldoc.request_url pvalues['reservation_dict'] = str(crawldoc.reservation_dict) pvalues['level'] = crawldoc.level pvalues['detect_time'] = crawldoc.detect_time pvalues['crawl_status'] = 'fresh' pvalues['parent_docid'] = crawldoc.parent_docid pvalues['recrawl_times'] = 0 pend_ref = models.CrawlFailPending() pend_ref.update(pvalues) pend_ref.save()