Example #1
0
def addPendingCrawlDocDict(url,
                           level,
                           parent_id,
                           res_dict={},
                           text='',
                           real_url='',
                           docid=0):
    pvalues = {}
    pvalues['request_url'] = url
    # fill url and docid when save to db
    pvalues['url'] = real_url
    if real_url == '':
        pvalues['url'] = urlutils.normalize(url)
    pvalues['docid'] = docid
    if docid == 0:
        pvalues['docid'] = mmh3.hash(pvalues['url'])
    pvalues['outlink_text'] = text
    pvalues['level'] = level + 1
    pvalues['reservation_dict'] = str(res_dict)
    pvalues['detect_time'] = int(timeutils.utcnow_ts())
    pvalues['crawl_status'] = 'fresh'
    pvalues['parent_docid'] = parent_id
    pvalues['recrawl_times'] = 0
    return pvalues
    pend_ref = models.CrawlPending()
    pend_ref.update(pvalues)
    pend_ref.save()
Example #2
0
def _addRecrawlTime(pend_id, recrawl_time, crawlfail=False):
    pvalues = {}
    pvalues['id'] = pend_id
    pvalues['recrawl_times'] = recrawl_time + 1
    if crawlfail:
        pend_ref = models.CrawlFailPending()
    else:
        pend_ref = models.CrawlPending()
    pend_ref.update(pvalues)
    pend_ref.save(update=True)
Example #3
0
def _updateScheduleTime(pend_id, crawlfail=False):
    pvalues = {}
    pvalues['id'] = pend_id
    pvalues['schedule_time'] = int(timeutils.utcnow_ts())
    if crawlfail:
        pend_ref = models.CrawlFailPending()
    else:
        pend_ref = models.CrawlPending()
    pend_ref.update(pvalues)
    pend_ref.save(update=True)
Example #4
0
def _updateCrawlStatus(pend_id, crawl_status, crawlfail=False):
    pvalues = {}
    pvalues['id'] = pend_id
    pvalues['crawl_status'] = crawl_status
    if crawlfail:
        pend_ref = models.CrawlFailPending()
    else:
        pend_ref = models.CrawlPending()
    pend_ref.update(pvalues)
    pend_ref.save(update=True)
Example #5
0
def _updateScheduleDoc(pend_id, recrawl_time, crawl_status, crawlfail=False):
    pvalues = {}
    pvalues['id'] = pend_id
    pvalues['crawl_status'] = crawl_status
    pvalues['schedule_time'] = int(timeutils.utcnow_ts())
    pvalues['recrawl_times'] = recrawl_time + 1
    if crawlfail:
        pend_ref = models.CrawlFailPending()
    else:
        pend_ref = models.CrawlPending()
    pend_ref.update(pvalues)
    pend_ref.save(update=True)
Example #6
0
def addPendingCrawlDoc(url, level, parent_id, res_dict={}, text=''):
    pvalues = addPendingCrawlDocDict(url, level, parent_id, res_dict, text)
    pend_ref = models.CrawlPending()
    pend_ref.update(pvalues)
    pend_ref.save()