Example #1
0
 def update_progress(self, rule_id):
     try:
         sql = "SELECT `progress` FROM task WHERE id=%s" % self.task_id
         progress = db.get(sql).progress
         if rule_id not in progress.split('|'):
             progress += '|%s' % rule_id
             sql = "UPDATE task SET `progress`='%s' WHERE id=%s" % (progress, self.task_id)
             db.execute(sql)
     except Exception:
         ERROR("update_progress exception")
Example #2
0
def get_target(task_id):
    sql = 'SELECT * FROM %s WHERE `ID`=%s' % (TASK_TABLE, task_id)
    task = db.get(sql)
    if not conf.url:
        conf.url = task.start_url
        conf.base = task.base
        conf.count = task.url_count
    conf.finished_progress = task.progress.split('|') if conf.goon else []
    conf.robots_parsed = task.robots_parsed if conf.goon else False
    conf.sitemap_parsed = task.sitemap_parsed if conf.goon else False
    conf.spider_finish = True if conf.goon and task.spider_flag == 3 else False
Example #3
0
def get_target(task_id):
    sql = 'SELECT * FROM %s WHERE `ID`=%s' %(TASK_TABLE, task_id)
    task = db.get(sql)
    if not conf.url:
        conf.url = task.start_url
        conf.base = task.base
        conf.count = task.url_count
    conf.finished_progress = task.progress.split('|') if conf.goon else []
    conf.robots_parsed = task.robots_parsed if conf.goon else False
    conf.sitemap_parsed = task.sitemap_parsed if conf.goon else False
    conf.spider_finish = True if conf.goon and task.spider_flag == 3 else False
Example #4
0
 def update_progress(self, rule_id):
     try:
         sql = "SELECT `progress` FROM task WHERE id=%s" % self.task_id
         progress = db.get(sql).progress
         if rule_id not in progress.split('|'):
             progress += '|%s' % rule_id
             sql = "UPDATE task SET `progress`='%s' WHERE id=%s" % (
                 progress, self.task_id)
             db.execute(sql)
     except Exception:
         ERROR("update_progress exception")
Example #5
0
def pipeline(request):
    try:
        data = [getattr(request,attr).encode('utf-8') for attr in ('url','method','params','referer')]

        sql_c = "SELECT COUNT(1) as `c` FROM %s" % (URL_TABLE)
        # sql_c += " WHERE `task_id`='%s' and `url`='%s' and `method`='%s' and `params`='%s'"
        # sql_c = sql_c % (conf.taskid, data[0], data[1], data[2])
        sql_c += " WHERE `task_id`=%s and `url`=%s and `method`=%s and `params`=%s"
        if db.get(sql_c, conf.taskid, data[0], data[1], data[2]).c > 0:
            return
        sql = "INSERT INTO %s" % (URL_TABLE)
        sql += "(`task_id`,`url`,`method`,`params`,`referer`,`start_time`) VALUES(%s,%s,%s,%s,%s,%s)"
        data.append(datetime.now())
        return db.execute(sql, conf.taskid, *data)
    except Exception:
        ERROR("Crawler.pipeline Exception")
Example #6
0
def pipeline(request):
    try:
        data = [
            getattr(request, attr).encode('utf-8')
            for attr in ('url', 'method', 'params', 'referer')
        ]

        sql_c = "SELECT COUNT(1) as `c` FROM %s" % (URL_TABLE)
        # sql_c += " WHERE `task_id`='%s' and `url`='%s' and `method`='%s' and `params`='%s'"
        # sql_c = sql_c % (conf.taskid, data[0], data[1], data[2])
        sql_c += " WHERE `task_id`=%s and `url`=%s and `method`=%s and `params`=%s"
        if db.get(sql_c, conf.taskid, data[0], data[1], data[2]).c > 0:
            return
        sql = "INSERT INTO %s" % (URL_TABLE)
        sql += "(`task_id`,`url`,`method`,`params`,`referer`,`start_time`) VALUES(%s,%s,%s,%s,%s,%s)"
        data.append(datetime.now())
        return db.execute(sql, conf.taskid, *data)
    except Exception:
        ERROR("Crawler.pipeline Exception")