def update_progress(self, rule_id): try: sql = "SELECT `progress` FROM task WHERE id=%s" % self.task_id progress = db.get(sql).progress if rule_id not in progress.split('|'): progress += '|%s' % rule_id sql = "UPDATE task SET `progress`='%s' WHERE id=%s" % (progress, self.task_id) db.execute(sql) except Exception: ERROR("update_progress exception")
def get_target(task_id): sql = 'SELECT * FROM %s WHERE `ID`=%s' % (TASK_TABLE, task_id) task = db.get(sql) if not conf.url: conf.url = task.start_url conf.base = task.base conf.count = task.url_count conf.finished_progress = task.progress.split('|') if conf.goon else [] conf.robots_parsed = task.robots_parsed if conf.goon else False conf.sitemap_parsed = task.sitemap_parsed if conf.goon else False conf.spider_finish = True if conf.goon and task.spider_flag == 3 else False
def get_target(task_id): sql = 'SELECT * FROM %s WHERE `ID`=%s' %(TASK_TABLE, task_id) task = db.get(sql) if not conf.url: conf.url = task.start_url conf.base = task.base conf.count = task.url_count conf.finished_progress = task.progress.split('|') if conf.goon else [] conf.robots_parsed = task.robots_parsed if conf.goon else False conf.sitemap_parsed = task.sitemap_parsed if conf.goon else False conf.spider_finish = True if conf.goon and task.spider_flag == 3 else False
def update_progress(self, rule_id): try: sql = "SELECT `progress` FROM task WHERE id=%s" % self.task_id progress = db.get(sql).progress if rule_id not in progress.split('|'): progress += '|%s' % rule_id sql = "UPDATE task SET `progress`='%s' WHERE id=%s" % ( progress, self.task_id) db.execute(sql) except Exception: ERROR("update_progress exception")
def pipeline(request): try: data = [getattr(request,attr).encode('utf-8') for attr in ('url','method','params','referer')] sql_c = "SELECT COUNT(1) as `c` FROM %s" % (URL_TABLE) # sql_c += " WHERE `task_id`='%s' and `url`='%s' and `method`='%s' and `params`='%s'" # sql_c = sql_c % (conf.taskid, data[0], data[1], data[2]) sql_c += " WHERE `task_id`=%s and `url`=%s and `method`=%s and `params`=%s" if db.get(sql_c, conf.taskid, data[0], data[1], data[2]).c > 0: return sql = "INSERT INTO %s" % (URL_TABLE) sql += "(`task_id`,`url`,`method`,`params`,`referer`,`start_time`) VALUES(%s,%s,%s,%s,%s,%s)" data.append(datetime.now()) return db.execute(sql, conf.taskid, *data) except Exception: ERROR("Crawler.pipeline Exception")
def pipeline(request): try: data = [ getattr(request, attr).encode('utf-8') for attr in ('url', 'method', 'params', 'referer') ] sql_c = "SELECT COUNT(1) as `c` FROM %s" % (URL_TABLE) # sql_c += " WHERE `task_id`='%s' and `url`='%s' and `method`='%s' and `params`='%s'" # sql_c = sql_c % (conf.taskid, data[0], data[1], data[2]) sql_c += " WHERE `task_id`=%s and `url`=%s and `method`=%s and `params`=%s" if db.get(sql_c, conf.taskid, data[0], data[1], data[2]).c > 0: return sql = "INSERT INTO %s" % (URL_TABLE) sql += "(`task_id`,`url`,`method`,`params`,`referer`,`start_time`) VALUES(%s,%s,%s,%s,%s,%s)" data.append(datetime.now()) return db.execute(sql, conf.taskid, *data) except Exception: ERROR("Crawler.pipeline Exception")