def get_failed_req(self, parse_func_list, max_retry=3): request_list = [] temp_db = CCrawlerDbHandle() temp_db.set_db_table('db_crawlers', 't_failed_task') field_list = ['*'] for parse_func in parse_func_list: where = "Fcrawler_name='%s' and \ Fcall_back='%s' and \ Fstate='%s' and Fretry_times < %s order by Fmodify_time limit 10000"\ %(self.name, parse_func, TASK_STATE['failure'], max_retry) request_list.extend(temp_db.query(field_list, where)) temp_db.destroy() return request_list
def __init__(self): self.HEADERS['Host'] = 'sou.zhaopin.com' self.headers = self.HEADERS self._db = CCrawlerDbHandle() self.name = 'ZhilianTaskSpider' self.clean_failed_req = False
def __init__(self): self._db = CCrawlerDbHandle()
class RowBuilder(object): def __init__(self): self._db = CCrawlerDbHandle() # 接受新的条目及其配置 def update(self, item): self.whether_insert = False self.data = item['row'] # 基础原始数据,只读 settings = item['settings'] # 入库规则配置 self.item_format = settings['item_format'] self.item_db = settings['item_db'] self.item_table = settings['item_table'] self.write_table_method = settings['write_table_method'] self.data_res = {} # 用配置转化 def build_row_from_item(self): for key in self.item_format.keys(): field_type = self.item_format[key]['type'] raw_field = self.data.get(key) type_convert = TYPE_CONVERT_MAP[field_type] self.data_res[key] = type_convert(raw_field) is_default = (TYPE_DEFAULT_VALUE_MAP[self.item_format[key]['type']] == self.data_res[key]) is_required = self.item_format[key]['req'] if is_default and is_required: return False # 建立数据行失败 return True # 自定义结尾 def wind_up(self): pass # raise NotImplementedException # 自定义转化 def build_row_from_custom(self): self.data_res['Fcreate_time'] = time_now() self.data_res['Fmodify_time'] = time_now() # 写数据库 def write_database(self): self._db.set_db_table(self.item_db, self.item_table) # 构造入库条件 condition_list = [] update_data = copy.deepcopy(self.data_res) for key in self.item_format.keys(): if self.item_format[key]['dup']: condition = key + '=' + '\'' + str(self.data_res[key]) + '\'' update_data.pop(key) # 去掉作为更新条件的字段 condition_list.append(condition) where = ' and '.join(condition_list) print('UPDATE DATABASE CONDITION: %s' % (where)) field_list = ['*'] # 选择性写库 if self.write_table_method == 'update': # 1.更新 if self._db.query(field_list, where): self.data_res.pop('Fcreate_time') self._db.update(update_data, where) self._db.commit() else: self._db.insert(self.data_res) self.whether_insert = True self._db.commit() elif self.write_table_method == 'insert': # 2.无条件插入 self._db.insert(self.data_res) self._db.commit() self.whether_insert = True elif self.write_table_method == 'conditional_insert': # 3.有条件插入 if not self._db.query(field_list, where): self._db.insert(self.data_res) self._db.commit() self.whether_insert = True else: print("Please specify a DB writing method") # 条目转化开始工作 def process(self, item): self.update(item) if self.build_row_from_item(): self.build_row_from_custom() self.write_database() self.wind_up() else: print('Item is not qualified for writing database') return self.whether_insert
def __init__(self): self.headers = self.HEADERS self._db = CCrawlerDbHandle()
class UniversalSpider(Spider): # 通用配置 custom_settings = { 'DNSCACHE_ENABLED': True, 'ROBOTSTXT_OBEY': False, 'RETRY_ENABLED': True, 'DOWNLOAD_TIMEOUT': 5, 'DOWNLOAD_DELAY': 0.1, 'CONCURRENT_REQUESTS': 32, 'HTTPERROR_ALLOW_ALL': True, # 允许所有类型的返回通过中间件,调试用,发布时关闭 'CONCURRENT_REQUESTS_PER_DOMAIN': 32, 'CONCURRENT_REQUESTS_PER_IP': 32, 'COOKIES_ENABLED': True, 'DOWNLOADER_MIDDLEWARES': { 'crawlers.middlewares.downloader.exception_response.ExceptionResponse': 100 #'middlewares.downloader.record_status_code.RecordStatusCodeMiddleware': 110, }, 'ITEM_PIPELINES': { 'crawlers.universal_spider.UniversalPipeline': 100 } } # 通用请求报头 HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh,zh-HK;q=0.8,zh-CN;q=0.7,en-US;q=0.5,en;q=0.3,el;q=0.2', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', #'Host': 'neets.cc', #'Origin':'http://neets.cc/', #'Referer': 'http://neets.cc/', 'DNT': '1', } # 初始化爬虫 def __init__(self): self.headers = self.HEADERS self._db = CCrawlerDbHandle() # 重新提交上次爬取失败的请求,一次获取最大10000条 def get_failed_req(self, parse_func_list, max_retry=3): request_list = [] temp_db = CCrawlerDbHandle() temp_db.set_db_table('db_crawlers', 't_failed_task') field_list = ['*'] for parse_func in parse_func_list: where = "Fcrawler_name='%s' and \ Fcall_back='%s' and \ Fstate='%s' and Fretry_times < %s order by Fmodify_time limit 10000"\ %(self.name, parse_func, TASK_STATE['failure'], max_retry) request_list.extend(temp_db.query(field_list, where)) temp_db.destroy() return request_list # 关闭爬虫时调用 def closed(self, reason): self._db.destroy() print("CLOSE_REASON:%s" % reason)
class ExceptionResponse(object): """This middleware enables working with sites that change the user-agent""" def __init__(self, debug=False): self.db = CCrawlerDbHandle() self.db.set_db_table('db_crawlers', 't_failed_task') dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self): print("close ExceptionResponse") self.db.destroy() def process_response(self, request, response, spider): status_code = response.status if response.status == 200: if request.meta.get(REQ_FAIL_MARK,False): self.inform_success(request, spider) return response record_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) print("ERROR RESPONSE STATUS is: %s, url: %s, time: %s" % (status_code, response.url, record_time)) try: retry_time = self.inform_failure(request, spider) # 通报失败,将失败信息收入通用表 if REQ_FAIL_PROCFUN in dir(spider): # 通报失败,执行定制的失败处理方法 getattr(spider,REQ_FAIL_PROCFUN)(retry_time,request,response) except Exception as e: print("MIDDLEWARE PROCESS RESPONSE:%s" % e) return response # 失败请求会写通用表 def inform_failure(self, request, spider): # 初始化变量 retry_time = 0 # 参数检测 if not spider.name: print('spider name not found for request %s'%(request.url)) return retry_time # 更新或插入失败请求记录 field_list = ['*'] where = "Fcrawler_name='%s' and Fcall_back='%s' and Furl='%s'"\ %(spider.name, request.meta['parse'],request.url) datar = self.db.query(field_list, where) if datar: datar = datar[0] retry_time = datar['Fretry_times'] + 1 datau = { 'Fretry_times':retry_time, 'Fstate':TASK_STATE['failure'], 'Fmeta':json.dumps(request.meta), 'Fmodify_time':time_now(), } self.db.update(datau, where) else: retry_time = 0 datai = { 'Fcrawler_name':spider.name, 'Fcall_back':request.meta['parse'], 'Furl':request.url, 'Fstate':TASK_STATE['failure'], 'Fmeta':json.dumps(request.meta), 'Fmethod':request.method, 'Fencoding':request.encoding, 'Fretry_times':retry_time, 'Fcreate_time':time_now(), 'Fmodify_time':time_now(), } self.db.insert(datai) self.db.commit() return retry_time # 成功请求回写通用表 def inform_success(self, request, spider): # 参数检测 if not spider.name: print('spider name not found for request %s'%(request.url)) return # 更新或插入失败请求记录 field_list = ['*'] where = "Fcrawler_name='%s' and Fcall_back='%s' and Furl='%s'"\ %(spider.name, request.meta['parse'],request.url) datar = self.db.query(field_list, where) if datar: datar = datar[0] datau = { 'Fretry_times':datar['Fretry_times'] + 1, 'Fstate':TASK_STATE['success'], 'Fmeta':json.dumps(request.meta), 'Fmodify_time':time_now(), } self.db.update(datau, where) self.db.commit()
def __init__(self, debug=False): self.db = CCrawlerDbHandle() self.db.set_db_table('db_crawlers', 't_failed_task') dispatcher.connect(self.spider_closed, signals.spider_closed)