def get_failed_req(self, parse_func_list, max_retry=3):
        request_list = []
        temp_db = CCrawlerDbHandle()
        temp_db.set_db_table('db_crawlers', 't_failed_task')
        field_list = ['*']

        for parse_func in parse_func_list:
            where = "Fcrawler_name='%s' and \
                    Fcall_back='%s' and \
                    Fstate='%s' and Fretry_times < %s order by Fmodify_time limit 10000"\
                    %(self.name, parse_func, TASK_STATE['failure'], max_retry)
            request_list.extend(temp_db.query(field_list, where))
        temp_db.destroy()
        return request_list
Exemple #2
0
 def __init__(self):
     self.HEADERS['Host'] = 'sou.zhaopin.com'
     self.headers = self.HEADERS
     self._db = CCrawlerDbHandle()
     self.name = 'ZhilianTaskSpider'
     self.clean_failed_req = False
 def __init__(self):
     self._db = CCrawlerDbHandle()
class RowBuilder(object):
    def __init__(self):
        self._db = CCrawlerDbHandle()

    # 接受新的条目及其配置
    def update(self, item):
        self.whether_insert = False
        self.data = item['row']  # 基础原始数据,只读
        settings = item['settings']  # 入库规则配置
        self.item_format = settings['item_format']
        self.item_db = settings['item_db']
        self.item_table = settings['item_table']
        self.write_table_method = settings['write_table_method']
        self.data_res = {}

    # 用配置转化
    def build_row_from_item(self):
        for key in self.item_format.keys():
            field_type = self.item_format[key]['type']
            raw_field = self.data.get(key)
            type_convert = TYPE_CONVERT_MAP[field_type]
            self.data_res[key] = type_convert(raw_field)
            is_default = (TYPE_DEFAULT_VALUE_MAP[self.item_format[key]['type']]
                          == self.data_res[key])
            is_required = self.item_format[key]['req']
            if is_default and is_required:
                return False  # 建立数据行失败
        return True

    # 自定义结尾
    def wind_up(self):
        pass
        # raise NotImplementedException

    # 自定义转化
    def build_row_from_custom(self):
        self.data_res['Fcreate_time'] = time_now()
        self.data_res['Fmodify_time'] = time_now()

    # 写数据库
    def write_database(self):
        self._db.set_db_table(self.item_db, self.item_table)
        # 构造入库条件
        condition_list = []
        update_data = copy.deepcopy(self.data_res)
        for key in self.item_format.keys():
            if self.item_format[key]['dup']:
                condition = key + '=' + '\'' + str(self.data_res[key]) + '\''
                update_data.pop(key)  # 去掉作为更新条件的字段
                condition_list.append(condition)

        where = ' and '.join(condition_list)
        print('UPDATE DATABASE CONDITION: %s' % (where))
        field_list = ['*']

        # 选择性写库
        if self.write_table_method == 'update':  # 1.更新
            if self._db.query(field_list, where):
                self.data_res.pop('Fcreate_time')
                self._db.update(update_data, where)
                self._db.commit()
            else:
                self._db.insert(self.data_res)
                self.whether_insert = True
                self._db.commit()

        elif self.write_table_method == 'insert':  # 2.无条件插入
            self._db.insert(self.data_res)
            self._db.commit()
            self.whether_insert = True

        elif self.write_table_method == 'conditional_insert':  # 3.有条件插入
            if not self._db.query(field_list, where):
                self._db.insert(self.data_res)
                self._db.commit()
                self.whether_insert = True
        else:
            print("Please specify a DB writing method")

    # 条目转化开始工作
    def process(self, item):
        self.update(item)

        if self.build_row_from_item():
            self.build_row_from_custom()
            self.write_database()
            self.wind_up()
        else:
            print('Item is not qualified for writing database')
        return self.whether_insert
 def __init__(self):
     self.headers = self.HEADERS
     self._db = CCrawlerDbHandle()
class UniversalSpider(Spider):
    # 通用配置
    custom_settings = {
        'DNSCACHE_ENABLED': True,
        'ROBOTSTXT_OBEY': False,
        'RETRY_ENABLED': True,
        'DOWNLOAD_TIMEOUT': 5,
        'DOWNLOAD_DELAY': 0.1,
        'CONCURRENT_REQUESTS': 32,
        'HTTPERROR_ALLOW_ALL': True,  # 允许所有类型的返回通过中间件,调试用,发布时关闭
        'CONCURRENT_REQUESTS_PER_DOMAIN': 32,
        'CONCURRENT_REQUESTS_PER_IP': 32,
        'COOKIES_ENABLED': True,
        'DOWNLOADER_MIDDLEWARES': {
            'crawlers.middlewares.downloader.exception_response.ExceptionResponse':
            100
            #'middlewares.downloader.record_status_code.RecordStatusCodeMiddleware': 110,
        },
        'ITEM_PIPELINES': {
            'crawlers.universal_spider.UniversalPipeline': 100
        }
    }

    # 通用请求报头
    HEADERS = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language':
        'zh,zh-HK;q=0.8,zh-CN;q=0.7,en-US;q=0.5,en;q=0.3,el;q=0.2',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        #'Host': 'neets.cc',
        #'Origin':'http://neets.cc/',
        #'Referer': 'http://neets.cc/',
        'DNT': '1',
    }

    # 初始化爬虫
    def __init__(self):
        self.headers = self.HEADERS
        self._db = CCrawlerDbHandle()

    # 重新提交上次爬取失败的请求,一次获取最大10000条
    def get_failed_req(self, parse_func_list, max_retry=3):
        request_list = []
        temp_db = CCrawlerDbHandle()
        temp_db.set_db_table('db_crawlers', 't_failed_task')
        field_list = ['*']

        for parse_func in parse_func_list:
            where = "Fcrawler_name='%s' and \
                    Fcall_back='%s' and \
                    Fstate='%s' and Fretry_times < %s order by Fmodify_time limit 10000"\
                    %(self.name, parse_func, TASK_STATE['failure'], max_retry)
            request_list.extend(temp_db.query(field_list, where))
        temp_db.destroy()
        return request_list

    # 关闭爬虫时调用
    def closed(self, reason):
        self._db.destroy()
        print("CLOSE_REASON:%s" % reason)
class ExceptionResponse(object):
    """This middleware enables working with sites that change the user-agent"""

    def __init__(self, debug=False):
        self.db = CCrawlerDbHandle()
        self.db.set_db_table('db_crawlers', 't_failed_task')
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self):
        print("close ExceptionResponse")
        self.db.destroy()

    def process_response(self, request, response, spider):
        status_code = response.status
        if response.status == 200:
            if request.meta.get(REQ_FAIL_MARK,False):
                self.inform_success(request, spider)
            return response

        record_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        print("ERROR RESPONSE STATUS is: %s, url: %s, time: %s" % (status_code, response.url, record_time))
        try:
            retry_time = self.inform_failure(request, spider)       # 通报失败,将失败信息收入通用表
            if REQ_FAIL_PROCFUN in dir(spider):                     # 通报失败,执行定制的失败处理方法
                getattr(spider,REQ_FAIL_PROCFUN)(retry_time,request,response)
            
        except Exception as e:
            print("MIDDLEWARE PROCESS RESPONSE:%s" % e)
        return response

    # 失败请求会写通用表
    def inform_failure(self, request, spider):
        # 初始化变量
        retry_time = 0
        
        # 参数检测
        if not spider.name:
            print('spider name not found for request %s'%(request.url))
            return retry_time
        
        # 更新或插入失败请求记录
        field_list = ['*']
        where = "Fcrawler_name='%s' and Fcall_back='%s' and Furl='%s'"\
                    %(spider.name, request.meta['parse'],request.url)
        datar = self.db.query(field_list, where)
        if datar:
            datar = datar[0]
            retry_time = datar['Fretry_times'] + 1
            datau = {
                'Fretry_times':retry_time,
                'Fstate':TASK_STATE['failure'],
                'Fmeta':json.dumps(request.meta),
                'Fmodify_time':time_now(),
            }
            self.db.update(datau, where)
        else:
            retry_time = 0
            datai = {
                'Fcrawler_name':spider.name,
                'Fcall_back':request.meta['parse'],
                'Furl':request.url,
                'Fstate':TASK_STATE['failure'],
                'Fmeta':json.dumps(request.meta),
                'Fmethod':request.method,
                'Fencoding':request.encoding,
                'Fretry_times':retry_time,
                'Fcreate_time':time_now(),
                'Fmodify_time':time_now(),
            }
            self.db.insert(datai)
        self.db.commit()
        return retry_time
    
    # 成功请求回写通用表
    def inform_success(self, request, spider):
        # 参数检测
        if not spider.name:
            print('spider name not found for request %s'%(request.url))
            return 
        
        # 更新或插入失败请求记录
        field_list = ['*']
        where = "Fcrawler_name='%s' and Fcall_back='%s' and Furl='%s'"\
                    %(spider.name, request.meta['parse'],request.url)
        datar = self.db.query(field_list, where)
        if datar:
            datar = datar[0]
            datau = {
                'Fretry_times':datar['Fretry_times'] + 1,
                'Fstate':TASK_STATE['success'],
                'Fmeta':json.dumps(request.meta),
                'Fmodify_time':time_now(),
            }
            self.db.update(datau, where)
            self.db.commit()
 def __init__(self, debug=False):
     self.db = CCrawlerDbHandle()
     self.db.set_db_table('db_crawlers', 't_failed_task')
     dispatcher.connect(self.spider_closed, signals.spider_closed)