コード例 #1
0
    def fetch_update_data(self, data_list=[], proxy=None, **kwargs):
        '''获取更新数据

        @return
            无论请求data_list
                0       为空(无视)
                -401      错误(需要重试,程序出错,语法或者由于异常删除造成错误,需要检查程序)
                -402      数据异常(需要重试,需要检验数据获取情况)
                -400    代理异常(须重试,可以无视)
                -200    非200状态,代理异常或者数据异常(须重试,特别注意此种情况是否进入死循环)
                200     正常状态,并非指http状态码
                404     产品不存在已被删除

        '''
        # 根据url进行网站判断, 进而调用网站爬虫的模块

        update_url = kwargs.get('update_url', '')
        if not update_url:
            return
        if '360' in update_url:
            return
        supplier_name = update_url.split('.')[1]
        if supplier_name is None:
            return None
        headers = {
            'user-agent': random.choice(config.USER_AGENT_LIST),
        }
        try:
            if not hasattr(supplier, supplier_name):
                module_name = 'supplier.{0}'.format(supplier_name)
                if module_name not in sys.modules:
                    __import__(module_name)
                obj = sys.modules[module_name]
            else:
                obj = getattr(supplier, supplier_name)
            if 'fetch_update_data' in dir(obj):
                _fetch_update_data = getattr(obj, 'fetch_update_data')
            else:
                kwargs['status'] = -401
                data_list.append(kwargs)
                return None
        except Exception as e:
            config.LOG.exception('STATUS: -401, ID: {0} 导入错误,将进行重试: {1}'.format(kwargs['id'], e))
            kwargs['status'] = -401
            data_list.append(kwargs)
            return None
        try:
            kwargs['headers'] = headers
            kwargs['proxy'] = proxy
            data_list.append(_fetch_update_data(**kwargs))
        except Exception as e:
            kwargs['status'] = -402
            if 'headers' in kwargs:
                del kwargs['headers']
            if 'proxy' in kwargs:
                del kwargs['proxy']
            data_list.append(kwargs)
            config.LOG.exception('STATUS: -402, ID: %(id)s 错误: %s',
                                 {'id': util.u2b(kwargs['id']), 'e': util.traceback_info(e)}, e)
コード例 #2
0
    def fetch_search_data(self, data_list=[], err_list=[], proxy=None, supp=None, **kwargs):
        """
        根据搜索关键词获取产品产品数据(可能为url也可能为详细信息)

        """
        if not supp or 'keyword' not in kwargs:
            return None
        headers = {
            'user-agent': random.choice(config.USER_AGENT_LIST),
        }
        keyword = util.u2b(kwargs['keyword'])
        supplier_name = config.DB_KEY[supp]
        try:
            if not hasattr(supplier, supplier_name):
                module_name = 'supplier.{0}'.format(supplier_name)
                if module_name not in sys.modules:
                    __import__(module_name)
                obj = sys.modules[module_name]
            else:
                obj = getattr(supplier, supplier_name)
            if hasattr(obj, 'api_search_data'):
                _fetch_function = getattr(obj, 'api_search_data')
            else:
                _fetch_function = getattr(obj, 'fetch_search_data')
        except Exception as e:
            config.LOG.exception('STATUS: -401, Keyword: %(keyword)s', {'keyword': keyword})
            if kwargs.get('count', 1) < self.exception_threshold:
                kwargs['status'] = -401
                kwargs['count'] = kwargs.get('count', 1) + 1
                err_list.append(kwargs)
            return None
        data_dict = {
            'detail': [],
            'list': [],
            'url': []
        }
        if self.optype == 'hot' and self.use:
            kwargs['hot_search'] = True
        del kwargs['keyword']
        try:
            _fetch_function(keyword, supp, data_dict, headers, **kwargs)
        except Exception as e:
            config.LOG.exception('STATUS: -402, Keyword: %(keyword)s', {'keyword': keyword})
            if kwargs.get('count', 1) < self.exception_threshold:
                kwargs['status'] = -402
                kwargs['count'] = kwargs.get('count', 1) + 1
                kwargs['keyword'] = keyword
                err_list.append(kwargs)
            return None
        if data_dict['list']:
            try:
                _fetch_function = getattr(obj, 'fetch_search_list')
            except Exception as e:
                _fetch_function = None
                print(util.traceback_info(e, return_all=1))
            if _fetch_function:
                res = self._crawl(_fetch_function, data_dict['list'], headers, proxy)
                if 'url' in res:
                    for url in res['url']:
                        data_dict['url'].append(url)
                if 'detail' in res:
                    for data in res['detail']:
                        data_dict['detail'].append(data)
        if data_dict['url']:
            try:
                _fetch_function = getattr(obj, 'fetch_data')
            except Exception as e:
                _fetch_function = None
                print(util.traceback_info(e, return_all=1))
            if _fetch_function:
                res = self._crawl(_fetch_function, data_dict['url'], headers, proxy)
                if 'detail' in res:
                    for data in res['detail']:
                        data_dict['detail'].append(data)
        for data in data_dict['detail']:
            pass
            data_list.append(data)
            '''
            此处进行每条数据的清洗整理
            '''
        return data_list