def fetch_update_data(self, data_list=[], proxy=None, **kwargs): '''获取更新数据 @return 无论请求data_list 0 为空(无视) -401 错误(需要重试,程序出错,语法或者由于异常删除造成错误,需要检查程序) -402 数据异常(需要重试,需要检验数据获取情况) -400 代理异常(须重试,可以无视) -200 非200状态,代理异常或者数据异常(须重试,特别注意此种情况是否进入死循环) 200 正常状态,并非指http状态码 404 产品不存在已被删除 ''' # 根据url进行网站判断, 进而调用网站爬虫的模块 update_url = kwargs.get('update_url', '') if not update_url: return if '360' in update_url: return supplier_name = update_url.split('.')[1] if supplier_name is None: return None headers = { 'user-agent': random.choice(config.USER_AGENT_LIST), } try: if not hasattr(supplier, supplier_name): module_name = 'supplier.{0}'.format(supplier_name) if module_name not in sys.modules: __import__(module_name) obj = sys.modules[module_name] else: obj = getattr(supplier, supplier_name) if 'fetch_update_data' in dir(obj): _fetch_update_data = getattr(obj, 'fetch_update_data') else: kwargs['status'] = -401 data_list.append(kwargs) return None except Exception as e: config.LOG.exception('STATUS: -401, ID: {0} 导入错误,将进行重试: {1}'.format(kwargs['id'], e)) kwargs['status'] = -401 data_list.append(kwargs) return None try: kwargs['headers'] = headers kwargs['proxy'] = proxy data_list.append(_fetch_update_data(**kwargs)) except Exception as e: kwargs['status'] = -402 if 'headers' in kwargs: del kwargs['headers'] if 'proxy' in kwargs: del kwargs['proxy'] data_list.append(kwargs) config.LOG.exception('STATUS: -402, ID: %(id)s 错误: %s', {'id': util.u2b(kwargs['id']), 'e': util.traceback_info(e)}, e)
def fetch_search_data(self, data_list=[], err_list=[], proxy=None, supp=None, **kwargs): """ 根据搜索关键词获取产品产品数据(可能为url也可能为详细信息) """ if not supp or 'keyword' not in kwargs: return None headers = { 'user-agent': random.choice(config.USER_AGENT_LIST), } keyword = util.u2b(kwargs['keyword']) supplier_name = config.DB_KEY[supp] try: if not hasattr(supplier, supplier_name): module_name = 'supplier.{0}'.format(supplier_name) if module_name not in sys.modules: __import__(module_name) obj = sys.modules[module_name] else: obj = getattr(supplier, supplier_name) if hasattr(obj, 'api_search_data'): _fetch_function = getattr(obj, 'api_search_data') else: _fetch_function = getattr(obj, 'fetch_search_data') except Exception as e: config.LOG.exception('STATUS: -401, Keyword: %(keyword)s', {'keyword': keyword}) if kwargs.get('count', 1) < self.exception_threshold: kwargs['status'] = -401 kwargs['count'] = kwargs.get('count', 1) + 1 err_list.append(kwargs) return None data_dict = { 'detail': [], 'list': [], 'url': [] } if self.optype == 'hot' and self.use: kwargs['hot_search'] = True del kwargs['keyword'] try: _fetch_function(keyword, supp, data_dict, headers, **kwargs) except Exception as e: config.LOG.exception('STATUS: -402, Keyword: %(keyword)s', {'keyword': keyword}) if kwargs.get('count', 1) < self.exception_threshold: kwargs['status'] = -402 kwargs['count'] = kwargs.get('count', 1) + 1 kwargs['keyword'] = keyword err_list.append(kwargs) return None if data_dict['list']: try: _fetch_function = getattr(obj, 'fetch_search_list') except Exception as e: _fetch_function = None print(util.traceback_info(e, return_all=1)) if _fetch_function: res = self._crawl(_fetch_function, data_dict['list'], headers, proxy) if 'url' in res: for url in res['url']: data_dict['url'].append(url) if 'detail' in res: for data in res['detail']: data_dict['detail'].append(data) if data_dict['url']: try: _fetch_function = getattr(obj, 'fetch_data') except Exception as e: _fetch_function = None print(util.traceback_info(e, return_all=1)) if _fetch_function: res = self._crawl(_fetch_function, data_dict['url'], headers, proxy) if 'detail' in res: for data in res['detail']: data_dict['detail'].append(data) for data in data_dict['detail']: pass data_list.append(data) ''' 此处进行每条数据的清洗整理 ''' return data_list