def put_queue_list(self, queue_name=None, message_list=None, print_info=True, exchange=''): '''提交异常至队列列表''' if not queue_name and not exchange: return None try: if not message_list: return None if isinstance(message_list, dict): message_list = [message_list] self.declare(queue_name, exchange=exchange) for message in message_list: if print_info: if 'abbreviation' in message: print('abbreviation : %s 数据已提交至队列 %s' % (Util.binary_type( message['goods_sn']), queue_name)) elif 'cp_id' in message: print('ID : %s 数据已提交至队列 %s' % (Util.binary_type(message['cp_id']), queue_name)) message = json.dumps(message) self.channel.basic_publish( exchange=exchange, routing_key=queue_name, body=message, properties=pika.BasicProperties( delivery_mode=2, # 持久化 )) self.close() except Exception as e: print(e) return None
def main(**kwargs): sd = kwargs.get('sd', '') ed = kwargs.get('ed', '') interval = kwargs.get('interval', 60) date_list = util.specified_date(sd, ed) data = [{'url': '1'}, {'url': '2'}] while 1: proxy = util.get_prolist(10) for _data in data: url = _data.get('url', '') if not url: continue fetch_data(url=url, proxy=proxy, headers=default_headers, **kwargs) ''' #根据url规律进行控制 ''' for str_time in date_list: pass if not interval: break print('-------------- sleep %s sec -------------' % interval) time.sleep(interval)
def parse_more(item=None, response=None): if not item or not response: return -404 root = lxml.html.fromstring(response.text.encode('utf-8')) data = {} # family_sn match = family_sn_pattern.search(response.url) data['family_sn'] = match.group(1) if match else item['goods_name'] # catlog breadcrumb = root.xpath('//p[@class="breadcrumb"]/a') data['catlog'] = [] for catlog in breadcrumb: catlog_name = util.cleartext(catlog.text_content()) catlog_url = util.urljoin(response.url, catlog.xpath('./@href')[0]) if catlog_name and catlog_url: data['catlog'].append([catlog_name, catlog_url]) else: data['catlog'] = [] break else: data['catlog'].append([data['family_sn'], response.url]) # doc doc = root.xpath('//li[@class="pdf"]/a[@class="doclink"]/@title') data['doc'] = "http://cds.linear.com/docs/en/datasheet/{title}".format( title=doc[0]) if doc else '' item.update(data) return item
def process_item(self, item, spider): """保存数据""" demo = item.get('demo') if not demo: raise DropItem("item data type error") # self.put_queue(item) data = copy.deepcopy(dict(item)) if not data: raise DropItem("item data is empty") # info = self.mongo.find_one({'demo': data['demo']}) demo_test = item.get('demo_test', '') if not demo_test: raise DropItem("demo_test is empty") # return condition = {'demo': demo} try: info = self.mysql.select(demo_test, condition=condition, limit=1) if not info: # self.mongo.insert(data) item['create_time'] = util.date() item['update_time'] = util.date() self.mysql.insert(demo_test, data=item) # _logger.info('success insert mysql : %s' % data['demo']) else: item['create_time'] = info['create_time'] item['update_time'] = util.date() # self.mongo.update({'_id': info['_id']}, {"$set": data}) self.mysql.update(demo_test, condition=condition, data=item) # _logger.info('success update mysql : %s' % data['demo']) except Exception as e: _logger.info('error op mysql : {0} : e {1}'.format( data['demo'], e)) raise DropItem('success process')
def fetch_data(url, proxy=None, headers=None, **kwargs): """获取页面数据 @param proxy 代理ip,[代理数量,代理列表] @param headers 头部信息,如user_agent @param kwargs 扩展参数,如fetch_update其表示是否为获取更新 @return 获取数据异常时返回信息为负值,成功为字典类型数据 """ _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: # 将进行重试,可忽略 logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) return -400 # 强制utf-8 resp.encoding = 'utf-8' if '404.html' in resp.url: return 404 return _parse_detail_data(resp, headers=_headers, **kwargs)
def parse_price(self, resp): """解析库存价格数据""" items = resp.meta.get('items') if not items: logger.error('request meta data error, url: %s', resp.url) return prices = {} try: data = json.loads(resp.body) for entprice in data['EntitledPrice']: tiered = [] if 'RangePrice' not in entprice: entprice['RangePrice'] = [] for vo in entprice['RangePrice']: qty = util.intval(vo['minimumQuantity']['value']) if 'minimumQuantity' in vo else 1 price = util.floatval(vo['priceInRange']['value']) if 'priceInRange' in vo else 0 if not qty or (tiered and qty < tiered[-1][0]): continue tiered.append([qty, price]) if not tiered: tiered.append([0, 0.0]) prices[entprice['productId']] = tiered except: logger.exception('parse stock price error, url: {0}---price_Json_error---{1}'.format(resp.url, resp.body) ) for item in items: if item['goods_sn'] in prices: item['tiered'] = prices[item['goods_sn']] yield item
def fetch_update_data(self, data_list=[], proxy=None, **kwargs): '''获取更新数据 @return 无论请求data_list 0 为空(无视) -401 错误(需要重试,程序出错,语法或者由于异常删除造成错误,需要检查程序) -402 数据异常(需要重试,需要检验数据获取情况) -400 代理异常(须重试,可以无视) -200 非200状态,代理异常或者数据异常(须重试,特别注意此种情况是否进入死循环) 200 正常状态,并非指http状态码 404 产品不存在已被删除 ''' # 根据url进行网站判断, 进而调用网站爬虫的模块 update_url = kwargs.get('update_url', '') if not update_url: return if '360' in update_url: return supplier_name = update_url.split('.')[1] if supplier_name is None: return None headers = { 'user-agent': random.choice(config.USER_AGENT_LIST), } try: if not hasattr(supplier, supplier_name): module_name = 'supplier.{0}'.format(supplier_name) if module_name not in sys.modules: __import__(module_name) obj = sys.modules[module_name] else: obj = getattr(supplier, supplier_name) if 'fetch_update_data' in dir(obj): _fetch_update_data = getattr(obj, 'fetch_update_data') else: kwargs['status'] = -401 data_list.append(kwargs) return None except Exception as e: config.LOG.exception('STATUS: -401, ID: {0} 导入错误,将进行重试: {1}'.format(kwargs['id'], e)) kwargs['status'] = -401 data_list.append(kwargs) return None try: kwargs['headers'] = headers kwargs['proxy'] = proxy data_list.append(_fetch_update_data(**kwargs)) except Exception as e: kwargs['status'] = -402 if 'headers' in kwargs: del kwargs['headers'] if 'proxy' in kwargs: del kwargs['proxy'] data_list.append(kwargs) config.LOG.exception('STATUS: -402, ID: %(id)s 错误: %s', {'id': util.u2b(kwargs['id']), 'e': util.traceback_info(e)}, e)
def parse_detail(self, data, category=None): """解析系列型号数据""" if category is None: category = {} item = GoodsItem() item['url'] = urlparse.urljoin(self.base_url, data['avn_pdp_seo_path']) item['goods_sn'] = data['uniqueID'] item['goods_name'] = data['mfPartNumber_ntk'].upper() if not item['goods_name']: return None if 'packageTypeCode' in item: item['goods_other_name'] = '{0}/{1}'.format(item['goods_name'], item['packageTypeCode']).upper() item['provider_name'] = data['manufacturer'] item['provider_url'] = '' item['goods_desc'] = data['shortDescription'] if 'shortDescription' in data else '' if 'avn_thumbnail' in data and data['avn_thumbnail']: item['goods_thumb'] = util.urljoin(self.base_url, data['avn_thumbnail']) else: item['goods_thumb'] = '' item['goods_img'] = item['goods_thumb'].replace('icon_thumb', 'icon_web') if 'auxDescription2' in data and data['auxDescription2']: item['doc'] = data['auxDescription2'] else: item['doc'] = '' min_qty = int(data['xcatField1']) if 'xcatField1' in data else 1 if 'multQuantity' in data: increment = int(data['multQuantity']) else: increment = 1 if 'inv_strlocqty' in data: stock_qty = util.intval(data['inv_strlocqty']) else: stock_qty = 0 item['rohs'] = 1 if 'ROHSComplianceCode' in data and data['ROHSComplianceCode'] == 'Y' else 0 item['tiered'] = [[0, 0.0]] item['stock'] = [stock_qty, min_qty] # 库存 item['increment'] = increment # 属性 item['attr'] = [] if 'attributes' not in data: data['attributes'] = [] for vo in data['attributes']: try: item['attr'].append([vo['name'], vo['values'][0]['value']]) except: pass # 分类 item['catlog'] = [] catelogs = data['parentCatgroup_id_path'].split('_')[-1].split(':') for vo in catelogs: if vo not in category: continue item['catlog'].append((category[vo], vo)) item['region'] = 'AMERICAS' item['id'] = 16 return item
def flush(self, table_name): ''' 刷新字段数据 ''' fields = self.get_fields(table_name) if not fields: return False self.fields[table_name] = fields if self.db_fields_cache: Util.file( '.fields/%s/%s_%s_%s' % (self.host, self.__class__.__name__, self.dbname, table_name), fields)
def exception_notice(etype=''): """异常通知""" now_minuter = util.date(format='%Y-%m-%d %H:%M') subject = '【HQChip】合作库存 %s 数据更新异常通知 %s' % (PN2, now_minuter) if etype == 'mysql': except_msg = 'mysql数据库连接异常' elif etype == 'mongo': except_msg = 'mongodb 数据库连接异常' else: except_msg = '数据获取异常' body = "合作库存 %s 数据更新数据获取异常, 异常原因:%s,请注意检查!" % (PN2, except_msg) util.sendmail(config.EMAIL_NOTICE.get( 'accept_list'), subject=subject, body=body)
def fetch_data(url, proxy=None, headers=None, **kwargs): """获取页面数据 @param proxy 代理ip,[代理数量,代理列表] @param headers 头部信息,如user_agent @param kwargs 扩展参数,如fetch_update其表示是否为获取更新 @return 获取数据异常时返回信息为负值,成功为字典类型数据 :param url: """ if 'goods_sn' in kwargs: del kwargs['goods_sn'] _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) if url[0:2] == '//': url = 'http:' + url try: proxies = None if proxy: i = random.randint(0, proxy[0] - 1) proxies = {'http': 'http://' + proxy[1][i]} ti_domain = urlparse.urlsplit(url)[1] if 'www.ti.com.cn' == ti_domain: product_path_pattern = re.compile(r'/cn/(.*)', re.IGNORECASE) product_path = product_path_pattern.search(url) if product_path: url = "http://www.ti.com/product/{path}".format( path=product_path.group(1)) elif 'store.ti.com' in ti_domain: kwargs['proxies'] = proxies return _parse_store_ti_com(url, **kwargs) resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: # 将进行重试,可忽略 logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) return -400 # 是否需要添加500的判断 # 强制utf-8 resp.encoding = 'utf-8' if '404.html' in resp.url: return 404 if '/tool/' in resp.url: return _parse_tool_detail(resp, **kwargs) kwargs['proxies'] = proxies return _parse_detail_data(resp, headers=_headers, **kwargs)
def load_js(self): ''' 加载js文件 :return: ''' file_path = util.get_static_file(self.js_file) # try: with open(file_path, 'r', encoding='utf-8') as fp: js_str = fp.read() except Exception as e: _logger.info('INFO: 加载js文件错误 {0}'.format(util.traceback_info(e))) js_str = '' return js_str
def parse_resp(self, resp): ''' 第一层处理,类别获取后进行下一层抓取 :param resp: :return: ''' item = GoodsItem() category = [] date_list = util.specified_date(self.start_date, end_date=self.end_date) for category_url in category: if self.abbreviation and self.abbreviation not in category_url: # 非指定的数据不进行抓取(指定彩种的情况下使用该选项) continue ''' 抓取规则 ''' today_url = '' # 获取保存的数据库 result_key = category_url.split('-')[1] demo_test = config.PKS_KEY_DICT.get(result_key, '') for history_date in date_list: date_time = ''.join(history_date.split('-')) url = today_url.replace('today', date_time) yield scrapy.Request(url=url, headers=self.headers, callback=self.parse_product, meta={'item': item})
def save_data(url, db_name, item): ''' 数据保存 ''' info = None if not info: item['create_time'] = util.date() mysql.insert(db_name, data=item) _logger.info('INFO: DB:%s 数据保存成功, 期号%s ; URL:%s' % (db_name, item['demo'], url)) else: item['update_time'] = util.date() del item['open_time'] del item['create_time'] mysql.update(db_name, condition=[('demo', '=', item['demo'])], data=item) _logger.info('INFO: DB:%s 数据已存在 更新成功, 期号: %s ; URL:%s' % (db_name, item['demo'], url))
def handle_of_redirects(item=None): item = item if item else {} if not item: return -404 search_url = 'http://www.linear.com.cn/search/index.php?q={search}'.format( search=item['goods_name']) _headers = copy.copy(default_headers) _headers.update({'Host': 'www.linear.com.cn'}) resp = requests.get(url=search_url, headers=_headers, allow_redirects=False) location = util.urljoin(resp.url, resp.headers.get('Location')) if 'product/' in location or 'solutions/' in location: try: response = requests.get(url=location, headers=_headers) except: logger.error("获取目录和文档失败 URL{url}".format(url=location)) return -404 return parse_more(item, response) elif 'search.php' in location: try: response = requests.get(url=location, headers=_headers) except: logger.error("获取搜索列表 URL{url}".format(url=location)) return -404 return filter_search_result(item, response)
def parse_resp(self, resp): search_match = self.product_url_pattern_0.search( urllib.unquote(resp.url)) detail_match = self.product_url_pattern_1.search( urllib.unquote(resp.url)) or self.product_url_pattern_2.search( urllib.unquote(resp.url)) print "=" * 30 print resp.url print urllib.unquote(resp.url) print detail_match print search_match if detail_match: yield self.parse_detail(resp) elif search_match: soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml') # 获取搜索数目 try: total = soup.find('h3', class_='results') total = util.intval(total.get_text(strip=True)) if total else 0 except: total = 0 pages = int(math.ceil(total / self.limit_num)) if pages <= 1: return for x in range(1, pages + 1): page_url = "http://cn.futureelectronics.com/zh/search.aspx?dsNav=Ro:%d,Aro:%d" % ( x * 10, x * 10) search_id = search_match.group(1) page_url = page_url + ',N:{search_id}'.format( search_id=search_id) yield Request(url=page_url, headers=self.headers, cookies=self.cookies)
def process_request(self, request, spider): scheme, url, port = util.get_host(request.url) try: proxies = get_proxies(settings['USE_PROXY']) except: raise NotConfigured request.meta["proxy"] = proxies[scheme]
def write_update_info(self, num_list): '''记录更新信息 @param num_list 记录每次更新数目信息 @param name 记录类型值,默认count为成功值 ''' if not num_list: return None mq.put('crawler_update_stats', {'data': num_list, 'time': util.date()})
def _init_args(self, **kwargs): start_url = kwargs.get('START_URL', '') self.abbreviation = kwargs.get('ABBREVIATION', '') self.start_date = kwargs.get('START_DATE', '') self.end_date = kwargs.get('END_DATE', '') self.end_date = self.end_date if self.end_date else util.date() if start_url: self.start_urls = [start_url] self.rules = (Rule(LinkExtractor(allow=filter_rules), callback='parse_resp', follow=True), )
def _check_table_info(self, table_name): # 只在第一次执行记录 if table_name not in self.fields: # 如果数据表字段没有定义则自动获取 if self.db_fields_cache: self.fields[table_name] = Util.file( '.fields/%s/%s_%s_%s' % (self.host, self.__class__.__name__, self.dbname, table_name)) if not self.fields[table_name]: self.flush(table_name) else: # 每次都会读取数据表信息 self.flush(table_name)
def get_proxies(proxies_type=1): ''' 返回指定代理 | 每次更新20个代理 :param proxies_type: int 代理類型 :return: proxies_dict ''' if queue.qsize() > 0: return queue.get() if proxies_type == 1: proxies = util.get_abuyun_proxies() for i in range(20): queue.put(proxies) else: get_web_proxy() return queue.get()
def run(args): if not isinstance(args, argparse.Namespace): print('参数有误') return interval = args.interval while 1: try: PutQueue(**args.__dict__) if args.interval <= 0: break print('------------- sleep %s sec -------------' % interval) time.sleep(interval) except Exception as e: if 'params_error' in e: break print(util.traceback_info(e, return_all=True))
def get_time_desc(t): """ 获取时间描述 :param t: :return: """ _time_desc = '' h = int(t / 3600) if h >= 1: _time_desc += '%s 小时' % h m = int((t - h * 3600) / 60) if m >= 1: _time_desc += '%s 分' % m s = util.number_format(t - h * 3600 - m * 60, 3) if s >= 0: _time_desc += '%s 秒' % s return _time_desc
def fetch_data(url, proxy=None, headers=None, **kwargs): ''' 获取页面数据 @description @param proxy 代理ip,[代理数量,代理列表] @param headers 头部信息,如user_agent @param kwargs 扩展参数,如fetch_update其表示是否为获取更新 @return 获取数据异常时返回信息为负值,成功为字典类型数据 ''' if isinstance(headers, dict): default_headers = headers try: proxies = None if proxy: i = random.randint(0, proxy[0] - 1) proxies = {'http': 'http://' + proxy[1][i]} sess = requests.Session() rs = sess.get(url, headers=default_headers, cookies=_cookies, timeout=30, proxies=proxies) except Exception as e: # 将进行重试,可忽略 _logger.info('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) return -400 if rs.status_code != 200: if rs.status_code == 500: _logger.debug('STATUS:-500 ; INFO:请求被禁止 ; PROXY:%s ; URL:%s ; User-Agent:%s' % ( proxies['http'] if proxy else '', url, headers.get('user_agent', ''))) return -500 # 已失效产品(url不存在) elif rs.status_code == 404: _logger.debug('STATUS:404 ; INFO:请求错误 ; URL:%s' % url) return 404 _logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % ( rs.status_code, proxies['http'] if proxy else '', url)) return -405 # 强制utf-8 rs.encoding = 'utf-8' return _parse_detail_data(rs.text, url=url, **kwargs)
def parse(self, resp): systems_catalog = 0 try: product_dict = json.loads(resp.text.encode('utf-8')) systems_catalog = resp.meta.get('systemsCatalog') total_match_count_string = util.intval( product_dict.get('totalMatchCountString')) pages = int(math.ceil(total_match_count_string / self.limit_num)) for pageNum in xrange(1, pages + 1): self.form_data['pageNum'] = str(pageNum) yield Request(url=self.processData_url, method='POST', headers=self.headers, body=json.dumps(self.form_data), meta={'systemsCatalog': systems_catalog}, callback=self.parse_detail) except: logger.exception('Parse error, systemsCatalog: %s', systems_catalog)
def fetch_data(self): ''' 获取页面数据 ''' headers = self.headers if self.headers else DEFAULT_HEADER try: sess = requests.Session() print('获取url: {0}'.format(self.url)) if self.method == 'GET': rs = sess.get(self.url, headers=headers, cookies=None, timeout=30, proxies=None) elif self.method == 'POST': rs = sess.post(self.url, data=self.form_data, headers=headers, cookies=None, timeout=30, proxies=None) else: _logger.info('INFO:请求方法未定义 ; URL: {0}'.format(self.url)) print('rs', rs) print(rs.text, rs.text) except Exception as e: # 将进行重试,可忽略 _logger.info('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), self.url)) return -400 if rs.status_code != 200: if rs.status_code == 404: _logger.debug('STATUS:404 ; INFO:请求错误 ; URL:%s' % self.url) return 404 # 强制utf-8 # rs.encoding = 'utf-8' rs.encoding = rs.apparent_encoding return self._parse_detail_data(rs.content)
def _fetch_data(self, fn, data_list=[], **kwargs): """获取数据""" try: data = fn(**kwargs) # 当function_name为fetch_data时失败或异常返回为状态值 if isinstance(data, dict): data['id'] = kwargs['id'] data['status'] = 200 data_list.append(data) elif fn.func_name == 'fetch_data': del kwargs['headers'] kwargs['status'] = data kwargs['count'] = kwargs.get('count', 1) if data in (404, 405): kwargs['list'] = [] data_list.append(kwargs) return data except Exception as e: print(util.binary_type(e)) return None
def start_requests(self): match = [] url = self.start_urls[0] rs = requests.get(url, headers=self.headers) js_cookies = {} for vo in rs.cookies: js_cookies[vo.name] = vo.value rs = requests.get(url, headers=self.headers, cookies=js_cookies) js_cookies = _parse_incapsula_page(rs.text, cookies=js_cookies, headers=self.headers) resp = requests.get( url='https://www.ttiinc.com/content/ttiinc/en/manufacturers.html', headers=self.headers, cookies=js_cookies) manufacturers = re.findall( r'(/content/ttiinc/en/manufacturers/.*/(.*).html)', resp.text.encode('utf-8')) for v, k in manufacturers: self.manufacturers[k] = util.urljoin(self.tti, v) rs = requests.get(url, headers=self.headers, cookies=js_cookies) match = re.findall(r'/.*/part-search.html.*systemsCatalog=(\d+)', rs.text.encode('utf-8')) # if not match: # with open(os.path.split(os.path.realpath(__file__))[0] + r'\tti_category_values.txt', 'r') as fp: # for line in fp.readlines(): # match.append(line.strip()) for systems_catalog in match: try: self.form_data['systemsCatalog'] = systems_catalog # print '*'*50 # print self.form_data yield Request(url=self.processData_url, method='POST', headers=self.headers, body=json.dumps(self.form_data), meta={'systemsCatalog': systems_catalog}) except: logger.exception('Request error, systemsCatalog: %s', systems_catalog)
def update_data(self, queue_name=None): """更新指定队列数据""" if not queue_name: return 0 qsize = mq.qsize(queue_name) self.limit = self.limit if qsize > self.limit else qsize # 每次更新的数量 queue_list = [] for i in range(self.limit): queue_data = mq.get(queue_name) if queue_data and queue_data not in queue_list: queue_list.append(queue_data) if not queue_list: print('等待中,队列 %s 为空' % queue_name) return 0 proxy = None if not self.no_proxy: proxy = self.get_prolist() tlist = [] data_list = [] total_num = 0 for data in queue_list: # 无效队列数据 if 'id' not in data: continue if 'proxy' in data: del data['proxy'] try: if len(tlist) > 30: for t in tlist: t.join(45) except (KeyboardInterrupt, SystemExit): mq.put(queue_name, queue_data) return 0 # 有效队列的总数(非型号总数) total_num += 1 t = threading.Thread(target=self.fetch_update_data, args=(data_list, proxy), kwargs=data) tlist.append(t) t.start() time.sleep(1) del data, queue_list valid_num = 0 delete_list = [] # 所有线程执行完毕后 再进行数据处理 for data in data_list: if not data: continue if data['status'] == 200: mq.put(config.WAIT_UPDATE_QUEUE, data['dlist']) # 等待提交数据 valid_num += 1 id = data.get('dlist').get('id', ) lottery_name = data.get('dlist').get('lottery_name', ) status = data.get('status') config.LOG.info('ID:{0} ;产品: {1} ;数据获取成功:{2} ;提交到入库队列: {3} !'.format(id, lottery_name, status, config.WAIT_UPDATE_QUEUE)) continue else: delete_list.append(data) count = data.get('count', '') if count and count < self.exception_threshold: # 重复更新的次数 config.LOG.info('ID:%s,更新状态:%s, 重新入队中!' % (data.get('id', ), data['status'])) # update_list.append(data) mq.put(queue_name, data) else: config.LOG.error('ID:%s,更新状态:%s, 重试次数超过阀值,保存日志中!' % (data.get('id', ), data['status'])) if 'count' in data: del data['count'] if 'time' not in data: data['time'] = util.date() # db.mongo['update_exception_logs'].insert(data) mq.put('update_exception_logs', data) self.write_update_info(valid_num) print('队列 %s 本次共有 %s 条数据更新成功,成功率:%s %%' % (queue_name, valid_num, valid_num * 1.0 / total_num * 100 if total_num > 0 else 0)) print('完成 , 等待下一个队列!')
def fetch_search_data(self, data_list=[], err_list=[], proxy=None, supp=None, **kwargs): """ 根据搜索关键词获取产品产品数据(可能为url也可能为详细信息) """ if not supp or 'keyword' not in kwargs: return None headers = { 'user-agent': random.choice(config.USER_AGENT_LIST), } keyword = util.u2b(kwargs['keyword']) supplier_name = config.DB_KEY[supp] try: if not hasattr(supplier, supplier_name): module_name = 'supplier.{0}'.format(supplier_name) if module_name not in sys.modules: __import__(module_name) obj = sys.modules[module_name] else: obj = getattr(supplier, supplier_name) if hasattr(obj, 'api_search_data'): _fetch_function = getattr(obj, 'api_search_data') else: _fetch_function = getattr(obj, 'fetch_search_data') except Exception as e: config.LOG.exception('STATUS: -401, Keyword: %(keyword)s', {'keyword': keyword}) if kwargs.get('count', 1) < self.exception_threshold: kwargs['status'] = -401 kwargs['count'] = kwargs.get('count', 1) + 1 err_list.append(kwargs) return None data_dict = { 'detail': [], 'list': [], 'url': [] } if self.optype == 'hot' and self.use: kwargs['hot_search'] = True del kwargs['keyword'] try: _fetch_function(keyword, supp, data_dict, headers, **kwargs) except Exception as e: config.LOG.exception('STATUS: -402, Keyword: %(keyword)s', {'keyword': keyword}) if kwargs.get('count', 1) < self.exception_threshold: kwargs['status'] = -402 kwargs['count'] = kwargs.get('count', 1) + 1 kwargs['keyword'] = keyword err_list.append(kwargs) return None if data_dict['list']: try: _fetch_function = getattr(obj, 'fetch_search_list') except Exception as e: _fetch_function = None print(util.traceback_info(e, return_all=1)) if _fetch_function: res = self._crawl(_fetch_function, data_dict['list'], headers, proxy) if 'url' in res: for url in res['url']: data_dict['url'].append(url) if 'detail' in res: for data in res['detail']: data_dict['detail'].append(data) if data_dict['url']: try: _fetch_function = getattr(obj, 'fetch_data') except Exception as e: _fetch_function = None print(util.traceback_info(e, return_all=1)) if _fetch_function: res = self._crawl(_fetch_function, data_dict['url'], headers, proxy) if 'detail' in res: for data in res['detail']: data_dict['detail'].append(data) for data in data_dict['detail']: pass data_list.append(data) ''' 此处进行每条数据的清洗整理 ''' return data_list