def process_item(self, item, spider): """保存数据""" demo = item.get('demo') if not demo: raise DropItem("item data type error") # self.put_queue(item) data = copy.deepcopy(dict(item)) if not data: raise DropItem("item data is empty") # info = self.mongo.find_one({'demo': data['demo']}) demo_test = item.get('demo_test', '') if not demo_test: raise DropItem("demo_test is empty") # return condition = {'demo': demo} try: info = self.mysql.select(demo_test, condition=condition, limit=1) if not info: # self.mongo.insert(data) item['create_time'] = util.date() item['update_time'] = util.date() self.mysql.insert(demo_test, data=item) # _logger.info('success insert mysql : %s' % data['demo']) else: item['create_time'] = info['create_time'] item['update_time'] = util.date() # self.mongo.update({'_id': info['_id']}, {"$set": data}) self.mysql.update(demo_test, condition=condition, data=item) # _logger.info('success update mysql : %s' % data['demo']) except Exception as e: _logger.info('error op mysql : {0} : e {1}'.format( data['demo'], e)) raise DropItem('success process')
def save_data(url, db_name, item): ''' 数据保存 ''' info = None if not info: item['create_time'] = util.date() mysql.insert(db_name, data=item) _logger.info('INFO: DB:%s 数据保存成功, 期号%s ; URL:%s' % (db_name, item['demo'], url)) else: item['update_time'] = util.date() del item['open_time'] del item['create_time'] mysql.update(db_name, condition=[('demo', '=', item['demo'])], data=item) _logger.info('INFO: DB:%s 数据已存在 更新成功, 期号: %s ; URL:%s' % (db_name, item['demo'], url))
def write_update_info(self, num_list): '''记录更新信息 @param num_list 记录每次更新数目信息 @param name 记录类型值,默认count为成功值 ''' if not num_list: return None mq.put('crawler_update_stats', {'data': num_list, 'time': util.date()})
def _init_args(self, **kwargs): start_url = kwargs.get('START_URL', '') self.abbreviation = kwargs.get('ABBREVIATION', '') self.start_date = kwargs.get('START_DATE', '') self.end_date = kwargs.get('END_DATE', '') self.end_date = self.end_date if self.end_date else util.date() if start_url: self.start_urls = [start_url] self.rules = (Rule(LinkExtractor(allow=filter_rules), callback='parse_resp', follow=True), )
def exception_notice(etype=''): """异常通知""" now_minuter = util.date(format='%Y-%m-%d %H:%M') subject = '【HQChip】合作库存 %s 数据更新异常通知 %s' % (PN2, now_minuter) if etype == 'mysql': except_msg = 'mysql数据库连接异常' elif etype == 'mongo': except_msg = 'mongodb 数据库连接异常' else: except_msg = '数据获取异常' body = "合作库存 %s 数据更新数据获取异常, 异常原因:%s,请注意检查!" % (PN2, except_msg) util.sendmail(config.EMAIL_NOTICE.get( 'accept_list'), subject=subject, body=body)
def update_data(self, queue_name=None): """更新指定队列数据""" if not queue_name: return 0 qsize = mq.qsize(queue_name) self.limit = self.limit if qsize > self.limit else qsize # 每次更新的数量 queue_list = [] for i in range(self.limit): queue_data = mq.get(queue_name) if queue_data and queue_data not in queue_list: queue_list.append(queue_data) if not queue_list: print('等待中,队列 %s 为空' % queue_name) return 0 proxy = None if not self.no_proxy: proxy = self.get_prolist() tlist = [] data_list = [] total_num = 0 for data in queue_list: # 无效队列数据 if 'id' not in data: continue if 'proxy' in data: del data['proxy'] try: if len(tlist) > 30: for t in tlist: t.join(45) except (KeyboardInterrupt, SystemExit): mq.put(queue_name, queue_data) return 0 # 有效队列的总数(非型号总数) total_num += 1 t = threading.Thread(target=self.fetch_update_data, args=(data_list, proxy), kwargs=data) tlist.append(t) t.start() time.sleep(1) del data, queue_list valid_num = 0 delete_list = [] # 所有线程执行完毕后 再进行数据处理 for data in data_list: if not data: continue if data['status'] == 200: mq.put(config.WAIT_UPDATE_QUEUE, data['dlist']) # 等待提交数据 valid_num += 1 id = data.get('dlist').get('id', ) lottery_name = data.get('dlist').get('lottery_name', ) status = data.get('status') config.LOG.info('ID:{0} ;产品: {1} ;数据获取成功:{2} ;提交到入库队列: {3} !'.format(id, lottery_name, status, config.WAIT_UPDATE_QUEUE)) continue else: delete_list.append(data) count = data.get('count', '') if count and count < self.exception_threshold: # 重复更新的次数 config.LOG.info('ID:%s,更新状态:%s, 重新入队中!' % (data.get('id', ), data['status'])) # update_list.append(data) mq.put(queue_name, data) else: config.LOG.error('ID:%s,更新状态:%s, 重试次数超过阀值,保存日志中!' % (data.get('id', ), data['status'])) if 'count' in data: del data['count'] if 'time' not in data: data['time'] = util.date() # db.mongo['update_exception_logs'].insert(data) mq.put('update_exception_logs', data) self.write_update_info(valid_num) print('队列 %s 本次共有 %s 条数据更新成功,成功率:%s %%' % (queue_name, valid_num, valid_num * 1.0 / total_num * 100 if total_num > 0 else 0)) print('完成 , 等待下一个队列!')