Exemple #1
0
    def process_item(self, item, spider):
        """保存数据"""
        demo = item.get('demo')
        if not demo:
            raise DropItem("item data type error")
        # self.put_queue(item)
        data = copy.deepcopy(dict(item))
        if not data:
            raise DropItem("item data is empty")

        # info = self.mongo.find_one({'demo': data['demo']})
        demo_test = item.get('demo_test', '')
        if not demo_test:
            raise DropItem("demo_test is empty")
            # return
        condition = {'demo': demo}
        try:
            info = self.mysql.select(demo_test, condition=condition, limit=1)
            if not info:
                # self.mongo.insert(data)
                item['create_time'] = util.date()
                item['update_time'] = util.date()
                self.mysql.insert(demo_test, data=item)
                # _logger.info('success insert mysql : %s' % data['demo'])
            else:
                item['create_time'] = info['create_time']
                item['update_time'] = util.date()
                # self.mongo.update({'_id': info['_id']}, {"$set": data})
                self.mysql.update(demo_test, condition=condition, data=item)
                # _logger.info('success update mysql : %s' % data['demo'])
        except Exception as e:
            _logger.info('error op mysql : {0}  : e {1}'.format(
                data['demo'], e))
        raise DropItem('success process')
Exemple #2
0
def save_data(url, db_name, item):
    '''
    数据保存
    '''
    info = None
    if not info:
        item['create_time'] = util.date()
        mysql.insert(db_name, data=item)
        _logger.info('INFO:  DB:%s 数据保存成功, 期号%s ; URL:%s' % (db_name, item['demo'], url))

    else:
        item['update_time'] = util.date()
        del item['open_time']
        del item['create_time']
        mysql.update(db_name, condition=[('demo', '=', item['demo'])], data=item)
        _logger.info('INFO:  DB:%s 数据已存在 更新成功, 期号: %s ; URL:%s' % (db_name, item['demo'], url))
Exemple #3
0
    def write_update_info(self, num_list):
        '''记录更新信息

        @param num_list     记录每次更新数目信息
        @param name         记录类型值,默认count为成功值
        '''
        if not num_list:
            return None
        mq.put('crawler_update_stats', {'data': num_list, 'time': util.date()})
Exemple #4
0
    def _init_args(self, **kwargs):
        start_url = kwargs.get('START_URL', '')
        self.abbreviation = kwargs.get('ABBREVIATION', '')
        self.start_date = kwargs.get('START_DATE', '')
        self.end_date = kwargs.get('END_DATE', '')
        self.end_date = self.end_date if self.end_date else util.date()

        if start_url:
            self.start_urls = [start_url]
        self.rules = (Rule(LinkExtractor(allow=filter_rules),
                           callback='parse_resp',
                           follow=True), )
def exception_notice(etype=''):
    """异常通知"""
    now_minuter = util.date(format='%Y-%m-%d %H:%M')
    subject = '【HQChip】合作库存 %s 数据更新异常通知 %s' % (PN2, now_minuter)
    if etype == 'mysql':
        except_msg = 'mysql数据库连接异常'
    elif etype == 'mongo':
        except_msg = 'mongodb 数据库连接异常'
    else:
        except_msg = '数据获取异常'
    body = "合作库存 %s 数据更新数据获取异常, 异常原因:%s,请注意检查!" % (PN2, except_msg)
    util.sendmail(config.EMAIL_NOTICE.get(
        'accept_list'), subject=subject, body=body)
Exemple #6
0
    def update_data(self, queue_name=None):
        """更新指定队列数据"""
        if not queue_name:
            return 0
        qsize = mq.qsize(queue_name)
        self.limit = self.limit if qsize > self.limit else qsize  # 每次更新的数量
        queue_list = []

        for i in range(self.limit):
            queue_data = mq.get(queue_name)
            if queue_data and queue_data not in queue_list:
                queue_list.append(queue_data)

        if not queue_list:
            print('等待中,队列 %s 为空' % queue_name)
            return 0
        proxy = None
        if not self.no_proxy:
            proxy = self.get_prolist()
        tlist = []
        data_list = []
        total_num = 0

        for data in queue_list:
            # 无效队列数据
            if 'id' not in data:
                continue
            if 'proxy' in data:
                del data['proxy']

            try:
                if len(tlist) > 30:
                    for t in tlist:
                        t.join(45)
            except (KeyboardInterrupt, SystemExit):
                mq.put(queue_name, queue_data)
                return 0

            # 有效队列的总数(非型号总数)
            total_num += 1
            t = threading.Thread(target=self.fetch_update_data,
                                 args=(data_list, proxy), kwargs=data)
            tlist.append(t)
            t.start()
            time.sleep(1)

        del data, queue_list
        valid_num = 0
        delete_list = []

        # 所有线程执行完毕后 再进行数据处理
        for data in data_list:
            if not data:
                continue
            if data['status'] == 200:
                mq.put(config.WAIT_UPDATE_QUEUE, data['dlist'])  # 等待提交数据
                valid_num += 1
                id = data.get('dlist').get('id', )
                lottery_name = data.get('dlist').get('lottery_name', )
                status = data.get('status')
                config.LOG.info('ID:{0} ;产品: {1} ;数据获取成功:{2} ;提交到入库队列:  {3} !'.format(id, lottery_name, status,
                                                                                      config.WAIT_UPDATE_QUEUE))
                continue
            else:
                delete_list.append(data)

            count = data.get('count', '')
            if count and count < self.exception_threshold:  # 重复更新的次数
                config.LOG.info('ID:%s,更新状态:%s, 重新入队中!' % (data.get('id', ), data['status']))
                # update_list.append(data)
                mq.put(queue_name, data)
            else:
                config.LOG.error('ID:%s,更新状态:%s, 重试次数超过阀值,保存日志中!' % (data.get('id', ), data['status']))
                if 'count' in data:
                    del data['count']
                if 'time' not in data:
                    data['time'] = util.date()
                # db.mongo['update_exception_logs'].insert(data)
                mq.put('update_exception_logs', data)

        self.write_update_info(valid_num)
        print('队列 %s 本次共有 %s 条数据更新成功,成功率:%s %%' %
              (queue_name, valid_num, valid_num * 1.0 / total_num * 100 if total_num > 0 else 0))
        print('完成 , 等待下一个队列!')