Beispiel #1
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时秒杀商品信息
        :return:
        '''
        zid_list = []
        for page in range(0, 100):
            tmp_zid_list = []
            tmp_url = 'https://pina.m.zhe800.com/nnc/list/deals.json?page={0}&size=500'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            tmp_data = self.get_url_body(tmp_url=tmp_url)
            # print(tmp_data)

            if tmp_data == []:
                print('该tmp_url得到的object为空list, 此处跳过!')
                break

            tmp_zid_list = [(item.get('product', {}).get('zid', ''), page)
                            for item in tmp_data]
            # print(tmp_zid_list)

            for item in tmp_zid_list:
                if item != '':
                    zid_list.append(item)

        zid_list = list(set(zid_list))
        print('该zid_list的总个数为: ', len(zid_list))
        print(zid_list)

        zhe_800_pintuan = Zhe800PintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline.select_zhe_800_pintuan_all_goods_id())
            ]
            for item in zid_list:
                if item[0] in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(
                        item[0])
                    goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url)

                    zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    goods_data = zhe_800_pintuan.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_data['goods_id'] = str(item[0])
                        goods_data['spider_url'] = tmp_url
                        goods_data['username'] = '******'
                        goods_data['page'] = str(item[1])

                        # print(goods_data)
                        zhe_800_pintuan.insert_into_zhe_800_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        sleep(.7)

        else:
            pass
        try:
            del zhe_800_pintuan
        except:
            pass
        gc.collect()
Beispiel #2
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_zhe_800_pintuan_all_goods_id())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                data = {}
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                zhe_800_pintuan = Zhe800PintuanParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    tmp_tmp = zhe_800_pintuan.get_goods_data(goods_id=item[0])

                    # 不用这个了因为会影响到正常情况的商品
                    # try:        # 单独处理商品页面不存在的情况
                    #     if isinstance(tmp_tmp, str) and re.compile(r'^ze').findall(tmp_tmp) != []:
                    #         print('******** 该商品的页面已经不存在!此处将其删除!')
                    #         tmp_sql_server.delete_zhe_800_pintuan_expired_goods_id(goods_id=item[0])
                    #     else:
                    #         pass
                    # except:
                    #     pass

                    data = zhe_800_pintuan.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]

                        if item[1] == 1:
                            tmp_sql_server.delete_zhe_800_pintuan_expired_goods_id(
                                goods_id=item[0])
                            print('该goods_id[{0}]已过期,删除成功!'.format(item[0]))
                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (item[0], index))
                            zhe_800_pintuan.to_right_and_update_data(
                                data=data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                try:
                    del zhe_800_pintuan
                except:
                    pass
                gc.collect()
                sleep(ZHE_800_PINTUAN_SLEEP_TIME)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()