Exemple #1
0
    def get_pintuan_goods_info(self):
        '''
        模拟构造得到data的url, 得到近期所有的限时拼团商品信息
        :return:
        '''
        pintuan_goods_id_list = []
        for page in range(0, 100):
            tmp_url = 'https://tuan.juanpi.com/pintuan/get_goods_list?page={0}&pageSize=20&cid=pinhaohuo_sx&show_type=wap'.format(
                str(page))
            print('正在抓取的页面地址为: ', tmp_url)

            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            if body == '': body = '{}'
            try:
                tmp_data = json.loads(body)
                tmp_data = tmp_data.get('data', {}).get('goods', [])
            except:
                print('json.loads转换tmp_data时出错!')
                tmp_data = []

            # print(tmp_data)
            sleep(.3)

            if tmp_data == []:
                print('该tmp_url得到的goods为空list, 此处跳过!')
                break

            tmp_pintuan_goods_id_list = [{
                'goods_id':
                item.get('goods_id', ''),
                'begin_time':
                self.timestamp_to_regulartime(int(item.get('start_time', ''))),
                'end_time':
                self.timestamp_to_regulartime(int(item.get('end_time', ''))),
                'all_sell_count':
                str(item.get('join_number_int', '')),
                'page':
                page,
            } for item in tmp_data]
            # print(tmp_pintuan_goods_id_list)

            for item in tmp_pintuan_goods_id_list:
                if item.get('goods_id', '') not in [
                        item2.get('goods_id', '')
                        for item2 in pintuan_goods_id_list
                ]:
                    pintuan_goods_id_list.append(item)

        print('该pintuan_goods_id_list的总个数为: ', len(pintuan_goods_id_list))
        print(pintuan_goods_id_list)

        juanpi_pintuan = JuanPiParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        index = 1
        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline.select_juanpi_pintuan_all_goods_id())
            ]
            # print(db_goods_id_list)
            for item in pintuan_goods_id_list:
                if index % 5 == 0:
                    # 此处避免脚本占用大量内存
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    juanpi_pintuan = JuanPiParse()
                    gc.collect()

                if db_goods_id_list != []:
                    if item.get('goods_id', '') in db_goods_id_list:
                        print('该goods_id已经存在于数据库中, 此处跳过')
                        pass
                    else:
                        # * 注意卷皮的拼团时间跟它原先抓到的上下架时间是同一个时间 *
                        ## 所以就不用进行替换
                        goods_data = self.get_pintuan_goods_data(
                            juanpi_pintuan=juanpi_pintuan,
                            goods_id=item.get('goods_id', ''),
                            all_sell_count=item.get('all_sell_count', ''),
                            page=item.get('page', 0))

                        if goods_data == {}:  # 返回的data为空则跳过
                            pass
                        else:
                            # print(goods_data)
                            juanpi_pintuan.insert_into_juuanpi_pintuan_table(
                                data=goods_data, pipeline=my_pipeline)
                            pass

                        sleep(.6)
                        index += 1

                else:
                    goods_data = self.get_pintuan_goods_data(
                        juanpi_pintuan=juanpi_pintuan,
                        goods_id=item.get('goods_id', ''),
                        all_sell_count=item.get('all_sell_count', ''),
                        page=item.get('page', 0))
                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:
                        # print(goods_data)
                        juanpi_pintuan.insert_into_juuanpi_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        pass
                    sleep(.6)
                    index += 1

        else:
            pass
        try:
            del juanpi_pintuan
        except:
            pass
        gc.collect()
Exemple #2
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_juanpi_pintuan_all_goods_id())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                data = {}
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    pintuan_end_time = json.loads(item[1])[0].get('end_time')
                    pintuan_end_time = int(str(time.mktime(time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(time.time()):
                        tmp_sql_server.delete_juanpi_pintuan_expired_goods_id(goods_id=item[0])
                        print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0]))
                    else:
                        print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                        juanpi_pintuan.get_goods_data(goods_id=item[0])
                        data = juanpi_pintuan.deal_with_data()

                        if data != {}:
                            data['goods_id'] = item[0]
                            juanpi_pintuan.to_right_and_update_pintuan_data(data=data, pipeline=tmp_sql_server)
                        else:  # 表示返回的data值为空值
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del juanpi_pintuan
                # except:
                #     pass
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()