Beispiel #1
0
 async def _get_new_jd_obj(self, index):
     if index % 10 == 0:         # 不能共享一个对象了, 否则驱动访问会异常!
         try:
             del self.jd
         except:
             pass
         collect()
         self.jd = JdParse(logger=self.lg)
Beispiel #2
0
def get_one_jd_data(**kwargs):
    '''
    抓取jd url的data
    :param kwargs:
    :return:
    '''
    username = kwargs.get('username', '18698570079')
    wait_to_deal_with_url = kwargs.get('wait_to_deal_with_url', '')
    my_lg = kwargs.get('my_lg')

    jd = JdParse(logger=my_lg)
    goods_id = jd.get_goods_id_from_url(
        wait_to_deal_with_url)  # 获取goods_id, 这里返回的是一个list
    if goods_id == []:  # 如果得不到goods_id, 则return error
        my_lg.info('获取到的goods_id为空!')
        try:
            del jd  # 每次都回收一下
        except:
            pass
        gc.collect()

        return {'goods_id': ''}

    # 改进判断,根据传入数据判断是京东(京东超市属于其中),还是京东全球购,还是京东大药房
    #####################################################
    if goods_id[0] == 0:  # [0, '1111']
        wait_to_deal_with_url = 'https://item.jd.com/' + goods_id[
            1] + '.html'  # 构造成标准干净的jd商品地址
    elif goods_id[0] == 1:  # [1, '1111']
        wait_to_deal_with_url = 'https://item.jd.hk/' + goods_id[1] + '.html'
    elif goods_id[0] == 2:  # [2, '1111', 'https://xxxxx']
        wait_to_deal_with_url = 'https://item.yiyaojd.com/' + goods_id[
            1] + '.html'

    tmp_result = jd.get_goods_data(goods_id=goods_id)
    data = jd.deal_with_data(goods_id=goods_id)  # 如果成功获取的话, 返回的是一个data的dict对象
    if data == {} or tmp_result == {}:
        my_lg.info('获取到的data为空!')
        try:
            del jd
        except:
            pass
        gc.collect()

        return {'goods_id': goods_id[1], 'msg': 'data为空!'}

    wait_to_save_data = add_base_info_2_processed_data(
        data=data,
        spider_url=wait_to_deal_with_url,
        username=username,
        goods_id=goods_id[1])
    try:
        del jd
    except:
        pass

    return wait_to_save_data
Beispiel #3
0
 async def _get_new_jd_obj(self, index):
     if index % 10 == 0:  # 不能共享一个对象了, 否则驱动访问会异常!
         try:
             del self.jd
         except:
             pass
         collect()
         self.jd = JdParse(
             logger=self.lg,
             is_real_times_update_call=True,
         )
Beispiel #4
0
    async def _update_db(self):
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.jd = JdParse(
                    logger=self.lg,
                    is_real_times_update_call=True,
                )
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        db_goods_info_obj = JDDbGoodsInfoObj(item=item,
                                                             logger=self.lg)
                        self.lg.info('创建 task goods_id: {}'.format(
                            db_goods_info_obj.goods_id))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(
                                    db_goods_info_obj=db_goods_info_obj,
                                    index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(5.5)
            try:
                del self.jd
            except:
                pass
            collect()
Beispiel #5
0
class JDUpdater(AsyncCrawler):
    """jd常规商品更新"""
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/jd/实时更新/'
        )
        self.sql_cli = None
        self.goods_index = 1
        # 并发量
        self.concurrency = 10  

    async def _get_db_old_data(self):
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            result = list(self.sql_cli._select_table(sql_str=jd_select_str_1))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_new_jd_obj(self, index):
        if index % 10 == 0:         # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.jd
            except:
                pass
            collect()
            self.jd = JdParse(logger=self.lg)

    async def _get_tmp_item(self, site_id, goods_id):
        tmp_item = []
        if site_id == 7 or site_id == 8:  # 从数据库中取出时,先转换为对应的类型
            tmp_item.append(0)
        elif site_id == 9:
            tmp_item.append(1)
        elif site_id == 10:
            tmp_item.append(2)

        tmp_item.append(goods_id)

        return tmp_item

    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新单个jd商品信息
        :param db_goods_info_obj:
        :param index:
        :return:
        '''
        res = False
        await self._get_new_jd_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli, index=index, logger=self.lg)
        if self.sql_cli.is_connect_success:
            self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(
                db_goods_info_obj.goods_id,
                index))
            tmp_item = await self._get_tmp_item(
                site_id=db_goods_info_obj.site_id,
                goods_id=db_goods_info_obj.goods_id,)
            data = self.jd.get_goods_data(goods_id=tmp_item)
            if data.get('is_delete', 1) == 1:
                self.lg.info('该商品已下架...')
                self.sql_cli._update_table_2(
                    sql_str=jd_update_str_2,
                    params=(str(get_shanghai_time()), tmp_item[1],),
                    logger=self.lg)
                await async_sleep(1.2)
                index += 1
                self.goods_index = index

                return db_goods_info_obj.goods_id, index

            data = self.jd.deal_with_data(goods_id=tmp_item)
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='jd',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,)
                self.jd.to_right_and_update_data(data, pipeline=self.sql_cli)

            else:  # 表示返回的data值为空值
                pass
        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(1.2)       # 避免被发现使用代理

        return db_goods_info_obj.goods_id, index

    async def _update_db(self):
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency)
                self.jd = JdParse(logger=self.lg)
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        db_goods_info_obj = JDDbGoodsInfoObj(item=item, logger=self.lg)
                        self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id))
                        tasks.append(self.loop.create_task(self._update_one_goods_info(
                            db_goods_info_obj=db_goods_info_obj,
                            index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(5.5)
            try:
                del self.jd
            except:
                pass
            collect()

    def __del__(self):
        try:
            del self.lg
        except: pass
        try:
            del self.loop
        except:pass
        collect()
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = '''
        select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice 
        from dbo.GoodsInfoAutoGet 
        where (SiteID=7 or SiteID=8 or SiteID=9 or SiteID=10) and GETDATE()-ModfiyTime>1 and IsDelete=0 and MainGoodsID is not null
        '''

        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            continue

        print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
        print(result)
        print('--------------------------------------------------------')
        print('总计待更新个数:', len(result))

        print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
        index = 1

        # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
        jd = JdParse()

        for item in result:  # 实时更新数据
            # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            # jd = JdParse()
            if index % 10 == 0:
                try: del jd
                except: pass
                gc.collect()
                jd = JdParse()

            if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                print('正在重置,并与数据库建立新连接中...')
                # try:
                #     del tmp_sql_server
                # except:
                #     pass
                # gc.collect()
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                print('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[1], index))
                tmp_item = []
                if item[0] == 7 or item[0] == 8:        # 从数据库中取出时,先转换为对应的类型
                    tmp_item.append(0)
                elif item[0] == 9:
                    tmp_item.append(1)
                elif item[0] == 10:
                    tmp_item.append(2)

                tmp_item.append(item[1])
                jd.get_goods_data(goods_id=tmp_item)
                data = jd.deal_with_data(goods_id=tmp_item)
                if data != {}:
                    data['goods_id'] = item[1]

                    data['my_shelf_and_down_time'], data['delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                        tmp_data=data,
                        is_delete=item[2],
                        MyShelfAndDownTime=item[3]
                    )
                    data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                        old_price=item[4],
                        old_taobao_price=item[5],
                        new_price=data['price'],
                        new_taobao_price=data['taobao_price']
                    )

                    # print('------>>>| 爬取到的数据为: ', data)
                    jd.to_right_and_update_data(data, pipeline=tmp_sql_server)
                else:  # 表示返回的data值为空值
                    pass
            else:  # 表示返回的data值为空值
                print('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            # try:
            #     del jd
            # except:
            #     pass
            gc.collect()
            sleep(1.2)
        print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        try: del jd
        except: pass
        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Beispiel #7
0
    def get_all_user_and_their_recommend_goods_list(self):
        for index in range(1, 100):
            t = str(time.time().__round__()) + str(randint(100, 999))  # time.time().__round__() 表示保留到个位

            # 达人推荐的地址(ajax请求)
            tmp_url = 'https://wq.jd.com/shopgroup_feed/GetDarenFeeds?pageno={}&pagesize=5&darenType=0&perDarenFeedNum=3&totalpage=1&_={}&callback=jsonpCBKC&g_ty=ls'.format(
                str(index), t
            )

            self.from_ip_pool_set_proxy_ip_to_phantomjs()
            self.driver.set_page_load_timeout(15)  # 设置成15秒避免数据出错

            try:
                self.driver.get(tmp_url)
                self.driver.implicitly_wait(15)
            except Exception as e:  # 如果超时, 终止加载并继续后续操作
                print('-->>time out after 15 seconds when loading page')
                self.driver.execute_script('window.stop()')  # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作
                # pass

            body = self.driver.page_source
            body = re.compile(r'\n').sub('', body)
            body = re.compile(r'\t').sub('', body)
            body = re.compile(r'  ').sub('', body)
            # print(body)
            body = re.compile(r'square\((.*)\)').findall(body)

            if body != []:
                body = body[0]
                try:
                    data = json.loads(body)
                    # pprint(data)
                except:
                    print('json.loads转换body得到data时出错!')
                    return []

                if data.get('user_list') is None:   # 表示没有数据了,返回的为 square({"errmsg":"","iRet":0,"totalnum":347} )
                    print('body中获取的user_list为None!')
                    pass

                else:
                    user_list = data.get('user_list', [])
                    # pprint(user_list)

                    for item in user_list:
                        # 达人昵称
                        nick_name = item.get('nickname', '')

                        # 达人头像
                        head_url = item.get('headurl', '')
                        head_url = re.compile(r'http:').sub('', head_url)
                        if re.compile(r'^http').findall(head_url) != []:
                            pass
                        else:
                            head_url = 'http:' + head_url

                        # 个性签名
                        profile = item.get('profile', '')

                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        sql_str = r'select SiteID, GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=7 or SiteID=8 or SiteID=9 or SiteID=10'
                        _ = my_pipeline._select_table(sql_str=sql_str)
                        db_goods_id = [j[1] for j in list(_)] if _ is not None else []
                        # print(db_goods_id)
                        sql_str = r'select share_id from dbo.jd_youxuan_daren_recommend'
                        db_share_id = [j[0] for j in list(my_pipeline._select_table(sql_str=sql_str))]
                        # print(db_share_id)
                        jd = JdParse()

                        # 达人推荐的商品info
                        feed_list = item.get('feed_list', [])
                        for feed_list_item in feed_list:
                            if feed_list_item.get('shareid', '') in db_share_id:
                                print('该share_id({})已存在于数据库中, 此处跳过!'.format(feed_list_item.get('shareid', '')))
                                pass
                            else:
                                # share_id
                                share_id = feed_list_item.get('shareid', '')
                                article_url = 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id
                                print('------>>>| 正在抓取的jd优选达人推荐文章的地址为: ', 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id)

                                # 图片的信息
                                tmp_share_img_url_list = []
                                for item2 in feed_list_item.get('sharepicurl', '').split(','):
                                    if re.compile(r'^//').findall(item2) == []:
                                        tmp_share_img_url = 'https://img14.360buyimg.com/evalpic/s800x800_' + item2
                                    else:
                                        tmp_share_img_url = 'http:' + item2
                                    tmp_share_img_url_list.append(tmp_share_img_url)
                                share_img_url_list = [{'img_url': item5} for item5 in tmp_share_img_url_list]

                                # 处理得到达人的自拍图片div
                                tmp_img_div_desc = ''
                                for item4 in tmp_share_img_url_list:
                                    tmp_img_div = r'<img src="{}" style="height:auto;width:100%;"/>'.format(item4)
                                    tmp_img_div_desc += tmp_img_div
                                my_img_div = '<div>' + tmp_img_div_desc + '</div>'
                                # print(my_img_div)

                                # 获取到goods_id 和 fisrt_text
                                share_url = 'https://wq.jd.com/shopgroup_feed/FeedDetail?shareid=' + feed_list_item.get('shareid', '') + '&g_tk=1975813451'
                                try:
                                    self.from_ip_pool_set_proxy_ip_to_phantomjs()
                                    self.driver.get(share_url)
                                    self.driver.implicitly_wait(15)
                                except Exception as e:  # 如果超时, 终止加载并继续后续操作
                                    print('-->>time out after 15 seconds when loading page')
                                    self.driver.execute_script('window.stop()')  # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作
                                    # pass
                                feed_detail_body = self.driver.page_source
                                feed_detail_body = re.compile(r'\n').sub('', feed_detail_body)
                                feed_detail_body = re.compile(r'\t').sub('', feed_detail_body)
                                feed_detail_body = re.compile(r'  ').sub('', feed_detail_body)

                                feed_data = re.compile(r'square\((.*)\)').findall(feed_detail_body)
                                # print(feed_data)

                                if feed_data != []:
                                    feed_data = feed_data[0]
                                    try:
                                        feed_data = json.loads(feed_data)
                                    except:
                                        print('json.loads转换feed_data失败,此处跳过!')
                                        break   # 跳出后执行后面的外层的else

                                    # 文章标题
                                    title = feed_data.get('feeddata', {}).get('title', '')
                                    title = re.compile(r'12.12').sub('', title)

                                    # 达人评论内容
                                    tmp_comment_content = feed_data.get('feeddata', {}).get('commentcontent', '')
                                    tmp_comment_content = re.compile(r'&amp;').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'\n').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'12.12').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'11.11').sub('', tmp_comment_content)
                                    comment_content = tmp_comment_content

                                    if title == '':
                                        # 由于获取到title为空, 所有title = comment_content, 并把comment_content = ''
                                        title = comment_content
                                        comment_content = ''
                                    # print('该文章的标题为: ', title)
                                    # print('达人的评论内容为: ', comment_content)

                                    # first_text(文章的第一段评论内容)
                                    first_text = feed_data.get('feeddata', {}).get('firsttext', '')
                                    first_text = re.compile(r'12.12').sub('', first_text)
                                    first_text = re.compile(r'11.11').sub('', first_text)
                                    # print('first_text为: ', first_text)

                                    sku_id = feed_data.get('feeddata', {}).get('skuid')
                                    if sku_id == '0':
                                        # 如果sku_id = '0'表示没有sku_id
                                        sku_id = ''
                                    # print('sku_id为: ', sku_id)

                                    share_id = feed_list_item.get('shareid', '')
                                    tmp_div_body_dict = self.get_div_body(share_id=share_id)
                                    # pprint(tmp_div_body_dict)

                                    if tmp_div_body_dict['sku_info'] == [] and sku_id != '':
                                        # 表示如果tmp_div_body_dict['sku_info']为[],则第二部分没有goods_id,所有将第一个sku_id赋值给sku_info
                                        goods_id_list = [{'goods_id': sku_id}]
                                    else:
                                        # 这篇文章推荐的商品goods_id的list(第一个为没有div_body时的goods_id)
                                        goods_id_list = [{'goods_id': item6} for item6 in tmp_div_body_dict['sku_info']]
                                    tmp_div_body = '<div>' + '<h3>{}</h3>'.format(title) + '<p>{}</p>'.format(comment_content) + my_img_div + tmp_div_body_dict['div_body']
                                    # print('该文章推荐的商品goods_id的list为: ', goods_id_list)
                                    # print(tmp_div_body)

                                else:
                                    print('获取feed_data失败!')
                                    return []

                                # 后期处理
                                if comment_content == '':
                                    comment_content = first_text

                                '''
                                时区处理,时间处理到上海时间
                                '''
                                tz = pytz.timezone('Asia/Shanghai')  # 创建时区对象
                                now_time = datetime.datetime.now(tz)
                                # 处理为精确到秒位,删除时区信息
                                now_time = re.compile(r'\..*').sub('', str(now_time))
                                # 将字符串类型转换为datetime类型
                                now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S')
                                create_time = now_time      # 创建的时间

                                result = {
                                    'nick_name': nick_name,                     # 达人昵称
                                    'head_url': head_url,                       # 达人头像
                                    'profile': profile,                         # 个性签名
                                    'share_id': share_id,                       # 分享的share_id
                                    'article_url': article_url,                 # 文章原地址
                                    'title': title,                             # 文章标题
                                    'comment_content': comment_content,         # 达人的评论内容
                                    'share_img_url_list': share_img_url_list,   # 达人自拍照片list
                                    # 'first_text': first_text,                   # 文章的第一段评论文字
                                    'goods_id_list':goods_id_list,              # 文章中所有推荐的商品的goods_id的list
                                    'div_body': tmp_div_body,                   # 文章主体div
                                    'create_time': create_time,                 # 文章创建的时间
                                }
                                # pprint(result)
                                print(result)
                                params = self._get_db_insert_params(item=result)
                                sql_str = r'insert into dbo.jd_youxuan_daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_img_url_list, goods_id_list, div_body, create_time) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
                                my_pipeline._insert_into_table(sql_str=sql_str, params=params)

                                print('准备开始抓取该文章中的所有推荐商品'.center(30, '-'))
                                for i in goods_id_list:
                                    if i.get('goods_id', '') in db_goods_id:
                                        print('该goods_id({})已经存在于数据库中, 此处跳过!'.format(i.get('goods_id', '')))
                                        pass
                                    else:
                                        tmp_goods_id_url = 'https://item.jd.com/' + i.get('goods_id', '') + '.html'
                                        goods_id = jd.get_goods_id_from_url(jd_url=tmp_goods_id_url)
                                        jd.get_goods_data(goods_id=goods_id)
                                        tmp_jd_data = jd.deal_with_data(goods_id=goods_id)
                                        tmp_jd_data['spider_url'] = tmp_goods_id_url
                                        tmp_jd_data['username'] = '******'
                                        tmp_jd_data['goods_id'] = goods_id[1]

                                        jd.insert_into_jd_table(data=tmp_jd_data, pipeline=my_pipeline)
                                print('该文章内推荐的商品全部抓取完毕'.center(30, '-'))

            else:
                print('body为空list!')
Beispiel #8
0
    def _jd_keywords_spider(self, **kwargs):
        '''
        jd对应关键字采集
        :param kwargs:
        :return:
        '''
        goods_id_list = kwargs.get('goods_id_list')
        keyword_id = kwargs.get('keyword_id')
        '''初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址, 存也可以存这个地址'''
        # 所以这边jd就不分类存,一律存为常规商品site_id = 7
        goods_url_list = [
            'https://item.jd.com/{0}.html'.format(str(item))
            for item in goods_id_list
        ]

        self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...')

        for item in goods_url_list:  # item为goods_url
            result = False  # 用于判断某个goods是否被插入db的参数
            try:
                goods_id = re.compile('\/(\d+)\.html').findall(item)[0]
            except IndexError:
                self.my_lg.error('re获取goods_id时出错, 请检查!')
                continue

            if goods_id in self.db_existed_goods_id_list:
                self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id))
                result = True  # 原先存在的情况
                pass
            else:
                jd = JdParse(logger=self.my_lg)
                if self.add_goods_index % 20 == 0:  # 每20次重连一次,避免单次长连无响应报错
                    self.my_lg.info('正在重置,并与数据库建立新连接中...')
                    self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                    self.my_lg.info('与数据库的新连接成功建立...')

                if self.my_pipeline.is_connect_success:
                    goods_id = jd.get_goods_id_from_url(item)
                    if goods_id == []:
                        self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item))
                        continue
                    else:
                        self.my_lg.info(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                            % (goods_id[1], str(self.add_goods_index)))
                        tt = jd.get_goods_data(goods_id)
                        data = jd.deal_with_data(goods_id)
                        goods_id = goods_id[1]
                        if data != {}:
                            data['goods_id'] = goods_id
                            data['username'] = '******'
                            data['main_goods_id'] = None
                            data['goods_url'] = item

                            result = jd.old_jd_goods_insert_into_new_table(
                                data, self.my_pipeline)
                        else:
                            pass
                else:
                    self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                self.add_goods_index += 1
                sleep(1)
                try:
                    del jd
                except:
                    pass
                gc.collect()

            if result:  # 仅处理goods_id被插入或者原先已存在于db中
                self._insert_into_goods_id_and_keyword_middle_table(
                    goods_id=goods_id, keyword_id=keyword_id)
            else:
                pass

        self.my_lg.info('该关键字的商品已经抓取完毕!')

        return True
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_jd_all_goods_id_url())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                data = {}
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                jd = JdParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[1], index))
                    tmp_item = []
                    if item[0] == 7 or item[0] == 8:  # 从数据库中取出时,先转换为对应的类型
                        tmp_item.append(0)
                    elif item[0] == 9:
                        tmp_item.append(1)
                    elif item[0] == 10:
                        tmp_item.append(2)
                    tmp_item.append(item[1])
                    jd.get_goods_data(goods_id=tmp_item)
                    data = jd.deal_with_data(goods_id=tmp_item)
                    if data != {}:
                        data['goods_id'] = item[1]
                        '''
                        设置最后刷新的商品状态上下架时间
                        '''
                        # 1.is_delete由0->1 为下架时间down_time  2. is_delete由1->0 为上架时间shelf_time
                        my_shelf_and_down_time = {
                            'shelf_time': '',
                            'down_time': '',
                        }
                        if data['is_delete'] != item[2]:
                            if data['is_delete'] == 0 and item[2] == 1:
                                # is_delete由0->1 表示商品状态上架变为下架
                                my_shelf_and_down_time['down_time'] = str(
                                    get_shanghai_time())
                            else:
                                # is_delete由1->0 表示商品状态下架变为上架
                                my_shelf_and_down_time['shelf_time'] = str(
                                    get_shanghai_time())
                        else:
                            if item[3] is None or item[
                                    3] == '{"shelf_time": "", "down_time": ""}' or len(
                                        item[3]) == 35:  # 35就是那串初始str
                                if data['is_delete'] == 0:  # 上架的状态
                                    my_shelf_and_down_time['shelf_time'] = str(
                                        get_shanghai_time())
                                else:  # 下架的状态
                                    my_shelf_and_down_time['down_time'] = str(
                                        get_shanghai_time())
                            else:
                                # 否则保存原始值不变
                                tmp_shelf_and_down_time = item[3]
                                my_shelf_and_down_time = json.loads(
                                    tmp_shelf_and_down_time)  # 先转换为dict
                        data['my_shelf_and_down_time'] = my_shelf_and_down_time
                        # print(my_shlef_and_down_time)

                        # print('------>>>| 爬取到的数据为: ', data)
                        jd.to_right_and_update_data(data,
                                                    pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del tmall
                # except:
                #     pass
                gc.collect()
                # sleep(1)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Beispiel #10
0
def run_forever():
    while True:
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/jd/实时更新/' + str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR)
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        # and GETDATE()-ModfiyTime>1 and IsDelete=0
        try:
            result = list(tmp_sql_server._select_table(sql_str=jd_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            continue

        my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
        my_lg.info(str(result))
        my_lg.info('--------------------------------------------------------')
        my_lg.info('总计待更新个数:{}'.format(len(result)))

        my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
        index = 1

        # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
        jd = JdParse(logger=my_lg)
        for item in result:  # 实时更新数据
            # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            # jd = JdParse()
            if index % 10 == 0:
                try: del jd
                except: pass
                gc.collect()
                jd = JdParse(logger=my_lg)

            if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                my_lg.info('正在重置,并与数据库建立新连接中...')
                tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                my_lg.info('与数据库的新连接成功建立...')

            if tmp_sql_server.is_connect_success:
                my_lg.info('------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'.format(item[1], index))
                tmp_item = []
                if item[0] == 7 or item[0] == 8:        # 从数据库中取出时,先转换为对应的类型
                    tmp_item.append(0)
                elif item[0] == 9:
                    tmp_item.append(1)
                elif item[0] == 10:
                    tmp_item.append(2)

                tmp_item.append(item[1])
                jd.get_goods_data(goods_id=tmp_item)
                data = jd.deal_with_data(goods_id=tmp_item)
                if data != {}:
                    data['goods_id'] = item[1]

                    data['shelf_time'], data['delete_time'] = get_shelf_time_and_delete_time(
                        tmp_data=data,
                        is_delete=item[2],
                        shelf_time=item[5],
                        delete_time=item[6])
                    my_lg.info('上架时间: {0}, 下架时间: {1}'.format(data['shelf_time'], data['delete_time']))

                    data['_is_price_change'], data['_price_change_info'] = _get_price_change_info(
                        old_price=item[3],
                        old_taobao_price=item[4],
                        new_price=data['price'],
                        new_taobao_price=data['taobao_price'])

                    site_id = jd._from_jd_type_get_site_id_value(jd_type=data['jd_type'])
                    try:
                        old_sku_info = format_price_info_list(
                            price_info_list=json_2_dict(item[7]),
                            site_id=site_id)
                    except AttributeError:  # 处理已被格式化过的
                        old_sku_info = item[7]
                    data['_is_price_change'], data['sku_info_trans_time'] = get_sku_info_trans_record(
                        old_sku_info=old_sku_info,
                        new_sku_info=format_price_info_list(data['price_info_list'], site_id=site_id),
                        is_price_change=item[8] if item[8] is not None else 0
                    )

                    jd.to_right_and_update_data(data, pipeline=tmp_sql_server)
                else:  # 表示返回的data值为空值
                    pass
            else:  # 表示返回的data值为空值
                my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                pass
            index += 1
            gc.collect()
            sleep(1.2)
        my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        try: del jd
        except: pass
        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(5)
        gc.collect()