Example #1
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_jd_all_goods_id_url())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                data = {}
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                jd = JdParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[1], index))
                    tmp_item = []
                    if item[0] == 7 or item[0] == 8:  # 从数据库中取出时,先转换为对应的类型
                        tmp_item.append(0)
                    elif item[0] == 9:
                        tmp_item.append(1)
                    elif item[0] == 10:
                        tmp_item.append(2)
                    tmp_item.append(item[1])
                    jd.get_goods_data(goods_id=tmp_item)
                    data = jd.deal_with_data(goods_id=tmp_item)
                    if data != {}:
                        data['goods_id'] = item[1]
                        '''
                        设置最后刷新的商品状态上下架时间
                        '''
                        # 1.is_delete由0->1 为下架时间down_time  2. is_delete由1->0 为上架时间shelf_time
                        my_shelf_and_down_time = {
                            'shelf_time': '',
                            'down_time': '',
                        }
                        if data['is_delete'] != item[2]:
                            if data['is_delete'] == 0 and item[2] == 1:
                                # is_delete由0->1 表示商品状态上架变为下架
                                my_shelf_and_down_time['down_time'] = str(
                                    get_shanghai_time())
                            else:
                                # is_delete由1->0 表示商品状态下架变为上架
                                my_shelf_and_down_time['shelf_time'] = str(
                                    get_shanghai_time())
                        else:
                            if item[3] is None or item[
                                    3] == '{"shelf_time": "", "down_time": ""}' or len(
                                        item[3]) == 35:  # 35就是那串初始str
                                if data['is_delete'] == 0:  # 上架的状态
                                    my_shelf_and_down_time['shelf_time'] = str(
                                        get_shanghai_time())
                                else:  # 下架的状态
                                    my_shelf_and_down_time['down_time'] = str(
                                        get_shanghai_time())
                            else:
                                # 否则保存原始值不变
                                tmp_shelf_and_down_time = item[3]
                                my_shelf_and_down_time = json.loads(
                                    tmp_shelf_and_down_time)  # 先转换为dict
                        data['my_shelf_and_down_time'] = my_shelf_and_down_time
                        # print(my_shlef_and_down_time)

                        # print('------>>>| 爬取到的数据为: ', data)
                        jd.to_right_and_update_data(data,
                                                    pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del tmall
                # except:
                #     pass
                gc.collect()
                # sleep(1)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Example #2
0
    def get_all_user_and_their_recommend_goods_list(self):
        for index in range(1, 100):
            # 达人推荐的地址(ajax请求)
            tmp_url = 'https://wq.jd.com/shopgroup_feed/GetDarenFeeds?pageno={}&pagesize=5&perDarenFeedNum=3&g_tk=1975813451'.format(str(index))

            self.from_ip_pool_set_proxy_ip_to_phantomjs()
            self.driver.set_page_load_timeout(15)  # 设置成15秒避免数据出错

            try:
                self.driver.get(tmp_url)
                self.driver.implicitly_wait(15)
            except Exception as e:  # 如果超时, 终止加载并继续后续操作
                print('-->>time out after 15 seconds when loading page')
                self.driver.execute_script('window.stop()')  # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作
                # pass

            body = self.driver.page_source
            body = re.compile(r'\n').sub('', body)
            body = re.compile(r'\t').sub('', body)
            body = re.compile(r'  ').sub('', body)
            # print(body)
            body = re.compile(r'square\((.*)\)').findall(body)

            if body != []:
                body = body[0]
                try:
                    data = json.loads(body)
                    # pprint(data)
                except:
                    print('json.loads转换body得到data时出错!')
                    return []

                if data.get('user_list') is None:   # 表示没有数据了,返回的为 square({"errmsg":"","iRet":0,"totalnum":347} )
                    print('body中获取的user_list为None!')
                    pass

                else:
                    user_list = data.get('user_list', [])
                    # pprint(user_list)

                    for item in user_list:
                        # 达人昵称
                        nick_name = item.get('nickname', '')

                        # 达人头像
                        head_url = item.get('headurl', '')
                        head_url = re.compile(r'http:').sub('', head_url)
                        if re.compile(r'^http').findall(head_url) != []:
                            pass
                        else:
                            head_url = 'http:' + head_url

                        # 个性签名
                        profile = item.get('profile', '')

                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        db_goods_id = [j[1] for j in list(my_pipeline.select_jd_all_goods_id_url())]
                        # print(db_goods_id)
                        db_share_id = [j[0] for j in list(my_pipeline.select_jd_youxuan_daren_recommend_all_share_id())]
                        # print(db_share_id)
                        jd = JdParse()

                        # 达人推荐的商品info
                        feed_list = item.get('feed_list', [])
                        for feed_list_item in feed_list:
                            if feed_list_item.get('shareid', '') in db_share_id:
                                print('该share_id({})已存在于数据库中, 此处跳过!'.format(feed_list_item.get('shareid', '')))
                                pass
                            else:
                                # share_id
                                share_id = feed_list_item.get('shareid', '')
                                article_url = 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id
                                print('------>>>| 正在抓取的jd优选达人推荐文章的地址为: ', 'https://wqs.jd.com/shoppingv2/detail.html?shareid=' + share_id)

                                # 图片的信息
                                tmp_share_img_url_list = []
                                for item2 in feed_list_item.get('sharepicurl', '').split(','):
                                    if re.compile(r'^//').findall(item2) == []:
                                        tmp_share_img_url = 'https://img14.360buyimg.com/evalpic/s800x800_' + item2
                                    else:
                                        tmp_share_img_url = 'http:' + item2
                                    tmp_share_img_url_list.append(tmp_share_img_url)
                                share_img_url_list = [{'img_url': item5} for item5 in tmp_share_img_url_list]

                                # 处理得到达人的自拍图片div
                                tmp_img_div_desc = ''
                                for item4 in tmp_share_img_url_list:
                                    tmp_img_div = r'<img src="{}" style="height:auto;width:100%;"/>'.format(item4)
                                    tmp_img_div_desc += tmp_img_div
                                my_img_div = '<div>' + tmp_img_div_desc + '</div>'
                                # print(my_img_div)

                                # 获取到goods_id 和 fisrt_text
                                share_url = 'https://wq.jd.com/shopgroup_feed/FeedDetail?shareid=' + feed_list_item.get('shareid', '') + '&g_tk=1975813451'
                                try:
                                    self.from_ip_pool_set_proxy_ip_to_phantomjs()
                                    self.driver.get(share_url)
                                    self.driver.implicitly_wait(15)
                                except Exception as e:  # 如果超时, 终止加载并继续后续操作
                                    print('-->>time out after 15 seconds when loading page')
                                    self.driver.execute_script('window.stop()')  # 当页面加载时间超过设定时间,通过执行Javascript来stop加载,即可执行后续动作
                                    # pass
                                feed_detail_body = self.driver.page_source
                                feed_detail_body = re.compile(r'\n').sub('', feed_detail_body)
                                feed_detail_body = re.compile(r'\t').sub('', feed_detail_body)
                                feed_detail_body = re.compile(r'  ').sub('', feed_detail_body)

                                feed_data = re.compile(r'square\((.*)\)').findall(feed_detail_body)
                                # print(feed_data)

                                if feed_data != []:
                                    feed_data = feed_data[0]
                                    try:
                                        feed_data = json.loads(feed_data)
                                    except:
                                        print('json.loads转换feed_data失败,此处跳过!')
                                        break   # 跳出后执行后面的外层的else

                                    # 文章标题
                                    title = feed_data.get('feeddata', {}).get('title', '')
                                    title = re.compile(r'12.12').sub('', title)

                                    # 达人评论内容
                                    tmp_comment_content = feed_data.get('feeddata', {}).get('commentcontent', '')
                                    tmp_comment_content = re.compile(r'&amp;').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'\n').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'12.12').sub('', tmp_comment_content)
                                    tmp_comment_content = re.compile(r'11.11').sub('', tmp_comment_content)
                                    comment_content = tmp_comment_content

                                    if title == '':
                                        # 由于获取到title为空, 所有title = comment_content, 并把comment_content = ''
                                        title = comment_content
                                        comment_content = ''
                                    # print('该文章的标题为: ', title)
                                    # print('达人的评论内容为: ', comment_content)

                                    # first_text(文章的第一段评论内容)
                                    first_text = feed_data.get('feeddata', {}).get('firsttext', '')
                                    first_text = re.compile(r'12.12').sub('', first_text)
                                    first_text = re.compile(r'11.11').sub('', first_text)
                                    # print('first_text为: ', first_text)

                                    sku_id = feed_data.get('feeddata', {}).get('skuid')
                                    if sku_id == '0':
                                        # 如果sku_id = '0'表示没有sku_id
                                        sku_id = ''
                                    # print('sku_id为: ', sku_id)

                                    share_id = feed_list_item.get('shareid', '')
                                    tmp_div_body_dict = self.get_div_body(share_id=share_id)
                                    # pprint(tmp_div_body_dict)

                                    if tmp_div_body_dict['sku_info'] == [] and sku_id != '':
                                        # 表示如果tmp_div_body_dict['sku_info']为[],则第二部分没有goods_id,所有将第一个sku_id赋值给sku_info
                                        goods_id_list = [{'goods_id': sku_id}]
                                    else:
                                        # 这篇文章推荐的商品goods_id的list(第一个为没有div_body时的goods_id)
                                        goods_id_list = [{'goods_id': item6} for item6 in tmp_div_body_dict['sku_info']]
                                    tmp_div_body = '<div>' + '<h3>{}</h3>'.format(title) + '<p>{}</p>'.format(comment_content) + my_img_div + tmp_div_body_dict['div_body']
                                    # print('该文章推荐的商品goods_id的list为: ', goods_id_list)
                                    # print(tmp_div_body)

                                else:
                                    print('获取feed_data失败!')
                                    return []

                                # 后期处理
                                if comment_content == '':
                                    comment_content = first_text

                                '''
                                时区处理,时间处理到上海时间
                                '''
                                tz = pytz.timezone('Asia/Shanghai')  # 创建时区对象
                                now_time = datetime.datetime.now(tz)
                                # 处理为精确到秒位,删除时区信息
                                now_time = re.compile(r'\..*').sub('', str(now_time))
                                # 将字符串类型转换为datetime类型
                                now_time = datetime.datetime.strptime(now_time, '%Y-%m-%d %H:%M:%S')
                                create_time = now_time      # 创建的时间

                                result = {
                                    'nick_name': nick_name,                     # 达人昵称
                                    'head_url': head_url,                       # 达人头像
                                    'profile': profile,                         # 个性签名
                                    'share_id': share_id,                       # 分享的share_id
                                    'article_url': article_url,                 # 文章原地址
                                    'title': title,                             # 文章标题
                                    'comment_content': comment_content,         # 达人的评论内容
                                    'share_img_url_list': share_img_url_list,   # 达人自拍照片list
                                    # 'first_text': first_text,                   # 文章的第一段评论文字
                                    'goods_id_list':goods_id_list,              # 文章中所有推荐的商品的goods_id的list
                                    'div_body': tmp_div_body,                   # 文章主体div
                                    'create_time': create_time,                 # 文章创建的时间
                                }
                                # pprint(result)
                                print(result)
                                my_pipeline.insert_into_jd_youxuan_daren_recommend_table(item=result)

                                print('准备开始抓取该文章中的所有推荐商品'.center(30, '-'))
                                for i in goods_id_list:
                                    if i.get('goods_id', '') in db_goods_id:
                                        print('该goods_id({})已经存在于数据库中, 此处跳过!'.format(i.get('goods_id', '')))
                                        pass
                                    else:
                                        tmp_goods_id_url = 'https://item.jd.com/' + i.get('goods_id', '') + '.html'
                                        goods_id = jd.get_goods_id_from_url(jd_url=tmp_goods_id_url)
                                        jd.get_goods_data(goods_id=goods_id)
                                        tmp_jd_data = jd.deal_with_data(goods_id=goods_id)
                                        tmp_jd_data['spider_url'] = tmp_goods_id_url
                                        tmp_jd_data['username'] = '******'
                                        tmp_jd_data['goods_id'] = goods_id[1]

                                        jd.insert_into_jd_table(data=tmp_jd_data, pipeline=my_pipeline)
                                print('该文章内推荐的商品全部抓取完毕'.center(30, '-'))

            else:
                print('body为空list!')