Example #1
0
def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列(seed_ids)
            wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
                                                    get('href'))
            parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('repost parse error occurred,the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
Example #2
0
def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
                                                    get('href'))
            parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('repost parse error occurred,the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
Example #3
0
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user, _ = user_get.get_profile(uid)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            repost_datas.extend(cur_repost_datas)

    for index, repost_obj in enumerate(repost_datas):
        user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name)
        if not user_id:
            # when it comes to errors, set the args to default(root)
            repost_obj.parent_user_id = root_user.uid
            repost_obj.parent_user_name = root_user.name
        else:
            repost_obj.parent_user_id = user_id
        repost_datas[index] = repost_obj

    weibo_repost.save_reposts(repost_datas)
Example #4
0
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    wb_data.set_weibo_repost_crawled(mid)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user = user_get.get_profile(uid)

    if total_page < limit:
        limit = total_page + 1
    # todo 这里需要衡量是否有用网络调用的必要性
    for page_num in range(2, limit):
        # app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler',
        #               routing_key='comment_page_info')
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            repost_datas.extend(cur_repost_datas)

    # 补上user_id,方便可视化
    for index, repost_obj in enumerate(repost_datas):
        user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name)
        if not user_id:
            # 设置成根用户的uid和用户名
            repost_obj.parent_user_id = root_user.uid
            repost_obj.parent_user_name = root_user.name
        else:
            repost_obj.parent_user_id = user_id
        repost_datas[index] = repost_obj

    weibo_repost.save_reposts(repost_datas)
Example #5
0
def crawl_repost_page(mid, uid):
    limit = get_max_repost_page() + 1
    first_repost_data = crawl_repost_by_page(mid, 1)
    total_page = repost.get_total_page(first_repost_data[0])
    repost_datas = first_repost_data[1]

    if not repost_datas:
        return

    root_user, _ = user_get.get_profile(uid)

    if total_page < limit:
        limit = total_page + 1

    for page_num in range(2, limit):
        cur_repost_datas = crawl_repost_by_page(mid, page_num)[1]
        if cur_repost_datas:
            repost_datas.extend(cur_repost_datas)

    for index, repost_obj in enumerate(repost_datas):
        user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name)
        if not user_id:
            # when it comes to errors, set the args to default(root)
            repost_obj.parent_user_id = root_user.uid
            repost_obj.parent_user_name = root_user.name
        else:
            repost_obj.parent_user_id = user_id
        repost_datas[index] = repost_obj

    weibo_repost.save_reposts(repost_datas)
Example #6
0
 def test_get_name(self):
     from db.redis_db import IdNames
     print(IdNames.fetch_uid_by_name('腐剧基地'))
Example #7
0
def get_repost_list(html, mid):
    """
       获取转发列表
       :param html: 
       :param mid:
       :return: 
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk',
                                                          'ignore').decode(
                                                              'gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列(seed_ids)
            wb_repost.user_id = repost.find(attrs={
                'class': 'WB_face W_fl'
            }).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={
                'class': 'WB_from S_txt2'
            }).find('a').get('title')
            wb_repost.weibo_url = repost_url.format(
                repost.find(attrs={
                    'class': 'WB_from S_txt2'
                }).find('a').get('href'))
            parents = repost.find(attrs={
                'class': 'WB_text'
            }).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # 把当前转发的用户id和用户名存储到redis中,作为中间结果
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # 第一个即是最上层用户,由于拿不到上层用户的uid,只能拿昵称,但是昵称可以修改,所以入库前还是得把uid拿到
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error('解析上层用户名发生错误,具体信息是{}'.format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
Example #8
0
 def test_store_and_fetch_name_id(self):
     IdNames.store_id_name(FAKE_STR, FAKE_ID)
     rs = IdNames.fetch_uid_by_name(FAKE_STR)
     assert rs == FAKE_ID
Example #9
0
 def test_get_name(self):
     from db.redis_db import IdNames
     print(IdNames.fetch_uid_by_name('腐剧基地'))
Example #10
0
 def test_store_and_fetch_name_id(self):
     IdNames.store_id_name(FAKE_STR, FAKE_ID)
     rs = IdNames.fetch_uid_by_name(FAKE_STR)
     assert rs == FAKE_ID