Exemple #1
0
def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列(seed_ids)
            wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
                                                    get('href'))
            parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('repost parse error occurred,the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
Exemple #2
0
def get_repost_list(html, mid):
    """
       Get repost details
       :param html: page source
       :param mid: weibo mid
       :return: list of repost infos
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
            wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
                                                    get('href'))
            parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # Save the current repost user's name and id as the middle result
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('repost parse error occurred,the detail is {}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
def get_repost_list(html, mid):
    """
       获取转发列表
       :param html: 
       :param mid:
       :return: 
       """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    repost_list = list()
    reposts = soup.find_all(attrs={'action-type': 'feed_list_item'})

    for repost in reposts:
        wb_repost = WeiboRepost()
        try:
            repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\
                split('//@')
            wb_repost.repost_cont = repost_cont[0].encode('gbk',
                                                          'ignore').decode(
                                                              'gbk', 'ignore')
            wb_repost.weibo_id = repost['mid']
            # TODO 将wb_repost.user_id加入待爬队列(seed_ids)
            wb_repost.user_id = repost.find(attrs={
                'class': 'WB_face W_fl'
            }).find('a').get('usercard')[3:]
            wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
                text
            wb_repost.repost_time = repost.find(attrs={
                'class': 'WB_from S_txt2'
            }).find('a').get('title')
            wb_repost.weibo_url = repost_url.format(
                repost.find(attrs={
                    'class': 'WB_from S_txt2'
                }).find('a').get('href'))
            parents = repost.find(attrs={
                'class': 'WB_text'
            }).find(attrs={'node-type': 'text'})
            wb_repost.root_weibo_id = mid

            # 把当前转发的用户id和用户名存储到redis中,作为中间结果
            IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id)

            if not parents:
                wb_repost.parent_user_name = ''
            else:
                try:
                    # 第一个即是最上层用户,由于拿不到上层用户的uid,只能拿昵称,但是昵称可以修改,所以入库前还是得把uid拿到
                    temp = parents.find(attrs={'extra-data': 'type=atname'})
                    if temp:
                        wb_repost.parent_user_name = temp.get('usercard')[5:]
                    else:
                        wb_repost.parent_user_name = ''
                except Exception as e:
                    parser.error('解析上层用户名发生错误,具体信息是{}'.format(e))
                    wb_repost.parent_user_name = ''

        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            repost_list.append(wb_repost)

    return repost_list
Exemple #4
0
 def test_store_and_fetch_name_id(self):
     IdNames.store_id_name(FAKE_STR, FAKE_ID)
     rs = IdNames.fetch_uid_by_name(FAKE_STR)
     assert rs == FAKE_ID
 def test_store_and_fetch_name_id(self):
     IdNames.store_id_name(FAKE_STR, FAKE_ID)
     rs = IdNames.fetch_uid_by_name(FAKE_STR)
     assert rs == FAKE_ID