コード例 #1
0
ファイル: comment.py プロジェクト: mousechen/WeiboSpider
def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    comment_list = list()
    comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
            wb_comment.comment_id = comment['comment_id']
            wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
            # todo 日期格式化
            wb_comment.create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list
コード例 #2
0
ファイル: comment.py プロジェクト: dittoyi/weibospider
def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html.parser')
    comment_list = list()
    comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列(seed_ids)
            wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
            # todo 日期格式化
            wb_comment.create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list
コード例 #3
0
ファイル: comment.py プロジェクト: thekingofcity/weibospider
def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html5lib')
    comment_list = list()
    comments = soup.find(attrs={
        'node-type': 'comment_list'
    }).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            cont = []
            first_author = True
            first_colon = True
            for content in comment.find(attrs={'class': 'WB_text'}).contents:
                if not content:
                    continue
                if content.name == 'a':
                    if first_author:
                        first_author = False
                        continue
                    else:
                        if content.text:
                            cont.append(content.text)

                elif content.name == 'img':
                    img_title = content.get('title', '')
                    if img_title == '':
                        img_title = content.get('alt', '')
                    if img_title == '':
                        img_src = content.get('src', '')
                        img_src = img_src.split('/')[-1].split('.', 1)[0]
                        try:
                            img_title = parse_emoji.softband_to_utf8(img_src)
                        except Exception as e:
                            parser.error('解析表情失败,具体信息是{},{}'.format(
                                e, comment))
                            img_title = ''
                    cont.append(img_title)

                else:
                    if first_colon:
                        if content.find(':') == 0:
                            cont.append(content.replace(':', '', 1))
                            first_colon = False
                    else:
                        cont.append(content)

            wb_comment.comment_cont = ''.join(cont)
            wb_comment.comment_screen_name = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').text

            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列(seed_ids)
            wb_comment.user_id = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').get('usercard')[3:]

            create_time_str = comment.find(attrs={
                'class': 'WB_from S_txt2'
            }).text
            try:
                create_time = get_create_time_from_text(create_time_str)
            except ValueError as e:
                create_time = get_create_time_from_text_default_error_handler(
                    create_time_str, e)
            create_time_str = create_time.strftime("%Y-%m-%d %H:%M:%S")
            wb_comment.create_time = create_time_str

            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list
コード例 #4
0
def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html5lib')
    comment_list = list()
    comments = soup.find(attrs={
        'node-type': 'comment_list'
    }).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            cont = []
            first_author = True
            first_colon = True
            for content in comment.find(attrs={'class': 'WB_text'}).contents:
                if not content:
                    continue
                if content.name == 'a':
                    if first_author:
                        first_author = False
                        continue
                    else:
                        if content.text:
                            cont.append(content.text)

                elif content.name == 'img':
                    img_title = content.get('title', '')
                    if img_title == '':
                        img_title = content.get('alt', '')
                    if img_title == '':
                        img_src = content.get('src', '')
                        img_src = img_src.split('/')[-1].split('.', 1)[0]
                        try:
                            img_title = parse_emoji.softband_to_utf8(img_src)
                        except Exception as e:
                            parser.error('解析表情失败,具体信息是{},{}'.format(
                                e, comment))
                            img_title = ''
                    cont.append(img_title)

                else:
                    if first_colon:
                        if content.find(':') == 0:
                            cont.append(content.replace(':', '', 1))
                            first_colon = False
                    else:
                        cont.append(content)

            wb_comment.comment_cont = ''.join(cont)
            wb_comment.comment_screen_name = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').text

            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列(seed_ids)
            wb_comment.user_id = comment.find(attrs={
                'class': 'WB_text'
            }).find('a').get('usercard')[3:]
            # 爬取新用户基本信息
            if wb_comment.user_id:
                get_profile(wb_comment.user_id)
            # 日期格式化
            create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            if '分钟前' in create_time:
                now = datetime.datetime.now()
                reduce_minute = create_time.strip().split('分钟')[0]
                delta = datetime.timedelta(minutes=int(reduce_minute))
                real_time = now - delta
                wb_comment.create_time = str(
                    real_time.strftime('%Y-%m-%d %H:%M'))
            elif '今天' in create_time:
                now = datetime.datetime.now().strftime('%Y-%m-%d')
                real_time = now + create_time.strip().split('今天')[-1]
                wb_comment.create_time = str(real_time)
            elif '楼' in create_time:
                wb_comment.create_time = str(re.sub('第\d*楼', '', create_time))
            else:
                wb_comment.create_time = create_time
            if not wb_comment.create_time.startswith('201'):
                wb_comment.create_time = str(
                    datetime.datetime.now().year) + wb_comment.create_time

            # 中文时间戳转换成标准格式 "%Y-%m-%d %H:%M"
            create_time_copy = wb_comment.create_time
            if '月' in create_time_copy and '日' in create_time_copy:
                month = create_time_copy.split("年")[-1].split("月")[0]
                day = create_time_copy.split("年")[-1].split("月")[-1].split(
                    "日")[0]
                # 补齐0
                if month and int(month) < 10:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        str(month) + "月", "0" + str(month) + "月")
                if day and int(day) < 10:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        str(day) + "日", "0" + str(day) + "日")
                wb_comment.create_time = wb_comment.create_time.replace(
                    "月", "-")
                wb_comment.create_time = wb_comment.create_time.replace(
                    "日", "")
                if '年' in wb_comment.create_time:
                    wb_comment.create_time = wb_comment.create_time.replace(
                        "年", "-")

            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list
コード例 #5
0
ファイル: comment.py プロジェクト: ResolveWang/WeiboSpider
def get_comment_list(html, wb_id):
    """
    获取评论列表
    :param html: 
    :param wb_id: 
    :return: 
    """
    cont = get_html_cont(html)
    if not cont:
        return list()

    soup = BeautifulSoup(cont, 'html5lib')
    comment_list = list()
    comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})

    for comment in comments:
        wb_comment = WeiboComment()
        try:
            cont = []
            first_author=True
            first_colon=True
            for content in comment.find(attrs={'class': 'WB_text'}).contents:
                if not content:
                    continue
                if content.name =='a':
                    if first_author:
                        first_author=False
                        continue
                    else:
                        if content.text:
                            cont.append(content.text)
                    
                elif content.name=='img':
                    img_title = content.get('title', '')
                    if img_title=='':
                        img_title = content.get('alt', '')
                    if img_title=='':
                        img_src = content.get('src','')
                        img_src = img_src.split('/')[-1].split('.',1)[0]
                        try:
                            img_title = parse_emoji.softband_to_utf8(img_src)
                        except Exception as e:
                            parser.error('解析表情失败,具体信息是{},{}'.format(e, comment))
                            img_title = ''
                    cont.append(img_title)

                else:
                    if first_colon:
                        if content.find(':')==0:
                            cont.append(content.replace(':','',1))
                            first_colon=False
                    else:            
                        cont.append(content)

            wb_comment.comment_cont = ''.join(cont)
            wb_comment.comment_screen_name =comment.find(attrs={'class': 'WB_text'}).find('a').text
            
            wb_comment.comment_id = comment['comment_id']
            # TODO 将wb_comment.user_id加入待爬队列(seed_ids)
            wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
            # 日期格式化
            create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
            if '分钟前' in create_time:
                now = datetime.datetime.now()
                reduce_minute = create_time.strip().split('分钟')[0]
                delta = datetime.timedelta(minutes=int(reduce_minute))
                real_time = now - delta
                wb_comment.create_time = str(real_time.strftime('%Y-%m-%d %H:%M'))
            elif '今天' in create_time:
                now = datetime.datetime.now().strftime('%Y-%m-%d')
                real_time = now + create_time.strip().split('今天')[-1]
                wb_comment.create_time = str(real_time)
            elif '楼' in create_time:
                wb_comment.create_time = str(re.sub('第\d*楼', '', create_time))
            else:
                wb_comment.create_time = create_time
            if not wb_comment.create_time.startswith('201'):
                wb_comment.create_time = str(datetime.datetime.now().year) + wb_comment.create_time

            wb_comment.weibo_id = wb_id
        except Exception as e:
            parser.error('解析评论失败,具体信息是{}'.format(e))
        else:
            comment_list.append(wb_comment)
    return comment_list