def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') comment_list = list() comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip() wb_comment.comment_id = comment['comment_id'] wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:] # todo 日期格式化 wb_comment.create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') comment_list = list() comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip() wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:] # todo 日期格式化 wb_comment.create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html5lib') comment_list = list() comments = soup.find(attrs={ 'node-type': 'comment_list' }).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: cont = [] first_author = True first_colon = True for content in comment.find(attrs={'class': 'WB_text'}).contents: if not content: continue if content.name == 'a': if first_author: first_author = False continue else: if content.text: cont.append(content.text) elif content.name == 'img': img_title = content.get('title', '') if img_title == '': img_title = content.get('alt', '') if img_title == '': img_src = content.get('src', '') img_src = img_src.split('/')[-1].split('.', 1)[0] try: img_title = parse_emoji.softband_to_utf8(img_src) except Exception as e: parser.error('解析表情失败,具体信息是{},{}'.format( e, comment)) img_title = '' cont.append(img_title) else: if first_colon: if content.find(':') == 0: cont.append(content.replace(':', '', 1)) first_colon = False else: cont.append(content) wb_comment.comment_cont = ''.join(cont) wb_comment.comment_screen_name = comment.find(attrs={ 'class': 'WB_text' }).find('a').text wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={ 'class': 'WB_text' }).find('a').get('usercard')[3:] create_time_str = comment.find(attrs={ 'class': 'WB_from S_txt2' }).text try: create_time = get_create_time_from_text(create_time_str) except ValueError as e: create_time = get_create_time_from_text_default_error_handler( create_time_str, e) create_time_str = create_time.strftime("%Y-%m-%d %H:%M:%S") wb_comment.create_time = create_time_str wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html5lib') comment_list = list() comments = soup.find(attrs={ 'node-type': 'comment_list' }).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: cont = [] first_author = True first_colon = True for content in comment.find(attrs={'class': 'WB_text'}).contents: if not content: continue if content.name == 'a': if first_author: first_author = False continue else: if content.text: cont.append(content.text) elif content.name == 'img': img_title = content.get('title', '') if img_title == '': img_title = content.get('alt', '') if img_title == '': img_src = content.get('src', '') img_src = img_src.split('/')[-1].split('.', 1)[0] try: img_title = parse_emoji.softband_to_utf8(img_src) except Exception as e: parser.error('解析表情失败,具体信息是{},{}'.format( e, comment)) img_title = '' cont.append(img_title) else: if first_colon: if content.find(':') == 0: cont.append(content.replace(':', '', 1)) first_colon = False else: cont.append(content) wb_comment.comment_cont = ''.join(cont) wb_comment.comment_screen_name = comment.find(attrs={ 'class': 'WB_text' }).find('a').text wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={ 'class': 'WB_text' }).find('a').get('usercard')[3:] # 爬取新用户基本信息 if wb_comment.user_id: get_profile(wb_comment.user_id) # 日期格式化 create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text if '分钟前' in create_time: now = datetime.datetime.now() reduce_minute = create_time.strip().split('分钟')[0] delta = datetime.timedelta(minutes=int(reduce_minute)) real_time = now - delta wb_comment.create_time = str( real_time.strftime('%Y-%m-%d %H:%M')) elif '今天' in create_time: now = datetime.datetime.now().strftime('%Y-%m-%d') real_time = now + create_time.strip().split('今天')[-1] wb_comment.create_time = str(real_time) elif '楼' in create_time: wb_comment.create_time = str(re.sub('第\d*楼', '', create_time)) else: wb_comment.create_time = create_time if not wb_comment.create_time.startswith('201'): wb_comment.create_time = str( datetime.datetime.now().year) + wb_comment.create_time # 中文时间戳转换成标准格式 "%Y-%m-%d %H:%M" create_time_copy = wb_comment.create_time if '月' in create_time_copy and '日' in create_time_copy: month = create_time_copy.split("年")[-1].split("月")[0] day = create_time_copy.split("年")[-1].split("月")[-1].split( "日")[0] # 补齐0 if month and int(month) < 10: wb_comment.create_time = wb_comment.create_time.replace( str(month) + "月", "0" + str(month) + "月") if day and int(day) < 10: wb_comment.create_time = wb_comment.create_time.replace( str(day) + "日", "0" + str(day) + "日") wb_comment.create_time = wb_comment.create_time.replace( "月", "-") wb_comment.create_time = wb_comment.create_time.replace( "日", "") if '年' in wb_comment.create_time: wb_comment.create_time = wb_comment.create_time.replace( "年", "-") wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html5lib') comment_list = list() comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: cont = [] first_author=True first_colon=True for content in comment.find(attrs={'class': 'WB_text'}).contents: if not content: continue if content.name =='a': if first_author: first_author=False continue else: if content.text: cont.append(content.text) elif content.name=='img': img_title = content.get('title', '') if img_title=='': img_title = content.get('alt', '') if img_title=='': img_src = content.get('src','') img_src = img_src.split('/')[-1].split('.',1)[0] try: img_title = parse_emoji.softband_to_utf8(img_src) except Exception as e: parser.error('解析表情失败,具体信息是{},{}'.format(e, comment)) img_title = '' cont.append(img_title) else: if first_colon: if content.find(':')==0: cont.append(content.replace(':','',1)) first_colon=False else: cont.append(content) wb_comment.comment_cont = ''.join(cont) wb_comment.comment_screen_name =comment.find(attrs={'class': 'WB_text'}).find('a').text wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:] # 日期格式化 create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text if '分钟前' in create_time: now = datetime.datetime.now() reduce_minute = create_time.strip().split('分钟')[0] delta = datetime.timedelta(minutes=int(reduce_minute)) real_time = now - delta wb_comment.create_time = str(real_time.strftime('%Y-%m-%d %H:%M')) elif '今天' in create_time: now = datetime.datetime.now().strftime('%Y-%m-%d') real_time = now + create_time.strip().split('今天')[-1] wb_comment.create_time = str(real_time) elif '楼' in create_time: wb_comment.create_time = str(re.sub('第\d*楼', '', create_time)) else: wb_comment.create_time = create_time if not wb_comment.create_time.startswith('201'): wb_comment.create_time = str(datetime.datetime.now().year) + wb_comment.create_time wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list