def get_dialogue(html, wb_id, cid): """ 获取对话列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) soup = BeautifulSoup(cont, 'lxml') dialogue_list = [] dialogues = soup.find_all(attrs={'class': 'WB_text'}) if len(dialogues) < 2: return None, None weibo_dialogue = WeiboDialogue() uids = [] try: for dialogue in dialogues: user_id = dialogue.find('a').get('usercard')[3:] uids.append(user_id) dialogue_list.append({'uid': user_id, 'text': dialogue.text.strip()}) weibo_dialogue.weibo_id = wb_id weibo_dialogue.dialogue_id = cid weibo_dialogue.dialogue_cont = json.dumps(dialogue_list) weibo_dialogue.dialogue_rounds = len(dialogues) except Exception as e: parser.error('解析对话失败,具体信息是{}'.format(e)) return weibo_dialogue, uids
def get_comment_id(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'lxml') comment_ids = list() comments = soup.find(attrs={ 'node-type': 'comment_list' }).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: try: comment_cont = comment.find(attrs={ 'class': 'WB_text' }).text.strip() if '回复@' in comment_cont: comment_ids.append(comment['comment_id']) except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) return comment_ids
def get_dialogue(html, wb_id, cid): """ 获取对话列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) soup = BeautifulSoup(cont, 'html.parser') dialogue_list = [] dialogues = soup.find_all(attrs={'class': 'WB_text'}) if len(dialogues) < 2: return None, None weibo_dialogue = WeiboDialogue() uids = [] try: for dialogue in dialogues: user_id = dialogue.find('a').get('usercard')[3:] uids.append(user_id) dialogue_list.append({ 'uid': user_id, 'text': dialogue.text.strip() }) weibo_dialogue.weibo_id = wb_id weibo_dialogue.dialogue_id = cid weibo_dialogue.dialogue_cont = json.dumps(dialogue_list) weibo_dialogue.dialogue_rounds = len(dialogues) except Exception as e: parser.error('解析对话失败,具体信息是{}'.format(e)) return weibo_dialogue, uids
def get_praise_list(html, wb_id): """ 获取点赞列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') praise_list = list() praises = soup.find_all('li') # pattern = re.compile(r'<li uid=\\"\d{10}\\">') # praises = pattern.findall(cont) for praise in praises: wb_praise = WeiboPraise() try: wb_praise.user_id = praise['uid'] wb_praise.weibo_id = wb_id except Exception as e: parser.error('解析点赞失败,具体信息是{}'.format(e)) else: praise_list.append(wb_praise) return praise_list
def get_dialogue(html, wb_id, cid): """ 获取对话列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) soup = BeautifulSoup(cont, 'lxml') # print(soup.prettify()) dialogue_list = [] dialogues = soup.find_all(attrs={'class': 'WB_text'}) if len(dialogues) < 2: return None weibo_dialogue = WeiboDialoggue() try: for dialogue in dialogues: # print(dialogue.text.strip()) dialogue_list.append(dialogue.text.strip()) weibo_dialogue.weibo_id = wb_id weibo_dialogue.dialogue_id = cid weibo_dialogue.dialogue_cont = json.dumps(dialogue_list) except Exception as e: parser.error('解析对话失败,具体信息是{}'.format(e)) return weibo_dialogue
def get_total_page(html): try: page_count = json.loads(html, encoding='utf-8').get('data', '').get('page', '').get('totalpage', 1) except Exception as e: parser.error('Get total page error, the reason is {}'.format(e)) page_count = 1 return page_count
def get_total_page(html): try: page_count = json.loads(html, encoding='utf-8').get('data', '').get('page', '').get('totalpage', 1) except Exception as e: parser.error('Errors occurred when parsing total page of repost,specification is {}'.format(e)) page_count = 1 return page_count
def handle_error(*keys): try: return func(*keys) except Exception as e: parser.error('Failed to parse the page, {} is raised, here are details:{}'.format( e, format_tb(e.__traceback__)[0] )) return return_value
def get_feed_info(feed_infos, goal): info_num = None for info in feed_infos: if goal in info.text: info_num = info.text.replace(goal, '') break if info_num is None: parser.error('unexcept template:{}'.format(feed_infos)) return int(info_num)
def handle_error(*keys): try: return func(*keys) except Exception as e: parser.error( 'Failed to parse the page, {} is raised, here are details:{}' .format(e, format_tb(e.__traceback__)[0])) return return_value
def get_repost_list(html, mid): """ Get repost details :param html: page source :param mid: weibo mid :return: list of repost infos """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore') wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title') wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a'). get('href')) parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # Save the current repost user's name and id as the middle result IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('repost parse error occurred,the detail is {}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def get_repost_list(html, mid): """ Get repost details :param html: page source :param mid: weibo mid :return: list of repost infos """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore') wb_repost.weibo_id = repost['mid'] wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title') wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a'). get('href')) parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # Save the current repost user's name and id as the middle result IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('repost parse error occurred,the detail is {}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def get_mid(html): cont = _get_statushtml(html) soup = BeautifulSoup(cont, 'html.parser') try: return soup.find(attrs={'action-type': 'feed_list_item'})['mid'] except TypeError: mid_pattern = r'mid=(\d+)' mid_matcher = re.search(mid_pattern, html) return mid_matcher.group(1) if mid_matcher else '' except Exception as e: parser.error('get_mid()发生异常,具体异常为{e}'.format(e=e))
def get_commentcounts(html): cont = _get_statushtml(html) soup = BeautifulSoup(cont, "html.parser") try: comments = soup.find(attrs={'node-type': 'comment_btn_text'}).find('span').find('em').find_next_sibling().text if comments == '评论': return 0 counts = int(comments) return counts except (ValueError, AttributeError) as e: parser.error(e) return 0
def get_commentcounts(html): cont = _get_statushtml(html) soup = BeautifulSoup(cont, "html.parser") try: comments = soup.find(attrs={ 'node-type': 'comment_btn_text' }).find('span').find('em').find_next_sibling().text if comments == '评论': return 0 counts = int(comments) return counts except (ValueError, AttributeError) as e: parser.error(e) return 0
def get_create_time_from_text_default_error_handler( create_time_str: str, e: Exception) -> datetime.datetime: """[default error handler will return datetime of now] Arguments: create_time_str {str} -- [origin str] e {Exception} -- [Exception] Returns: datetime -- [datetime of now] """ parser.error('解析评论时间失败,原时间为"{}",具体信息是{}'.format(create_time_str, e)) return datetime.datetime.now()
def get_likecounts(html): cont = _get_statushtml(html) soup = BeautifulSoup(cont, "html.parser") try: if is_root(html): likes = soup.find(attrs={'node-type': 'like_status'}).find_all('em')[1].text else: likes = soup.find_all(attrs={'node-type': 'like_status'})[1].find_all('em')[1].text if likes == '赞': return 0 else: return int(likes) except (ValueError, AttributeError) as e: parser.error(e) return 0
def get_upperusername(html, defaultname): cont = _get_statushtml(html) if 'type=atname' in cont: try: soup = BeautifulSoup(cont, 'html.parser') content = soup.find(attrs={'node-type': 'feed_list_content'}).find(attrs={'render': 'ext', 'extra-data': 'type=atname'}).text return content[1:] except AttributeError: return defaultname except Exception as e: parser.error(e) return defaultname else: return defaultname
def _get_statushtml(html): soup = BeautifulSoup(html, "html.parser") scripts = soup.find_all('script') pattern = re.compile(r'FM.view\((.*)\)') cont = '' for script in scripts: try: m = pattern.search(script.string) if m and 'pl.content.weiboDetail.index' in script.string: all_info = m.group(1) cont = json.loads(all_info)['html'] except TypeError: return '' except Exception as e: parser.error('__get__statushtml()错误,具体错误是'.format(e=e)) parser.error('网页代码为{page}'.format(page=html)) return cont
def get_praise_list(html: str, wb_id: str): """[get praise list] Arguments: html {str} -- [web page] wb_id {str} -- [weibo mid] Raises: in -- [can't get wanted dom] Returns: WeiboPraise list -- [list contains praises in this html] ext_param -- [extra parameters to get next page] """ cont = get_html_cont(html) if not cont: return list(), '' soup = BeautifulSoup(cont, 'html.parser') praise_list = list() praises = soup.find_all(attrs={'class': 'list_li S_line1 clearfix'}) # pattern = re.compile(r'<li uid=\\"\d{10}\\">') # praises = pattern.findall(cont) for praise in praises: try: user_id = praise.find('img').get('usercard')[3:] get_profile(user_id) wb_praise = WeiboPraise(user_id, wb_id) except Exception as e: parser.error('解析点赞失败,具体信息是{}'.format(e)) else: praise_list.append(wb_praise) like_loading = soup.find(attrs={'node-type': 'like_loading'}) feed_like_more = soup.find(attrs={'action-type': 'feed_like_more'}) if like_loading: action_data = like_loading.get('action-data', '') elif feed_like_more: action_data = feed_like_more.get('action-data', '') else: action_data = '' ext_param = htmllib.unescape(action_data) return praise_list, ext_param
def get_praise_list(html:str, wb_id:str): """[get praise list] Arguments: html {str} -- [web page] wb_id {str} -- [weibo mid] Raises: in -- [can't get wanted dom] Returns: WeiboPraise list -- [list contains praises in this html] ext_param -- [extra parameters to get next page] """ cont = get_html_cont(html) if not cont: return list(), '' soup = BeautifulSoup(cont, 'html.parser') praise_list = list() praises = soup.find_all(attrs={'class': 'list_li S_line1 clearfix'}) # pattern = re.compile(r'<li uid=\\"\d{10}\\">') # praises = pattern.findall(cont) for praise in praises: try: user_id = praise.find('img').get('usercard')[3:] wb_praise = WeiboPraise(user_id, wb_id) except Exception as e: parser.error('解析点赞失败,具体信息是{}'.format(e)) else: praise_list.append(wb_praise) like_loading = soup.find(attrs={'node-type': 'like_loading'}) feed_like_more = soup.find(attrs={'action-type': 'feed_like_more'}) if like_loading: action_data = like_loading.get('action-data', '') elif feed_like_more: action_data = feed_like_more.get('action-data', '') else: action_data = '' ext_param = htmllib.unescape(action_data) return praise_list, ext_param
def _get_statushtml(html): soup = BeautifulSoup(html, "html.parser") scripts = soup.find_all('script') pattern = re.compile(r'FM.view\((.*)\)') cont = '' for script in scripts: try: m = pattern.search(script.string) if m and 'pl.content.weiboDetail.index' in script.string: all_info = m.group(1) # TODO 留意这里可能发生异常 cont = json.loads(all_info)['html'] except TypeError: return '' except Exception as e: parser.error('__get__statushtml()错误,具体错误是'.format(e=e)) parser.error('网页代码为{page}'.format(page=html)) return cont
def get_likecounts(html): cont = _get_statushtml(html) soup = BeautifulSoup(cont, "html.parser") try: if is_root(html): likes = soup.find(attrs={ 'node-type': 'like_status' }).find_all('em')[1].text else: likes = soup.find_all( attrs={'node-type': 'like_status'})[1].find_all('em')[1].text if likes == '赞': return 0 else: return int(likes) except (ValueError, AttributeError) as e: parser.error(e) return 0
def get_upperusername(html, defaultname): cont = _get_statushtml(html) if 'type=atname' in cont: try: soup = BeautifulSoup(cont, 'html.parser') content = soup.find(attrs={ 'node-type': 'feed_list_content' }).find(attrs={ 'render': 'ext', 'extra-data': 'type=atname' }).text return content[1:] except AttributeError: return defaultname except Exception as e: parser.error(e) return defaultname else: return defaultname
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') comment_list = list() comments = soup.find(attrs={ 'node-type': 'comment_list' }).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: wb_comment.comment_cont = comment.find(attrs={ 'class': 'WB_text' }).text.strip() wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={ 'class': 'WB_text' }).find('a').get('usercard')[3:] # todo 日期格式化 wb_comment.create_time = comment.find(attrs={ 'class': 'WB_from S_txt2' }).text wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_comment_id(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'lxml') comment_ids = list() comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: try: comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip() if '回复@' in comment_cont: comment_ids.append(comment['comment_id']) except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) return comment_ids
def get_weibo_info(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_href = user_cont.find('a').get('href', '') if not user_href: parser.warning('Failed to get user id') return None wb_data.uid = parse_url(user_href).path[3:] try: wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error( 'Failed to get weibo url, the error is {}, the source page is {}'. format(e, html)) return None try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' try: a_tag = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('a')) extracted_url = urllib.parse.unquote( re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={ 'class': 'feed_from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error( 'Failed to get feed_action, the error is {},the page source is {}'. format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() try: wb_data.weibo_id = each['mid'] except (AttributeError, IndexError, TypeError): parser.error('Failed to get weibo id, the page source is {}'.format(html)) return None imgs = list() imgs_url = list() try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' # todo 没找到vedio的测试数据 try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'from'}).find(attrs={'rel': 'nofollow'}).text except AttributeError: wb_data.device = '' try: # todo 日期格式化,会有今日XXX,X分钟前等噪音 wb_data.create_time = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'}).text.strip() wb_data.weibo_url = 'https:'+each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'] wb_data.uid = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'].split('/')[3] except (AttributeError, KeyError): wb_data.create_time = '' wb_data.weibo_url = '' wb_data.weibo_uid = '' try: wb_data.repost_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[0].find('a').text.split('/')[-1]) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[1].find('a').text.split('/')[-1]) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[2].find('a').find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 if '展开全文' in str(each): is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content_full'}).text.strip() except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None else: is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content'}).text.strip() except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None return wb_data, is_all_cont
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html5lib') comment_list = list() comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: cont = [] first_author=True first_colon=True for content in comment.find(attrs={'class': 'WB_text'}).contents: if not content: continue if content.name =='a': if first_author: first_author=False continue else: if content.text: cont.append(content.text) elif content.name=='img': img_title = content.get('title', '') if img_title=='': img_title = content.get('alt', '') if img_title=='': img_src = content.get('src','') img_src = img_src.split('/')[-1].split('.',1)[0] try: img_title = parse_emoji.softband_to_utf8(img_src) except Exception as e: parser.error('解析表情失败,具体信息是{},{}'.format(e, comment)) img_title = '' cont.append(img_title) else: if first_colon: if content.find(':')==0: cont.append(content.replace(':','',1)) first_colon=False else: cont.append(content) wb_comment.comment_cont = ''.join(cont) wb_comment.comment_screen_name =comment.find(attrs={'class': 'WB_text'}).find('a').text wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:] # 日期格式化 create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text if '分钟前' in create_time: now = datetime.datetime.now() reduce_minute = create_time.strip().split('分钟')[0] delta = datetime.timedelta(minutes=int(reduce_minute)) real_time = now - delta wb_comment.create_time = str(real_time.strftime('%Y-%m-%d %H:%M')) elif '今天' in create_time: now = datetime.datetime.now().strftime('%Y-%m-%d') real_time = now + create_time.strip().split('今天')[-1] wb_comment.create_time = str(real_time) elif '楼' in create_time: wb_comment.create_time = str(re.sub('第\d*楼', '', create_time)) else: wb_comment.create_time = create_time if not wb_comment.create_time.startswith('201'): wb_comment.create_time = str(datetime.datetime.now().year) + wb_comment.create_time wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_weibo_info(each, html): # print ("----------------------") wb_data = WeiboData() # print ("-------" * 10) # print(each) # print ("#$#" * 10) # print(html) # print ("-----" * 10) user_cont = each.find(attrs={'class': 'card-feed'}) user_avator = user_cont.find(attrs={'class': 'avator'}) #usercard = user_cont.find('img').get('usercard', '') usercard = user_avator.find('a').get('href', '') # this only for login user if not usercard: return None wb_data.uid = usercard.split('?')[0][12:] # print ("uid", wb_data.uid) try: wb_data.weibo_id = each.find(attrs={'title': '赞'}).get('action-data')[4:] # print ("weibo_id", wb_data.weibo_id) except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').get("href", "")[2:] # wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href'] # print ("weibo_url", wb_data.weibo_url) except Exception as e: parser.error('Failed to get weibo url, the error is {}, the source page is {}'.format(e, html)) return None imgs = list() imgs_url = list() try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text.strip() except AttributeError: wb_data.device = '' try: create_time = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').text.strip() if "年" not in create_time and "月" in create_time: create_time = "2019年" + create_time elif "今天" in create_time: pass create_time.replace("今天", datetime.datetime.now().strftime("%Y-%m-%d ")) print( "啦啦啦啦啦 今天") create_time = datetime.datetime.strptime(create_time, "%Y年%m月%d日 %H:%M") wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") # print ("create_time", wb_data.create_time) except Exception as e: traceback.print_exc() wb_data.create_time = '' try: feed_action = each.find(attrs={'class': 'card-act'}) except Exception as why: parser.error('Failed to get feed_action, the error is {},the page source is {}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: try: wb_data.weibo_cont = each.find(attrs={"node-type": "feed_list_content_full"}).text.strip() # print ("full_weibo_cont", wb_data.weibo_cont) except: wb_data.weibo_cont = each.find(attrs={'class': 'txt'}).text.strip() # print ("weibo_cont", wb_data.weibo_cont) except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() try: wb_data.weibo_id = each['mid'] except (AttributeError, IndexError, TypeError): parser.error( 'Failed to get weibo id, the page source is {}'.format(html)) return None imgs = list() imgs_url = list() try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' # todo 没找到vedio的测试数据 try: a_tag = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('a')) extracted_url = urllib.parse.unquote( re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={ 'class': 'from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: # todo 日期格式化,会有今日XXX,X分钟前等噪音 wb_data.create_time = each.find(attrs={ 'class': 'from' }).find(attrs={ 'target': '_blank' }).text.strip() wb_data.weibo_url = 'https:' + each.find(attrs={ 'class': 'from' }).find(attrs={'target': '_blank'})['href'] wb_data.uid = each.find(attrs={ 'class': 'from' }).find(attrs={'target': '_blank'})['href'].split('/')[3] except (AttributeError, KeyError): wb_data.create_time = '' wb_data.weibo_url = '' wb_data.weibo_uid = '' try: wb_data.repost_num = int( each.find(attrs={ 'class': 'card-act' }).find_all('li')[1].find('a').text.split(' ')[-1]) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int( each.find(attrs={ 'class': 'card-act' }).find_all('li')[2].find('a').text.split(' ')[-1]) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( each.find(attrs={ 'class': 'card-act' }).find_all('li')[3].find('a').find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 if '展开全文' in str(each): is_all_cont = 1 try: wb_data.weibo_cont = each.find( attrs={ 'node-type': 'feed_list_content_full' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}' .format(why, html)) return None else: is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={ 'node-type': 'feed_list_content' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}' .format(why, html)) return None return wb_data, is_all_cont
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html5lib') comment_list = list() comments = soup.find(attrs={ 'node-type': 'comment_list' }).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: cont = [] first_author = True first_colon = True for content in comment.find(attrs={'class': 'WB_text'}).contents: if not content: continue if content.name == 'a': if first_author: first_author = False continue else: if content.text: cont.append(content.text) elif content.name == 'img': img_title = content.get('title', '') if img_title == '': img_title = content.get('alt', '') if img_title == '': img_src = content.get('src', '') img_src = img_src.split('/')[-1].split('.', 1)[0] try: img_title = parse_emoji.softband_to_utf8(img_src) except Exception as e: parser.error('解析表情失败,具体信息是{},{}'.format( e, comment)) img_title = '' cont.append(img_title) else: if first_colon: if content.find(':') == 0: cont.append(content.replace(':', '', 1)) first_colon = False else: cont.append(content) wb_comment.comment_cont = ''.join(cont) wb_comment.comment_screen_name = comment.find(attrs={ 'class': 'WB_text' }).find('a').text wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={ 'class': 'WB_text' }).find('a').get('usercard')[3:] # 爬取新用户基本信息 if wb_comment.user_id: get_profile(wb_comment.user_id) # 日期格式化 create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text if '分钟前' in create_time: now = datetime.datetime.now() reduce_minute = create_time.strip().split('分钟')[0] delta = datetime.timedelta(minutes=int(reduce_minute)) real_time = now - delta wb_comment.create_time = str( real_time.strftime('%Y-%m-%d %H:%M')) elif '今天' in create_time: now = datetime.datetime.now().strftime('%Y-%m-%d') real_time = now + create_time.strip().split('今天')[-1] wb_comment.create_time = str(real_time) elif '楼' in create_time: wb_comment.create_time = str(re.sub('第\d*楼', '', create_time)) else: wb_comment.create_time = create_time if not wb_comment.create_time.startswith('201'): wb_comment.create_time = str( datetime.datetime.now().year) + wb_comment.create_time # 中文时间戳转换成标准格式 "%Y-%m-%d %H:%M" create_time_copy = wb_comment.create_time if '月' in create_time_copy and '日' in create_time_copy: month = create_time_copy.split("年")[-1].split("月")[0] day = create_time_copy.split("年")[-1].split("月")[-1].split( "日")[0] # 补齐0 if month and int(month) < 10: wb_comment.create_time = wb_comment.create_time.replace( str(month) + "月", "0" + str(month) + "月") if day and int(day) < 10: wb_comment.create_time = wb_comment.create_time.replace( str(day) + "日", "0" + str(day) + "日") wb_comment.create_time = wb_comment.create_time.replace( "月", "-") wb_comment.create_time = wb_comment.create_time.replace( "日", "") if '年' in wb_comment.create_time: wb_comment.create_time = wb_comment.create_time.replace( "年", "-") wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_weibo_info(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) usercard = user_cont.find('img').get('usercard', '') # this only for login user if not usercard: return None wb_data.uid = usercard.split('&')[0][3:] try: wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error( 'Failed to get weibo url, the error is {}, the source page is {}'. format(e, html)) return None imgs = list() imgs_url = list() try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' try: a_tag = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('a')) extracted_url = urllib.parse.unquote( re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={ 'class': 'feed_from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error( 'Failed to get feed_action, the error is {},the page source is {}'. format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html5lib') comment_list = list() comments = soup.find(attrs={ 'node-type': 'comment_list' }).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: cont = [] first_author = True first_colon = True for content in comment.find(attrs={'class': 'WB_text'}).contents: if not content: continue if content.name == 'a': if first_author: first_author = False continue else: if content.text: cont.append(content.text) elif content.name == 'img': img_title = content.get('title', '') if img_title == '': img_title = content.get('alt', '') if img_title == '': img_src = content.get('src', '') img_src = img_src.split('/')[-1].split('.', 1)[0] try: img_title = parse_emoji.softband_to_utf8(img_src) except Exception as e: parser.error('解析表情失败,具体信息是{},{}'.format( e, comment)) img_title = '' cont.append(img_title) else: if first_colon: if content.find(':') == 0: cont.append(content.replace(':', '', 1)) first_colon = False else: cont.append(content) wb_comment.comment_cont = ''.join(cont) wb_comment.comment_screen_name = comment.find(attrs={ 'class': 'WB_text' }).find('a').text wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={ 'class': 'WB_text' }).find('a').get('usercard')[3:] create_time_str = comment.find(attrs={ 'class': 'WB_from S_txt2' }).text try: create_time = get_create_time_from_text(create_time_str) except ValueError as e: create_time = get_create_time_from_text_default_error_handler( create_time_str, e) create_time_str = create_time.strftime("%Y-%m-%d %H:%M:%S") wb_comment.create_time = create_time_str wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_weibo_info_1(each, html): wb_data = WeiboData() try: wb_data.weibo_id = each['mid'] except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None try: feed_action = each.find(attrs={'class': 'card-act'}) except Exception as why: parser.error( 'Failed to get feed_action, the error is {},the page source is {}'. format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: m = re.search( 'uid=(\\d+)', str( feed_action.find( attrs={'action-type': 'feed_list_forward'}) ['action-data'])) wb_data.uid = m.group(1) except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}' .format(why, html)) return None try: a_tag = each.find(attrs={'class': 'from'}) wb_data.weibo_url = "https:" + a_tag.a['href'] create_time = a_tag.a.text.replace("\n", "").strip() if "秒前" in create_time: create_time = ( datetime.datetime.now() - datetime.timedelta(seconds=int(create_time.replace("秒前", ""))) ).strftime("%Y-%m-%d %H:%M") elif "分钟前" in create_time: create_time = ( datetime.datetime.now() - datetime.timedelta(minutes=int(create_time.replace("分钟前", ""))) ).strftime("%Y-%m-%d %H:%M") elif "今天" in create_time: create_time = datetime.datetime.now().strftime( "%Y-%m-%d") + " " + create_time.replace("今天", "") else: create_time = str( datetime.datetime.now().year) + '-' + create_time.replace( '月', '-').replace('日', '') wb_data.create_time = create_time if len(a_tag.contents) >= 4: wb_data.device = a_tag.contents[3].text else: wb_data.device = '' except Exception as why: parser.error(why) wb_data.weibo_url = '' try: wb_data.weibo_cont = each.find(attrs={ 'node-type': 'feed_list_content' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont