def get_weibo_list(html): """ get the list of weibo info :param html: :return: """ if not html: return list() soup = BeautifulSoup(html, "html.parser") feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'}) weibo_datas = [] for data in feed_list: r = get_weibo_info_detail(data, html) if r is not None: wb_data = r[0] if r[1] == 0 and CRAWLING_MODE == 'accurate': weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id) wb_data.weibo_cont = weibo_cont if weibo_cont else wb_data.weibo_cont weibo_datas.append(wb_data) if wb_data.is_origin == 0: fr = get_weibo_forward_info_detail(wb_data.weibo_forward_id, data, html) if fr is not None: wb_fd_data = fr[0] if fr[1] == 0 and CRAWLING_MODE == 'accurate': weibo_fd_cont = status.get_cont_of_weibo( wb_fd_data.weibo_id) wb_fd_data.weibo_cont = weibo_fd_cont if weibo_fd_cont else wb_fd_data.weibo_cont weibo_datas.append(wb_fd_data) return weibo_datas
def get_weibo_list(html): """ get the list of weibo info :param html: :return: """ if not html: return list() soup = BeautifulSoup(html, "html.parser") feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'}) weibo_datas = [] weibo_pics = [] for data in feed_list: r = get_weibo_info_detail(data, html) if r is not None: wb_data = r[0] if r[1] == 0: wb_data.weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id) # 如果pic是非空,则在pic数组中插入 if r[2]: weibo_pics.extend(r[2]) # end weibo_datas.append(wb_data) return weibo_datas, weibo_pics
def get_search_info(html): """ 通过搜索页的内容获取搜索结果 :param html: :return: """ # 搜索结果可能有两种方式,一种是直接返回的,一种是编码过后的 content = _search_page_parse(html) if '举报' not in html else html if content == '': return list() # todo 这里用bs会导致某些信息不能被解析(参考../tests/fail.html),可参考使用xpath,考虑到成本,暂时不实现 soup = BeautifulSoup( content.encode('utf-8', 'ignore').decode('utf-8'), "html.parser") feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'}) search_list = [] for each in feed_list: r = get_weibo_info(each, html) if r is not None: wb_data = r[0] if r[1] == 0: wb_data.weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id) search_list.append(wb_data) return search_list
def get_search_info(html): """ :param html: response content for search with login :return: search results """ # 搜索结果可能有两种方式,一种是直接返回的,一种是编码过后的 # content = _search_page_parse(html) if '举报' not in html else html content = html if content == '': return list() # todo 这里用bs会导致某些信息不能被解析(参考../tests/fail.html),可参考使用xpath,考虑到成本,暂时不实现 soup = BeautifulSoup( content.encode('utf-8', 'ignore').decode('utf-8'), "html.parser") feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'}) search_list = [] for each in feed_list: r = get_weibo_info_1(each, html) if r is not None: wb_data = r[0] if r[1] == 0 and CRAWLING_MODE == 'accurate': weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id) wb_data.weibo_cont = weibo_cont if weibo_cont else wb_data.weibo_cont search_list.append(wb_data) return search_list
def test_get_weibo_detail_cont(self): """ test for get weibo's all cont :return: """ from page_get import status print(status.get_cont_of_weibo('4129510280252577'))
def get_search_info(html): """ 通过搜索页的内容获取搜索结果 :param html: :return: """ # 搜索结果可能有两种方式,一种是直接返回的,一种是编码过后的 content = _search_page_parse(html) if '举报' not in html else html if content == '': return list() # todo 这里用bs会导致某些信息不能被解析(参考../tests/fail.html),可参考使用xpath,考虑到成本,暂时不实现 soup = BeautifulSoup(content.encode('utf-8', 'ignore').decode('utf-8'), "html.parser") feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'}) search_list = [] for each in feed_list: r = get_weibo_info(each, html) if r is not None: wb_data = r[0] if r[1] == 0 and CRAWLING_MODE == 'accurate': weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id) wb_data.weibo_cont = weibo_cont if weibo_cont else wb_data.weibo_cont search_list.append(wb_data) return search_list
def get_weibo_list(html): """ get the list of weibo info :param html: :return: """ if not html: return list() soup = BeautifulSoup(html, "html.parser") feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'}) weibo_datas = [] for data in feed_list: r = get_weibo_info_detail(data, html) if r is not None: wb_data = r[0] if r[1] == 0 and CRAWLING_MODE == 'accurate': weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id) wb_data.weibo_cont = weibo_cont if weibo_cont else wb_data.weibo_cont weibo_datas.append(wb_data) return weibo_datas