Example #1
0
def get_weibo_list(html):
    """
    get the list of weibo info
    :param html:
    :return: 
    """
    if not html:
        return list()
    soup = BeautifulSoup(html, "html.parser")
    feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'})
    weibo_datas = []
    for data in feed_list:
        r = get_weibo_info_detail(data, html)
        if r is not None:
            wb_data = r[0]
            if r[1] == 0 and CRAWLING_MODE == 'accurate':
                weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id)
                wb_data.weibo_cont = weibo_cont if weibo_cont else wb_data.weibo_cont
            weibo_datas.append(wb_data)

            if wb_data.is_origin == 0:
                fr = get_weibo_forward_info_detail(wb_data.weibo_forward_id,
                                                   data, html)
                if fr is not None:
                    wb_fd_data = fr[0]
                    if fr[1] == 0 and CRAWLING_MODE == 'accurate':
                        weibo_fd_cont = status.get_cont_of_weibo(
                            wb_fd_data.weibo_id)
                        wb_fd_data.weibo_cont = weibo_fd_cont if weibo_fd_cont else wb_fd_data.weibo_cont
                    weibo_datas.append(wb_fd_data)

    return weibo_datas
Example #2
0
def get_weibo_list(html):
    """
    get the list of weibo info
    :param html: 
    :return: 
    """
    if not html:
        return list()
    soup = BeautifulSoup(html, "html.parser")
    feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'})
    weibo_datas = []
    weibo_pics = []
    for data in feed_list:
        r = get_weibo_info_detail(data, html)
        if r is not None:
            wb_data = r[0]
            if r[1] == 0:
                wb_data.weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id)

            # 如果pic是非空,则在pic数组中插入
            if r[2]:
                weibo_pics.extend(r[2])
            # end
            weibo_datas.append(wb_data)
    return weibo_datas, weibo_pics
Example #3
0
def get_search_info(html):
    """
    通过搜索页的内容获取搜索结果
    :param html: 
    :return: 
    """
    # 搜索结果可能有两种方式,一种是直接返回的,一种是编码过后的
    content = _search_page_parse(html) if '举报' not in html else html

    if content == '':
        return list()

    # todo 这里用bs会导致某些信息不能被解析(参考../tests/fail.html),可参考使用xpath,考虑到成本,暂时不实现
    soup = BeautifulSoup(
        content.encode('utf-8', 'ignore').decode('utf-8'), "html.parser")

    feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'})
    search_list = []
    for each in feed_list:
        r = get_weibo_info(each, html)
        if r is not None:
            wb_data = r[0]
            if r[1] == 0:
                wb_data.weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id)
            search_list.append(wb_data)
    return search_list
Example #4
0
def get_search_info(html):
    """
    :param html: response content for search with login
    :return: search results
    """
    # 搜索结果可能有两种方式,一种是直接返回的,一种是编码过后的
    # content = _search_page_parse(html) if '举报' not in html else html
    content = html

    if content == '':
        return list()
    # todo 这里用bs会导致某些信息不能被解析(参考../tests/fail.html),可参考使用xpath,考虑到成本,暂时不实现
    soup = BeautifulSoup(
        content.encode('utf-8', 'ignore').decode('utf-8'), "html.parser")
    feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'})
    search_list = []
    for each in feed_list:
        r = get_weibo_info_1(each, html)
        if r is not None:
            wb_data = r[0]
            if r[1] == 0 and CRAWLING_MODE == 'accurate':
                weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id)
                wb_data.weibo_cont = weibo_cont if weibo_cont else wb_data.weibo_cont
            search_list.append(wb_data)
    return search_list
Example #5
0
 def test_get_weibo_detail_cont(self):
     """
     test for get weibo's all cont
     :return:
     """
     from page_get import status
     print(status.get_cont_of_weibo('4129510280252577'))
Example #6
0
 def test_get_weibo_detail_cont(self):
     """
     test for get weibo's all cont
     :return:
     """
     from page_get import status
     print(status.get_cont_of_weibo('4129510280252577'))
Example #7
0
def get_search_info(html):
    """
    通过搜索页的内容获取搜索结果
    :param html: 
    :return: 
    """
    # 搜索结果可能有两种方式,一种是直接返回的,一种是编码过后的
    content = _search_page_parse(html) if '举报' not in html else html

    if content == '':
        return list()
    # todo 这里用bs会导致某些信息不能被解析(参考../tests/fail.html),可参考使用xpath,考虑到成本,暂时不实现
    soup = BeautifulSoup(content.encode('utf-8', 'ignore').decode('utf-8'), "html.parser")

    feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'})
    search_list = []
    for each in feed_list:
        r = get_weibo_info(each, html)
        if r is not None:
            wb_data = r[0]
            if r[1] == 0 and CRAWLING_MODE == 'accurate':
                weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id)
                wb_data.weibo_cont = weibo_cont if weibo_cont else wb_data.weibo_cont
            search_list.append(wb_data)
    return search_list
Example #8
0
def get_weibo_list(html):
    """
    get the list of weibo info
    :param html: 
    :return: 
    """
    if not html:
        return list()
    soup = BeautifulSoup(html, "html.parser")
    feed_list = soup.find_all(attrs={'action-type': 'feed_list_item'})
    weibo_datas = []
    for data in feed_list:
        r = get_weibo_info_detail(data, html)
        if r is not None:
            wb_data = r[0]
            if r[1] == 0 and CRAWLING_MODE == 'accurate':
                weibo_cont = status.get_cont_of_weibo(wb_data.weibo_id)
                wb_data.weibo_cont = weibo_cont if weibo_cont else wb_data.weibo_cont
            weibo_datas.append(wb_data)
    return weibo_datas