Ejemplo n.º 1
0
def resultlist():
    if (no_session in items):
        return "need update key"
    general_msg_list = eval(items)
    temp_list = general_msg_list['general_msg_list']
    res_spider_list = eval(temp_list)
    for item in res_spider_list['list']:
        item_app_msg_ext_info = item["app_msg_ext_info"]
        item_comm_msg_info = item["comm_msg_info"]
        img_src_url = item_app_msg_ext_info['cover'].replace("\\", "").replace(
            "http", "https")
        img_src = auto_img.auto_img_option(img_src_url, "szjyyjy_cover_")
        title = item_app_msg_ext_info['title']
        # 时间戳
        datetime_struct = item_comm_msg_info["datetime"]
        datetime_struct = datetime.datetime.fromtimestamp(datetime_struct)
        time = (datetime_struct.strftime('%Y-%m-%d'))  # 2016-12-22
        href = item_app_msg_ext_info['content_url'].replace("\\", "").replace(
            "http", "https")
        source_src = item_app_msg_ext_info["author"]
        url = href
        content = requ(href)
        content_str = ','.join(str(i) for i in content)
        if (content_str == ''):
            continue
        res_list.append([title, img_src, time, content_str, source_src, url])
    return res_list
Ejemplo n.º 2
0
def requ(href):
    # 让服务器认为你是通过浏览器访问页面
    page = requests.get(url=href, headers=headers, verify=False)
    # 转json
    # json = json.loads(page.text)
    soup = BeautifulSoup(page.text, 'lxml')
    soup_pre = soup.prettify()  # 格式化输出全部内容
    # items 是一个 <listiterator object at 0x10a4b9950> 对象,不是一个list,但是可以循环遍历所有子节点。
    items = soup.find(attrs={'class': 'artview_detail'})
    source = soup.find(attrs={'class': 'org_txt'}).nextSibling
    constent_list = []
    for item in items.contents:
        content_obj = {}
        if item == '\n' or item == ' ' or item.name == 'br':
            continue
        item_contents = item.contents
        for i in range(len(item_contents)):
            if (item_contents[i] == '\n' or item_contents[i].name == 'br'):
                continue
            if (isinstance(item.contents[i], str)):  # 判断类型
                content_obj = {}
                content_obj["type"] = item.name
                content_obj['content'] = item.contents[i]
                constent_list.append(content_obj)
            else:
                tagname = item.contents[i].name
                if (tagname == "a"):
                    content_obj = {}
                    content_obj["type"] = item.contents[i].name
                    content_obj["href"] = item.contents[i].attrs["href"]
                    content_obj['content'] = item.contents[i].contents[0]
                    cont = str(content_obj['content'])
                    if ("<" not in cont):
                        constent_list.append(content_obj)
                else:
                    if (tagname == 'div'):
                        for j in range(len(item.contents[i].contents)):
                            tagname_div = item.contents[i].contents[j].name
                            if (tagname_div == 'img'):
                                content_obj = {}
                                content_obj["type"] = item.contents[
                                    i].contents[j].name
                                img_src_url = item.contents[i].contents[
                                    j].attrs["src"]
                                content_obj['src'] = auto_img.auto_img_option(
                                    img_src_url, "yxlx_cont_")
                                constent_list.append(content_obj)

    return [source, constent_list]
Ejemplo n.º 3
0
def requ(href):
    # 让服务器认为你是通过浏览器访问页面
    page = requests.get(url=href, headers=headers)
    # 转json
    # json = json.loads(page.text)
    soup = BeautifulSoup(page.text, 'lxml')
    soup_pre = soup.prettify()  # 格式化输出全部内容
    # items 是一个 <listiterator object at 0x10a4b9950> 对象,不是一个list,但是可以循环遍历所有子节点。
    items = soup.find(attrs={'class': 'con'})
    constent_list = []
    for item in items.contents:
        content_obj = {}
        if item == '\n' or item == ' ' or len(item.contents) == 0:
            continue
        for i in item.contents:
            if (isinstance(item.contents[0], str)):  # 判断类型
                continue
                """ if(item.name =="h2" or item.attrs['class'][0]=="info2"):
                    continue
                content_obj["type"] = item.name
                content_obj['content'] = item.contents[0]
                constent_list.append(content_obj) """
            else:
                tagname = item.contents[0].name
                if (tagname == "img"):
                    content_obj["type"] = item.contents[0].name
                    img_src_url = item.contents[0].attrs['src']
                    content_obj['content'] = auto_img.auto_img_option(
                        img_src_url, "tiyan_cont_")
                    constent_list.append(content_obj)
                if (tagname == "strong"):
                    content_obj["type"] = item.contents[0].name
                    content_obj['content'] = item.contents[0].contents[0]
                    constent_list.append(content_obj)
                if (tagname == "section"):
                    reve = sectionReverse(item)
                    if len(reve) > 0 and len(reve.contents) != 0:
                        if isinstance(reve.contents[0], str):  # section 来源
                            source = reve.contents[0]
                        elif len(reve.contents[0].contents) == 0:
                            continue
                        else:
                            content_obj["type"] = reve.contents[0].name
                            content_obj['content'] = reve.contents[0].contents[
                                0]
                            constent_list.append(content_obj)

    return constent_list
Ejemplo n.º 4
0
def resultlist():
    img_src_url = "https://mmbiz.qpic.cn/mmbiz_jpg/TLo8OEdyVibx4pb3W4MIE06BjWoOodiaX3ZcXb2iajJeeek2CEhbeian4mAURRzL6t0Fdy1ervKbChIDruMiaUY3laQ/0?wx_fmt=jpeg"
    img_src = auto_img.auto_img_option(img_src_url, "sougou_cover_")
    title = item_href.text[:-3]
    datetime_struct = item_href.nextSibling.next.next[28:38]
    datetime_struct = int(datetime_struct)
    datetime_struct = datetime.datetime.fromtimestamp(datetime_struct)
    time = (datetime_struct.strftime('%Y-%m-%d'))  # 2016-12-22
    href = base_url + item_href.attrs['href']
    source_src = item_href
    content = requ(href)
    content_str = ','.join(str(i) for i in content)
    if (content_str == ''):
        return []
    res_list.append([title, img_src, time, content_str, item_source, href])
    # res_list.append([title, img_src, time, content, source_src, url])
    return res_list
Ejemplo n.º 5
0
def resultlist():
    for item in items:
        if item == '\n':
            continue
        img_src_url = item.contents[1].contents[3].contents[0].attrs[
            'data-original']
        img_src = auto_img.auto_img_option(img_src_url, "yxlx_cover_")
        title = item.contents[1].contents[3].contents[0].attrs['alt']
        time = item.contents[4].contents[3].contents[0].contents[0]
        href = item.contents[1].contents[3].attrs['href']
        content = requ(href)
        content_str = ','.join(str(i) for i in content[1])
        source_src = content[0]
        url = href
        if (content_str == ''):
            continue
        res_list.append([title, img_src, time, content_str, source_src, url])
    return res_list
Ejemplo n.º 6
0
def resultlist():
    for item in items:
        if item == '\n':
            continue
        img_src_source = item.contents[1].contents[0].contents[0].attrs['src']
        if (img_src_source == ""):
            img_src = img_src_source
        else:
            img_src_url = base_url + img_src_source
            img_src = auto_img.auto_img_option(img_src_url, "tiyan_cover_")
        title = item.contents[3].contents[0].contents[0]
        time = item.contents[7].contents[0]
        href = base_url + item.contents[3].contents[0].attrs['href']
        content = requ(href)
        content_str = ','.join(str(i) for i in content)
        source_src = "中国研学旅行网 "
        url = href
        if (content_str == ''):
            continue
        res_list.append([title, img_src, time, content_str, source_src, url])
    return res_list