def resultlist(): if (no_session in items): return "need update key" general_msg_list = eval(items) temp_list = general_msg_list['general_msg_list'] res_spider_list = eval(temp_list) for item in res_spider_list['list']: item_app_msg_ext_info = item["app_msg_ext_info"] item_comm_msg_info = item["comm_msg_info"] img_src_url = item_app_msg_ext_info['cover'].replace("\\", "").replace( "http", "https") img_src = auto_img.auto_img_option(img_src_url, "szjyyjy_cover_") title = item_app_msg_ext_info['title'] # 时间戳 datetime_struct = item_comm_msg_info["datetime"] datetime_struct = datetime.datetime.fromtimestamp(datetime_struct) time = (datetime_struct.strftime('%Y-%m-%d')) # 2016-12-22 href = item_app_msg_ext_info['content_url'].replace("\\", "").replace( "http", "https") source_src = item_app_msg_ext_info["author"] url = href content = requ(href) content_str = ','.join(str(i) for i in content) if (content_str == ''): continue res_list.append([title, img_src, time, content_str, source_src, url]) return res_list
def requ(href): # 让服务器认为你是通过浏览器访问页面 page = requests.get(url=href, headers=headers, verify=False) # 转json # json = json.loads(page.text) soup = BeautifulSoup(page.text, 'lxml') soup_pre = soup.prettify() # 格式化输出全部内容 # items 是一个 <listiterator object at 0x10a4b9950> 对象,不是一个list,但是可以循环遍历所有子节点。 items = soup.find(attrs={'class': 'artview_detail'}) source = soup.find(attrs={'class': 'org_txt'}).nextSibling constent_list = [] for item in items.contents: content_obj = {} if item == '\n' or item == ' ' or item.name == 'br': continue item_contents = item.contents for i in range(len(item_contents)): if (item_contents[i] == '\n' or item_contents[i].name == 'br'): continue if (isinstance(item.contents[i], str)): # 判断类型 content_obj = {} content_obj["type"] = item.name content_obj['content'] = item.contents[i] constent_list.append(content_obj) else: tagname = item.contents[i].name if (tagname == "a"): content_obj = {} content_obj["type"] = item.contents[i].name content_obj["href"] = item.contents[i].attrs["href"] content_obj['content'] = item.contents[i].contents[0] cont = str(content_obj['content']) if ("<" not in cont): constent_list.append(content_obj) else: if (tagname == 'div'): for j in range(len(item.contents[i].contents)): tagname_div = item.contents[i].contents[j].name if (tagname_div == 'img'): content_obj = {} content_obj["type"] = item.contents[ i].contents[j].name img_src_url = item.contents[i].contents[ j].attrs["src"] content_obj['src'] = auto_img.auto_img_option( img_src_url, "yxlx_cont_") constent_list.append(content_obj) return [source, constent_list]
def requ(href): # 让服务器认为你是通过浏览器访问页面 page = requests.get(url=href, headers=headers) # 转json # json = json.loads(page.text) soup = BeautifulSoup(page.text, 'lxml') soup_pre = soup.prettify() # 格式化输出全部内容 # items 是一个 <listiterator object at 0x10a4b9950> 对象,不是一个list,但是可以循环遍历所有子节点。 items = soup.find(attrs={'class': 'con'}) constent_list = [] for item in items.contents: content_obj = {} if item == '\n' or item == ' ' or len(item.contents) == 0: continue for i in item.contents: if (isinstance(item.contents[0], str)): # 判断类型 continue """ if(item.name =="h2" or item.attrs['class'][0]=="info2"): continue content_obj["type"] = item.name content_obj['content'] = item.contents[0] constent_list.append(content_obj) """ else: tagname = item.contents[0].name if (tagname == "img"): content_obj["type"] = item.contents[0].name img_src_url = item.contents[0].attrs['src'] content_obj['content'] = auto_img.auto_img_option( img_src_url, "tiyan_cont_") constent_list.append(content_obj) if (tagname == "strong"): content_obj["type"] = item.contents[0].name content_obj['content'] = item.contents[0].contents[0] constent_list.append(content_obj) if (tagname == "section"): reve = sectionReverse(item) if len(reve) > 0 and len(reve.contents) != 0: if isinstance(reve.contents[0], str): # section 来源 source = reve.contents[0] elif len(reve.contents[0].contents) == 0: continue else: content_obj["type"] = reve.contents[0].name content_obj['content'] = reve.contents[0].contents[ 0] constent_list.append(content_obj) return constent_list
def resultlist(): img_src_url = "https://mmbiz.qpic.cn/mmbiz_jpg/TLo8OEdyVibx4pb3W4MIE06BjWoOodiaX3ZcXb2iajJeeek2CEhbeian4mAURRzL6t0Fdy1ervKbChIDruMiaUY3laQ/0?wx_fmt=jpeg" img_src = auto_img.auto_img_option(img_src_url, "sougou_cover_") title = item_href.text[:-3] datetime_struct = item_href.nextSibling.next.next[28:38] datetime_struct = int(datetime_struct) datetime_struct = datetime.datetime.fromtimestamp(datetime_struct) time = (datetime_struct.strftime('%Y-%m-%d')) # 2016-12-22 href = base_url + item_href.attrs['href'] source_src = item_href content = requ(href) content_str = ','.join(str(i) for i in content) if (content_str == ''): return [] res_list.append([title, img_src, time, content_str, item_source, href]) # res_list.append([title, img_src, time, content, source_src, url]) return res_list
def resultlist(): for item in items: if item == '\n': continue img_src_url = item.contents[1].contents[3].contents[0].attrs[ 'data-original'] img_src = auto_img.auto_img_option(img_src_url, "yxlx_cover_") title = item.contents[1].contents[3].contents[0].attrs['alt'] time = item.contents[4].contents[3].contents[0].contents[0] href = item.contents[1].contents[3].attrs['href'] content = requ(href) content_str = ','.join(str(i) for i in content[1]) source_src = content[0] url = href if (content_str == ''): continue res_list.append([title, img_src, time, content_str, source_src, url]) return res_list
def resultlist(): for item in items: if item == '\n': continue img_src_source = item.contents[1].contents[0].contents[0].attrs['src'] if (img_src_source == ""): img_src = img_src_source else: img_src_url = base_url + img_src_source img_src = auto_img.auto_img_option(img_src_url, "tiyan_cover_") title = item.contents[3].contents[0].contents[0] time = item.contents[7].contents[0] href = base_url + item.contents[3].contents[0].attrs['href'] content = requ(href) content_str = ','.join(str(i) for i in content) source_src = "中国研学旅行网 " url = href if (content_str == ''): continue res_list.append([title, img_src, time, content_str, source_src, url]) return res_list