def parse_blogs_info(blogs_urls): global pre_page_last_img_info imgs_info = [] blog_num = 0 blen = len(blogs_urls) # 循环len(blogs_info)次,每次解析blogs_info的第一元素,解析完后删除 for blog_url in blogs_urls: print("博客 %s 开始解析" % (blog_url)) content = requests.get(blog_url, headers=useragentutil.get_headers()).content.decode("utf-8") author_page_parse = etree.HTML(requests.get(blog_url.split("/post")[0]).content.decode("utf-8")) author_name = author_page_parse.xpath("//title/text()")[0].replace("\n", "").replace(" ", "") author_ip = re.search(r"http(s)*://(.*).lofter.com/", blog_url).group(2) # 获取博客发表时间 public_time = get_time(blog_url, author_page_parse) # 不同作者主页会有不同页面结构,所以没有使用xpath而是直接用正则匹配出所有的图片链接 # imgs_url = re.findall('"(http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*?)"', content) imgs_url = re.findall('"(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)"', content) # 图片文件下标接上次的增加 img_index = 0 blog_num += 1 # 过滤博客页面中获取到的图片链接 imgs_url = l4_author_img.img_fliter(imgs_url, "img") print(imgs_url) # 整理图片信息,用于下一步保存 for img_url in imgs_url: img_index += 1 # 判断图片类型 is_gif = re.findall("gif", img_url) is_png = re.findall("png", img_url) if is_gif: img_type = "gif" elif is_png: img_type = "png" else: img_type = "jpg" img_info = {} img_info["img_url"] = img_url author_name_in_filename = author_name.replace("/", "&").replace("|", "&").replace("\\", "&"). \ replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?"). \ replace("*", "·").replace("\n", "").replace("(", "(").replace(")", ")") img_info["pic_name"] = author_name_in_filename + "[" + author_ip + "] " + public_time + "(" + str( img_index) + ")." + img_type imgs_info.append(img_info) # 这里想遇到png的时候存一张png格式,一张jpg格式。png格式有透明图层有的要在jpg格式下才能看全 # 但总之效果不好所以弃用 # if img_type == "png": # img_info_2={} # img_info_2["img_url"] = img_url # img_info_2["pic_name"] = author_name + "[" + author_ip + "] " + public_time + "(" + str( # img_index) + ").jpg" # imgs_info.append(img_info_2) blen -= 1 print("解析完成,获取到图片链接%d,总获取图片数%d,已解析完成%d个链接,剩余%d" % (len(imgs_url), len(imgs_info), blog_num, blen)) print() return imgs_info
def parse_proxy_url(temp_url): '''爬取请求解析网站,获取网页源码''' request = urllib.request.Request(temp_url, headers=useragentutil.get_headers()) #发送请求,获得结果 rsp = urllib.request.urlopen(request).read().decode() return rsp
def save_long_article(long_articles_info, file_path, save_img_in_text): if not os.path.exists(file_path + "/long article"): os.makedirs(file_path + "/long article") count = 0 is_tag_null = lambda x: x if x != "" else "无" for l_info in long_articles_info: # 文档整理 l_head = l_info["title"] + " by " + l_info["author name"] + "[" + l_info["author ip"] + "]" + "\n发表时间:" \ + l_info["public time"] + "\n原文连接:" + l_info["url"] + "\ntags:" + \ is_tag_null(", ".join(l_info["tags"])) l_tail = "" if l_info["long article url"]: l_tail += "文章中包含的外部连接" for external_links in l_info["long article url"]: l_tail += "\n" + external_links if l_info["long article img"]: l_tail += "\n\n文章中包含的图片连接:" for illustration in l_info["long article img"]: l_tail += "\n" + illustration long_article = l_head + "\n\n\n" + l_info[ "long article content"] + "\n\n\n" + l_tail filename = l_info["title in filename"] + " by " + l_info[ "author name in filename"] + ".txt" count += 1 # 输出 if count % 5 == 0 or count == len(long_articles_info) or count == 1: print("保存进度 {}/{}\t\t{}".format(count, len(long_articles_info), filename), end="\t\t") # 保存 write_text(long_article, filename, file_path + "/long article") # 保存文本中的图片 if save_img_in_text: if l_info["long article img"]: for img_url in l_info["long article img"]: # re_url = re.findall('http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*', img_url) re_url = re.findall( 'http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*', img_url) if not re_url: print("\n图片 {} 不是lofter站内图 可能会保存失败".format(img_url), end="\t") try: img = requests.get( img_url, headers=useragentutil.get_headers()).content except: print("保存失败,请尝试手动保存", end="\t") continue img_name = l_info["title in filename" \ ""] + " by " + l_info["author name in filename"] + ".jpg" img_name = filename_check(img_name, img, file_path + "/long article", "jpg") write_img(img, img_name, file_path + "/long article") if count % 5 == 0 or count == len(long_articles_info) or count == 1: print("保存完成")
def main(): """下载皮肤图片""" #读取json文件 hero_lists = read_hero_skin_by_json() #遍历 for hero_element in hero_lists: hero_name = hero_element["hero_name"] hero_skins = hero_element["hero_skin_list"] # [{'skin_name': '流云之翼', 'skin_url ': 'http://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/506/506-bigskin-1.jpg'}, headers = useragentutil.get_headers() for skins_element in hero_skins: #皮肤名称 skin_name = skins_element["skin_name"] #皮肤url skin_url = skins_element["skin_url"] #下载图片 response = requests.get(skin_url, headers=headers) image_content = response.content #文件操作保存图片 with open("./herofile/" + hero_name + "/" + skin_name + ".jpg", "wb") as image_file: image_file.write(image_content) print("正在下载——>%s-->皮肤%s图片.." % (hero_name, skin_name)) print("----%s的所有皮肤图片下载成功" % hero_name) print("所有英雄图片下载成功")
def get_html_datas(url_datas): """爬取英雄界面network代码""" headers = useragentutil.get_headers() #获取页面内容 driver = webdriver.PhantomJS("./phantomjs-2.1.1-windows/bin/phantomjs.exe") driver.get(url_datas) html_content = driver.page_source driver.quit() return html_content
def parse_offer_url(self, temp_url): """爬取整个页面内容""" offer_response = requests.get(temp_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) offer_html_content = offer_response.content.decode("utf-8") # 限制处理 wait_time = random.randint(0, 5) print("动态限制访问频率,%ds 后继续爬取数据..." % wait_time) time.sleep(wait_time) return offer_html_content
def save_text(texts_info, file_path, save_img_in_text): if not os.path.exists(file_path + "/text"): os.makedirs(file_path + "/text") count = 0 is_tag_null = lambda x: x if x != "" else "无" for text_info in texts_info: count += 1 # 文档整理 text_head = text_info["author name"] + "[" + text_info["author ip"] + "]\n发表时间:" + text_info["public time"] \ + "\n原文连接:" + text_info["url"] + "\ntags:" + is_tag_null(", ".join(text_info["tags"])) text_tial = get_tail(text_info) text = text_head + "\n\n\n" + text_info[ "content"] + "\n\n\n" + text_tial first_tag = "无tag" if text_info["tags"]: first_tag = text_info["tags"][0] filename = text_info[ "author name in filename"] + "-" + first_tag + "-" + text_info[ "public time"] + ".txt" filename = filename_check(filename, text, file_path + "/text", "txt") # 提示输出 if count % 10 == 0 or count == len(texts_info): try: print("保存进度 {}/{}\t\t{}".format(count, len(texts_info), filename), end="\t\t") except: print("保存进度 {}/{}\t\t{}".format(count, len(texts_info), "文件名异常"), end="\t\t") # 保存 write_text(text, filename, file_path + "/text") # 保存文字中的图片 if save_img_in_text: if text_info["illustration"]: for img_url in text_info["illustration"]: img_name = text_info[ "author name in filename"] + "-" + first_tag + "-" + text_info[ "public time"] + ".jpg" # img_name = text_info + " by " + text_info["author name in filename"] + ".jpg" img = requests.get( img_url, headers=useragentutil.get_headers()).content img_name = filename_check(img_name, img, file_path + "/text", "jpg") write_img(img, img_name, file_path + "/text") if count % 10 == 0 or count == len(texts_info): print("保存完成")
def catch_work_info(self, temp_url): """提取工作职责信息""" try: work_response = requests.get(temp_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) work_html_content = work_response.content.decode("gbk") work_parser = lxml.html.etree.HTML(work_html_content) work_infos = "".join( work_parser.xpath("//div[@class='bmsg job_msg inbox']//text()") ).strip().replace(" ", "") # 清洗数据 #print("工作职责:",work_infos) except Exception: work_infos = "暂无数据" return work_infos
def get_offer_pages(self): """动态获取页面数,int""" offer_page_response = requests.get(self.offer_index_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) # 获取网页源码 page_html_content = offer_page_response.content.decode("gbk") # 解析数据 metree = lxml.html.etree page_parser = metree.HTML(page_html_content) # 获得内容值 pages_content = page_parser.xpath( "//div[@class='dw_page']//span[@class='td']/text()")[0] pages = int(re.search(r"共(\d+)页", pages_content)[1]) return pages
def download_hero_image(hero_list): """下载头像图片""" dir_name = "./herofile" for hero_element in hero_list: hero_name = hero_element["hero_name"] hero_url = hero_element["image_url"] headers = useragentutil.get_headers() response = requests.get(hero_url, headers=headers) image_content = response.content hero_path = dir_name + "/" + hero_name + "/1" + hero_name + ".jpg" #文件操作 写入 with open(hero_path, "wb") as hero_file: hero_file.write(image_content) print("正在下载--(%S)--图片" % hero_name) print("所有图片头像已经下载成功")
def catch_company_info(self, temp_url): """提取公司简介信息""" try: company_response = requests.get( temp_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) company_html_content = company_response.content.decode("gbk") # 提取数据 company_parser = lxml.html.etree.HTML(company_html_content) company_infos = "".join( company_parser.xpath( "//div[@class='con_txt']//text()")).strip().replace( " ", "") except Exception: company_infos = "暂无数据" return company_infos
def download_img(imgs_info, imgs_info_saved, author_name, author_ip, file_update_interval): """ :param imgs_info: 图片信息 :param imgs_info_saved: 已完成保存的图片信息 :param author_name: 作者名 :param author_ip 作者的lofter三级域名 :return:无 """ author_name_in_filename = author_name.replace("/", "&").replace("|", "&").replace("\\", "&"). \ replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?"). \ replace("*", "·").replace("\n", "").replace("(", "(").replace(")", ")") dir_path = "./dir/img/" + author_name_in_filename + "[" + author_ip + "]" if not os.path.exists(dir_path): os.makedirs(dir_path) save_num = len(imgs_info_saved) for img_index in range(len(imgs_info)): pic_name = imgs_info[0]["pic_name"] pic_name_in_filename = pic_name.replace("/", "&").replace("|", "&").replace("\r", " ").replace( "\\", "&").replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?") \ .replace("*", "·").replace("\n", "").replace("(", "(").replace(")", ")").strip() pic_url = imgs_info[0]["img_url"] img_path = dir_path + "/" + pic_name_in_filename print("获取图片 %s" % (pic_url)) content = requests.get(pic_url, headers=useragentutil.get_headers()).content with open(img_path, "wb") as op: op.write(content) save_num += 1 imgs_info_saved.append(imgs_info[0]) del imgs_info[0] print("图片已保存,共保存图片%d (本次运行已保存%d),余%d" % (save_num, img_index + 1, len(imgs_info))) if img_index % file_update_interval == 0 or len(imgs_info) == 0: file_update("./dir/author_img_file/imgs_info.json", imgs_info) file_update("./dir/author_img_file/imgs_info_saved.json", imgs_info_saved) time.sleep(1) print("文件刷新") with open("./dir/author_img_file/imgs_info.json", "w") as op: op.write("finished")
def download_img(imgs_info): dir_path = "./dir/img/this" if not os.path.exists(dir_path): os.makedirs(dir_path) num = 0 list_len = len(imgs_info) for img_info in imgs_info: pic_name = img_info["pic_name"] pic_url = img_info["img_url"] img_path = dir_path + "/" + pic_name print("获取图片 %s,%s" % (pic_url, pic_name)) content = requests.get(pic_url, headers=useragentutil.get_headers()).content with open(img_path, "wb") as op: op.write(content) num += 1 list_len -= 1 print("图片已保存,共保存图片%d ,余%d" % (num, list_len)) if num % 8 == 0: time.sleep(1)
def catch_work_info(self, temp_url): """提取工作职责信息""" try: work_response = requests.get(temp_url, headers=useragentutil.get_headers(), proxies=proxypool.get_proxy()) work_html_content = work_response.content.decode("utf-8") work_parser = lxml.html.etree.HTML(work_html_content) work_infos = work_parser.xpath( "//div[@class='tabs_box pllist active']//ul[@class='clearfix']/li" ) url = [] for li in work_infos: url.append(li.xpath("./a/@href")) except Exception: work_infos = "暂无数据" return url
def run(author_url, start_time, end_time, target_titles, merger_chapter): author_page_parse = etree.HTML( requests.get( author_url, headers=useragentutil.get_headers()).content.decode("utf-8")) # id是是获取归档页面需要的一个参数,纯数字;ip是作者在lofter的三级域名,由作者注册时设定 author_id = author_page_parse.xpath( "//body/iframe[@id='control_frame']/@src")[0].split("blogId=")[1] author_ip = re.search(r"http[s]*://(.*).lofter.com/", author_url).group(1) try: author_name = author_page_parse.xpath("//title//text()")[0] except: author_name = input("解析作者名时出现异常,请手动输入\n") archive_url = author_url + "dwr/call/plaincall/ArchiveBean.getArchivePostByTime.dwr" query_num = 50 data = l4_author_img.make_data(author_id, query_num) head = l4_author_img.make_head(author_url) print("作者名%s,lofter ip %s,主页链接 %s" % (author_name, author_ip, author_url)) path = "./dir/article" arthicle_path = "./dir/article/{}".format(author_name) blog_infos = parse_archive_page(archive_url, head, data, author_url, query_num, start_time, end_time, target_titles, merger_chapter) if not blog_infos: print("作者主页中无带标题的博客,无需爬取,程序退出") exit() for x in [path, arthicle_path]: if not os.path.exists(x): os.makedirs(x) if target_titles and merger_chapter: save_chapter(blog_infos, target_titles, author_name, author_ip) else: save_file(blog_infos, author_name, author_ip) # print("end") print("运行结束")
def main(): '''对IP进行处理,好留坏丢''' proxy_ip_datas = get_proxy_fromjson() url = "https://www.sohu.com/" value_proxy_list = [] for proxy in proxy_ip_datas: try: proxy_response = requests.get(url, headers=useragentutil.get_headers(), proxies=proxy) if proxy_response.status_code == 200: value_proxy_list.append(proxy) #with open("./ip_path/proxypool.json","w",encoding="utf-8") as file json.dump(value_proxy_list, open("./ip_path/proxypool.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2) print("正在处理ip:{}".format(proxy)) except Exception: print("异常ip:{}".format(proxy))
# coding:utf-8 """ 思路分析: (1)爬取整个新浪网的页面数据内容; (2)把爬取下来的数据内容保存到一个文件中。 """ import requests import useragentutil sina_url = "https://www.sina.com.cn/" # 爬取整个新浪网的页面数据内容 headers = useragentutil.get_headers() # print(headers) response = requests.get(sina_url, headers=headers) # 获得内容 # html_content = response.text html_content = response.content.decode("utf-8") # print(html_content) # 把爬取下来的数据内容保存到一个文件中 writer = open("./file/sina.html", "w", encoding="utf-8") writer.write(html_content) writer.close() print("新浪网页面数据已保存成功!")
def make_data(mode, url=""): """ :param mode: 模式,支持的模式有share like1 like2 tag :param url: 生成data需要用到url,share like1 需要的是用户主页的url,tag需要的是tag页的url。like2不会用到,因为信息在cookies种 :return: """ if (mode == "like1" or mode == "share" or mode == "tag") and url == "": print("{}模式生成data需要url参数".format(mode)) return {} base_data = { 'callCount': '1', 'httpSessionId': '', 'scriptSessionId': '${scriptSessionId}187', 'c0-id': '0', "batchId": "472351" } get_num = 100 got_num = 0 if mode == "share" or mode == "like1": userId = "" user_page_parse = etree.HTML( requests.get( url, headers=useragentutil.get_headers()).content.decode("utf-8")) try: userId = user_page_parse.xpath( "//body/iframe[@id='control_frame']/@src")[0].split( "blogId=")[1] except: print("\n链接与模式不匹配") exit() data_parme = { 'c0-scriptName': 'BlogBean', "c0-methodName": "", 'c0-param0': 'number:' + str(userId), 'c0-param1': 'number:' + str(get_num), 'c0-param2': 'number:' + str(got_num), 'c0-param3': 'string:' } if mode == "like1": data_parme["c0-methodName"] = "queryLikePosts" else: data_parme["c0-methodName"] = "querySharePosts" elif mode == "like2": data_parme = { "c0-scriptName": "PostBean", "c0-methodName": "getFavTrackItem", "c0-param0": "number:" + str(get_num), "c0-param1": "number:" + str(got_num), } elif mode == "tag": # 参数8要拿时间戳 url_search = re.search("http[s]{0,1}://www.lofter.com/tag/(.*?)/(.*)", url) type = url_search.group(2) if type == "": type = "new" data_parme = { 'c0-scriptName': 'TagBean', 'c0-methodName': 'search', 'c0-param0': 'string:' + url_search.group(1), 'c0-param1': 'number:0', 'c0-param2': 'string:', 'c0-param3': 'string:' + type, 'c0-param4': 'boolean:false', 'c0-param5': 'number:0', 'c0-param6': 'number:' + str(get_num), 'c0-param7': 'number:' + str(got_num), 'c0-param8': 'number:' + str(int(time.time() * 1000)), 'batchId': '870178' } else: print("data-模式错误") data_parme = {} data = {**base_data, **data_parme} return data
def save_img(imgs_info, file_path, img_save_info, classify_by_tag, prior_tags, agg_non_prior_tag, print_level): if not os.path.exists(file_path + "/img"): os.makedirs(file_path + "/img") if classify_by_tag and prior_tags: for x in ["prior", "other"]: if not os.path.exists(file_path + "/img/" + x): os.makedirs(file_path + "/img/" + x) # saved_index是已经保存完成的数量 count = 0 saved_num = img_save_info["已保存"] for img_info in imgs_info: # 跳到上次的保存进度 if count < saved_num: count += 1 continue print_end = lambda x: "\n" if x == 1 else " " print("正在保存:博客序号{} {}".format(count + 1, img_info["url"]), end=print_end(print_level)) for img_url in img_info["img urls"]: is_gif = re.findall("gif", img_url) is_png = re.findall("png", img_url) if is_gif: img_type = "gif" elif is_png: img_type = "png" else: img_type = "jpg" if print_level: print("正在保存图片 {} ".format(img_url), end="\t\t") # 检查图片是否是站内图 # re_url = re.findall('http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*', img_url) re_url = re.findall( 'http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*', img_url) if not re_url: print("\n图片 {} 不是lofter站内图 ".format(img_url), end="\t") try: img = requests.get(img_url, headers=useragentutil.get_headers()).content except: print("保存失败,请尝试手动保存") continue filename = img_info["author name in filename"] + "[" + img_info[ "author ip"] + "] " + img_info["public time"] + "." + img_type # 根据自动整理选项选择保存路径 key_tag_path = img_info["key tag"].replace("/", "&").replace("|", "&").replace("\\", "&") \ .replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?") \ .replace("*", "·").replace("(", "(").replace(")", ")") # 没有启动tag分类 if not classify_by_tag: img_path = file_path + "/img" # 启动tag分类,未启用优先tag elif classify_by_tag and not prior_tags: if not os.path.exists(file_path + "/img/" + key_tag_path): os.makedirs(file_path + "/img/" + key_tag_path) img_path = file_path + "/img/" + key_tag_path # 启用tag分类,启用优先tag else: # key tag在优先tag中 if img_info["key tag"] in prior_tags: img_path = file_path + "/img/prior/" + key_tag_path # tag不在优先tag else: # tag不在优先tag中,启用非优先tag聚合 if agg_non_prior_tag: img_path = file_path + "/img/other" # tag不在优先tag中,未启用非优先tag聚合 else: img_path = file_path + "/img/other/" + key_tag_path # 文件名查重,保存 filename = filename_check(filename, img, img_path, img_type) if not os.path.exists(img_path): os.makedirs(img_path) write_img(img, filename, img_path) if print_level: print("保存完成") if not print_level: print("保存完成") else: print("\n" + "-----------" * 10) # 保存数+1,每7条博客刷新一次文件 saved_num += 1 count += 1 if saved_num % 7 == 0 or saved_num == len(imgs_info): img_save_info["已保存"] = saved_num with open(file_path + "/img_save_info.json", "w", encoding="utf-8") as i_op1: i_op1.write( json.dumps(img_save_info, indent=4, ensure_ascii=False))
def save_article(articles_info, file_path, classify_by_tag, prior_tags, agg_non_prior_tag, save_img_in_text, print_level): # 在启用按tag分类时,先建立优先prior和other文件夹 if classify_by_tag and prior_tags: for x in ["prior", "other"]: if not os.path.exists(file_path + "/article/" + x): os.makedirs(file_path + "/article/" + x) count = 0 is_tag_null = lambda x: x if x != "" else "无" for article_info in articles_info: # 文档信息整理 article_head = article_info["title"] + " by " + article_info["author name"] + "[" + article_info[ "author ip"] + "]" + "\n发表时间:" + article_info["public time"] + "\n原文连接:" + article_info["url"] \ + "\ntags:" + is_tag_null(", ".join(article_info["tags"])) article_tail = get_tail(article_info) article = article_head + "\n\n\n" + article_info[ "content"] + "\n\n\n" + article_tail filename_title = article_info["title in filename"] filename = filename_title + " by " + article_info[ "author name in filename"] + ".txt" # 提示输出 count += 1 if print_level: try: print("保存:文章序号{} {} 原文链接:{}".format( articles_info.index(article_info) + 1, filename, article_info["url"]), end="\t\t") except: print( print("保存:文章序号{} 原文链接:{}".format( articles_info.index(article_info) + 1, article_info["url"]), end="\t\t")) else: if count % 20 == 0 or count == len(articles_info) or count == 0: try: print("保存进度 {}/{}\t\t{}".format(count, len(articles_info), filename), end="\t\t") except: print("保存进度 {}/{}\t\t".format(count, len(articles_info)), end="\t\t") # 文件路径判断 # 没有启动tag分类 key_tag_path = article_info["key tag"].replace("/", "&").replace("|", "&").replace("\\", "&") \ .replace("<", "《").replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?") \ .replace("*", "·").replace("(", "(").replace(")", ")") if not classify_by_tag: article_path = file_path + "/article" # 启动tag分类,未启用优先tag elif classify_by_tag and not prior_tags: article_path = file_path + "/article/" + key_tag_path # 启用tag分类,启用优先tag else: # key tag在优先tag中 if article_info["key tag"] in prior_tags: article_path = file_path + "/article/prior/" + key_tag_path # tag不在优先tag else: # tag不在优先tag中,启用非优先tag聚合 if agg_non_prior_tag: article_path = file_path + "/article/other" # tag不在优先tag中,未启用非优先tag聚合 else: article_path = file_path + "/article/other/" + key_tag_path # 如果文件夹不存在,建立文件夹 if not os.path.exists(article_path): os.makedirs(article_path) # 保存 write_text(article, filename, article_path) # 保存文章中的图片 if save_img_in_text: if article_info["illustration"]: for img_url in article_info["illustration"]: if print_level: print("准备保存文章中的图片 {}".format(img_url), end="\t\t") img_name = filename_title + " by " + article_info[ "author name in filename"] + ".jpg" img = requests.get( img_url, headers=useragentutil.get_headers()).content img_name = filename_check(img_name, img, article_path, "jpg") write_img(img, img_name, article_path) if print_level: print("保存完成") # 输出 if print_level: print("保存完成") else: if count % 20 == 0 or count == len(articles_info): print("保存完成")
def get_parse(url): content = requests.get(url, headers=useragentutil.get_headers()).content parse = etree.HTML(content) return parse
def parse_blogs_info(blogs_info, parsed_blogs_info, author_name, author_ip, target_tags, tags_filter_mode, file_update_interval): """ :param blogs_info: 未解析的博客信息 :param parsed_blogs_info: 已解析完的博客信息 :param author_name: 作者名 :param author_ip 作者的lofter三级域名 :param target_tags 保留带有哪些tag的博客 :param tags_filter_mode 博客过滤方式 :return: 无 解析完成的图片信息会写入./dir/imgs_info.json """ global pre_page_last_img_info imgs_info = get_file_contetn( "./dir/author_img_file/imgs_info.json") # 上次获取到的图片信息 parsed_num = len(parsed_blogs_info) # 循环len(blogs_info)次,每次解析blogs_info的第一元素,解析完后删除,定时将blogs_info刷新到文件中,以保证中途失败后能继续爬取 for blog_num in range(len(blogs_info)): blog_url = blogs_info[0]["blog_url"] img_time = blogs_info[0]["time"] print("博客 %s 开始解析" % blog_url, end=" ") content = requests.get( blog_url, headers=useragentutil.get_headers()).content.decode("utf-8") blog_tags = re.findall(r'"http[s]{0,1}://.*?.lofter.com/tag/(.*?)"', content) blog_tags = list( map(lambda x: unquote(x, "utf-8").replace("\xa0", " "), blog_tags)) if target_tags: if not tag_filter(blog_tags, target_tags, tags_filter_mode): del blogs_info[0] parsed_num += 1 print("该篇博客被过滤掉,剩余%d" % (len(blogs_info))) # 文件刷新 if (blog_num % file_update_interval == 0) or len(blogs_info) == 0: file_update("./dir/author_img_file/blogs_info.json", blogs_info) file_update("./dir/author_img_file/imgs_info.json", imgs_info) file_update("./dir/author_img_file/blogs_info_parsed.json", parsed_blogs_info) print("文件刷新") time.sleep(random.randint(1, 2)) continue # 不同作者主页会有不同页面结构,所以没有使用xpath而是直接用正则匹配出所有的图片链接,其中会包括一些评论头像和推荐图片 # 大概9月前的图片链接格式是nosdn,9月之后是imglf # imgs_url = re.findall('"(http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*?)"', content) imgs_url = re.findall( '"(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)"', content) # 过滤后为空说明没有获取到有效图片 if not img_fliter(imgs_url, "img"): print("使用旧正则表达式", end="\t") imgs_url = re.findall( '"(http[s]{0,1}://imglf\d.nosdn\d*.[0-9]{0,3}\d.net.*?)"', content) # 过滤图片链接 imgs_url = img_fliter(imgs_url, "img") # 判断跟上一博客的发表日期是否相同,如果是的话文件下标接上次的增加 img_index = 0 if img_time == pre_page_last_img_info["last_file_time"]: img_index = pre_page_last_img_info["index"] # 整理图片信息,用于下一步保存 count = 0 for img_url in imgs_url: # 判断图片类型是jpg png还是gif is_gif = re.findall("gif", img_url) is_png = re.findall("png", img_url) if is_gif: img_type = "gif" elif is_png: img_type = "png" else: img_type = "jpg" img_info = {} img_info["img_url"] = img_url img_index += 1 img_info[ "pic_name"] = author_name + "[" + author_ip + "] " + img_time + "(" + str( img_index) + ")." + img_type imgs_info.append(img_info) count += 1 # 用于验证下一条博客是不是同一天发的 pre_page_last_img_info["last_file_time"] = img_time pre_page_last_img_info["index"] = img_index # next_some_time用于判断跟下一篇博客发布时间是否相同,相同则不能刷新文件,防止相同时程序中断pre_page_last_img_info数据无法传递 try: if blogs_info[0]["time"] == blogs_info[1]["time"]: next_some_time = 1 else: next_some_time = 0 except: next_some_time = 0 parsed_num += 1 parsed_blogs_info.append(blogs_info[0]) del blogs_info[0] print( "解析完成,获取到图片链接%d,总获取图片数%d,已解析完成%d个链接(本次运行中已解析%d个链接),剩余%d" % (count, len(imgs_info), parsed_num, blog_num + 1, len(blogs_info))) # print(imgs_url) # print("--------"*10) # 按文件数目为间隔,将未解析博客、解析出的图片信息、已解析的博客 刷新到文件中 if (blog_num % file_update_interval == 0 and not next_some_time) or len(blogs_info) == 0: file_update("./dir/author_img_file/blogs_info.json", blogs_info) file_update("./dir/author_img_file/imgs_info.json", imgs_info) file_update("./dir/author_img_file/blogs_info_parsed.json", parsed_blogs_info) print("文件刷新") time.sleep(random.randint(1, 2)) with open("./dir/author_img_file/blogs_info.json", "w") as op: op.write("finished")
def run(author_url, start_time, end_time, target_tags, tags_filter_mode, file_update_interval): author_page_parse = etree.HTML( requests.get( author_url, headers=useragentutil.get_headers()).content.decode("utf-8")) # id是是获取归档页面需要的一个参数,纯数字;ip是作者在lofter的三级域名,由作者注册时设定 author_id = author_page_parse.xpath( "//body/iframe[@id='control_frame']/@src")[0].split("blogId=")[1] author_ip = re.search(r"http[s]*://(.*).lofter.com/", author_url).group(1) try: author_name = author_page_parse.xpath("//title//text()")[0] except: author_name = input("解析作者名时出现异常,请手动输入\n") archive_url = author_url + "dwr/call/plaincall/ArchiveBean.getArchivePostByTime.dwr" query_num = 50 data = make_data(author_id, query_num) head = make_head(author_url) try: print("作者名%s,lofter ip %s,主页链接 %s" % (author_name, author_ip, author_url)) except: print("作者名中有异常符号,无法显示,lofter ip %s,主页链接 %s" % (author_ip, author_url)) if target_tags: print("tag过滤已经打开,仅保存含有tag中包含%s的图片," % (" [" + ",".join(target_tags) + "] "), end="") if tags_filter_mode == "in": print("没有tag的图片将会保留") else: print("没有tag的图片将不会保留") else: print("tag过滤未打开,将保存所有图片") print("tag过滤和模式参数为:target_tags,tags_filter_mode,请根据需求自行修改") start_command = input("输入ok以启动程序\n") if start_command != "ok": print("程序退出") exit() deal_file("init") dir_path = "./dir/author_img_file" # 判断博客解析进度 if is_file_in(dir_path + "/blogs_info.json") == "finished": print("所有博客已解析完毕,跳转至图片下载") elif is_file_in(dir_path + "/blogs_info.json"): blogs_info = get_file_contetn(dir_path + "/blogs_info.json") parsed_blogs_info = get_file_contetn(dir_path + "/blogs_info_parsed.json") print("读取到上次运行保存的博客信息:未解析博链接%d条,已解析链接%d条,接上次继续运行" % (len(blogs_info), len(parsed_blogs_info))) parse_blogs_info(blogs_info, parsed_blogs_info, author_name, author_ip, target_tags, tags_filter_mode, file_update_interval) else: print("开始获取归档页面数据,链接 %s (不能直接点开)" % archive_url) blog_infos = parse_archive_page(url=archive_url, data=data, header=head, author_url=author_url, query_num=query_num, start_time=start_time, end_time=end_time) parsed_blogs_info = get_file_contetn(dir_path + "/blogs_info_parsed.json") file_update(dir_path + "/blogs_info.json", blog_infos) print("归档页面数据保存完毕,开始解析博客页面") parse_blogs_info(blog_infos, parsed_blogs_info, author_name, author_ip, target_tags, tags_filter_mode, file_update_interval) print("博客解析完毕,开始图片下载") # 判断图片保存进度 if is_file_in(dir_path + "/imgs_info.json") == "finished": print("该作者首页的所有图片已保存完毕,无需操作") else: imgs_info = get_file_contetn(dir_path + "/imgs_info.json") imgs_info_saved = get_file_contetn(dir_path + "/imgs_info_saved.json") download_img(imgs_info, imgs_info_saved, author_name, author_ip, file_update_interval) print("所有图片保存完毕") deal_file("del") print("程序运行结束")
def save_file(blog_infos, author_name, author_ip, get_comm): all_file_name = [] print("开始保存文章内容") # 拿一篇出来,测试匹配模板 first_parse = get_parse(blog_infos[0]["url"]) template_id = parse_template.matcher(first_parse) print("文字匹配模板为模板{}".format(template_id)) if template_id == 0: print("文字匹配模板是根据作者主页自动匹配的,模板0是一个匹配度比较广的模板,使用模板0说明没有其他的模板匹配成功,除了文章主体之外可能会爬到一些其他的内容,也有可能出现文章部分内容缺失") input1 = input("输入ok确定继续爬取,或输入任意其他文字退出\n") if not input1 == "ok": print("退出") exit() # 开始保存 arthicle_path = "./dir/article/{}".format(author_name) for blog_info in blog_infos: # 信息提取 title = blog_info["title"] print_title = blog_info["print_title"] public_time = blog_info["time"] url = blog_info["url"] blog_type = blog_info["blog_type"] print("准备保存:{} ,原文连接: {} ".format(print_title, url), end=" ") # 文件头 if blog_info["blog_type"] == "article": article_head = "{} by {}[{}]\n发表时间:{}\n原文链接: {}".format(title, author_name, author_ip, public_time, url) else: article_head = "{}\n原文链接: {}".format(title, url) # 正文 content = requests.get(url, headers=useragentutil.get_headers()).content parse = etree.HTML(content) article_content = parse_template.get_content(parse, template_id, title, blog_type) comm_list = [] # 评论 if get_comm: referer_url = parse.xpath("//div[@class='main comment']//iframe/@src")[0] param0 = re.search("pid=(\d+)&bid=", referer_url).group(1) number1 = 50 number2 = 0 comm_url = "https://www.lofter.com/dwr/call/plaincall/PostBean.getPostResponses.dwr" headers = { 'Host': 'www.lofter.com', 'Origin': 'https://www.lofter.com', 'Referer': "https:" + referer_url, 'Accept-Encoding': 'gzip, deflate', } all_comm_str = "" while True: comm_data = {"callCount": "1", "scriptSessionId": "${scriptSessionId}187", "httpSessionId": "", "c0-scriptName": "PostBean", "c0-methodName": "getPostResponses", "c0-id": "0", "c0-param0": "number:{}".format(param0), "c0-param1": "number:{}".format(number1), "c0-param2": "number:{}".format(number2), "batchId": "334950"} number2 += number1 comm_response = requests.post(comm_url, data=comm_data, headers=headers) comm_text = comm_response.content.decode("utf-8") all_comm_str += comm_text comm_infos = comm_text.split("anonymousUser")[1:] if not comm_infos: break for comm_info in comm_infos: # 获取的信息里每条评论有个s\d+编号 comm_sid = re.search("(s\d+)\.appVersion", comm_info).group(1) # 评论内容 comm_content = re.search(comm_sid + '\.content="(.*?)";', comm_info).group(1) \ .encode('utf8', errors="replace").decode('unicode_escape') # 评论发表时间 comm_publish_time = re.search(comm_sid + '\.publishTime=(\d+);', comm_info).group(1) public_time = time.strftime("%Y-%m-%d %H:%M", time.localtime(int(comm_publish_time) / 1000)) # 发表者信息 publisher_sid = re.search(comm_sid + "\.publisherMainBlogInfo=(.*?);", comm_info).group(1) # 昵称 re_publisher_nickname = re.search(publisher_sid + '\.blogNickName="(.*?)";', comm_info) if not re_publisher_nickname: re_publisher_nickname = re.search(publisher_sid + '\.blogNickName="(.*?)";', all_comm_str) publisher_nickname = re_publisher_nickname.group(1) \ .encode('utf8', errors="replace").decode('unicode_escape') # 用户名 re_publisher_blogname = re.search(publisher_sid + '\.blogName="(.*?)";', comm_info) if not re_publisher_blogname: re_publisher_blogname = re.search(publisher_sid + '\.blogName="(.*?)";', all_comm_str) publisher_blogname = re_publisher_blogname.group(1) \ .encode('utf8', errors="replace").decode('unicode_escape') # 回复 reply_blogsid = re.search(comm_sid + "\.replyBlogInfo=(.*?);", comm_info).group(1) if not reply_blogsid == "null": re_reply_nickname = re.search(reply_blogsid + '\.blogNickName="(.*?)";', comm_info) if not re_reply_nickname: re_reply_nickname = re.search(reply_blogsid + '\.blogNickName="(.*?)";', all_comm_str) reply_nickname = re_reply_nickname.group(1).encode('utf8', errors="replace").decode( 'unicode_escape') re_reply_blogname = re.search(reply_blogsid + '\.blogName="(.*?)";', comm_info) if not re_reply_blogname: re_reply_blogname = re.search(reply_blogsid + '\.blogName="(.*?)";', all_comm_str) reply_blogname = re_reply_blogname.group(1) else: reply_nickname = "" reply_blogname = "" if reply_nickname: comm = "{} {}[{}] 回复 {}[{}]:{}".format(public_time, publisher_nickname, publisher_blogname, reply_nickname, reply_blogname, comm_content) else: comm = "{} {}[{}]:{}".format(public_time, publisher_nickname, publisher_blogname, comm_content) comm_list.append(comm) comm_list = comm_list[::-1] # 文件尾,文章中插,的图片 # 匹配新格式 illustration = re.findall('"(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)"', content.decode("utf-8")) # 过滤后为空说明没有获取到有效图片 if not l4_author_img.img_fliter(illustration, blog_type): illustration = re.findall('"(http[s]{0,1}://imglf\d.nosdn\d*.[0-9]{0,3}\d.net.*?)"', content.decode("utf-8")) illustration = l4_author_img.img_fliter(illustration, blog_type) ''' illustration = re.findall('(http[s]{0,1}://imglf\d{0,1}.lf\d*.[0-9]{0,3}.net.*?)\?', tmp_str) if illustration == []: # 匹配旧格式 illustration = re.findall('"(http[s]{0,1}://imglf\d{0,1}.nosdn\d*.[0-9]{0,3}.net.*?)\?', "\n".join(img_src)) ''' if illustration: article_tail = "博客中包含的图片:\n" + "\n".join(illustration) else: article_tail = "" # 全文 article = article_head + "\n\n\ n\n" + article_content + "\n\n\n" + article_tail + \ ("\n\n\n-----评论-----\n\n" + "\n".join(comm_list) if comm_list else "") article = article.encode("utf-8", errors="replace").decode("utf-8", errors="replace") # 文件名 if blog_info["blog_type"] == "article": # 文章用 文章名by作者,替换掉非法字符 file_name = "{} by {}.txt".format(title, author_name) file_name = file_name.replace("/", "&").replace("|", "&").replace("\\", "&").replace("<", "《") \ .replace(">", "》").replace(":", ":").replace('"', '”').replace("?", "?").replace("*", "·"). \ replace("\n", "").replace("(", "(").replace( ")", ")").replace(",", ",") file_name = l13_like_share_tag.filename_check(file_name, article, arthicle_path, "txt") else: # 文本要检查是否重名 file_name = l13_like_share_tag.filename_check(title + ".txt", article, arthicle_path, "txt") # 写入 with open(arthicle_path + "/" + file_name, "w", encoding="utf-8") as op: op.write(article) try: print("{} 保存完毕".format(file_name)) except: print("{} 保存完毕".format(print_title)) all_file_name.append(file_name) return all_file_name