def multi_start(page=1, max_page=20): global proxy_list init_proxy.create_proxy() print(now() + "获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) dir_name = init(proxy_list[index], page, max_page) print(now() + "梳理获取的数据") new_list = [] for i in range(1, max_page - 1): with open(dir_name + '/' + str(i) + ".json", 'r') as f: new_list += json.loads(f.read(-1)) with open(dir_name + '/total.json', 'w') as f: f.write(json.dumps(new_list)) print(now() + "梳理完成") # 将梳理结果写入数据库 try: read_comic_to_db.driver(dir_name + '/total.json') except Exception: print(now() + "写入数据库失败,跳过写入操作") # 梳理完成后开始读取图片 multi_thread_after(dir_name + '/total.json')
def comic_json_after(comic_json, get_proxy_again=False): global proxy_list, headers if get_proxy_again: init_proxy.create_proxy() print(now() + "获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) read_comic_img_info(comic_json['comicLink'], headers, proxy_list[index])
def multi_thread_after(total_json_file): global proxy_list init_proxy.create_proxy() print(now() + "获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) data = '' # 将total数据全部读取 with open(total_json_file, 'r') as f: for line in f.readlines(): data += line.strip() json_list_data = json.loads(data) # q是任务队列 q = Queue() # data是comic对象队列 comic_list = Queue() # NUM是并发线程总数 NUM = 15 # JOBS是有多少任务,也就是comic_list的大小 JOBS = 0 # 在工作之前,将json_list_data的数据放入comic_list中 for comic in json_list_data: JOBS += 1 comic_list.put(comic) print(now() + "要执行" + str(JOBS) + "个任务") # 这个是工作进程,负责不断从队列取数据并处理 def working(): while True: comic_json = comic_list.get() print(now() + "执行新任务:下载[" + comic_json['comicId'] + "]漫画") comic_json_after(comic_json) sleep(1) q.task_done() # fork NUM个线程等待队列 for i in range(NUM): print(now() + "创建线程[" + str(i) + "]中") t = Thread(target=working) t.setDaemon(True) t.start() # 把JOBS排入队列 for i in range(JOBS): # print(now() + "执行第[" + str(i) + "]个任务") q.put(i) # 等待所有JOBS完成 q.join()
def after(total_json_file): global proxy_list init_proxy.create_proxy() print(now() + "获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) data = '' # 将total数据全部读取 with open(total_json_file, 'r') as f: for line in f.readlines(): data += line.strip() data = json.loads(data) for comic in data: comic_json_after(comic)
def init(proxy, page=1, max_page=20, use_proxy=True, dir_name=None): global index, proxy_list if dir_name is None: # 首先创建一个基于当前时间的文件夹 dir_name = time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime(int(time.time()))) # 在Windows环境下 文件夹不允许带冒号 if platform.system() == "Windows": dir_name = time.strftime("[%Y-%m-%d %H-%M-%S]", time.localtime(int(time.time()))) dir_name = "ComicHentai" + dir_name os.mkdir(dir_name) proxies = { "http": proxy, } while page < max_page: querystring = { "page": page, "f_apply": "Search", "f_search": "Chinese" } response = None while response is None: try: if use_proxy: response = requests.request("GET", url, headers=headers, params=querystring, proxies=proxies, timeout=10) else: response = requests.request("GET", url, headers=headers, params=querystring, timeout=10) if response.text.count('squid') > 0 or response.text.count("http://warning.or.kr") > 0: print(now() + "当前代理出现异常,切换代理") raise ConnectionError("has been blocked") if response.status_code != 200: print(now() + "非正常返回结果,代码[" + str(response.status_code) + "],切换代理") raise ConnectionError("has been blocked") if response.text.count( 'Your IP address has been temporarily banned for excessive pageloads which indicates that you are using automated mirroring/harvesting software.') > 0: print(now() + "当前代理已被Ban,切换代理") raise ConnectionError("has been blocked") print(now() + "入侵成功!当前为第" + str(page) + "页") except ConnectionError: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) print( now() + "代理[" + proxies.get('http') + "]不可用,切换至[" + proxy_list[index] + "],当前为第" + str(page) + "页") proxies = { "http": proxy_list[index] } continue except ReadTimeout: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) print(now() + "代理[" + proxies.get('http') + "]超时,切换至[" + proxy_list[index] + "],当前为第" + str(page) + "页") proxies = { "http": proxy_list[index] } continue # 获得漫画栏,对漫画栏的每个元素进行迭代 div id = ig comic_list = pq(response.text).find('.ig') comic_json_list = [] list_len = 0 for elements in comic_list: comic = read_basic_comic_info(elements) # 获取完成基本信息之后,就要遍历漫画的每张图片了 # imgList = read_basic_comic_info() # comic["imgList"] = json.dumps(imgList) comic_json_list.append(comic) list_len += 1 print(now() + "读取漫画信息中[" + json.dumps(comic) + "]") comic_json_list = json.dumps(comic_json_list) print(now() + "读取了 " + str(list_len) + " 个漫画 , 准备写入") if list_len == 0: print(now() + "读取漫画失败,重新读取") continue # 最后将当前页的数据写回json with open(dir_name + "/" + str(page) + ".json", 'w') as f: f.write(comic_json_list) page += 1 print(now() + "数据写入成功,等待睡眠2秒后进行下一页查询") time.sleep(2) return dir_name
def read_comic_img_info(comic_link, headers, proxy, use_proxy=True): if not os.path.exists('ComicData') or not os.path.isdir('ComicData'): os.mkdir('ComicData') global index, proxy_list if len(proxy_list) == 0: print(now() + '代理列表为空,重新获取') init_proxy.create_proxy() print(now() + "获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) comic_id = comic_link.split('/')[5] if os.path.exists('ComicData/' + comic_id + ".json"): print(now() + "漫画[" + comic_id + "]的数据已存在") return print(now() + "正在读取漫画[" + comic_id + "]的数据") this_page_link = comic_link prev_page_link = '' proxy = { "http": proxy } page = 0 img_list = [] total_page = 999 # 当上一页和当前页相同时,说明完结了,退出 while this_page_link != prev_page_link: if this_page_link is None: break try: if use_proxy: response = requests.request("GET", this_page_link, headers=headers, proxies=proxy, timeout=10) else: response = requests.request("GET", this_page_link, headers=headers, timeout=10) # 代理出了问题 if response.text.count('squid') or response.text.count("http://warning.or.kr") > 0: print(now() + "读取漫画[" + comic_id + "]:" + "当前代理出现异常,切换代理") raise ConnectionError("has been blocked") if response.status_code != 200: print(now() + "读取漫画[" + comic_id + "]:" + "非正常返回结果,代码[" + str(response.status_code) + "],切换代理") raise ConnectionError("has been blocked") # print(now()+response.text) if page == 0: # 目录页 提取第一页的链接 tmp = pq(response.text).find('.gi').eq(0).find('a').attr('href') if tmp is None: print(now() + "读取漫画[" + comic_id + "]:" + "读取数据错误,重新读取,切换代理") raise ConnectionError("has been blocked") if tmp.count("509.gif") > 0: print(now() + "读取漫画[" + comic_id + "]:" + "当前代理已被Ban,重新读取,切换代理") raise ConnectionError("has been blocked") prev_page_link = this_page_link this_page_link = tmp page += 1 print(now() + "读取漫画[" + comic_id + "]:" + "目录页读取完成,开始进入第一页") else: # 图片页 提取下一页的链接 img_link = pq(response.text)("#sm").attr('src') # 图片页链接非法 if img_link is None: print(now() + "读取漫画[" + comic_id + "]:" + "读取数据错误,重新读取,切换代理") raise ConnectionError("has been blocked") if img_link.count("509.gif") > 0: print(now() + "读取漫画[" + comic_id + "]:" + "当前代理已被Ban,重新读取,切换代理") raise ConnectionError("has been blocked") print(now() + "读取漫画[" + comic_id + "]:" + "第" + str(page) + "页数据为[" + img_link + "]") # 下载中 download_count = 0 result = download_img(img_link, comic_id) while download_count < 5 and not result: download_count += 1 print( now() + "读取漫画[" + comic_id + "]:" + "图片下载失败[" + img_link + "],重试第" + str(download_count) + "次中") result = download_img(img_link, comic_id) if download_count >= 5 and not result: print(now() + "读取漫画[" + comic_id + "]:" + "下载图片失败,重新读取,切换代理") raise ConnectionError("has been blocked") # 下载完成 img_list.append(img_link) td = pq(response.text)('#ia').children().eq(0).children().children() total_page = int(td.eq(1).text().split('/')[1]) tmp = td.eq(2).children().attr('href') prev_page_link = this_page_link this_page_link = tmp print(now() + "读取漫画[" + comic_id + "]:" + "第" + str(page) + "页读取完成,总共有" + str(total_page) + "页") if total_page == page: print(now() + "读取漫画[" + comic_id + "]:" + "当前漫画已读取完毕") break if this_page_link is None: print(now() + "读取漫画[" + comic_id + "]:" + "读取下一页出现错误") raise ConnectionError("has been blocked") print(now() + "读取漫画[" + comic_id + "]:" + "下一页为[" + this_page_link + "]") page += 1 except ConnectionError: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = 0 print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]不可用,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = { "http": proxy_list[index] } continue except TooManyRedirects: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = len(proxy_list) - 1 print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]链接不可用,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = { "http": proxy_list[index] } continue except ReadTimeout: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = len(proxy_list) - 1 print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]超时,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = { "http": proxy_list[index] } continue except RequestException as e: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = len(proxy_list) - 1 print(now() + "读取漫画[" + comic_id + "]:" + "出现请求异常 [{0:s}]".format(e)) print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]出现异常,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = { "http": proxy_list[index] } continue except Exception as e: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = len(proxy_list) - 1 print(now() + "读取漫画[" + comic_id + "]:" + "出现异常 [{0:s}]".format(e)) print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]出现异常,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = { "http": proxy_list[index] } continue print(now() + "读取漫画[" + comic_id + "]:" + "数据查询成功,等待睡眠2秒后进行下一页查询") time.sleep(2) if this_page_link == prev_page_link and total_page != page: print(now() + "读取漫画[" + comic_id + "]:" + "漫画[" + comic_id + "]写入出现问题,页数不符") print(now() + "读取漫画[" + comic_id + "]:" + "当前漫画所有页读取完成,进行写入") json_list = json.dumps(img_list) with open("ComicData/" + comic_id + ".json", 'w') as f: f.write(json_list) print(now() + "漫画[" + comic_id + "]写入完成")
def init(proxy, page=1, max_page=20, use_proxy=True, dir_name=None): global index, proxy_list if dir_name is None: # 首先创建一个基于当前时间的文件夹 dir_name = time.strftime("%Y-%m-%d %H%M%S", time.localtime(int(time.time()))) # 在Windows环境下 文件夹不允许带冒号 if platform.system() == "Windows": dir_name = time.strftime("%Y-%m-%d %H%M%S", time.localtime(int(time.time()))) dir_name = "data-" + dir_name os.mkdir(dir_name) proxies = { "http": proxy, } while page < max_page: querystring = { "page": page, "f_apply": "Search", "f_search": "Chinese" } response = None while response is None: try: if use_proxy: response = requests.request("GET", url, headers=headers, params=querystring, proxies=proxies, timeout=10) else: response = requests.request("GET", url, headers=headers, params=querystring, timeout=10) if response.text.count('squid') > 0 or response.text.count( "http://warning.or.kr") > 0: print(now() + "当前代理出现异常,切换代理") raise ConnectionError("has been blocked") if response.status_code != 200: print(now() + "非正常返回结果,代码[" + str(response.status_code) + "],切换代理") raise ConnectionError("has been blocked") if response.text.count( 'Your IP address has been temporarily banned for excessive pageloads which indicates that you are using automated mirroring/harvesting software.' ) > 0: print(now() + "当前代理已被Ban,切换代理") raise ConnectionError("has been blocked") print(now() + "入侵成功!当前为第" + str(page) + "页") except ConnectionError: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) print(now() + "代理[" + proxies.get('http') + "]不可用,切换至[" + proxy_list[index] + "],当前为第" + str(page) + "页") proxies = {"http": proxy_list[index]} continue except ReadTimeout: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) print(now() + "代理[" + proxies.get('http') + "]超时,切换至[" + proxy_list[index] + "],当前为第" + str(page) + "页") proxies = {"http": proxy_list[index]} continue # 获得漫画栏,对漫画栏的每个元素进行迭代 div id = ig comic_list = pq(response.text).find('.ig') comic_json_list = [] list_len = 0 for elements in comic_list: comic = read_basic_comic_info(elements) # 获取完成基本信息之后,就要遍历漫画的每张图片了 # imgList = read_basic_comic_info() # comic["imgList"] = json.dumps(imgList) comic_json_list.append(comic) list_len += 1 print(now() + "读取漫画信息中[" + json.dumps(comic) + "]") comic_json_list = json.dumps(comic_json_list) print(now() + "读取了 " + str(list_len) + " 个漫画 , 准备写入") if list_len == 0: print(now() + "读取漫画失败,重新读取") continue # 最后将当前页的数据写回json with open(dir_name + "/" + str(page) + ".json", 'w') as f: f.write(comic_json_list) page += 1 print(now() + "数据写入成功,等待睡眠2秒后进行下一页查询") time.sleep(2) return dir_name
def read_comic_img_info(comic_link, headers, proxy, use_proxy=True): if not os.path.exists('ComicData') or not os.path.isdir('ComicData'): os.mkdir('ComicData') global index, proxy_list if len(proxy_list) == 0: print(now() + '代理列表为空,重新获取') init_proxy.create_proxy() print(now() + "获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) comic_id = comic_link.split('/')[5] if os.path.exists('ComicData/' + comic_id + ".json"): print(now() + "漫画[" + comic_id + "]的数据已存在") return print(now() + "正在读取漫画[" + comic_id + "]的数据") this_page_link = comic_link prev_page_link = '' proxy = {"http": proxy} page = 0 img_list = [] total_page = 999 # 当上一页和当前页相同时,说明完结了,退出 while this_page_link != prev_page_link: if this_page_link is None: break try: if use_proxy: response = requests.request("GET", this_page_link, headers=headers, proxies=proxy, timeout=10) else: response = requests.request("GET", this_page_link, headers=headers, timeout=10) # 代理出了问题 if response.text.count('squid') or response.text.count( "http://warning.or.kr") > 0: print(now() + "读取漫画[" + comic_id + "]:" + "当前代理出现异常,切换代理") raise ConnectionError("has been blocked") if response.status_code != 200: print(now() + "读取漫画[" + comic_id + "]:" + "非正常返回结果,代码[" + str(response.status_code) + "],切换代理") raise ConnectionError("has been blocked") # print(now()+response.text) if page == 0: # 目录页 提取第一页的链接 tmp = pq( response.text).find('.gi').eq(0).find('a').attr('href') if tmp is None: print(now() + "读取漫画[" + comic_id + "]:" + "读取数据错误,重新读取,切换代理") raise ConnectionError("has been blocked") if tmp.count("509.gif") > 0: print(now() + "读取漫画[" + comic_id + "]:" + "当前代理已被Ban,重新读取,切换代理") raise ConnectionError("has been blocked") prev_page_link = this_page_link this_page_link = tmp page += 1 print(now() + "读取漫画[" + comic_id + "]:" + "目录页读取完成,开始进入第一页") else: # 图片页 提取下一页的链接 img_link = pq(response.text)("#sm").attr('src') # 图片页链接非法 if img_link is None: print(now() + "读取漫画[" + comic_id + "]:" + "读取数据错误,重新读取,切换代理") raise ConnectionError("has been blocked") if img_link.count("509.gif") > 0: print(now() + "读取漫画[" + comic_id + "]:" + "当前代理已被Ban,重新读取,切换代理") raise ConnectionError("has been blocked") print(now() + "读取漫画[" + comic_id + "]:" + "第" + str(page) + "页数据为[" + img_link + "]") # 下载中 download_count = 0 result = download_img(img_link, comic_id) while download_count < 5 and not result: download_count += 1 print(now() + "读取漫画[" + comic_id + "]:" + "图片下载失败[" + img_link + "],重试第" + str(download_count) + "次中") result = download_img(img_link, comic_id) if download_count >= 5 and not result: print(now() + "读取漫画[" + comic_id + "]:" + "下载图片失败,重新读取,切换代理") raise ConnectionError("has been blocked") # 下载完成 img_list.append(img_link) td = pq(response.text)('#ia').children().eq( 0).children().children() total_page = int(td.eq(1).text().split('/')[1]) tmp = td.eq(2).children().attr('href') prev_page_link = this_page_link this_page_link = tmp print(now() + "读取漫画[" + comic_id + "]:" + "第" + str(page) + "页读取完成,总共有" + str(total_page) + "页") if total_page == page: print(now() + "读取漫画[" + comic_id + "]:" + "当前漫画已读取完毕") break if this_page_link is None: print(now() + "读取漫画[" + comic_id + "]:" + "读取下一页出现错误") raise ConnectionError("has been blocked") print(now() + "读取漫画[" + comic_id + "]:" + "下一页为[" + this_page_link + "]") page += 1 except ConnectionError: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = 0 print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]不可用,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = {"http": proxy_list[index]} continue except TooManyRedirects: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = len(proxy_list) - 1 print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]链接不可用,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = {"http": proxy_list[index]} continue except ReadTimeout: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = len(proxy_list) - 1 print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]超时,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = {"http": proxy_list[index]} continue except RequestException as e: # 如果代理连不上,换一个 index += 1 if index == len(proxy_list): index = 0 init_proxy.create_proxy() print(now() + "重新获取代理成功") with open('proxy.json', 'r') as f: proxy_list = json.loads(f.read(-1)) if index >= len(proxy_list): index = len(proxy_list) - 1 print(now() + "读取漫画[" + comic_id + "]:" + "出现未知异常 [{0:s}]".format(e)) print(now() + "读取漫画[" + comic_id + "]:" + "代理[" + proxy.get('http') + "]出现异常,切换至[" + proxy_list[index] + "],当前为" + ("目录" if page == 0 else "第" + str(page)) + "页") proxy = {"http": proxy_list[index]} continue print(now() + "读取漫画[" + comic_id + "]:" + "数据查询成功,等待睡眠2秒后进行下一页查询") time.sleep(2) if this_page_link == prev_page_link and total_page != page: print(now() + "读取漫画[" + comic_id + "]:" + "漫画[" + comic_id + "]写入出现问题,页数不符") print(now() + "读取漫画[" + comic_id + "]:" + "当前漫画所有页读取完成,进行写入") json_list = json.dumps(img_list) with open("ComicData/" + comic_id + ".json", 'w') as f: f.write(json_list) print(now() + "漫画[" + comic_id + "]写入完成")
import init_proxy init_proxy.create_proxy('fpl')