'Accept-Encoding': 'gzip', } #初始使用的header #种子 13页 URL_1024 = 'http://1024.stv919.pw/pw/thread.php?fid=22?fid=15&page=2' #22有 #http://1024.stv919.pw/pw/thread.php?fid=14&page=7 URL_youma = 'http://1024.stv919.pw/pw/thread.php?fid=22&page=' #到了13页 URL_wm = 'http://1024.stv919.pw/pw/thread.php?fid=7&page=' #到了13页 for num in range(4, 111): start_html = requests.get(URL_wm + str(num), headers=headers) start_html.encoding = 'utf-8' bsObj = BeautifulSoup(start_html.text, 'html.parser') writerLog(URL_wm + str(num)) for a in bsObj.find("tbody", { "style": "table-layout:fixed;" }).findAll("a"): if ('href' in a.attrs) and ('title' not in a.attrs): if re.match(r'^htm_data/.+.html', a.attrs['href']): a_path = get_format_filename(a.text) #的到连接中文名称 if not os.path.exists(os.path.join(file_path_wuma, a_path)): os.makedirs(os.path.join(file_path_wuma, a_path)) os.chdir(file_path_wuma + '\\' + a_path) #切换到上面创建的文件夹 f = open(a_path + '.txt', 'w') # r只读,w可写,a追加 f.write(get_inner_link(a.attrs['href'])) f.close() Process_SubPage(file_path_wuma + '\\' + a_path, a.attrs['href']) #处理子页面,包括下载图片,种子 # print(get_inner_link(a.attrs['href'])) print(a_path + ':处理完毕') # time.sleep(0.5)#设置等待还是会被服务器封禁
file_path = 'D:\MyProjectFile\Python\studyproject\Python3\StudyPro1\datebase' #存储的地址 headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 'Accept-Encoding': 'gzip', } #初始使用的header URL_1024 = 'http://1024.91lulea.click/pw/thread.php?fid=22&page=4' #网站地址中‘日本骑兵’系列,别的系列我没有测试,不保证正确 #刚测试了,‘亚洲无码’也可以,估计所有系列的网页HTML格式是一样的 start_html = requests.get(URL_1024, headers=headers) start_html.encoding = 'utf-8' bsObj = BeautifulSoup(start_html.text, 'html.parser') for a in bsObj.find("tbody", {"style": "table-layout:fixed;"}).findAll("a"): if ('href' in a.attrs) and ('title' not in a.attrs): if re.match(r'^htm_data/.+.html', a.attrs['href']): a_path = get_format_filename(a.text) if not os.path.exists(os.path.join(file_path, a_path)): os.makedirs(os.path.join(file_path, a_path)) os.chdir(file_path + '\\' + a_path) #切换到上面创建的文件夹 f = open(a_path + '.txt', 'w') # r只读,w可写,a追加 f.write(get_inner_link(a.attrs['href'])) f.close() Process_SubPage(file_path + '\\' + a_path, a.attrs['href']) #处理子页面,包括下载图片,种子 print(get_inner_link(a.attrs['href'])) print(a_path + ':处理完毕') # time.sleep(0.5)#设置等待还是会被服务器封禁
}).findAll("h3"): attrs = a.find("a").attrs['href'] # if re.match(r'^htm_data/.+.html', attrs): print(attrs) # 取种子名 seedStr = get_inner_link(attrs) seed_html = requests.get(seedStr, headers=get_image_header()) seed_html.encoding = 'utf-8' seedObj = BeautifulSoup(seed_html.text, 'html.parser') for seed_a in seedObj.find("div", {"id": "read_tpc"}).findAll("a"): if re.match(r'^http://www?\d+.+.html$', seed_a.attrs['href']): seedUrl = seed_a.attrs['href'] seedNum = seedUrl[-12:-5] print(seedNum) a_path = get_format_filename(a.text) # 构建本地文件路径,影片名 if not os.path.exists(os.path.join(filePath, seedNum)): os.makedirs(os.path.join(filePath, seedNum)) os.chdir(filePath + '/' + seedNum) # 切换到上面创建的文件夹 f = open(seedNum + '.txt', 'w') # r只读,w可写,a追加 f.write(a_path) f.close() Process_SubPage(filePath + '/' + seedNum, attrs) # 处理子页面,包括下载图片,种子 print(get_inner_link(attrs)) print(a_path + ':处理完毕') time.sleep(0.5) # 设置等待还是会被服务器封禁 # # 种子页网址 爬取种子 # seedStr = get_inner_link(a.attrs['href'])