Example #1
0
file_path = 'D:\MyProjectFile\Python\studyproject\Python3\StudyPro1\datebase'  #存储的地址
headers = {
    'User-Agent':
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    'Accept':
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    'Accept-Encoding': 'gzip',
}  #初始使用的header
URL_1024 = 'http://1024.91lulea.click/pw/thread.php?fid=22&page=4'
#网站地址中‘日本骑兵’系列,别的系列我没有测试,不保证正确 #刚测试了,‘亚洲无码’也可以,估计所有系列的网页HTML格式是一样的

start_html = requests.get(URL_1024, headers=headers)
start_html.encoding = 'utf-8'
bsObj = BeautifulSoup(start_html.text, 'html.parser')
for a in bsObj.find("tbody", {"style": "table-layout:fixed;"}).findAll("a"):
    if ('href' in a.attrs) and ('title' not in a.attrs):
        if re.match(r'^htm_data/.+.html', a.attrs['href']):
            a_path = get_format_filename(a.text)
            if not os.path.exists(os.path.join(file_path, a_path)):
                os.makedirs(os.path.join(file_path, a_path))
            os.chdir(file_path + '\\' + a_path)  #切换到上面创建的文件夹
            f = open(a_path + '.txt', 'w')  # r只读,w可写,a追加
            f.write(get_inner_link(a.attrs['href']))
            f.close()
            Process_SubPage(file_path + '\\' + a_path,
                            a.attrs['href'])  #处理子页面,包括下载图片,种子
            print(get_inner_link(a.attrs['href']))
            print(a_path + ':处理完毕')
            # time.sleep(0.5)#设置等待还是会被服务器封禁
Example #2
0
        # 翻页
        print(x)
        url_for_1024 = URL_Arr[i] + str(x + 1)
        filePath = file_path_Arr[i]

        start_html = requests.get(url_for_1024, headers=headers)
        start_html.encoding = 'utf-8'
        bsObj = BeautifulSoup(start_html.text, 'html.parser')
        for a in bsObj.find("tbody", {
                "style": "table-layout:fixed;"
        }).findAll("h3"):
            attrs = a.find("a").attrs['href']
            # if re.match(r'^htm_data/.+.html', attrs):
            print(attrs)
            # 取种子名
            seedStr = get_inner_link(attrs)
            seed_html = requests.get(seedStr, headers=get_image_header())
            seed_html.encoding = 'utf-8'
            seedObj = BeautifulSoup(seed_html.text, 'html.parser')
            for seed_a in seedObj.find("div", {"id": "read_tpc"}).findAll("a"):
                if re.match(r'^http://www?\d+.+.html$', seed_a.attrs['href']):
                    seedUrl = seed_a.attrs['href']
                    seedNum = seedUrl[-12:-5]
                    print(seedNum)

                    a_path = get_format_filename(a.text)  # 构建本地文件路径,影片名

                    if not os.path.exists(os.path.join(filePath, seedNum)):
                        os.makedirs(os.path.join(filePath, seedNum))
                    os.chdir(filePath + '/' + seedNum)  # 切换到上面创建的文件夹
                    f = open(seedNum + '.txt', 'w')  # r只读,w可写,a追加