about_3 = about_2.find_all('p') #print(str(about_3).encode('utf-8')) #print(len(about_3)) #pdb.set_trace() #print(about_2.original_encoding) c = 0 for each_about_3 in about_3: # c+=1 # print(c) # print(each_about_3.get_text()) # print(type(each_about_3.get_text())) print(each_about_3.get_text().strip() == '') if each_about_3.get_text().strip() == '': continue print(each_about_3.get_text()) BaseCommon.loglist(base_log, '简介:' + each_about_3.get_text()) #exit(); #列表 targets_url = bf.find_all('div', class_='sections') for each_url in targets_url: #pdb.set_trace() # 运行到这里会自动暂停 bf_2 = BeautifulSoup(str(each_url), 'lxml') [s.extract() for s in bf_2("p")] #去除指定标签 targets_url_2 = bf_2.find_all('a') for each_url_2 in targets_url_2: print(each_url_2.get('href')) print(each_url_2.get_text()) #pdb.set_trace() BaseCommon.loglist( base_log, each_url_2.get_text() + '=' + base_url +
import json #引入自定义函数 from python_lib import BaseCommon if __name__ == '__main__': #base_folder ='F:/py3project/91hanman/'#以/结束 home #base_folder ='F:/py3workspace/python-spider/'#以/结束 company base_folder = os.getcwd()+"\\" base_url='https://www.manhuaw.cc' base_config_file=base_folder+'\\'+sys.argv[0][sys.argv[0].rfind(os.sep) + 1:-3]+'.ini' #判断配置文件是否存在 不存在建立 if os.path.exists(base_config_file) == False: BaseCommon.loglist(base_config_file,"") print(base_config_file) #定义一组 manhua_list=[] manhua_vip_list = [] manhua_vip_list.append('1-vip-1=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-1') manhua_vip_list.append('1-vip-2=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-2') manhua_vip_list.append('1-vip-3=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-3') manhua_vip_list.append('1-vip-4=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-4') manhua_vip_list.append('1-vip-5=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-5') manhua_vip_list.append('1-vip-6=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-6') manhua_vip_list.append('1-vip-7=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-7') manhua_vip_list.append('1-vip-8=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-8') manhua_vip_list.append('1-vip-9=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-9') manhua_vip_list.append('1-vip-10=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-10')
import sys import configparser #引入自定义函数 from python_lib import BaseCommon if __name__ == '__main__': #base_folder ='F:/py3project/91hanman/'#以/结束 home #base_folder ='F:/py3workspace/python-spider/'#以/结束 company base_folder = os.getcwd() + "\\" base_url = 'http://www.xxmh.cc' base_config_file = base_folder + '\\' + sys.argv[0][ sys.argv[0].rfind(os.sep) + 1:-3] + '.ini' #判断配置文件是否存在 不存在建立 if os.path.exists(base_config_file) == False: BaseCommon.loglist(base_config_file, "") print(base_config_file) #定义一组 manhua_list = [] # HOME manhua_list.append('xxmh-我还小狼性老公太凶猛=/book/48') #83 中断 manhua_list.append('xxmh-逗腐教室=/book/49') # COMPANY print(manhua_list) print(len(manhua_list)) manhua_list_count = 0 #ConfigParser 初始化对象 config = configparser.ConfigParser() config.read(base_config_file, encoding="GBK") #exit()
import sys import configparser #引入自定义函数 from python_lib import BaseCommon if __name__ == '__main__': #base_folder ='F:/py3project/91hanman/'#以/结束 home #base_folder ='F:/py3workspace/python-spider/'#以/结束 company base_folder = os.getcwd() + "\\" base_url = 'https://lieqiman.com' base_config_file = base_folder + '\\' + sys.argv[0][ sys.argv[0].rfind(os.sep) + 1:-3] + '.ini' #判断配置文件是否存在 不存在建立 if os.path.exists(base_config_file) == False: BaseCommon.loglist(base_config_file, "") print(base_config_file) #定义一组 manhua_list = [] manhua_list.append('baiyihuangdao=/mh/xe/141.html') #manhua_list.append('zhenjiajiedi=/mh/xe/142.html') manhua_list.append('nvyuanjianwenlu=/mh/xe/143.html') manhua_list.append('yinmandelianqing=/mh/xe/144.html') manhua_list.append('mingxingdachu=/mh/xe/145.html') manhua_list.append('dushixiejiang=/mh/xe/146.html') #manhua_list.append('caokongzhe=/mh/xe/147.html') print(manhua_list) print(len(manhua_list)) manhua_list_count = 0 #ConfigParser 初始化对象
import socket import configparser import sys from python_lib import BaseCommon if __name__ == '__main__': #base_folder ='F:/py3project/91hanman/'#以/结束 #base_folder ='F:/py3workspace/python-spider/'#以/结束 base_folder = os.getcwd()+"\\" base_url='https://www.91hanman.com' base_config_file=base_folder+'\\'+sys.argv[0][sys.argv[0].rfind(os.sep) + 1:-3]+'.ini' #判断配置文件是否存在 不存在建立 if os.path.exists(base_config_file) == False: BaseCommon.loglist(base_config_file,"") #BaseCommon.loglist(base_config_file,"[currentManga]"+"\n"+"mangacount = 0"+"\n"+"[currentChapter]"+"\n"+"chaptercount = 0"+"\n") print(base_config_file) manhua_list=[] manhua_list.append('https://www.91hanman.com/book/webBookDetail/4=91hanman-恋爱辅助器') manhua_list.append('https://www.91hanman.com/book/webBookDetail/23=91hanman-制服诱惑') manhua_list.append('https://www.91hanman.com/book/webBookDetail/134=91hanman-红杏出墙') print(manhua_list) manhua_list_count=0 #ConfigParser 初始化对象 config = configparser.ConfigParser()
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "Referer": "http://www.google.com/bot.html" } #网页下载太长 需要加入参数 stream=True 来判断完整性下载 req = requests.get(url=url, headers=headers, stream=True) req.encoding = 'utf-8' html = req.text bf = BeautifulSoup(html, 'lxml') targets_url = bf.find_all(class_='detail-chapters-list-item') for each_url in targets_url: #pdb.set_trace() # 运行到这里会自动暂停 bf_2 = BeautifulSoup(str(each_url), 'lxml') print(bf_2.a.get('href')) print(bf_2.a.span.span.get_text()) BaseCommon.loglist( base_log, bf_2.a.span.span.get_text() + '=' + base_url + bf_2.a.get('href')) list_url.append(bf_2.a.span.span.get_text() + '=' + base_url + bf_2.a.get('href')) #pdb.set_trace() # 运行到这里会自动暂停 # for each in targets_url: # list_url.append(each.img.get('alt') + '=' + each.get('href')) print('连接采集完成') print(list_url) print(len(list_url)) count = 0 for each_img in list_url: