Esempio n. 1
0
 about_3 = about_2.find_all('p')
 #print(str(about_3).encode('utf-8'))
 #print(len(about_3))
 #pdb.set_trace()
 #print(about_2.original_encoding)
 c = 0
 for each_about_3 in about_3:
     # c+=1
     # print(c)
     # print(each_about_3.get_text())
     # print(type(each_about_3.get_text()))
     print(each_about_3.get_text().strip() == '')
     if each_about_3.get_text().strip() == '':
         continue
     print(each_about_3.get_text())
     BaseCommon.loglist(base_log, '简介:' + each_about_3.get_text())
 #exit();
 #列表
 targets_url = bf.find_all('div', class_='sections')
 for each_url in targets_url:
     #pdb.set_trace() # 运行到这里会自动暂停
     bf_2 = BeautifulSoup(str(each_url), 'lxml')
     [s.extract() for s in bf_2("p")]  #去除指定标签
     targets_url_2 = bf_2.find_all('a')
     for each_url_2 in targets_url_2:
         print(each_url_2.get('href'))
         print(each_url_2.get_text())
         #pdb.set_trace()
         BaseCommon.loglist(
             base_log,
             each_url_2.get_text() + '=' + base_url +

import json
#引入自定义函数
from python_lib import BaseCommon

if __name__ == '__main__':

    #base_folder ='F:/py3project/91hanman/'#以/结束 home
    #base_folder ='F:/py3workspace/python-spider/'#以/结束 company
    base_folder = os.getcwd()+"\\"
    base_url='https://www.manhuaw.cc'
    base_config_file=base_folder+'\\'+sys.argv[0][sys.argv[0].rfind(os.sep) + 1:-3]+'.ini'
    #判断配置文件是否存在 不存在建立
    if os.path.exists(base_config_file) ==  False:
        BaseCommon.loglist(base_config_file,"")
    print(base_config_file)
    #定义一组
    manhua_list=[]
    manhua_vip_list = []
    manhua_vip_list.append('1-vip-1=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-1')
    manhua_vip_list.append('1-vip-2=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-2')
    manhua_vip_list.append('1-vip-3=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-3')
    manhua_vip_list.append('1-vip-4=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-4')
    manhua_vip_list.append('1-vip-5=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-5')
    manhua_vip_list.append('1-vip-6=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-6')
    manhua_vip_list.append('1-vip-7=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-7')
    manhua_vip_list.append('1-vip-8=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-8')
    manhua_vip_list.append('1-vip-9=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-9')
    manhua_vip_list.append('1-vip-10=https://www.manhuaw.cc/index/api/getpage/tp/1-vip-10')
Esempio n. 3
0
import sys
import configparser
#引入自定义函数
from python_lib import BaseCommon

if __name__ == '__main__':

    #base_folder ='F:/py3project/91hanman/'#以/结束 home
    #base_folder ='F:/py3workspace/python-spider/'#以/结束 company
    base_folder = os.getcwd() + "\\"
    base_url = 'http://www.xxmh.cc'
    base_config_file = base_folder + '\\' + sys.argv[0][
        sys.argv[0].rfind(os.sep) + 1:-3] + '.ini'
    #判断配置文件是否存在 不存在建立
    if os.path.exists(base_config_file) == False:
        BaseCommon.loglist(base_config_file, "")
    print(base_config_file)
    #定义一组
    manhua_list = []
    # HOME
    manhua_list.append('xxmh-我还小狼性老公太凶猛=/book/48')  #83 中断
    manhua_list.append('xxmh-逗腐教室=/book/49')
    # COMPANY

    print(manhua_list)
    print(len(manhua_list))
    manhua_list_count = 0
    #ConfigParser 初始化对象
    config = configparser.ConfigParser()
    config.read(base_config_file, encoding="GBK")
    #exit()
import sys
import configparser
#引入自定义函数
from python_lib import BaseCommon

if __name__ == '__main__':

    #base_folder ='F:/py3project/91hanman/'#以/结束 home
    #base_folder ='F:/py3workspace/python-spider/'#以/结束 company
    base_folder = os.getcwd() + "\\"
    base_url = 'https://lieqiman.com'
    base_config_file = base_folder + '\\' + sys.argv[0][
        sys.argv[0].rfind(os.sep) + 1:-3] + '.ini'
    #判断配置文件是否存在 不存在建立
    if os.path.exists(base_config_file) == False:
        BaseCommon.loglist(base_config_file, "")
    print(base_config_file)
    #定义一组
    manhua_list = []
    manhua_list.append('baiyihuangdao=/mh/xe/141.html')
    #manhua_list.append('zhenjiajiedi=/mh/xe/142.html')
    manhua_list.append('nvyuanjianwenlu=/mh/xe/143.html')
    manhua_list.append('yinmandelianqing=/mh/xe/144.html')
    manhua_list.append('mingxingdachu=/mh/xe/145.html')
    manhua_list.append('dushixiejiang=/mh/xe/146.html')
    #manhua_list.append('caokongzhe=/mh/xe/147.html')

    print(manhua_list)
    print(len(manhua_list))
    manhua_list_count = 0
    #ConfigParser 初始化对象
import socket
import configparser
import sys

from python_lib import BaseCommon


if __name__ == '__main__':
    #base_folder ='F:/py3project/91hanman/'#以/结束
    #base_folder ='F:/py3workspace/python-spider/'#以/结束
    base_folder = os.getcwd()+"\\"
    base_url='https://www.91hanman.com'
    base_config_file=base_folder+'\\'+sys.argv[0][sys.argv[0].rfind(os.sep) + 1:-3]+'.ini'
    #判断配置文件是否存在 不存在建立
    if os.path.exists(base_config_file) ==  False:
        BaseCommon.loglist(base_config_file,"")
        #BaseCommon.loglist(base_config_file,"[currentManga]"+"\n"+"mangacount = 0"+"\n"+"[currentChapter]"+"\n"+"chaptercount = 0"+"\n")

    print(base_config_file)

    manhua_list=[]

    manhua_list.append('https://www.91hanman.com/book/webBookDetail/4=91hanman-恋爱辅助器')
    manhua_list.append('https://www.91hanman.com/book/webBookDetail/23=91hanman-制服诱惑')
    manhua_list.append('https://www.91hanman.com/book/webBookDetail/134=91hanman-红杏出墙')


    print(manhua_list)
    manhua_list_count=0
    #ConfigParser 初始化对象
    config = configparser.ConfigParser()
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
            "Referer": "http://www.google.com/bot.html"
        }
        #网页下载太长 需要加入参数 stream=True 来判断完整性下载
        req = requests.get(url=url, headers=headers, stream=True)
        req.encoding = 'utf-8'
        html = req.text
        bf = BeautifulSoup(html, 'lxml')
        targets_url = bf.find_all(class_='detail-chapters-list-item')
        for each_url in targets_url:
            #pdb.set_trace() # 运行到这里会自动暂停
            bf_2 = BeautifulSoup(str(each_url), 'lxml')
            print(bf_2.a.get('href'))
            print(bf_2.a.span.span.get_text())
            BaseCommon.loglist(
                base_log,
                bf_2.a.span.span.get_text() + '=' + base_url +
                bf_2.a.get('href'))
            list_url.append(bf_2.a.span.span.get_text() + '=' + base_url +
                            bf_2.a.get('href'))

        #pdb.set_trace() # 运行到这里会自动暂停

        # for each in targets_url:
        #     list_url.append(each.img.get('alt') + '=' + each.get('href'))

        print('连接采集完成')
        print(list_url)
        print(len(list_url))

        count = 0
        for each_img in list_url: