コード例 #1
0
ファイル: runSpider.py プロジェクト: lvm0306/HSpider
def dowaloadTxt():
    # csv_file = csv.reader(open(txt_list, 'r'))
    # txts = []
    # for i in csv_file:
    #     temp = []
    #     temp.append(i[0])
    #     temp.append(i[1])
    #     temp.append(i[2])
    #     txts.append(temp)
    test = 'https://www.jjxs.la/txt/31534.htm'
    bs = SpiderHtml(test).getBeautifulSoup(baseurl)
    download_link = bs.find('ul', class_='downlistbox').find('a')['href']
    print(download_link)
    bs = SpiderHtml(baseurl + download_link).getBeautifulSoup(baseurl)
    trs = bs.find_all('td')[0].find_all('tr')
    for i in trs:
        a_s = i.find_all('a')
        for j in a_s:
            if j.text == 'TXT电子书下载地址【无需解压缩】':
                print(j['href'])
                folder= '穿越小说'
                if not (os.path.exists(folder)):
                    os.makedirs(folder)
                DownloadBinaryFile(baseurl + j['href'],folder+'/小说1.txt').load()
    pass
コード例 #2
0
ファイル: runSpider.py プロジェクト: lvm0306/HSpider
def getTxtListInfo(cate, link, num):
    print('-----开始获取' + cate + str(num) + '下的小说-----')
    url = baseurl + link + 'index_' + str(num) + '.html'
    print(url)
    bs = SpiderHtml(url=url).getBeautifulSoup(baseurl)
    info_list = bs.find('div', id='catalog').find_all('div', class_='listbg')
    f = open(txt_list, 'a+', newline='')
    csv_write = csv.writer(f)

    try:
        for i in info_list:
            temp = []
            temp.append(i.a['href'])  # 链接
            temp.append(i.a['title'])  # 小说名
            date = ''
            try:
                date = i.find('span', class_='newDate').text
            except Exception as e:
                date = i.find('span', class_='oldDate').text
            temp.append(date)
            csv_write.writerow(temp)
    except Exception as e:
        print(e)

    return 0
コード例 #3
0
ファイル: runSpider.py プロジェクト: lvm0306/HSpider
def runInfo():
    print()
    # 需要加referer
    soup = SpiderHtml('http://www.mmjpg.com/mm/1316').getBeautifulSoup(base)
    content = soup.find('div', id='content').a.img['src']
    print(content)
    DownloadUtils.DownloadBinaryFileWithReferer(content, '1.jpg', base).load()
コード例 #4
0
ファイル: runSpider.py プロジェクト: lvm0306/HSpider
def getInfoNum(url, name):
    print()
    soup = SpiderHtml(url).getBeautifulSoup(base)
    page = 0
    try:
        page = soup.find('div', id='page').find_all('a')[-2].text
    except Exception as e:
        print()
    print(name + "  共" + str(page) + '页  ' + '链接:' + url)
    return int(page)
コード例 #5
0
ファイル: runSpider.py プロジェクト: lvm0306/HSpider
def getListNum(url, name):
    page = 1
    try:
        soup = SpiderHtml(url).getBeautifulSoup(base)
        page = soup.find('div', class_='page').find_all('a')[-1]['href']
        page = page.split('/')[-1]
    except Exception as e:
        print()
    print(name + "  共  " + str(page) + "页")
    return int(page)
コード例 #6
0
ファイル: runSpider.py プロジェクト: lvm0306/HSpider
def runList():
    print()
    soup = SpiderHtml('http://www.mmjpg.com/tag/xinggan').getBeautifulSoup(
        base)
    li_s = soup.find(class_='pic').find('ul').find_all('li')
    print(len(li_s))
    for i in li_s:
        # 获取名字,链接
        print('链接是:' + i.a['href'])  # link
        print('名字是:' + i.find_all('span')[0].text)  # name
        print('发布时间:' + i.find_all('span')[1].text)  # time
コード例 #7
0
ファイル: runSpider.py プロジェクト: lvm0306/HSpider
def runTags():
    soup = SpiderHtml(url=url).getBeautifulSoup(base)
    tags = soup.find(id='morelist').find_all('li')
    print(len(tags))
    with open(tag_csv_name, "a+", encoding="utf-8") as f:
        writer = csv.writer(f)
        for i in tags:
            print(i)
            list = []
            list.append(i.a['href'])  # link
            list.append(i.a.img['src'])  # img
            list.append(i.a.img['data-img'])  # img
            list.append(i.find('i').text)  # num
            list.append(i.a.text)  # name
            writer.writerow(list)