def dowaloadTxt(): # csv_file = csv.reader(open(txt_list, 'r')) # txts = [] # for i in csv_file: # temp = [] # temp.append(i[0]) # temp.append(i[1]) # temp.append(i[2]) # txts.append(temp) test = 'https://www.jjxs.la/txt/31534.htm' bs = SpiderHtml(test).getBeautifulSoup(baseurl) download_link = bs.find('ul', class_='downlistbox').find('a')['href'] print(download_link) bs = SpiderHtml(baseurl + download_link).getBeautifulSoup(baseurl) trs = bs.find_all('td')[0].find_all('tr') for i in trs: a_s = i.find_all('a') for j in a_s: if j.text == 'TXT电子书下载地址【无需解压缩】': print(j['href']) folder= '穿越小说' if not (os.path.exists(folder)): os.makedirs(folder) DownloadBinaryFile(baseurl + j['href'],folder+'/小说1.txt').load() pass
def getTxtListInfo(cate, link, num): print('-----开始获取' + cate + str(num) + '下的小说-----') url = baseurl + link + 'index_' + str(num) + '.html' print(url) bs = SpiderHtml(url=url).getBeautifulSoup(baseurl) info_list = bs.find('div', id='catalog').find_all('div', class_='listbg') f = open(txt_list, 'a+', newline='') csv_write = csv.writer(f) try: for i in info_list: temp = [] temp.append(i.a['href']) # 链接 temp.append(i.a['title']) # 小说名 date = '' try: date = i.find('span', class_='newDate').text except Exception as e: date = i.find('span', class_='oldDate').text temp.append(date) csv_write.writerow(temp) except Exception as e: print(e) return 0
def runInfo(): print() # 需要加referer soup = SpiderHtml('http://www.mmjpg.com/mm/1316').getBeautifulSoup(base) content = soup.find('div', id='content').a.img['src'] print(content) DownloadUtils.DownloadBinaryFileWithReferer(content, '1.jpg', base).load()
def getInfoNum(url, name): print() soup = SpiderHtml(url).getBeautifulSoup(base) page = 0 try: page = soup.find('div', id='page').find_all('a')[-2].text except Exception as e: print() print(name + " 共" + str(page) + '页 ' + '链接:' + url) return int(page)
def getListNum(url, name): page = 1 try: soup = SpiderHtml(url).getBeautifulSoup(base) page = soup.find('div', class_='page').find_all('a')[-1]['href'] page = page.split('/')[-1] except Exception as e: print() print(name + " 共 " + str(page) + "页") return int(page)
def runList(): print() soup = SpiderHtml('http://www.mmjpg.com/tag/xinggan').getBeautifulSoup( base) li_s = soup.find(class_='pic').find('ul').find_all('li') print(len(li_s)) for i in li_s: # 获取名字,链接 print('链接是:' + i.a['href']) # link print('名字是:' + i.find_all('span')[0].text) # name print('发布时间:' + i.find_all('span')[1].text) # time
def runTags(): soup = SpiderHtml(url=url).getBeautifulSoup(base) tags = soup.find(id='morelist').find_all('li') print(len(tags)) with open(tag_csv_name, "a+", encoding="utf-8") as f: writer = csv.writer(f) for i in tags: print(i) list = [] list.append(i.a['href']) # link list.append(i.a.img['src']) # img list.append(i.a.img['data-img']) # img list.append(i.find('i').text) # num list.append(i.a.text) # name writer.writerow(list)