def runInfo(): print() # 需要加referer soup = SpiderHtml('http://www.mmjpg.com/mm/1316').getBeautifulSoup(base) content = soup.find('div', id='content').a.img['src'] print(content) DownloadUtils.DownloadBinaryFileWithReferer(content, '1.jpg', base).load()
def getTxtListInfo(cate, link, num): print('-----开始获取' + cate + str(num) + '下的小说-----') url = baseurl + link + 'index_' + str(num) + '.html' print(url) bs = SpiderHtml(url=url).getBeautifulSoup(baseurl) info_list = bs.find('div', id='catalog').find_all('div', class_='listbg') f = open(txt_list, 'a+', newline='') csv_write = csv.writer(f) try: for i in info_list: temp = [] temp.append(i.a['href']) # 链接 temp.append(i.a['title']) # 小说名 date = '' try: date = i.find('span', class_='newDate').text except Exception as e: date = i.find('span', class_='oldDate').text temp.append(date) csv_write.writerow(temp) except Exception as e: print(e) return 0
def getInfoNum(url, name): print() soup = SpiderHtml(url).getBeautifulSoup(base) page = 0 try: page = soup.find('div', id='page').find_all('a')[-2].text except Exception as e: print() print(name + " 共" + str(page) + '页 ' + '链接:' + url) return int(page)
def getListNum(url, name): page = 1 try: soup = SpiderHtml(url).getBeautifulSoup(base) page = soup.find('div', class_='page').find_all('a')[-1]['href'] page = page.split('/')[-1] except Exception as e: print() print(name + " 共 " + str(page) + "页") return int(page)
def runList(): print() soup = SpiderHtml('http://www.mmjpg.com/tag/xinggan').getBeautifulSoup( base) li_s = soup.find(class_='pic').find('ul').find_all('li') print(len(li_s)) for i in li_s: # 获取名字,链接 print('链接是:' + i.a['href']) # link print('名字是:' + i.find_all('span')[0].text) # name print('发布时间:' + i.find_all('span')[1].text) # time
def runTags(): soup = SpiderHtml(url=url).getBeautifulSoup(base) tags = soup.find(id='morelist').find_all('li') print(len(tags)) with open(tag_csv_name, "a+", encoding="utf-8") as f: writer = csv.writer(f) for i in tags: print(i) list = [] list.append(i.a['href']) # link list.append(i.a.img['src']) # img list.append(i.a.img['data-img']) # img list.append(i.find('i').text) # num list.append(i.a.text) # name writer.writerow(list)
def getImageUrl(url, name): html = SpiderHtml(url).getHtmlWithReferer(base) soup = bs(html, 'html.parser') page = int(soup.find(class_='pagenavi').find_all('a')[-2].span.text) # 获取页码 for i in range(page): html = SpiderHtml(url + '/' + str(i)).getHtmlWithReferer(base) image_link = bs(html, 'html.parser').find('div', class_='main-image').p.a.img['src'] print(name) print(image_link) #判断地址是否存在 if not (os.path.exists(folder+'\\'+name)): os.makedirs(folder+'\\'+name) DownloadBinaryFileWithReferer(aim_url=image_link, save_url=folder+'\\'+name+'\\'+str(i)+'.jpg', referer=base).load()
def getType(): html = SpiderHtml(test_url).getHtmlWithReferer(base) li_s = bs(html, 'html.parser').find('ul', class_='menu').find_all('li')[1:] for i in li_s: print('当前进行的是' + i.a.text + i.a['href']) runSpyder(base + str(i.a['href']), i.a.text) time.sleep(1)
def runSpyder(url, name): # 获取列表 名字 - 链接 - 图片地址 html = SpiderHtml(url).getHtmlWithReferer(base) # 获取页码 page = 0 page_url = '' try: a_s = bs(html, 'html.parser').find('div', class_='pages').find_all('a') if (len(a_s) > 1): page = a_s[-1]['href'].split('.')[0].split('-')[-1] page_url = "-".join(a_s[-1]['href'].split('.')[0].split('-')[:-1]) print(page_url) except Exception as e: print(name + "只有一页哦~" + str(e)) print(name + "该分类下共" + page) try: for i in range(int(page)): print('此分类是' + name + "当前是第" + str(i + 1) + "页," + "链接为" + base + str(page_url) + '-' + str(i + 1) + '.html') spider_title(base + str(page_url), name) time.sleep(0.3) except Exception as e: print("出错了~~")
def runStarName(): # 获取列表 名字 - 链接 - 图片地址 html = SpiderHtml(test_url).getHtmlWithReferer(base) div_s=bs(html,'html.parser').find_all('div',class_='item') for div in div_s: print("演员的名字"+div.a.div.img['title']) print("演员的链接"+div.a['href']) print("演员的图片"+div.a.div.img['src'])
def runSpider(url): html = SpiderHtml(url).getHtmlWithReferer(base) p_s = bs(html, 'html.parser').find_all('ul', class_='archives') for i in p_s: a_s = i.find_all('a') for j in a_s: # 获取所有的图片网址 print(j['href']) print(j.text) getImageUrl(j['href'], j.text)
def runMovieLink(): html = SpiderHtml("https://www.javhoo.ca/av/juy-578").getHtmlWithReferer( base) div_s = bs(html, 'html.parser').find(id='comments').find_all('a') print("共" + str(len(div_s)) + "个资源可用") print(div_s[-1]) print('磁力链接名字:' + div_s[-1]['title']) print('磁力链接地址:' + div_s[-1]['href']) print('内容大小' + div_s[-2].text) print('分享时间:' + div_s[-1].text)
def getCatePages(link, cate): html = SpiderHtml(baseurl + link).getHtml() bs = BeautifulSoup(html, 'html.parser') a_list = bs.find('div', id='pages').find_all('a') pages = int(a_list[-1]['href'].split('/')[-1].split('.')[0].split('_')[1]) print(cate + ' 总页码为:' + str(pages)) out = open(csv_cate2, 'a+', newline='') csv_write = csv.writer(out) csv_write.writerow([cate, link, pages]) return 0
def getCate(): html = SpiderHtml(baseurl).getHtml() bs = BeautifulSoup(html, 'html.parser') div1 = bs.find('div', id='navber').find_all('li') out = open(csv_cate, 'a+', newline='') csv_write = csv.writer(out) for i in div1[1:]: temp = [] temp.append(i.a['title']) temp.append(i.a['href']) csv_write.writerow(temp) return 0
def runSpyder(): # 获取列表 名字 - 链接 - 图片地址 html = SpiderHtml(test_url).getHtmlWithReferer(base) # a_s = bs(html, 'html.parser').find(id='pageContent').find_all('ul', class_='thumbs') print(len(a_s)) a_s = a_s[-1] print(len(a_s.find_all('li'))) for i in a_s.find_all('li'): print(i)
def dowaloadTxt(): # csv_file = csv.reader(open(txt_list, 'r')) # txts = [] # for i in csv_file: # temp = [] # temp.append(i[0]) # temp.append(i[1]) # temp.append(i[2]) # txts.append(temp) test = 'https://www.jjxs.la/txt/31534.htm' bs = SpiderHtml(test).getBeautifulSoup(baseurl) download_link = bs.find('ul', class_='downlistbox').find('a')['href'] print(download_link) bs = SpiderHtml(baseurl + download_link).getBeautifulSoup(baseurl) trs = bs.find_all('td')[0].find_all('tr') for i in trs: a_s = i.find_all('a') for j in a_s: if j.text == 'TXT电子书下载地址【无需解压缩】': print(j['href']) folder= '穿越小说' if not (os.path.exists(folder)): os.makedirs(folder) DownloadBinaryFile(baseurl + j['href'],folder+'/小说1.txt').load() pass
def runStarList(): #测试 html = SpiderHtml("https://www.dmmsee.net/star/2pv").getHtmlWithReferer(base) div_s=bs(html,'html.parser').find('div',id="waterfall").find_all('div',class_='item') print(div_s[1]) # 第一个是演员介绍 # 其他为演绎作品 # 目标 :获取作品名字,作品图片,作品详情,番号名,出版日日期 print("作品详情页"+div_s[1].find('a',class_='movie-box')['href']) print("作品名字"+div_s[1].find('div',class_='photo-frame').img['title']) print("作品图片"+div_s[1].find('div',class_='photo-frame').img['src']) print("番号名"+div_s[1].find_all('date')[0].text) print("出版日期"+div_s[1].find_all('date')[1].text)
def runStarName(): # 获取列表 名字 - 链接 - 图片 - 详情 html = SpiderHtml(test_url).getHtmlWithReferer(base) div_s = bs(html, 'html.parser').find(id='content').find_all( class_= 'wf-container loading-effect-none iso-container description-on-hover hover-style-two hover-fade content-align-left' )[0].find_all('article') print(div_s[0]) for article in div_s: print("演员的名字" + article.div.a['href'].split('/')[-1]) print("演员的链接" + article.div.a['href']) print("演员的图片" + article.div.a.img['data-src']) print("演员的详情" + article.find('div', class_='testimonial-content').text)
def spider_title(url, name): # 获取列表 名字 - 链接 - 图片地址 html = SpiderHtml(url).getHtmlWithReferer(base) li_s = bs(html, 'html.parser').find('ul', class_='videos').find_all('li') # 写入csv with open("test.csv", "a+", encoding="utf-8") as f: writer = csv.writer(f) for i in li_s: list = [] list.append(name) list.append(i.div.a['href']) #链接 list.append(i.div.a['title']) #标题 list.append(i.div.a.div.img['src']) #图片链接 writer.writerow(list)
def download(code, filefold, name): if not (os.path.exists(filefold)): os.makedirs(filefold) print('正在下载' + 'https://geo.datav.aliyun.com/areas_v3/bound/' + code + '.json') html = SpiderHtml('https://geo.datav.aliyun.com/areas_v3/bound/' + code + '.json').getHtmlWithReferer( 'https://geo.datav.aliyun.com/') with open(filefold + '\\' + name + '.json', "w") as file: # file.write(html) writeInfo('下载成功:' + 'https://geo.datav.aliyun.com/areas_v3/bound/' + code + '.json') writeInfo('下载路径:' + filefold + '\\' + name) pass
def runSpider(): html = SpiderHtml(url).getHtmlWithReferer(base) soup = bs(html, 'html.parser') div = soup.find_all('div',class_='content') ul=div[0].find_all('ul') lis=ul[1].find_all('li') print(len(lis)) for i in lis: print('名字:'+i.find('img')['title']) print('图片地址:'+i.find('img')['src']) # 需要处理 svgmtsi print('地址:'+i.find('div',class_='tag-addr').text) pass
def getPagesBefore(): # https://www.qishus.com/xuanhuan/list1_1.html test_url = 'https://www.qishus.com/xuanhuan/list1_1.html' bs = SpiderHtml(test_url).getBeautifulSoup(baseurl) a_s = bs.find('code') print(a_s) print(type(a_s)) a_s2 = a_s.find_all('a') print(a_s2) print(type(a_s2)) print(a_s2[-1]['href']) print(a_s2[-1].text) pass
def getCate(): html = SpiderHtml(url).getHtmlWithReferer(base) soup = bs(html, 'html.parser') divs = soup.find_all('tr') write = CsvUtil(cate_file, 'a+') print(len(divs)) for i in divs[3:]: tds = i.find_all('td') a = [] for j in tds[1:3]: temp = j.text t = temp.replace(' ', '') t = t.replace('\xa0', '') a.append(t) print(t + '---' + str(t.isdigit())) if not t.isdigit(): cishu2 = temp.count('\xa0') a.append(cishu2) write.write(a)
def runStarList(): #测试 html = SpiderHtml( "https://www.javhoo.ca/star/%E6%B3%A2%E5%A4%9A%E9%87%8E%E7%B5%90%E8%A1%A3" ).getHtmlWithReferer(base) div_s = bs(html, 'html.parser').find( class_= 'wf-container loading-effect-fade-in iso-container bg-under-post description-under-image content-align-left' ).find_all('article') print("本页共" + len(div_s) + "个") # 第一个是演员介绍 # 其他为演绎作品 # 目标 :获取作品名字,作品图片,作品详情,番号名,出版日日期 for div in div_s[1:]: print("作品详情页" + div.a['href']) print("作品名字" + div.a['title']) print("作品图片" + div.a.img['data-src']) print("番号名" + div.find('date').text.split('/')[0]) print("出版日期" + div.find('date').text.split('/')[1])
def getTxtList(link, page, title): base = "https://www.qishus.com" link1 = base + link print("当前开始爬取===" + title) for i in range(int(page))[1:]: time.sleep(0.3) print("第" +str(i)+"页==="+ title) _links=link1.split("_") print(_links[0]+"_"+str(i)+".html") try: html = SpiderHtml(_links[0]+"_"+str(i)+".html").getHtmlWithReferer(baseurl) bs1 = bs(html, 'html.parser') divs = bs1.find_all('div', class_="mainListInfo") # print(divs) # print(len(divs)) if len(divs)!=0: for i in range(len(divs)): note_title=divs[i].find_all('a')[0]['title'] note_title=note_title.replace("TXT全集下载", "").replace(" ", "") # print(note_title) writeCsv(note_title) if len(divs) == 0: divs2 = bs1.find_all('div', class_="list_b") # print(divs2) # print(len(divs2)) # print(divs2[0]) for i in range(len(divs2)): note_title=divs2[i].find_all('a')[0]['title'] note_title=note_title.replace("TXT全集下载", "").replace(" ", "") # print(note_title) writeCsv(note_title) except Exception as e: writeError("第" +str(i)+"页==="+ title+"出现错误\n"+str(e)) print("第" +str(i)+"页==="+ title+"出现错误\n"+str(e)) # bs=SpiderHtml(str).getBeautifulSoup(baseurl) # divs= bs.find_all('div',class_="mainListInfo") # print(divs) # print(len(divs)) pass
def getCate(): html = SpiderHtml(baseurl).getHtmlWithReferer(baseurl) bs = BeautifulSoup(html, 'html.parser') ul = bs.find('ul', id='globalNavUL') print(ul) a_list = ul.find_all('a') print(a_list[1:]) f = open(cate_csv_name, 'a+', newline='') csv_write = csv.writer(f) try: for i in a_list[1:]: print(i) temp = [] temp.append(i['href']) # 链接 temp.append(i['title']) # 小说名 csv_write.writerow(temp) except Exception as e: print(e) pass
from utils.SpiderUtil import SpiderHtml from utils.UserAgentSeed import argent, getHeaders, getHeadersWithReferer from urllib import request as req imageurl = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1594661017603&di=68ae790cb6f7103ed19d07cd04c60a3a&imgtype=0&src=http%3A%2F%2Fpic.feizl.com%2Fupload%2Fallimg%2F170615%2F1TH010Z-7.jpg' videourl = 'https://aweme.snssdk.com/aweme/v1/playwm/?video_id=v0200f410000bd08252gd9fg6tidtt50&line=0' radiourl = 'https://translate.google.cn/translate_tts?ie=UTF-8&q=%E4%B8%8B%E8%BD%BD%E8%A7%86%E5%B1%8F%E5%89%8D%E5%85%88%E8%8E%B7%E5%8F%96%E5%88%B0%E8%A7%86%E5%B1%8F%E7%9A%84%E9%93%BE%E6%8E%A5%EF%BC%8C%E8%BF%99%E9%87%8C%E6%88%91%E5%B0%B1%E5%85%88%E9%9A%8F%E4%BE%BF%E9%80%89%E5%8F%96%E4%B8%80%E4%B8%AASRC%E4%BD%9C%E4%B8%BA%E5%8F%82%E7%85%A7&tl=zh-CN&total=1&idx=0&textlen=33&tk=240950.338665&client=t&hint=en' image_pathurl = 'image.jpg' video_pathurl = 'mp4.mp4' radio_pathurl = 'mp3.mp3' txt_url = 'https://www.jjxs.la/e/DownSys/doaction.php?enews=DownSoft&classid=47&id=31513&pathid=0&pass=ee247a67a5adcf1dfb1abecbd1ff5635&p=:::' # dfwp=DownloadBinaryFileWithProgressBar(aim_url=imageurl,save_url=image_pathurl) # dfwp.load() # DownloadBinaryFileWithProgressBar(aim_url=imageurl,save_url=image_pathurl).load() # DownloadBinaryFile(aim_url='https://i.meizitu.net/2018/11/21c31.jpg',save_url='F:\space\\torrent\meizitu\甜美御姐五官精致 丰满好身材衣服都要绷不住了'+'\\1.jpg').load() # DownloadBinaryFileWithProgressBar(aim_url=txt_url,save_url='1.txt').load() # req.urlretrieve('https://i.meizitu.net/2018/11/21c01.jpg','1.jpg')txt_url # respone=requests.get(txt_url,headers=getHeaders()) # # with open('1.txt','wb') as f : # f.write(respone.content) # req.urlretrieve('https://i.meizitu.net/2018/11/21c01.jpg','1.jpg') imageurl = 'https://per.spdb.com.cn/bank_financing/financial_product/zxlc/201704/P020200617777637682661.pdf' # # DownloadBinaryFileWithProgressBar(aim_url=imageurl,save_url='1.pdf').load() # DownloadBinaryFile(aim_url=txt_url,save_url='2.txt').load() html = requests.get(imageurl) html = SpiderHtml(imageurl).getHtml() print(html)
import requests from utils.SpiderUtil import SpiderHtml from bs4 import BeautifulSoup as bs import csv # base = 'http://fine.gq/' base = 'http://cape.gq.gq/' html = SpiderHtml(base).getHtml() soup = bs(html, 'html.parser') items = soup.find_all(class_='item item-0') with open("item.csv", "a+", encoding="utf-8") as f: writer = csv.writer(f) for i in items: a_s = i.find_all('a') for j in a_s: list = [] list.append(j['href']) list.append(j.text) writer.writerow(list)
def runMovieLink(): html = SpiderHtml("https://www.javhoo.ca/av/juy-578").getHtmlWithReferer('https://www.javhoo.ca') div_s=bs(html,'html.parser').find(id='comments').find_all('a') print(div_s)
def runSpyder(): # 获取列表 名字 - 链接 - 图片地址 html = SpiderHtml(test_url).getHtmlWithReferer(base)