def save_one_page_movie(link): con2 = get_and_sleep(link) #savecontent("test"+str(page)+"a.html",con2) soup2 = BeautifulSoup(con2) list2 = soup2.find_all("a", attrs={"class": "movie-box"}) for la2 in list2:#每个作品 link2 = la2.find("span") list3 = link2.find_all("date") savecontent("test" + str(page) + "a.html","\n" + link2.contents[0])#作品名 savecontent("test" + str(page) + "a.html","\n" + list3[0].get_text())#番号 savecontent("test" + str(page) + "a.html","\n" + list3[1].get_text())#日期 link3 = la2["href"] con3 = get_and_sleep(link3) soup3 = BeautifulSoup(con3) movie = soup3.find("div", attrs={"class": "col-md-3 info"}).find_all("p")[2] #m1 = movie.contents[0] #print type(m1) #print m1.get_text() duration = movie.get_text() #print duration savecontent("test" + str(page) + "a.html","\n" + duration)#时长 savecontent("test" + str(page) + "a.html","\n" + link3)#链接 #break #break on one page one movie #return next page link next = soup2.find("a", attrs={"name": "nextpage"}) if(next is not None): nextlink = next["href"] return nextlink
def save_one_page_movie(link): con2 = get_and_sleep(link) #savecontent("test"+str(page)+"a.html",con2) soup2 = BeautifulSoup(con2) list2 = soup2.find_all("a", attrs={"class": "movie-box"}) for la2 in list2: #每个作品 link2 = la2.find("span") list3 = link2.find_all("date") savecontent("test" + str(page) + "a.html", "\n" + link2.contents[0]) #作品名 savecontent("test" + str(page) + "a.html", "\n" + list3[0].get_text()) #番号 savecontent("test" + str(page) + "a.html", "\n" + list3[1].get_text()) #日期 link3 = la2["href"] con3 = get_and_sleep(link3) soup3 = BeautifulSoup(con3) movie = soup3.find("div", attrs={ "class": "col-md-3 info" }).find_all("p")[2] #m1 = movie.contents[0] #print type(m1) #print m1.get_text() duration = movie.get_text() #print duration savecontent("test" + str(page) + "a.html", "\n" + duration) #时长 savecontent("test" + str(page) + "a.html", "\n" + link3) #链接 #break #break on one page one movie #return next page link next = soup2.find("a", attrs={"name": "nextpage"}) if (next is not None): nextlink = next["href"] return nextlink
def jian(self, con2): # logging.error("call jian") soup2 = BeautifulSoup(con2["value"]) list2 = soup2.find_all("a", attrs={"class": "movie-box"}) for la2 in list2: #每个作品 link2 = la2.find("span") # list3 = link2.find_all("date") #savecontent("\n" + link2.contents[0])#作品名 #savecontent("\n" + list3[0].get_text())#番号 #savecontent("\n" + list3[1].get_text())#日期 link3 = la2["href"] # logging.error("jian get " + link3) con3 = get_and_sleep(link3) # logging.error("jian got " + link3) soup3 = BeautifulSoup(con3) title = soup3.find_all("div", attrs={"class": "container"})[1].find("h3") movie = soup3.find("div", attrs={ "class": "col-md-3 info" }).find_all("p") #m1 = movie.contents[0] #print type(m1) #print m1.get_text() # duration = movie.get_text() # print duration[2] # logging.error("jian get " + title.get_text()) # logging.error("jian get " + movie[0].get_text()) # logging.error("jian get " + movie[1].get_text()) # logging.error("jian get " + movie[2].get_text()) # logging.error("call savecontent") self.fp.write("\n" + link2.contents[0] + "\n" + movie[0].get_text() + "\n" + movie[1].get_text() + "\n" + movie[2].get_text() + "\n演员:" + con2["actor"]) #savecontent("\n" + link3)#链接 #flush() self.fp.flush() next = soup2.find("a", attrs={"name": "nextpage"}) if (next is not None): nextlink = next["href"] # logging.error("jian next=" + nextlink) con2["value"] = "https://avmo.pw" + nextlink #item = {"type":"2","value":"https://avmo.pw" + nextlink, "actor":con2["actor"]} self.url_queue.put(con2)
def find_imgs(self, uri): url = HOST + uri soup = BeautifulSoup(self.get(url)) img_list = [] for input in soup.find_all('input', type="image"): img = input['src'] content = self.get(img) filename = sha1(content) + img[img.rfind('.'):] save(content, filename) img_list.append({ 'url': img, 'hash': filename, }) return img_list
def jia(self, con1): # https://avmo.pw/cn/actresses//page/2 # logging.error("call jia") soup1 = BeautifulSoup(con1["value"]) list = soup1.find_all("a", attrs={"class": "avatar-box text-center"}) for la in list: #每个演员 #savecontent("\n@@@" + la.find("span").get_text()) #演员名@@@开始 actor = la.find("span").get_text() link = la["href"] #la.get("href") item = {"type": "2", "value": link, "actor": actor} #con1["value"] = link #con1["type"] = "2" #con1["actor"] = actor # logging.error("jia put" + link) self.url_queue.put(item) # 下一页 next = soup1.find("a", attrs={"name": "nextpage"}) if (next is not None): nextlink = next["href"] # logging.error("jia next =:" + nextlink) item = {"type": "1", "value": "https://avmo.pw" + nextlink} self.url_queue.put(item)
if(next is not None): nextlink = next["href"] return nextlink if __name__ == '__main__': session = requests.session() logging.basicConfig( filename='server.log', level=logging.INFO, format='%(asctime)s,%(levelname)s,%(filename)s:%(lineno)d,%(threadName)s:%(message)s', datefmt='[/%Y/%m%d-%H:%M:%S]') try: for page in range(1,200): #不含最大值 #print page con1 = get_and_sleep('https://avmo.pw/cn/actresses/page/' + str(page)) #savecontent("test"+str(page)+".html",con1) #从中找出名字和链接 soup1 = BeautifulSoup(con1) list=soup1.find_all("a", attrs={"class": "avatar-box text-center"}) for la in list:#每个演员 savecontent("test" + str(page) + "a.html","\n@@@" + la.find("span").get_text()) #演员名@@@开始 link = la["href"]#la.get("href") #一页作品 nextlink = save_one_page_movie(link) #下一页 while (nextlink is not None): nextlink = save_one_page_movie("https://avmo.pw" + nextlink) #break #break one actor except: logging.exception("exception")
session = requests.session() logging.basicConfig( filename='server.log', level=logging.INFO, format= '%(asctime)s,%(levelname)s,%(filename)s:%(lineno)d,%(threadName)s:%(message)s', datefmt='[/%Y/%m%d-%H:%M:%S]') try: for page in range(1, 200): #不含最大值 #print page con1 = get_and_sleep('https://avmo.pw/cn/actresses/page/' + str(page)) #savecontent("test"+str(page)+".html",con1) #从中找出名字和链接 soup1 = BeautifulSoup(con1) list = soup1.find_all("a", attrs={"class": "avatar-box text-center"}) for la in list: #每个演员 savecontent("test" + str(page) + "a.html", "\n@@@" + la.find("span").get_text()) #演员名@@@开始 link = la["href"] #la.get("href") #一页作品 nextlink = save_one_page_movie(link) #下一页 while (nextlink is not None): nextlink = save_one_page_movie("https://avmo.pw" + nextlink) #break #break one actor except: logging.exception("exception")