Python BeautifulSoup.find_all Examples, xtls.util.BeautifulSoup.find_all Python Examples

Example #1

0

Show file

File: py2-av.py Project: nkzxw/repo

def save_one_page_movie(link):
	con2 = get_and_sleep(link)
	#savecontent("test"+str(page)+"a.html",con2)
	soup2 = BeautifulSoup(con2)
	list2 = soup2.find_all("a", attrs={"class": "movie-box"})
	for la2 in list2:#每个作品
		link2 = la2.find("span")
		list3 = link2.find_all("date")
		savecontent("test" + str(page) + "a.html","\n" + link2.contents[0])#作品名
		savecontent("test" + str(page) + "a.html","\n" + list3[0].get_text())#番号
		savecontent("test" + str(page) + "a.html","\n" + list3[1].get_text())#日期
		link3 = la2["href"]
		con3 = get_and_sleep(link3)
		soup3 = BeautifulSoup(con3)
		movie = soup3.find("div", attrs={"class": "col-md-3 info"}).find_all("p")[2]
		#m1 = movie.contents[0]
		#print type(m1)
		#print m1.get_text()
		duration = movie.get_text()	
		#print duration
		savecontent("test" + str(page) + "a.html","\n" + duration)#时长
		savecontent("test" + str(page) + "a.html","\n" + link3)#链接
		#break #break on one page one movie
	#return next page link
	next = soup2.find("a", attrs={"name": "nextpage"})
	if(next is not None):
		nextlink = next["href"]
		return nextlink

Example #2

0

Show file

def save_one_page_movie(link):
    con2 = get_and_sleep(link)
    #savecontent("test"+str(page)+"a.html",con2)
    soup2 = BeautifulSoup(con2)
    list2 = soup2.find_all("a", attrs={"class": "movie-box"})
    for la2 in list2:  #每个作品
        link2 = la2.find("span")
        list3 = link2.find_all("date")
        savecontent("test" + str(page) + "a.html",
                    "\n" + link2.contents[0])  #作品名
        savecontent("test" + str(page) + "a.html",
                    "\n" + list3[0].get_text())  #番号
        savecontent("test" + str(page) + "a.html",
                    "\n" + list3[1].get_text())  #日期
        link3 = la2["href"]
        con3 = get_and_sleep(link3)
        soup3 = BeautifulSoup(con3)
        movie = soup3.find("div", attrs={
            "class": "col-md-3 info"
        }).find_all("p")[2]
        #m1 = movie.contents[0]
        #print type(m1)
        #print m1.get_text()
        duration = movie.get_text()
        #print duration
        savecontent("test" + str(page) + "a.html", "\n" + duration)  #时长
        savecontent("test" + str(page) + "a.html", "\n" + link3)  #链接
        #break #break on one page one movie
    #return next page link
    next = soup2.find("a", attrs={"name": "nextpage"})
    if (next is not None):
        nextlink = next["href"]
        return nextlink

Example #3

0

Show file

File: py-av-queue.py Project: NuclearGear/repo

 def jian(self, con2):
     # logging.error("call jian")
     soup2 = BeautifulSoup(con2["value"])
     list2 = soup2.find_all("a", attrs={"class": "movie-box"})
     for la2 in list2:  #每个作品
         link2 = la2.find("span")
         # list3 = link2.find_all("date")
         #savecontent("\n" + link2.contents[0])#作品名
         #savecontent("\n" + list3[0].get_text())#番号
         #savecontent("\n" + list3[1].get_text())#日期
         link3 = la2["href"]
         # logging.error("jian get " + link3)
         con3 = get_and_sleep(link3)
         # logging.error("jian got " + link3)
         soup3 = BeautifulSoup(con3)
         title = soup3.find_all("div", attrs={"class":
                                              "container"})[1].find("h3")
         movie = soup3.find("div", attrs={
             "class": "col-md-3 info"
         }).find_all("p")
         #m1 = movie.contents[0]
         #print type(m1)
         #print m1.get_text()
         # duration = movie.get_text()
         # print duration[2]
         # logging.error("jian get " + title.get_text())
         # logging.error("jian get " + movie[0].get_text())
         # logging.error("jian get " + movie[1].get_text())
         # logging.error("jian get " + movie[2].get_text())
         # logging.error("call savecontent")
         self.fp.write("\n" + link2.contents[0] + "\n" +
                       movie[0].get_text() + "\n" + movie[1].get_text() +
                       "\n" + movie[2].get_text() + "\n演员：" + con2["actor"])
         #savecontent("\n" + link3)#链接
     #flush()
     self.fp.flush()
     next = soup2.find("a", attrs={"name": "nextpage"})
     if (next is not None):
         nextlink = next["href"]
         # logging.error("jian next=" + nextlink)
         con2["value"] = "https://avmo.pw" + nextlink
         #item = {"type":"2","value":"https://avmo.pw" + nextlink, "actor":con2["actor"]}
         self.url_queue.put(con2)

Example #4

0

Show file

File: t66y_crawler.py Project: 503945930/zhihu.photo

 def find_imgs(self, uri):
     url = HOST + uri
     soup = BeautifulSoup(self.get(url))
     img_list = []
     for input in soup.find_all('input', type="image"):
         img = input['src']
         content = self.get(img)
         filename = sha1(content) + img[img.rfind('.'):]
         save(content, filename)
         img_list.append({
             'url': img,
             'hash': filename,
         })
     return img_list

Example #5

0

Show file

 def find_imgs(self, uri):
     url = HOST + uri
     soup = BeautifulSoup(self.get(url))
     img_list = []
     for input in soup.find_all('input', type="image"):
         img = input['src']
         content = self.get(img)
         filename = sha1(content) + img[img.rfind('.'):]
         save(content, filename)
         img_list.append({
             'url': img,
             'hash': filename,
         })
     return img_list

Example #6

0

Show file

File: py-av-queue.py Project: NuclearGear/repo

 def jia(self, con1):
     # https://avmo.pw/cn/actresses//page/2
     # logging.error("call jia")
     soup1 = BeautifulSoup(con1["value"])
     list = soup1.find_all("a", attrs={"class": "avatar-box text-center"})
     for la in list:  #每个演员
         #savecontent("\n@@@" + la.find("span").get_text())    #演员名@@@开始
         actor = la.find("span").get_text()
         link = la["href"]  #la.get("href")
         item = {"type": "2", "value": link, "actor": actor}
         #con1["value"] = link
         #con1["type"] = "2"
         #con1["actor"] = actor
         # logging.error("jia put" + link)
         self.url_queue.put(item)
     # 下一页
     next = soup1.find("a", attrs={"name": "nextpage"})
     if (next is not None):
         nextlink = next["href"]
         # logging.error("jia next =:" + nextlink)
         item = {"type": "1", "value": "https://avmo.pw" + nextlink}
         self.url_queue.put(item)

Example #7

0

Show file

File: py2-av.py Project: nkzxw/repo

	if(next is not None):
		nextlink = next["href"]
		return nextlink

if __name__ == '__main__':
	session = requests.session()
	logging.basicConfig(
	filename='server.log',
	level=logging.INFO,
	format='%(asctime)s,%(levelname)s,%(filename)s:%(lineno)d,%(threadName)s:%(message)s', 
	datefmt='[/%Y/%m%d-%H:%M:%S]')
	try:
		for page in range(1,200): #不含最大值
			#print page
			con1 = get_and_sleep('https://avmo.pw/cn/actresses/page/' + str(page))
			#savecontent("test"+str(page)+".html",con1)
			#从中找出名字和链接
			soup1 = BeautifulSoup(con1)
			list=soup1.find_all("a", attrs={"class": "avatar-box text-center"})
			for la in list:#每个演员
				savecontent("test" + str(page) + "a.html","\n@@@" + la.find("span").get_text())	#演员名@@@开始
				link = la["href"]#la.get("href")
				#一页作品
				nextlink = save_one_page_movie(link)
				#下一页
				while (nextlink is not None):
					nextlink = save_one_page_movie("https://avmo.pw" + nextlink)
				#break #break one actor
	except:
		logging.exception("exception")

Example #8

0

Show file

    session = requests.session()
    logging.basicConfig(
        filename='server.log',
        level=logging.INFO,
        format=
        '%(asctime)s,%(levelname)s,%(filename)s:%(lineno)d,%(threadName)s:%(message)s',
        datefmt='[/%Y/%m%d-%H:%M:%S]')
    try:
        for page in range(1, 200):  #不含最大值
            #print page
            con1 = get_and_sleep('https://avmo.pw/cn/actresses/page/' +
                                 str(page))
            #savecontent("test"+str(page)+".html",con1)
            #从中找出名字和链接
            soup1 = BeautifulSoup(con1)
            list = soup1.find_all("a",
                                  attrs={"class": "avatar-box text-center"})
            for la in list:  #每个演员
                savecontent("test" + str(page) + "a.html",
                            "\n@@@" + la.find("span").get_text())  #演员名@@@开始
                link = la["href"]  #la.get("href")
                #一页作品
                nextlink = save_one_page_movie(link)
                #下一页
                while (nextlink is not None):
                    nextlink = save_one_page_movie("https://avmo.pw" +
                                                   nextlink)
                #break #break one actor
    except:
        logging.exception("exception")