def book_spider(book_tag): global file_content url = "http://bbs.csdn.net/topics/310046216" source_code = requests.get(url) # just get the code, no headers or anything plain_text = source_code.text # BeautifulSoup objects can be sorted through easy soup = BeautifulSoup(plain_text) '''print('\n') print('--' * 30) print('--' * 30) print("\t"*4+book_tag+" :") print('--' * 30) print('--' * 30) print('\n')''' title_divide = '\n' + '--' * 30 + '\n' + '--' * 30 + '\n' file_content += title_divide + '\t' * 4 + \ book_tag + ':' + title_divide count = 1 for book_info in soup.findAll('div', {'class': 'info'}): title = book_info.findAll('a', { 'onclick': re.compile(r"\"moreurl(.+)")})[0].get('title') pub = book_info.findAll('div', {'class':'pub'})[0].string.strip() rating = book_info.findAll('span', { 'class':'rating_nums'})[0].string.strip() people_num = book_info.findAll('span', { 'class':'pl'})[0].string.strip() file_content += "*%d\t《%s》\t评分:%s%s\n\t%s\n\n" % ( count, title, rating, people_num, pub) count += 1
def get_single_book_data(book_url): source_code = requests.get(book_url) plain_text = source_code.text soup = BeautifulSoup(plain_text) # for rating in soup.findAll('strong', {'class':'ll rating_num '}): # print("评分:" + rating.string.strip()) for rating in soup.findAll('p', {'class':'rating_self clearfix'}): print rating.strong.string '''for book_info in soup.findAll('div', {'id':'info'}):
def fetch_tv_info(username, password): match = re.search(pattern, get_home_page()) challenge = "" if match is not None: # get the challenge challenge = match.group(2) print "challenge string: " + challenge # now login response = login(username, password, challenge) content = response.read() # open the TV page opener = urllib2.build_opener() response = execute_opener(opener, "http://iptv.bg/watch") content = response.read() #print content soup = BeautifulSoup(content, fromEncoding='utf-8') tvTags = soup.findAll(name='li', attrs={'class': 'listmode_tv'}) tv_info = [] for tag in tvTags: name = tag.find(name='div', attrs={'class': 'tv_info'}).find(name='b').getText() logo = tag.find(name='img').get('src', default='') url = tag.findAll( name='div', attrs={'class': 'noprint player_soft'})[-1].find(name='a').get('href', default='') thumbnail = tag.findAll( name='div', attrs={'class': 'noprint player_soft'})[-1].find(name='a').get('href', default='') info_tag = tag.find(name='div', attrs={'class': 'tv_info'}) info = '' thumbnail = '' if info_tag is not None: thumbnail = info_tag.find(name='img').get('src', default='') detail_tag = info_tag.find(name='em').find(name='abbr') if detail_tag is not None: info = detail_tag.get('title', default='Unknown') tv_info += [{'name': name, 'logo': logo, 'path': url, 'thumbnail': thumbnail, 'info': info}] return tv_info
def geturl(self, webpage, key=None): #key = None ##############################test global dlLinksNext try: webpage = unicode(webpage, 'gbk').encode('utf-8') soup = BeautifulSoup(webpage) tagA = soup.findAll('a') for link in tagA: if not key: dlLinksNext.put(link.get('href')) elif key in str(link): dlLinksNext.put(link.get('href')) except (UnicodeDecodeError): #error = '132 have code' error = 'UnicodeDecodeError' self.loger.logInfo(error) except (UnicodeEncodeError): #error = '135 had code' error = 'UnicodeDecodeError' self.loger.logInfo(error)
import requests from bs3 import BeautifulSoup url = "https://www.yelp.com/sf" yelp_r = requests.get(url) print(yelp_r.status_code) #should be 200 yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser') print(yelp_soup.prettify()) print(yelp_soup.findAll('a')) for link in yelp_soup.findAll('a'): print(link)