Python bs4 Examples, BeautifulSoup.bs4 Python Examples

Example #1

0

Show file

    def __init__(self, qr):
        self.bib = dict()

        for div in qr:
            temp = bs4(str(div.findAll("div", {"class": "gs_ri"})))
            self.bib['url'] = (temp.find('a')['href'])  # get the url
            self.bib['title'] = (temp.find('a').text
                                 )  # get the title of the url
            self.bib['Publisher'] = (bs4(
                str(div.findAll("div", {"class": "gs_a"}))).text
                                     )  # get publishing details
            self.bib['abstract'] = (bs4(
                str(div.findAll("div", {"class": "gs_rs"}))).text
                                    )  #get abstract of publication
            self.bib['citedby'] = (bs4(
                str(div.findAll("div", {"class": "gs_fl"}))).find('a').text
                                   )  #get number of citations

        for pub in pub_details:
            temp = pub.split('-')
            author.append(temp[0])
            year.append(temp[1])
            try:
                pub_journal.append(temp[2])
            except:
                pub_journal.append('NA')

        self.url = url
        self.urlText = urlText
        self.abstract = abstract
        self.cite = re.sub("[A-Z,a-z]", "", str(cite))
        self.author = author
        self.pub_journal = pub_journal
        self.year = year

Example #2

0

Show file

File: CheckProducer.py Project: qgzang/Music_MetaGrabber

class CheckMp3Path(object):
    def __init__(self, thread_num=30):
        self.data_file = "prder"
        self.session = requests.session()
        self.session.headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"
        }
        self.get = self.session.get
        self.thread_num = thread_num
        self.html_parser = htmlP()

    def start(self):
        records = self.load_file()
        index = 0
        que = Queue.Queue()
        threads = []
        for i in records:
            index += 1
            que.put([i, self.req])
        for j in xrange(self.thread_num):
            threads.append(Down(que, "线程" + str(j), DB()))
        for j in threads:
            j.setDaemon(True)
            j.start()
        for j in threads:
            j.join()
        print "END ALL"

    def req(self, album):
        url = "http://www.xiami.com/search/album?key=%s" % album
        print "搜索:", url
        con = self.get(url).content
        try:
            href = re.findall('CDcover100.+?href="(.+?)"', con, re.S)[0]
        except IndexError, e:
            print e, url
            return "", ""

        print "专辑", href
        con = self.get(href).content
        try:
            language = re.findall("语种.+?td>.+?<td.+?>(.+?)</td", con,
                                  re.S)[0].strip()
            language = bs4(language).text
            language = self.html_parser.unescape(language).encode("utf8")
            #rtime = '-'.join(re.findall("<span>发行时间：(\d+)年(\d+)月(\d+)日</span", con, re.S)[0])
            #rtime = re.findall("发行时间.+?top\">(.+?)</td", con, re.S)[0].replace("年", '-').replace("月", '-').replace("日", '')
            genre = re.findall("专辑风格.+?<a.+?>(.+?)</a", con, re.S)[0].strip()
            genre = bs4(genre).text
            genre = self.html_parser.unescape(genre).encode("utf8")
        except Exception, e:
            print e
            return "", ""

Example #3

0

Show file

def searchResults(topic, n=None):
    queryRes = []
    query = addQuotes(topic)

    if n is None:
        n = 10
    for i in range(0, n):
        # url to scrape
        url_scrape = 'https://scholar.google.com/scholar?start=%d0' % i + '&q=%s' % query + '&hl=en&as_sdt=0,44'

        #add delay between requests
        time.sleep(0)
        # get the html data
        html = requests.get(url=url_scrape)
        # Convert html text to beautiful soup object
        soup = bs4(html.text)
        '''
        f = open('TopicQuery_HTML.txt','wb')
        f.write(str(soup))
        f.close()
        '''

        # Get the query results from html page
        q_table = soup.findAll("div", {"class": "gs_r"})
        q = queryResult(q_table)

Example #4

0

Show file

File: knowafest.py Project: raushanraj/CrawlerScripts

def save_data_from_link(link):
	final_data={}
	try:
		page=html.fromstring(requests.get(link).text)
		final_data['title']=page.xpath("//h3[@class='post-title entry-title']//div//p//text()")[0]
		all_dd=page.xpath("//dl[@class='dl-horizontal']//dd")
		all_dt=page.xpath("//dl[@class='dl-horizontal']//dt")
		for item in zip(all_dt,all_dd):
			final_data[bs4(etree.tostring(item[0])).text]=bs4(etree.tostring(item[1])).text
		final_data['about']=html2text(etree.tostring(page.xpath("//div[@id='info']")[0]))
		final_data['event']=html2text(etree.tostring(page.xpath("//div[@id='events']")[0]))
		final_data['register']=html2text(etree.tostring(page.xpath("//div[@id='register']")[0]))
		final_data['contact']=html2text(etree.tostring(page.xpath("//div[@id='contact']")[0]))
		final_data['tags']=page.xpath("//ul[@class='list-unstyled list-inline blog-tags']//a//text()")
		#about,event,register,contact are in Markdown-structured text
	except:
		pass
	return final_data

Example #5

0

Show file

File: kyfw.py Project: zhang1987wing/FiddlerResponse

 def login(self, *arg):
     url = 'https://kyfw.12306.cn/otn/index/init'
     self.request = urllib2.Request(url)
     try:
         response = urllib2.urlopen(self.request).read()
     except Exception:
         return self.login()
     soup = bs4(response)
     data = soup.find('a', {'id':'login_user'}).text
     print data

Example #6

0

Show file

File: university_extract.py Project: guptaakashgupta/crawler

def get_college_details():
	source=get_source(url)
	soup=bs4(source)
	x=soup.findAll('tr')
	i=1
	details=[]
	for item in x:
		country=item.find('td',{'class':'country'}).img['alt']
		rank=i
		university=item.a.text
		#print (country,rank,university)
		i=i+1
		details.append((country,rank,university))
	return details

Example #7

0

Show file

File: NFLFileFinder.py Project: adamgilman/nfl-analysis

	def getNFLMatchups(self, path):
		results = []
		soup = bs4( open(path, 'r').read() )
		##grid = soup.find("td", {"class" : "viBodyBorderNorm"})
		### bigger table but, seemingly can get more specificity by color
		grid = soup.find("table", {"bgcolor" : "C48F1B"})
		#tr color:d6bd7b are rows with info, that I don't need so remove
		extraneous_rows = grid.findAll("tr", {"bgcolor" : "d6bd7b"})
		[row.extract() for row in extraneous_rows]
		game_rows = grid.findAll("tr")
		#note; the above game_rows have all betting information, helful for future stories
		for game in game_rows:
			dirty_teams = game.findAll("a", {"target":None})
			row = {'home' : dirty_teams[1].text, 'away' : dirty_teams[0].text}
			results.append(row)

		return results

Example #8

0

Show file

File: forms.py Project: hoytnix/spidey

    def search_forms(self, txt):

      # Soup Object
      self.soup = bs4(txt)

      self.forms = self.soup.findAll('form')
      self.form_len = len(self.forms)

      if self.form_len < 0:
        return False
        
      self.site = {}

      self.i = 0
      for self.form in self.forms:
        self._f = self.control(self.form)

        self._key = str(self.i)

        self.site[self._key] = {}

        for self.key in self._f:
          self.site[self._key][self.key] = self._f[self.key]

        self.inputs = self.forms[self.i].findAll('input')
        self.site[self._key]['inputs'] = []
        for self._input in self.inputs:
          self._in = {}
          self._i = self.control(self._input)
          for self.key in self._i:
            self._in[self.key] = self._i[self.key]
          self.site[self._key]['inputs'].append(self._in)

        self.i += 1

      return self.site

Example #9

0

Show file

File: __init__.py Project: yffu/si501

for row in record['eGQueryResult']:
    if row["DbName"]=="pubmed":
        print row["Count"]

handle=Entrez.esearch(db="pubmed", term="Kai Zheng", retmax=200,usehistory="y")
record=Entrez.read(handle)
handle.close()
idlist=record["IdList"]
webenv=record["WebEnv"]
query=record["QueryKey"]

papers=Entrez.efetch(db="pubmed", query_key=query, rettype="abstract", WebEnv=webenv, retmode="html", retmax=50)

for pap in papers:
    try:
        pap=bs4(pap)
        print type(pap).prettify()
    except:
        print 'Nope'
#     print "title:", record.get("TI")
#     print "authors:", record.get("AU")
#     print "source:", record.get("SO")
    
# hand2=Entrez.efetch(db="pubmed", id=idlist, rettype='abstract', retmode="text", retmax=50)
# records= Medline.parse(handle)

# for record in records:
#     print records
#     print "title:", record.get("TI","?")
#     print "authors:", record.get("AU","?")
#     print "source:", record.get("SO","?")

Example #10

0

Show file

def get_rental_data(neighborhoods):
    """This function loops through all the items in neighborhoods,
    scrapes craiglist for date for that neighborhood, appends it to a list,
    and uploads a json to s3.

    Args:
        neighborhoods: neighborhoods is a dictionary containing
            the names of the neighborhoods as keys and the craigslist URLs as values.
    """

    # Create list to hold all scraped data
    rental_data = []

    # Loop through neighborhoods dict
    for neighborhood, url in neighborhoods.items():

        # Retrieve page with the requests module
        response = requests.get(url)

        # Create BeautifulSoup object; parse with 'lxml'
        soup = bs4(response.text, 'lxml')

        # results are returned as an iterable list
        results = soup.find_all('li', class_="result-row")

        # Loop through returned results
        for result in results:
            # Error handling
            try:
                # Identify and return bedrooms and footage
                raw_br = result.find(
                    'span', class_="housing").text.split("-")[0].strip()
                if regex.search(raw_br):
                    bedrooms = float(regex.search(raw_br).group(1))
                else:
                    continue

                raw_sqft = result.find(
                    'span', class_="housing").text.split("-")[1].strip()
                if regex.search(raw_sqft):
                    sqft = float(regex.search(raw_sqft).group(1))
                else:
                    continue

                # Get datetime of post
                datetime = result.find("time")["datetime"]

                # Identify and return title of listing
                title = result.find('a', class_="result-title").text

                # Identify and return price of listing
                price = float(result.a.span.text.strip("$"))

                # Identify and return link to listing
                link = result.a['href']

                # Create dictionary for result
                data = {
                    "neighborhood": neighborhood,
                    "datetime": datetime,
                    "title": title,
                    "price": price,
                    "bedrooms": bedrooms,
                    "sqft": sqft,
                    "link": link
                }

                # Append data to list
                rental_data.append(data)

            except:
                continue

    # Load rental data to s3
    obj = s3.Object(output_bucket, ouput_obj_path)
    obj.put(Body=json.dumps(rental_data, separators=(',', ':')))

Example #11

0

Show file

File: html.py Project: Kironide/flanbot

def strip_tags(html):
	return ''.join(bs4(html).findAll(text=True))

Example #12

0

Show file

File: mitocw.py Project: guptaakashgupta/crawler

def makeSoup(src):
	return bs4(src)

Example #13

0

Show file

import requests
from config import account,password
from BeautifulSoup import BeautifulSoup as bs4

s = requests.session()
# initial request paraments
urlLogin = "******"
header = {
  'Host':'signin.fcu.edu.tw',
  'Content-Type':'application/x-www-form-urlencoded',
  'Referer':'https://signin.fcu.edu.tw/clockin/login.aspx',
  'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'
}
# get initial page session
r = s.get(urlLogin)
response = bs4(r.text)

# initial login paraments
postdataLogin = {
  'i__EVENTTARGET':'',
  '__EVENTARGUMENT':'',
  'LoginLdap$LoginButton':'登入'
}

# parse form data
for element in response.findAll('input',{'type':'hidden','value':True}):
  postdataLogin[str(element['name'])] = str(element['value'])
postdataLogin['LoginLdap$UserName'] = account
postdataLogin['LoginLdap$Password'] = password
# log in
loginHtml = s.post(urlLogin,data=postdataLogin,headers=header)