def ID(title_series ): #function to find imdbID corresponding to given tv series myurl1 = "http://www.omdbapi.com/?t=%s&apikey=8deaca41" % title_series #creating omdb url from title client1 = ureq(myurl1) page = json.load(client1) ID = page['imdbID'] #imdb url to be used for scraping myurl = "https://www.imdb.com/title/%s/" % ID return (myurl) #to return the generated url
from urllib2 import urlopen as ureq from urllib2 import Request as req from bs4 import BeautifulSoup as soup import lxml agent='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19' headers={ "User-Agent":agent } url='https://www.google.com/search?q=pangolin' #"User-Agent":agent html=req(url, headers=headers) try: gold=ureq(html); #url.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19') #page_soup=open('html') page_soup=soup(gold, "lxml") divs=page_soup.findAll("div", {"class":"rc"}) for div in divs: gold=div.h3.a print gold.text except urllib2HTTPError, e: print "HTTP error:", e.code exit(1)
# product = input("Please enter the product: ") product = sys.argv[1] fileName = product.title().replace(" ", "") + ".json" product = product.replace(" ", "+").replace(",","").title() # upci = input("UPCI: ") ############################################################Searches mygrocerydeals.com############################################################ url = groceryUrlBeginning + product + groceryUrlEnd #page URL uclient = ureq(url) #opening connection to website pageHTMl = uclient.read() #reading HTML uclient.close() content = soup(pageHTMl, "html.parser") #grabs each product itemContainers = content.findAll("div", {"data-type":"special"}) productName = [] size = [] price = [] dealEnd = [] storeName = [] pictureUrl = [] count = 0 #counter for going row by row
def helper(List): #function to do scraping status = [] IMDBID = [] for i in range(len(List)): status1 = [] myurl = ID(List[i]) uclient = ureq(myurl, cafile=certifi.where()) page_html = uclient.read() #reading the content from myurl uclient.close() page_soup = soup(page_html, "html.parser") #using 'beautiful soup' for scraping title = page_soup.h1 status1.append("Title of series: %s" % title.text) #scraped title of tv series container = page_soup.findAll("div", {"class": "table full-width"}) contain = container[0] contain.findAll("div", {"class": "episode-widget-currentep"}) sub = contain.findAll("div", {"class": "episode-widget-currentep"}) if (sub == []): final = contain.div.findAll("a") finals = final[0] number1 = finals.text #season number number = int(number1) finals_date = final[number] date = finals_date.text DATE = int(date) #Date of release if (DATE > 2018): status1.append("Upcoming season: %d" % number) #upcoming season status1.append("Aired in: %s" % date) #year in which it will be released elif (DATE == 2018): status1.append("Running seson: %d" % number) #running season status1.append("Aired in: %s" % date) #year in which season started streaming elif (DATE < 2017): s = "The show has finished streaming all its episodes." status1.append(s) status1.append("Most recent season: %d" % number) #last aired season status1.append("Aired in: %s" % date) #year in which it was released else: subs = sub[0] subs.findAll("div", {"class": "episode-widget-airdate"}) dd = subs.findAll("div", {"class": "episode-widget-airdate"}) dds = dd[0] dds.text ddttitle = subs.findAll("div", {"class": "episode-widget-title"}) ddttitles = ddttitle[0] title = ddttitles.a title.text status1 = contain.h4 status1.append(dds.text) status1.append(title.text) for i in range(len(status1)): elements = status1[ i] #list named 'status' having details related to each tv series status.append(elements) #return list status return status
# scraps a website from urllib2 import urlopen as ureq from bs4 import BeautifulSoup as soup my_url = 'http://www.goforevent.com/jntua_pixel18/cse_dashboard.php' # opening the connection uclient = ureq(my_url) pagehtml = uclient.read() uclient.close() # html parser page_soup = soup(pagehtml, 'html.parser') # grabs all products rows = page_soup.select('tr') file = open('sample.txt', 'w') sum = 0 for i in range(1, len(rows)): rows_i = rows[i].select('td') # file.write(rows_i[2].text +" "+ rows_i[3].text + "\n") sum = sum + float(rows_i[3].text) file.write(str(sum)) file.close()
from urllib2 import urlopen as ureq from bs4 import BeautifulSoup as soup import lxml url = 'http://deschulz.net/plaincontent.html' html = ureq(url) page_soup = soup(html, "lxml") body1 = page_soup.body print(body1.text.strip())
from urllib2 import urlopen as ureq import bs4 from bs4 import BeautifulSoup import requests import io import json from inflection import singularize # Change csrf in security.js to false to use the script. Remember to change it back to true. # create a set to make sure there are no duplicate recipe entries recipe_set = set() prepTimes = [] url2 = 'https://tasty.co/topic/one-pot' uclient2 = ureq(url2) # opens up connection, grabs web page page_html = uclient2.read() uclient2.close() # close the client parent_page = BeautifulSoup(page_html, 'html.parser') # html parsing # Put links to recipe pages in a list links_container = parent_page.findAll('a', {'class': 'feed-item analyt-unit-tap'}) # First 20 links are on the page. The rest are loaded after clicking "Show more" for i in range(20): # Parse page from <a> tag #print(links_container[i].get("href")) url_child = links_container[i].get('href') ch_client = ureq(url_child) # opens up connection, grabs web page source = ch_client.read() # read the page