def getSeasons(show): my_url = "https://www.imdb.com/title/{0}/episodes?ref_=tt_ov_epl".format(show) # open connection and grab page uClient = uReq(my_url) # put into a variable page_html = uClient.read() # close connection uClient.close() #html parsing page_soup = soup(page_html, "html.parser") # finds how many seasons are in this show containers = page_soup.findAll("div", {"class": "seasonAndYearNav"}) container = containers[0] # the location of the season information seasons = container.div.div.text.split() seasons.pop(0) ## If errors occur, the following lines may be helpful in determining problems # for i in seasons: # print type(i) # find the length of the list to determine the number of seasons in a show Seasons = len(seasons) # print Seasons # This is the answer! return Seasons # This is the answer!
def get_soup_content(url): uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, 'html.parser') return page_soup
def get_page_number(url): pageClient = uReq(url) page_html = pageClient.read() pageClient.close() page_soup = soup(page_html, 'html.parser') pages = page_soup.find('div', {'class':'search_paginator'}).ul.find('li', {'class' : 'last-page'}).a.decode_contents(formatter='html') return pages
def page_crawler(urls, visited, category_data, page_data, incomplete_categories): count = 0 files_made = 1 G = nx.DiGraph() while len(urls) > 0 and count < 25000: if count % 1000 == 0 and count != 0: pages_pickle = open( STARTING_TOPIC + "_pages0" + str(files_made) + ".pickle", "wb") pickle.dump(page_data, pages_pickle, pickle.HIGHEST_PROTOCOL) pages_pickle.close() category_pickle = open( STARTING_TOPIC + "_categories0" + str(files_made) + ".pickle", "wb") pickle.dump(category_data, category_pickle, pickle.HIGHEST_PROTOCOL) category_pickle.close() files_made += 1 if urls[0] not in visited: print urls[0] page_list = [] uClient = uReq(urls[0]) page_html = uClient.read() uClient.close() fixed_name = fix_name(urls[0]) current_category = category(fixed_name) page_soup = soup(page_html, "html.parser") get_subcategories(current_category, page_soup, urls, BASE_URL, fixed_name, incomplete_categories, G) get_supercategories(current_category, page_soup, BASE_URL, G) page_list = get_pages(current_category, page_soup, page_list, fixed_name, BASE_URL, SUBPAGE_WEIGHT_FACTOR, G) category_data.append(current_category) page_data += page_list visited.append(urls[0]) urls.pop(0) count += 1 print "count is " + str(count) + ", cue is " + str(len(urls)) print current_category.name + "[pages] : " + str(len(page_list)) #print "current_weight is " + str(len(current_category.subcategories)) else: urls.pop(0) return G
def getCountry(url): # gets a country # open connection, grab the page uClient = uReq(url) page_html = uClient.read() uClient.close() # html parser page_soup = soup(page_html, "html.parser") # return container name and sorts out country names containerName = page_soup.findAll("span",{"class":"heavy"}) countryName = findall(r'/names/usage/(.*?)">',str(containerName)) return(countryName[random.randrange(0,len(countryName))])
def info(str1): my_link = str1 uClient = uReq(my_link) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers_link = page_soup.findAll("ul")[2].findAll("li") counter = 0 for i in containers_link: info_list.append(containers_link[counter].text) counter += 1
def submissionCount(): #Opening Connection, Grabbing the Page uClient = uReq(url) page_html = uClient.read() uClient.close() #Parse the HTML page_soup = soup(page_html, "html.parser") submissionCount = page_soup.findAll("span", {"class": "span-6"})[1].text print(submissionCount) return submissionCount
def get_page_content(url, error_link_array): try: contentClient = uReq(url) content_html = contentClient.read() contentClient.close() print(url + ' -> getting content from here') if content_html == None: error_link_array.append(url) return None return content_html except: print(url + ' ->error Link!') get_page_content(iriToUri(url), error_link_array)
def sample_page_number(url): try: pageClient = uReq(url) page_html = pageClient.read() pageClient.close() page_soup = soup(page_html, 'html.parser') if page_soup.find('div', {'class': 'search_paginator'}) and len( page_soup.find('div', {'class': 'search_paginator'}).find_all('li', {'class': 'other-page'})) > 1: pages = page_soup.find('div', {'class': 'search_paginator'}).ul.find_all('li', {'class': 'other-page'})[ -1].a.decode_contents(formatter='html') return pages else: return 1 except: sample_page_number(iriToUri(url))
def scraping(my_url, old_url, n, f, t): n=n+1 print ("n: " + str(n)) navigator = [] uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") authors = page_soup.find_all("span", class_="visible-contributors") author = authors[0].a.string.encode("utf-8") containers = page_soup.findAll("div",{"class":"item-container"}) #container = containers[0] #title_container = container.div.findAll("div",{"class":"notranslate_title"}) #i = 0 for container in containers: title_container = container.find_all("div", class_="book-detail-line") title = str(title_container[0].p.string.encode("utf-8")) title_url = container.find_all("a", class_="notranslate_title") url = title_url[0]["href"] autori = container.find_all("span", class_="contributor-name") autore = str(autori[0].string.encode("utf-8")) ratings = container.find_all("div", class_="star-rating") if ratings: rating = ratings[0]["aria-label"] splited_rating = rating.split(" ") rating = str(splited_rating[1]) else: rating = "null" prezzi = container.find_all("p", class_="price") try: prezzo = str(prezzi[0].span.span.string.encode("utf-8")) except: prezzo = "gratis" header_url="https://www.kobo.com" #print(header_url+url) navigator.append(header_url+url) full_url = header_url + url f.write(my_url+ "," + full_url + "\n") t.write(full_url + ";" + title + ";" + autore + ";" + rating + ";" + prezzo + "\n") #i=i+1 if not navigator: return old_url, my_url old_url = my_url rnd = random.randint(0,len(navigator)-1) my_url = navigator[rnd] print(my_url) return my_url, old_url
def get_file_meta(url): try: sound_array = {} soundClient = uReq(url) sound_html = soundClient.read() soundClient.close() tags = [] sound_soup = soup(sound_html, 'html.parser') sound_name = sound_soup.find('div', {'id': 'single_sample_header'}).getText() sound_description = sound_soup.find('div', {'id': 'sound_description'}).p.getText() sound_tag = sound_soup.find('ul', {'class': 'tags'}).find_all('li') for sound in sound_tag: tags.append(sound.getText()) sound_tags = ','.join(tags) sound_download = sound_soup.find('div', {'id': 'download'}).a['href'] sound_license = sound_soup.find('div', {'id': 'sound_license'}).a.getText() sound_type = \ sound_soup.find('dl', {'id': 'sound_information_box'}).find_all( 'dd')[0].getText() sound_duration = \ sound_soup.find('dl', {'id': 'sound_information_box'}).find_all( 'dd')[1].getText() sound_filesize = \ sound_soup.find('dl', {'id': 'sound_information_box'}).find_all( 'dd')[2].getText() sound_bitrate = \ sound_soup.find('dl', {'id': 'sound_information_box'}).find_all( 'dd')[3].getText() sound_channels = \ sound_soup.find('dl', {'id': 'sound_information_box'}).find_all( 'dd')[4].getText() sound_array['name'] = sound_name.encode('utf-8').replace("'", "").replace("\n", "") sound_array['description'] = sound_description.encode('utf-8').replace("'", "").replace("\n", "") sound_array['tags'] = sound_tags.encode('utf-8').replace("'", "").replace("\n", "") sound_array['license'] = sound_license.encode('utf-8').replace("'", "").replace("\n", "") sound_array['type'] = sound_type.encode('utf-8').replace("'", "").replace("\n", "") sound_array['duration'] = sound_duration.encode('utf-8').replace("'", "").replace("\n", "") sound_array['filesize'] = sound_filesize.encode('utf-8').replace("'", "").replace("\n", "") sound_array['bitrate'] = sound_bitrate.encode('utf-8').replace("'", "").replace("\n", "") sound_array['channels'] = sound_channels.encode('utf-8').replace("'", "").replace("\n", "") sound_array['download'] = 'https://www.freesound.org' + sound_download.encode('utf-8') return sound_array except Exception, e: print(e) get_file_meta(iriToUri(url))
def jax_enter_artikel_authoren(author_name, anzahl_bisheriger_artikel): url_jax_enter = 'https://jaxenter.de/author/' + author_name jax_enter_Client = uReq(url_jax_enter) jax_enter_page_html = jax_enter_Client.read() jax_enter_Client.close() jax_enter_page_soup = soup(jax_enter_page_html, "html.parser") container_jax_enter = jax_enter_page_soup.findAll("div", {"class": "info"}) author_name = jax_enter_page_soup.findAll("span", {"class": "author-name"}) for container in container_jax_enter: info_text_container = container.a print author_name[0].text.strip() + ": " + info_text_container.text if len(container_jax_enter) > anzahl_bisheriger_artikel: print "Unter der URL gibt es einen neuen Artikel: " + url_jax_enter
def one_de_prices(notebook_types): my_url = 'https://www.one.de/notebooks/' + notebook_types uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("div", {"class": "product--info-box"}) for container in containers: info_text_container = container.findAll("a", {"class": "product--title"}) info_price_container = container.findAll( "div", {"class": "price--default-list"}) print info_text_container[0].text.strip( ) + ": " + info_price_container[0].text.strip()
def getWords(url): letters = ['A-B/','C-D/','E-G/','H-K/','L-N/','O-P/','Q-R/','S/','T/','U-Z/'] numPages = [ 5, 7, 6, 4, 4, 5, 3, 5, 3, 3 ] p = 0 wordList = [] while p < len(numPages): for elem in range(1, numPages[p] + 1): uClient = uReq(url + letters[p] + '?page=' + str(elem)) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containerName = page_soup.findAll("ul",{"class":"result-list1 wordlist-oxford3000 list-plain"}) pageWords = findall(r'definition">(.*?)</a>',str(containerName)) for item in pageWords: wordList.append(item) p += 1 return wordList
def firstNameLst(url): # returns a list of first names # open connection, grab the page uClient = uReq(url) page_html = uClient.read() uClient.close() # html parser page_soup = soup(page_html, "html.parser") # grabs each item in the class called browsename # searches out the names and creates a list we can use to draw from randomly containersName = page_soup.findAll("div",{"class":"browsename"}) to_string_name = '' if len(containersName) == 0: return '' else: for items in range(0, len(containersName)): to_string_name += str((containersName[items].b.a)) first_name = findall(r'">(.*?)',to_string_name) return first_name[random.randrange(0, len(first_name)) ]
def findNumPages(url): # check the url to see if there are multiple pages # a reference to the current url to use when finding values for the number of pages # eg:value="/names/usage/english/13">page 13</option></select> container_page_ref = (url[30:]) + '/' # open connection, grab the page uClient = uReq(url) page_html = uClient.read() uClient.close() # html parser page_soup = soup(page_html, "html.parser") # grabes each item and finds the largest number containersPage = str(page_soup.find("select",{"name": "page"})) page_num = findall(r'{0}(.*?)">page '.format(container_page_ref), containersPage) #checks len of list before conversion to integers if len(page_num) > 0: page_num_max = (max(map (int, page_num))) return page_num_max else: return ''
def mainfun(my_url): uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") # print page_soup product_available="" product_name = page_soup.findAll("h1",{"class":"_3eAQiD"})[0].text product_price = page_soup.findAll("div",{"class":"_1vC4OE _37U4_g"})[0].text try: product_available = page_soup.findAll("div",{"class":"_3xgqrA"})[0].text except: product_available = "Available" update_time = time.asctime(time.localtime(time.time())) print "Product Name: "+ product_name print "Product Cost: "+ product_price print "Availablity : "+ product_available print "Product Url : "+ my_url print "Update Time : "+ update_time
def recupere(): my_url = 'https://play.google.com/store/search?q=toutes%20les%20applications%20camerounaise' uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html,"html.parser") dts = page_soup.find_all("div",{"class":"details"}) liste = [] for detail in dts : title_detail = detail.find_all("a",{"class":"title"}) title_name = title_detail[0].text description_detail = detail.find_all("div",{"class":"description"}) description = description_detail[0].text price_detail = detail.find_all("span",{"class":"display-price"}) price = price_detail[0].text.strip() descrip = (title_name, description, price) liste.append(descrip) return liste
def sample_more_search_result(url): try: sampleClient = uReq(url) sample_html = sampleClient.read() sampleClient.close() sample_soup = soup(sample_html, 'html.parser') sample_result = sample_soup.find('div', {'id': 'wrapper'}).find('div', {'id': 'container'}).find('div', { 'id': 'content_full'}).find_all('div', {'class': 'sample_more_search_results'}) samples = [] for sample in sample_result: if sample != None: samples.append('https://www.freesound.org' + sample.a['href']) return samples except Exception, e: sample_more_search_result(iriToUri(url)) print (e)
def main(): # Argument check if len(sys.argv) != 2: sys.exit("Usage: python web_scrape.py trialID") # Define URL url = "https://clinicaltrials.gov/ct2/show/" + sys.argv[1] # Open connection and grab html uClient = uReq(url) page_html = uClient.read() uClient.close() # Write html to file 'html' fp = open("results/html", "w+") fp.write(page_html) fp.close() # Open 'html' for reading and new file 'hits' for writing fp = open("results/html", 'r') new_fp = open("results/hits", "w+") copy = False # Remove all text between "Criteria" and "Locations & Contacts" for line in fp: if line.strip().lstrip( ) == "<div class=\"header3\" style=\"margin-top:2ex\">Criteria</div>": copy = True elif line.strip().lstrip() == "<!-- location_section -->": copy = False elif copy: # Regex to strip all html tags (everything between triangle brackets) cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', line) new_fp.write(cleantext.strip().lstrip()) # Close 'hits' and 'html' new_fp.close() fp.close()
links=k.get_attribute('href') print links driver.get(links) """ urls=driver.find_elements_by_css_selector('div.srg a') #urls=driver.find_element_by_tag_name('h3').findNext('a'); #k=urls.find_element_by_css_selector('a') for l in urls: links.append(l.get_attribute('href')) print links my_url=links[0] my_url2=links[4] uClient=uReq(my_url) uClient2=uReq(my_url2) page_html=uClient.read() page_html2=uClient2.read() #for getting al the things on page f=uReq(my_url) g=soup(page_html,"html.parser") s=g.get_text() #// uClient.close() page_soup=soup(page_html,"html.parser") page_soup2=soup(page_html2,"html.parser") #containers=page_soup.findAll(text="introduction") #print containers
def retriever_soup(my_url): uClient = uReq(my_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") return (page_soup)
'https://www.beatport.com/genre/tech-house/11/top-100', 'https://www.beatport.com/genre/house/5/top-100', 'https://www.beatport.com/genre/progressive-house/15/top-100', 'https://www.beatport.com/genre/funk-soul-disco/40/top-100', 'https://www.beatport.com/genre/indie-dance-nu-disco/37/top-100', 'https://www.beatport.com/genre/funky-groove-jackin-house/81/top-100', 'https://www.beatport.com/genre/leftfield-house-and-techno/80/top-100', 'https://www.beatport.com/genre/dj-tools/16/top-100', 'https://www.beatport.com/genre/minimal-deep-tech/14/top-100', 'https://www.beatport.com/genre/techno/6/top-100' ] # opening up connecting, grabbing the page for url in my_url: uClient = uReq(url) # this will offload our content in'to a variable page_html = uClient.read() # closes our client uClient.close() # html parsing page_soup = soup(page_html, "html.parser") containers = page_soup.findAll("li", {"class": "bucket-item ec-item track"}) print(url) conn = sqlite3.connect('Beatscrape.db') cursor = conn.cursor()
from urllib import urlopen as uReq from bs4 import BeautifulSoup as soup if __name__ == '__main__': pages = [] for i in range(1, 100): my_url = 'https://www.monster.se/jobb/sok/Data-IT_4?intcid=swoop_BrowseJobs_Data-IT&page={0}'.format( i) pages.append(my_url) for my_url in pages: try: uClient = uReq(my_url) pageHtml = uClient.read() uClient.close() page_soup = soup(pageHtml, "html.parser") print page_soup.h1.text.strip() containers = page_soup.findAll("article", {"class": "js_result_row"}) for container in containers: job_title = container.findAll("div", {"class": "jobTitle"}) print job_title[0].text.strip() company = container.findAll("div", {"class": "company"}) print company[0].text.strip() location = container.findAll("div", {"class": "location"}) print location[0].text.strip() print('-------------------------------') except AttributeError: break pages_stepstone = [] for i in range(1, 100): my_url_stepstone = 'https://www.stepstone.se/lediga-jobb-i-hela-sverige/data-it/sida{0}/'.format(
from urllib import urlopen as uReq from bs4 import BeautifulSoup as soup import re jobs_url = 'https://www.indeed.com/jobs?q=web+developer&l=Roanoke%2C+TX' #opening connection and grabbing the page uClient = uReq(jobs_url) page_html = uClient.read() uClient.close() #html parsing page_soup = soup(page_html, "html.parser") #grab all divs with a class of result results = page_soup.findAll("div", {"class": "result"}) #print len(results) filename = "jobs.csv" f = open(filename, "w") headers = "Title, Company, Location, Experience, Link \n" f.write(headers) for result in results: title = result.a["title"] company = result.findAll('span', {'class': 'company'}) company_name = company[0].text.strip()
return int(s) / 20 for semt in semts: page_url = "https://www.hurriyetemlak.com/" + semt + "-satilik" buildings = [] count = 0 page_limit = 2 page = 1 while (page != page_limit): pagesUrl = page_url + "?page=" + str(page) print("Semt: " + semt + " Sayfa : " + str(page)) # opens the connection and downloads html page from url uClient = uReq(page_url) page_soup = soup(uClient.read(), "html.parser") uClient.close() rows = page_soup.findAll("div", {"class": "list-item timeshare clearfix"}) if (page == 1): numberofPost = page_soup.findAll("strong", {"data-ads-count": ""}) page_limit = strToNum(numberofPost[5].text) for r in rows: suburl = "https://www.hurriyetemlak.com" + r.a["href"] count += 1 try: RuClient = uReq(suburl)
# removed urllib.request per stackoverflow from urllib import urlopen as uReq from bs4 import BeautifulSoup as soup import sqlite3 my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20cards' # opening up connecting, grabbing the page uClient = uReq(my_url) # this will offload our content into a variable page_html = uClient.read() # closes our client uClient.close() # html parsing page_soup = soup(page_html, "html.parser") # grabs each product containers = page_soup.findAll("div", {"class": "item-container"}) # ------ commenting this out .... replacing with create database #filename = "products.csv" #f = open(filename, "w") #headers = "brand, product_name, shipping\n" #f.write(headers) #for container in containers: # brand = container.div.div.a.img["title"]
def getWebsite(url): page_html = uReq(url) page_soup = soup(page_html, "html.parser") findToday(page_soup)
all_links = [] country_list = [ 'african', 'america', 'arabic', 'australian', 'christian', 'english', 'french', 'german', 'indian', 'iranian', 'irish' ] gender_list = ['boy', 'girl'] letter = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] my_url = 'https://www.babynamesdirect.com/baby-names' for i in country_list: for j in gender_list: for k in letter: new_url = my_url + "/" + i + "/" + j + "/" + k uClient = uReq(new_url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") containers_outer_div = page_soup.findAll("small") # pages_number = containers_outer_div[-1].string if len(containers_outer_div) != 0: pages_number = containers_outer_div[-1].string pages_number = pages_number.split("of ") lastpage = pages_number[-1] lastpage = int(lastpage) + 1 for every in range(1, lastpage): inner_new_url = new_url + "/" + str(every) print(inner_new_url) all_links.append(inner_new_url) else:
from urllib import urlopen as uReq from bs4 import BeautifulSoup as soup myURL = 'https://www.newegg.ca/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphic%20cards' #opening connection uClient = uReq(myURL) page_HTML = uClient.read() uClient.close() #parse the page_HTML using html parser page_soup = soup(page_HTML, "html.parser") containers = page_soup.findAll("div", {"class": "item-info"}) filename = "products.csv" f = open(filename, "w") headers = "brand, product_name, shipping\n" f.write(headers) for container in containers: brand = container.div.a.img["title"] title_container = container.findAll("a", {"class": "item-title"}) productName = title_container[0].text shipping_container = container.findAll("li", {"class": "price-ship"}) shippingPrice = shipping_container[0].text.strip() print("brand : " + brand) print("productName : " + productName) print("shippingPrice : " + shippingPrice) f.write(brand + "," + productName.replace(",", "| ") + "," + shippingPrice + "\n")