def scrape_links(steamName, gameName): """Scrape Steam's pages for data.""" global allNames steam_page = _format_steam_url(steamName, gameName) page = urllib2.urlopen(steam_page) print page.geturl() soup = bs(page.read(), 'html.parser') name = soup.find('a', attrs={'class':'whiteLink'}) name = str(name.contents[0]) achievements = soup.find('div', attrs={'id':'personalAchieve'}) achievements = str(achievements).split('<br/><br/><br/>') achievements = achievements[0] achievements = bs(achievements, 'html.parser') achievements = achievements.find_all('h3') allNames[name] = {} allNames[name][gameName] = [] for ach in achievements: allNames[name][gameName].append(str(ach.contents[0]).strip())
def get_chapters_list(url,opener=opener): #Find chapters urls and publish time for the chapter by going to the navigate page. #Returns tuple (chapter url, time) url_full = show_full_contents(url) chapters_list = [] navigate = '' try: req = urllib2.Request(url_full) page = bs(opener.open(req)) for link in page.find_all('a'): if 'Chapter Index' in link.text and len(link.get('href')) > 1: navigate = 'http://archiveofourown.org' + link.get('href') if navigate != '': req2 = urllib2.Request(navigate) page2 = bs(opener.open(req2)) links = re.findall('<li><a href="(.*?)</span></li>', str(page2)) for i in links: chapter_url = 'http://archiveofourown.org' + i.split('"')[0] chapter_index = re.findall('[0-9]+\.', i) [0].replace('.', '') chapter_time = re.findall('<span class="datetime">\((.*?)\)', i)[0] chapters_list.append((chapter_url, chapter_index, chapter_time)) except: pass return chapters_list
def definition(self,SearchWord): page=urllib2.urlopen("http://dictionary.reference.com/browse/%s"%(SearchWord.strip())) html=page.read() page=bs(html) definition=page.find_all("div",attrs={"class":"dndata"}) defined=str(bs(str(definition)).get_text()).replace("[","").replace("]","").replace(":","") return defined
def test_nonstandard_youtube_stripped(self): """ Test whether an embedded YouTube video that does not follow the standard options gets stripped as well. """ from bs4 import BeautifulSoup as bs from website.utils.filters import filter_iframes self.maxDiff = None field_value_pre = """<div id="test"> <p>Wit amet interdum dolor felis ut ante. Morbi a facilisis ante, in lobortis urna. Etiam ut nunc quis libero interdum aliquam eu at magna. Nunc vehicula risus eleifend molestie vulputate. Mauris diam odio, congue eget lorem id, finibus imperdiet sem.</p>""" field_value_post = """<p>Vestibulum eget posuere metus, vel finibus leo. Suspendisse congue orci magna, in vestibulum lacus pulvinar a. Donec egestas, felis id feugiat tempus, orci velit ullamcorper risus, et ultricies augue arcu ullamcorper dolor. Mauris eget sollicitudin purus. Aenean a cursus risus, sit amet mattis erat. Curabitur vel venenatis sem. Cras non gravida tellus, eu egestas tellus. Morbi at lorem a turpis blandit vulputate vitae a est.</p></div>""" # First case: embed from a different URL field_value_different_src = field_value_pre + \ """<iframe width="560" height="315" src="//www.youtub.com/embed/-Y6ImGzTF70"></iframe>""" + \ field_value_post self.assertEqual(str(bs(field_value_pre + field_value_post, 'html.parser')), filter_iframes(field_value_different_src)) # Second case: embed using an attribute other than # the ones YouTube sets by default (width, height, src, # frameborders, allowfullscreen) field_value_different_attributes = field_value_pre + \ """<iframe id="nonstandard" width="560" height="315" src="//www.youtube.com/embed/-Y6ImGzTF70"></iframe>""" + \ field_value_post self.assertEqual(str(bs(field_value_pre + field_value_post, 'html.parser')), filter_iframes(field_value_different_attributes)) # Third case: iframe contains information. field_value_iframe_has_content = field_value_pre + \ """<iframe width="560" height="315" src="//www.youtube.com/embed/-Y6ImGzTF70">Test Information</iframe>""" + \ field_value_post self.assertEqual(str(bs(field_value_pre + field_value_post, 'html.parser')), filter_iframes(field_value_iframe_has_content))
def linkExtractor(urltoopen, tag1, attrib1, attrib1value, tag2 ,attrib2, attrib2value, finalAttrib): url = urllib2.urlopen(urltoopen).read() soup = bs(url) lastPageTag = soup.find("span",{"class":"pagnDisabled"}) lastPage = int(lastPageTag.getText()) apple = [] #inside the loop for j in range(0,lastPage): result = soup.findAll(tag1,{attrib1:attrib1value}) for i in range(0,len(result)): resultDetails = result[i].find(tag2,{attrib2:attrib2value}) link = resultDetails[finalAttrib] apple.append(link) nextLinkATag = soup.find("span",{"class":"pagnRA"}) nextLink = "http://www.amazon.com"+nextLinkATag.a['href'] url = urllib2.urlopen(nextLink).read() soup = bs(url) #the loop ends return apple
def GetMyviUrl(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0", 'referer': url, } with requests.session() as s: # logging.basicConfig(level=logging.DEBUG) # import time #_startTime = time.time() r = s.get(url) s.headers.update(headers) soup = bs(r.text) #print "Elapsed time: {:.3f} sec".format(time.time() - _startTime) url = soup.find('div', {'class':'player-area'}).find('iframe')['src'] r = s.get(url, allow_redirects=True) UniversalUserID = r.cookies['UniversalUserID'] js = bs(r.text).find('body').find('script', {'type': 'text/javascript'}).encode('utf-8') js = '{%s}' % (js.decode('utf-8').split('{', 1)[1].rsplit('}', 1)[0]) js = re.sub(ur'([\s*{\s*,])([a-z]\w*):', ur'\1"\2":', js) js = js.replace("'", '"') json_data = json.loads(js) api = 'http://myvi.ru' + json_data['dataUrl'] r = s.get(api) data = json.loads(r.text) url = data['sprutoData']['playlist'][0]['video'][0]['url'] r = s.get(url, allow_redirects=False) return r.headers['Location'] + '|Cookie=' + urllib.quote_plus(urllib.urlencode({'UniversalUserID' : UniversalUserID })) return None
def Main(main_url): addDir('Поиск', site_url, mode="SEARCH") if main_url == None : main_url = site_url html = Get(main_url) soup = bs(html) content = soup.find('ul', {'class': 'main_menu'}).find_all('a', attrs={'class': 'main_menu_item_lnk'}) for num in content: if 'news' not in num['href'] and 'deti' not in num['href'] : if 'sport' in num['href'] : addDir(num.text, addUrlParams(site_url + num['href']), mode="CONTENT") else : addDir(num.text, site_url + num['href']) else : print main_url cat = main_url.partition(site_url + '/')[-1].rpartition('?')[0] soup = bs(Get(main_url)) if 'films' in main_url: content = soup.find('ul', attrs={'class': 'main_menu'}).find_all('li', attrs={'class': 'mseries_cont'})[1].find('div', {'class': 'submenu01_cont'}).find_all('a') elif 'series' in main_url: content = soup.find('ul', attrs={'class': 'main_menu'}).find_all('li', attrs={'class': 'mseries_cont'})[0].find('div', {'class': 'submenu01_cont'}).find_all('a') elif (cat in main_url) and (cat in categories): content = soup.find('ul', attrs={'class': 'main_menu'}).find('li', attrs={'class': 'm' + cat + '_cont'}).find('div', {'class': 'submenu01_cont'}).find_all('a') for num in content: label = num.text if label == '': label = 'ТНТ' addDir(label, addUrlParams(site_url + num['href']), mode="CONTENT")
def populateList(): '''first, we get the whole list of pokemon, sorted by national dex number. there is also a regional dex number, which i will preserve later. returns a tuple in the form (name, url_suffix). ''' path = URL + "/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number" page = wget(path) soup = bs(page.read(), 'html.parser') tables = soup.findAll('table') # - tables[1] is the list of kanto (kdex) pokemon. # - tables[2] is the list of jhoto (jdex) pokemon. # - tables[3] is the list of hoenn (hdex) pokemon. # - tables[4] is the list of sinnoh (sdex) pokemon. # - tables[5] is the list of unova (udex) pokemon. # - tables[6] is the list of kalos pokemon. kalos is special because the region is # split into 3 sub regions, central (cekdex), coastal (cokdex), and mountain (mokdex). # - tables[7] is the list of alola (adex) pokemon. it is not populated, as the region # is part of the gen VII game release (not released yet). # get a list of pokemon pokemon = [] for table in tables[:7]: # ignoring alola region for now entries = bs(table.__str__(), 'html.parser').findAll('tr') for entry in entries[1:]: # entries[0] defines column headers. entry = bs(entry.__str__(), 'html.parser') info = entry.findAll('td')[3] poke = (info.a.contents[0], info.a['href']) if poke not in pokemon: # there are duplicate entries. some pokemon have different "states". pokemon.append(poke) # using a dictionary reorders, lets stay in order for debugging's sake. return pokemon
def GetVKUrl(url): http = GetHTML(url) soup = bs(http) soup = bs(GetHTML(soup.find('div', {'class':'b-video_player'}).find('iframe')['src'])) sdata1 = soup.find('div', style="position:absolute; top:50%; text-align:center; right:0pt; left:0pt; font-family:Tahoma; font-size:12px; color:#777;") video = '' if sdata1: return False for rec in soup.find_all('param', {'name':'flashvars'}): for s in rec['value'].split('&'): if s.split('=', 1)[0] == 'url240': url240 = s.split('=', 1)[1] if s.split('=', 1)[0] == 'url360': url360 = s.split('=', 1)[1] if s.split('=', 1)[0] == 'url480': url480 = s.split('=', 1)[1] if s.split('=', 1)[0] == 'url720': url720 = s.split('=', 1)[1] if s.split('=', 1)[0] == 'hd': hd = s.split('=', 1)[1] video = url240 qual = __settings__.getSetting('qual') if int(hd) >= 3 and int(qual) == 3: video = url720 elif int(hd) >= 2 and (int(qual) == 2 or int(qual) == 3): video = url480 elif int(hd) >= 1 and (int(qual) == 1 or int(qual) == 2): video = url360 return video
def process_md(md_file, template_text): templ = bs(template_text) plain_doc = bs(markdown2.main(md_file)) container = templ.select('#impress')[0] def new_step(i): new = bs('<div></div>') new.div['class'] = 'step' new.div['id'] = i return new.div i = 0 current = new_step(i) for node in plain_doc.body.children: if not hasattr(node, 'name'): continue elif node.name == 'hr': i += 1 container.append(current) current = new_step(i) else: current.append(node) container.append(current) return templ
def getContent(u, p): result = [] driver = webdriver.Firefox() url = u driver.get(url) for i in xrange(1,p+1): element = WebDriverWait(driver, 1).until( EC.presence_of_element_located((By.CSS_SELECTOR, ".js-layout-option.full")) ) res = driver.page_source.encode('utf-8') soup = bs(res, 'html.parser') n = len(soup.select('.middle-header')) time.sleep(0.5) driver.find_element_by_css_selector(".js-layout-option.full").click() res = driver.page_source soup = bs(res, 'html.parser') for t in soup.select('.review-container'): temp = re.sub('<img.*>','',t.text) s = "".join(temp.split()).strip() result.append(s) if i < p: nextpage = driver.find_element_by_css_selector(".pagination-button.next.js-next") nextpage.click() else: driver.quit() return result
def spoj(): """ Scrapes problems from spoj.com (Uses default template) """ url = "http://spoj.com" # Start with the domain name self.problem = str.upper(self.problem) url = url+"/problems/"+self.problem+'/' print "Pinging up spoj...." self.page_as_string = Utilities.get_html(url) soup = bs(self.page_as_string) p_header = soup.find('h2',{'id':'problem-name'}) p_container = soup.find('div',{'id':'problem-body'}) self.problem_container = p_container self.problem_container_as_string = str(p_container) self.page_as_string = StaticScraper.setTemplate\ (str(p_header),self.problem_container_as_string) self.page_as_soup = bs(self.page_as_string)
def GetSRUrl(html): soup = bs(html, "html.parser") sr_url = 'http:' + soup.find('div', {'class':'b-video_player'}).find('iframe')['src'] soup = bs(GetHTML(sr_url), "html.parser") source = soup.find('video').find('source')['src'] return source
def scrape_waymarks(url): headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'} response = requests.get(url, headers= headers) soup = bs(response.text, "lxml") links = soup.select('.wmd_namebold > a') links = links[1::2] links = [link['href'] for link in links] for link in links: response = requests.get(link, headers= headers) soup = bs(response.text, "lxml") # Do some messy parsing and decoding to extract coordinates and other landmark details coords = soup.select('#wm_coordinates')[0].get_text().encode('ascii','ignore').replace('.','').split() latitude = float('.'.join([coords[1], coords[2]])) longitude = -(float('.'.join([coords[4], coords[5]]))) title = soup.select('#wm_name')[0].get_text().split(' - ')[0].encode('ascii', 'replace').strip() artist = soup.select('#Table1')[0].get_text('|', strip=True).split('|')[5] # details = soup.select('#Table1')[0].get_text('|', strip=True).split('|')[7] details = soup.select('#wm_quickdesc')[0].get_text().split(': ')[1] image_url = soup.select('.wm_photo > a > img')[0]['src'] print "{}|{}|{}|{}|{}|{}".format(latitude, longitude, title, artist, details, image_url)
def vid_info(url,cookie): opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) data = opener.open(urllib2.Request(url)).read() soup = bs(data) vid_title = soup.findAll("h1",class_="single-pagetitle")[0].string # print vid_title vid_link = soup.findAll("div",class_="video-embed-container")[0].iframe rq = urllib2.Request(vid_link['src']) rq.add_header('Referer',url) if_src = bs(opener.open(rq).read()).prettify() m3u8_link = re.findall("\"hd\":\"(.*?)\"},",if_src)[0] rq2 = urllib2.Request(m3u8_link) rq2.add_header('Referer',vid_link["src"]) new_data = opener.open(rq2).read() # print new_data ind_link = 'http://'+re.findall(r"http://(.*?).m3u8",new_data)[0]+".m3u8" # print ind_link rq3 = urllib2.Request(ind_link) rq3.add_header('Referer',m3u8_link) seg_data = opener.open(rq3).read() # print seg_data segs = re.findall("http://(.*?).ts",seg_data) for i in xrange(len(segs)): segs[i] = 'http://'+segs[i]+'.ts' # sys.exit(1) return [vid_title,segs]
def scrape_zips(zips): all_pages = [] pages_completed = 0 start_time = time.time() path = 'data' try: os.chdir(path) except WindowsError or IOError: pass #connect to sql db conn = sqlite3.connect('housing_data.db') c = conn.cursor() #start parsin some zips #### figure out how many pages for each zip code and make a list of all the pages ### for z in zips: base_page = 'http://www.trulia.com/for_rent/'+str(z)+'_zip/' soup = bs(requests.get(base_page).text,'html5lib') #create list of pages to scrape pages = [base_page] #create soup of area to look for number of pages if len(soup.find_all(class_='srpPagination_list')) == 0: pass else: pages_area = soup.find_all(class_='srpPagination_list') try: number_of_pages= int(bs(str(pages_area)).find_all('a')[-1].text) for i in range(2,number_of_pages+1): pages.append(base_page + str(i)+'_p') except IndexError: number_of_pages = 1 all_pages= all_pages + pages print('zip: ' + str(z) + ' added to job. ~Listings: '+ str(number_of_pages*30)) ##### go through each page and make it into some soup #### print('total pages to scrape: ' + str(len(all_pages))) time.sleep(2) for page in all_pages: soup = bs(requests.get(page).text,'html5lib') mylist = soup.find_all(class_='property-data-elem') ##### add listings for each page to the database ### for listing in mylist: home = Property(listing) if home.type == 'single': c.execute("INSERT INTO rental_data\ (Longitude, Latitude, Address, Zip, Price, RoomType, Bathrooms, Sqft, Date_Scraped)\ VALUES(?,?,?,?,?,?,?,?,?)",home.output()) else: for apt in home.units: c.executemany("INSERT INTO rental_data\ (Longitude, Latitude, Address, Zip, Price,RoomType, Bathrooms, Sqft, Date_Scraped)\ VALUES(?,?,?,?,?,?,?,?,?)",home.output()) print("--- %s seconds ---" % (time.time() - start_time)) pages_completed +=1 pages_remaining = len(all_pages)-pages_completed print('number of pages remaining: ' + str(pages_remaining)\ + ' . ~Minuntes to completion: ' + str(pages_remaining*2/60)) conn.commit() os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir)) return ("--- %s seconds ---" % (time.time() - start_time))
def tam_data(): s = requests.Session() d = s.get('http://www.tam.com.br') p = bs(d.text, 'html.parser') cookies = { } form = { 'WDS_CORPORATE_SALES': 'FALSE', 'SITE': 'JJBKJJBK', 'LANGUAGE': 'BR', 'WDS_MARKET': 'BR', 'FROM_PAGE': 'HOME_SEARCH', 'B_DATE_1': '201603110000', 'B_DATE_2': '201603140000', 'B_LOCATION_1': 'POA', 'E_LOCATION_1': 'CGH', 'WDS_FORCE_SITE_UPDATE': 'TRUE', 'FORCE_OVERRIDE': 'TRUE', 'TRIP_TYPE': 'R', 'search_from': 'Porto+Alegre+-+Salgado+Filho+Internacional+(POA)', 'search_to': 'Sao+Paulo+-+Congonhas+(CGH)', 'adults': '2', 'children': '0', 'infants': '0', 'CORPORATE_CODE_INPUT': '', 'SEACH_COOKIE': '"{"bounds":[null,null,null,null,null,null,null,null,null,null,{"bLocation":"POA","eLocation":"CGH","bDate":"201603110000"},{"bDate":"201603140000"}],"roundtripCommon":{"tripType":"R","adults":"2","children":"0","infants":"0","mcabin":null}}"' } d = s.post('http://book.tam.com.br/TAM/dyn/air/booking/upslDispatcher;jsessionid=dh9csky6V5pDct8lcQcV_TZaedKzD6Z2LOj4Gg8GH5qvYoRIRXp_!1618028954!549751287', data=form) p = bs(d.text, 'html.parser') return p
def statigr(self): '''simple extraction in json of statigram url should begin by http://statigr.am/ to work json data: * user: - url - stats : media, follower, following * data (list of every image limit to first load): -type -url - stats: like, comment, favorite pict => Should directly take from instagram API :p ''' user_stats_list = zip(bs(self._content).findAll("span",{"class":"chiffre"}), bs(self._content).findAll("span",{"class":"legende"})) img_details = bs(self._content).findAll("div", {"id":re.compile('^detailPhoto.*?$')}) #je mets à la ligne parce qu'on me dit que c'est illisible self._values['name'] = "instagram" self._values['stats'] = dict((y.get_text(), int(x.get_text())) for x, y in user_stats_list) self._values["details"] =[{"img":{ 'type': 'img', 'url':n.find('img')['src'], 'stats':dict((img.get('class')[0], txt2int(img.get_text())) for img in n.findAll('span')) } } for n in img_details ] return self
def set_reserve_param(tr): param = dict(reserve_param) train_info_list = bs(str(tr), 'html.parser').select("td.trnNo > input") train_info_dict = { bs(str(info), 'html.parser').find()['name'].split('[')[0]: bs(str(info),'html.parser').find()['value'] for info in train_info_list } param['dptDt1'] = train_info_dict['dptDt'] param['runDt1'] = train_info_dict['runDt'] param['arvStnConsOrdr1'] = train_info_dict['arvStnConsOrdr'] param['arvStnRunOrdr1'] = train_info_dict['arvStnRunOrdr'] param['arvRsStnCd1'] = train_info_dict['arvRsStnCd'] param['dirSeatAttCd1'] = '000' param['dptRsStnCd1'] = train_info_dict['dptRsStnCd'] param['dptStnConsOrdr1'] = train_info_dict['dptStnConsOrdr'] param['dptTm1'] = train_info_dict['dptTm'] param['jrnySqno1'] = train_info_dict['jrnySqno'] param['locSeatAttCd1'] = "000" param['reqTime'] = int(time.time()*1000) #현재시간 param['rqSeatAttCd1'] = train_info_dict['seatAttCd'] param['stlbTrnClsfCd1'] = train_info_dict['stlbTrnClsfCd'] param['trnGpCd1'] = train_info_dict['trnGpCd'] param['trnNo1'] = train_info_dict['trnNo'] param['trnOrdrNo1'] = train_info_dict['trnOrdrNo'] #화면에서 몇번째 라인에 있던 열차인지 return param
def search_author_publication(self, author_id, show=True, verbose=False): #{{{ search author's publications using authid #TODO: Verbose mode ''' Search author's publication by author id returns a list of dictionaries ''' url = self._search_url_base + 'apikey={}&query=au-id({})&start=0&httpAccept=application/xml'.format(self.apikey, author_id) soup = bs(urlopen(url).read(), 'lxml') total = float(soup.find('opensearch:totalresults').text) print 'A toal number of ', int(total), ' records for author ', author_id starts = np.array([i*25 for i in range(int(np.ceil(total/25.)))]) publication_list = [] for start in starts: search_url = self._search_url_base + 'apikey={}&start={}&query=au-id({})&httpAccept=application/xml'.format(self.apikey, start, author_id) results = bs(urlopen(search_url).read(), 'lxml') entries = results.find_all('entry') for entry in entries: publication_list.append(_parse_xml(entry)) if show: #pd.set_printoptions('display.expand_frame_repr', False) #print df['title'].to_string(max_rows=10, justify='left') df = pd.DataFrame(publication_list) titles = np.array(df['title']) for i in range(titles.size): t = trunc(titles[i]) print '%d)' %i, t # }}} return publication_list
def GetVKUrl(html): soup = bs(html, "html.parser") vk_url = 'http:' + soup.find('div', {'class':'b-video_player'}).find('iframe')['src'] soup = bs(GetHTML(vk_url), "html.parser") video = '' js = soup.find_all('script', {'type': 'text/javascript'})[-1].encode('utf-8') p = re.compile('var vars = (.*?);') js = p.findall(js) json_data = json.loads(js[0]) if 'url240' in json_data: url240 = json_data['url240'] if 'url360' in json_data: url360 = json_data['url360'] if 'url480' in json_data: url480 = json_data['url480'] if 'url720' in json_data: url720 = json_data['url720'] if 'hd' in json_data: hd = json_data['hd'] video = url240 qual = __settings__.getSetting('qual') if int(hd) >= 3 and int(qual) == 3: video = url720 elif int(hd) >= 2 and (int(qual) == 2 or int(qual) == 3): video = url480 elif int(hd) >= 1 and (int(qual) == 1 or int(qual) == 2): video = url360 return video
def GetMyviUrl(html, url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0", "referer": url} with requests.session() as s: # logging.basicConfig(level=logging.DEBUG) # import time # _startTime = time.time() # r = s.get(url) s.headers.update(headers) soup = bs(html, "html.parser") # print "Elapsed time: {:.3f} sec".format(time.time() - _startTime) url = soup.find("div", {"class": "player-area"}).find("iframe")["src"] url = "http:" + url r = s.get(url, allow_redirects=True) UniversalUserID = r.cookies["UniversalUserID"] js = bs(r.text, "html.parser").find("body").find("script", {"type": "text/javascript"}).encode("utf-8") js = "{%s}" % (js.decode("utf-8").split("{", 1)[1].rsplit("}", 1)[0]) js = re.sub(r"([\s*{\s*,])([a-z]\w*):", r'\1"\2":', js) js = js.replace("'", '"') json_data = json.loads(js) api = "http://myvi.ru" + json_data["dataUrl"] r = s.get(api) data = json.loads(r.text) url = data["sprutoData"]["playlist"][0]["video"][0]["url"] r = s.get(url, allow_redirects=False) return ( r.headers["Location"] + "|Cookie=" + urllib.quote_plus(urllib.urlencode({"UniversalUserID": UniversalUserID})) ) return None
def clean_html(html_url): html_content = get_html_content(html_url) if html_content is None: return None soup = bs(html_content, 'lxml').find('body') if soup is None: p1 = html_content.find('<body') p2 = html_content.find('</body>') if p1 < 0 or p2 < 2: return None soup = bs(html_content[p1: p2+7], 'lxml') if soup is None: return None to_extract = soup.findAll('script') for it in to_extract: it.extract() res = soup.get_text()\ .replace('\n', '')\ .replace('\t', '')\ .replace('\r', '')\ .replace('百度', '')\ .strip() res = res[160:] res = res[:-200] return res
def YeniSurumTespit(self): bit = self.Bilgi_Al() if bit == "i686": Kodlar = urllib.urlopen('https://www.mozilla.org/tr/firefox/new/').read() KARISTIR = bs(Kodlar) Bul1 = KARISTIR.find('li', {"class" : "os_linux"}) KARISTIR = bs(str(Bul1)) Bul = KARISTIR.find('a', {"class" : "download-link"}) SATIR = str(Bul).split('\n')[0] for i in SATIR.split(' '): if 'href=' in i: Adres = i Adres = Adres.replace('href=', '', 1) Adres = Adres.replace('>', '', -1) Adres = Adres[1: -1].replace('amp;', '') return Adres elif bit == "x86_64": Kodlar = urllib.urlopen('https://www.mozilla.org/tr/firefox/new/').read() KARISTIR = bs(Kodlar) Bul1 = KARISTIR.find('li', {"class" : "os_linux64"}) KARISTIR = bs(str(Bul1)) Bul = KARISTIR.find('a', {"class" : "download-link"}) SATIR = str(Bul).split('\n')[0] for i in SATIR.split(' '): if 'href=' in i: Adres = i Adres = Adres.replace('href=', '', 1) Adres = Adres.replace('>', '', -1) Adres = Adres[1: -1].replace('amp:', '') return Adres
def scrape_phone_numbers(self): """ Scrape all phone numbers from the currently open page and save them to self.numbers. """ all_numbers = {} try: soup = bs(self.main_driver.page_source) except selenium.common.exceptions.UnexpectedAlertPresentException: try: alert = self.main_driver.switch_to_alert() alert.accept() soup = bs(self.main_driver.page_source) except Exception as e: logger.error("Exception (%s) triggered when extracting source from (%s)" % (e, self.main_driver.current_url) ) return False except Exception as e: logger.error("Exception (%s) triggered when extracting source from (%s)" % (e, self.main_driver.current_url) ) return False extracted_strings = soup.find_all(lambda x: x.name != 'script' and x.name != 'style' and x.name != 'noscript' and x.name != 'iframe', text=lambda x: True) for extracted_string in extracted_strings: for extracted_number in phone_re.findall(extracted_string.text): extracted_number = '-'.join(extracted_number).encode('ascii', 'ignore') extracted_number = re.sub('-{2,}|\A-|-\Z', '', extracted_number ) if len(extracted_number) >= 12: all_numbers[extracted_number] = extracted_number if len(all_numbers): logger.info("Found %s phone numbers at (%s):\n%s" % (len(all_numbers), self.main_driver.current_url, all_numbers.values()) ) return all_numbers.values() else: logger.debug("Found %s phone numbers at (%s)" % (len(all_numbers), self.main_driver.current_url) ) return False
def Percussion (testkey): stafflist = map(lambda x :x.keys()[0],testkey) stafflist = list(set(stafflist)) timesigN = 4 Division = 480 newM = bs('') for stfno in range(0,len(stafflist)): a = filter(lambda a : a if a.keys()[0] in stafflist[stfno] else None ,testkey) no = stafflist[stfno].split('ff')[-1] newM.append(newM.new_tag('Staff',id = no)) mes=bs('');i=1 for x in a: bag =bs('') mes.append(mes.new_tag('Measure',number = str(i))) s = x[stafflist[stfno]] tkno =0 # track number = tkno tkvalue = s.pop('track0') bag.append(PercussionUnit(tkno,tkvalue)) if len(s) > 0: tick = bag.new_tag('tick') # 建構Division tag tick.string= str(Division*timesigN*(i-1)) # Division 480 * timesigN * measure number bag.append(tick) for track in s.keys(): tkno = int(track.split('k')[-1]) tkvalue = x[stafflist[stfno]][track] bag.append(PercussionUnit(tkno,tkvalue)) mes.select('Measure')[i-1].append(bag) i += 1 newM.select('Staff')[stfno].append(mes) return newM
def Tchord(tkno,x): ccc = bs("") if (x[0][0] == str(0) and x[1]=='whole'): TagR = bs("") TagR.append(TagR.new_tag("Rest")) if tkno > 0 : track = TagR.new_tag("track") track.string= str(tkno) TagR.Rest.append(track) durT = TagR.new_tag("durationType") durT.string = "measure" TagR.Rest.append(durT) ccc.append(TagR) elif x[0][0]== str(0): Rtag = bs("") Trest = Rtag.append(Rtag.new_tag("Rest")) if tkno > 0 : track = Rtag.new_tag("track") track.string= str(tkno) Rtag.Rest.append(track) durT = Rtag.new_tag("durationType") durT.string = x[1] Rtag.Rest.append(durT) ccc.append(Rtag) else: Ctag = bs("") Tcho = Ctag.append(Ctag.new_tag("Chord")) if tkno > 0 : track = Ctag.new_tag("track") track.string= str(tkno) Ctag.Chord.append(track) ccc.append(Ctag) if x[2] >0: Tdot = Ctag.new_tag("dots") Tdot.string = str(x[2]) Ctag.Chord.append(Tdot) durT = Ctag.new_tag("durationType") durT.string = x[1] Ctag.Chord.append(durT) ccc.append(Ctag) for i in range(0,len(x[0])): Tnote = Ctag.Chord.append(Ctag.new_tag("Note")) if tkno > 0 : track = Ctag.new_tag("track") track.string= str(tkno) Ctag.Chord.select('Note')[i].append(track) Tpitch = Ctag.new_tag("pitch") Tpitch.string= str(x[0][i]) Ctag.select('Note')[i].append(Tpitch) Ttpc = Ctag.new_tag("tpc") Ttpc.string="22" Ctag.select('Note')[i].append(Ttpc) Tvelo = Ctag.new_tag("velocity") Tvelo.string="100" Ctag.select('Note')[i].append(Tvelo) TvT = Ctag.new_tag("veloType") TvT.string="user" Ctag.select('Note')[i].append(TvT) ccc.append(Ctag) return ccc
def GetVKUrl(html): soup = bs(html, "html.parser") vk_url = "http:" + soup.find("div", {"class": "b-video_player"}).find("iframe")["src"] soup = bs(GetHTML(vk_url), "html.parser") video = "" js = soup.find_all("script", {"type": "text/javascript"})[-1].encode("utf-8") p = re.compile("var vars = (.*?);") js = p.findall(js) json_data = json.loads(js[0]) if "url240" in json_data: url240 = json_data["url240"] if "url360" in json_data: url360 = json_data["url360"] if "url480" in json_data: url480 = json_data["url480"] if "url720" in json_data: url720 = json_data["url720"] if "hd" in json_data: hd = json_data["hd"] video = url240 qual = __settings__.getSetting("qual") if int(hd) >= 3 and int(qual) == 3: video = url720 elif int(hd) >= 2 and (int(qual) == 2 or int(qual) == 3): video = url480 elif int(hd) >= 1 and (int(qual) == 1 or int(qual) == 2): video = url360 return video
def scrapeTweets(matchPattern,fromTime,untilTime): #generator of tweets #definitions for json structure returned from twitter search api: #initialisation of return values: num_tweets_scraped = 0 finished = False while not finished: #print "Query times: " + str(fromTime) + " to " + str(untilTime) print >> sys.stderr, "Query times: " + str(fromTime) + " to " + str(untilTime) query = ts.buildQuery(matchPattern,fromTime,untilTime) response = urllib2.urlopen(query) data = json.load(response) soup = bs(str(data)) tweet_soups = ts.get_tweet_soups(soup) num_tweets_scraped = num_tweets_scraped + len(tweet_soups) for tweet_soup in tweet_soups: if not re.search('Related Searches:',str(bs(str(tweet_soup)))): yield ts.buildTweet(tweet_soup) if len(tweet_soups): untilTime = ts.getTime(tweet_soups[-1]) #get time of last tweet else: print >> sys.stderr, 'Finished getting all tweets, total: ' + str(num_tweets_scraped) finished = True
def mitocheck(gene, screens=('Mitocheck primary screen', 'Mitocheck validation screen'), limit=10, substitute='(),'): """Search Mitocheck database for given gene name (or Ensembl id) and return DataFrame containing download links. """ bsurl = lambda x: bs(urllib2.urlopen(x).read()) request = 'http://mitocheck.org/cgi-bin/mtc?query=%s' % gene x = bsurl(request) y = x.find(title='List all movies/images associated with this gene') if y is None: print 'zero or multiple entries for', gene return None z = bsurl('http://mitocheck.org' + y['href']) df = pd.read_html(str(z.find('table')), header=0)[2].dropna(how='all') df = df[df['Source'].isin(screens)] df = df.groupby('Source').head(10) for ix, movie_id in df['Movie/Image ID'].iteritems(): request = 'http://mitocheck.org/cgi-bin/mtc?action=show_movie;query=%s' % movie_id x = bs(urllib2.urlopen(request).read()) df.loc[ix, 'link'] = x.find_all('a', text=u'Download this movie')[0]['href'] movie_id = int(movie_id) tmp = (df.loc[ix, 'link'].split('/')[-1] .replace('.avi', '.%d.avi' % movie_id)) df.loc[ix, 'avi'] = ''.join([c if c not in substitute else '_' for c in tmp]) return df.drop(df.columns[0], axis=1)
def scrape(): """scrapes everything above""" all_data = [] #MARS NEWS url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' response = req.get(url) soup = bs(response.text, 'html.parser') results = soup.find_all('div', class_="image_and_description_container") counter = 0 for result in results: if counter == 0: news_p = result.find('div', class_="rollover_description_inner").text counter += 1 all_data.append({"news_p": news_p}) #JPL IMAGES executable_path = { "executable_path": 'C:\\Users\\dan.brueckman\\Desktop\\chromedriver.exe' } jpl_link_main = 'www.jpl.nasa.gov' browser = Browser('chrome', **executable_path, headless=True) url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) browser.click_link_by_id('full_image') html = browser.html soup = bs(html, 'html.parser') jpl_results = soup.find_all('a', class_="button fancybox") counter = 0 for result in jpl_results: if counter == 0: featured_image_url = jpl_link_main + result['data-fancybox-href'] counter += 1 all_data.append({"featured_image_url": featured_image_url}) #MARS WEATHER weather_url = 'https://twitter.com/marswxreport?lang=en' response = req.get(weather_url) soup = bs(response.text, 'html.parser') weather_results = soup.find_all('p', class_="TweetTextSize") counter = 0 for result in weather_results: if counter == 0: mars_weather = result.text counter += 1 all_data.append({"weather": mars_weather}) #MARS FACTS facts_url = 'https://space-facts.com/mars/' tables = pd.read_html(facts_url) facts_df = tables[0] facts_df.rename(columns={0: "Profile", 1: "Attributes"}) html_table = facts_df.to_html() all_data.append({"html_table": html_table}) #MARS HEMISPHERES astro_link = 'https://astropedia.astrogeology.usgs.gov' hem_links = [ 'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced', 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced' ] browser = Browser('chrome', **executable_path, headless=True) hemisphere_image_urls = [] for link in hem_links: browser.visit(link) img = browser.find_link_by_partial_href('.tif/full.jpg') img_url = img['href'] response = req.get(link) soup = bs(response.text, 'html.parser') result = soup.find('h2', class_='title') img_title = result.text hemisphere_image_urls.append({"title": img_title, "img_url": img_url}) all_data.append({"hemisphere_images": hemisphere_image_urls}) return all_data
import json # 2. Extrair o conteúdo HTML a partir da URL ----------------------------------------------------------- url_busca_uf = 'http://www.buscacep.correios.com.br/sistemas/buscacep/resultadoBuscaFaixaCEP.cfm' ufs = [ 'AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MG', 'MS', 'MT', 'PA', 'PB', 'PE', 'PI', 'PR', 'RJ', 'RN', 'RO', 'RR', 'RS', 'SC', 'SE', 'SP', 'TO' ] for uf in ufs: payload = {'UF': uf} pagina = requests.post(url_busca_uf, payload) # 3. Parsear o conteúdo HTML utilizando a biblioteca BeautifulSoup --------------------------------- soup = bs(pagina.text, 'html.parser') # 4. Estruturar o conteúdo em um Data Frame utilizando a biblioteca Pandas ------------------------- table_estados = soup.find_all(name='table')[0] df_estados = pd.read_html(str(table_estados))[0] df_estados_salvar = df_estados[['UF', 'Faixa de CEP']] table_localidades = soup.find_all(name='table')[1] df_localidades = pd.read_html(str(table_localidades))[0] df_localidades_salvar = df_localidades[[ 'Localidade', 'Faixa de CEP', 'Situação', 'Tipo de Faixa' ]] # 5. Trasnformar os dados em um dicionário de dados próprio ---------------------------------------- dict_estados_salvar = {} dict_estados_salvar['Estados'] = df_estados_salvar.to_dict('records')
def scrape(html): soup = bs(html, 'html.parser') items = soup.select('.topicsListItem') return items
import requests from bs4 import BeautifulSoup as bs response = requests.get('https://www.hltv.org/matches') soup = bs(response.text, 'lxml') class MatchParser(): days_matches = [] def get_matches_in_dicts(self): for match_day in soup.findAll('div', {'class': 'match-day'}): for match in match_day.findAll('div', {'class': 'match'}): selected_match = [] for child in match.recursiveChildGenerator(): if child.name == 'td': selected_match.append(child.text.strip()) self.days_matches.append({ match_day.find('span', { 'class': 'standard-headline' }).text: selected_match }) def return_matches_in_dicts(self): return (self.days_matches) def print_matches(self): for i in self.days_matches: for j, k in i.items():
def scrape(): browser = init_browser() # Create a dictionary for all of the scraped data mars_dat = {} # Visit the Mars news page. news_url = "https://mars.nasa.gov/news/" browser.visit(news_url) # Search for news # Scrape page into soup html = browser.html soup = bs(html, 'html.parser') # Find the latest Mars news. article = soup.find("div", class_="list_text") news_content = article.find("div", class_="article_teaser_body").text news_title = article.find("div", class_="content_title").text news_date = article.find("div", class_="list_date").text # Add the news date, title and summary to the dictionary mars_dat["news_date"] = news_date mars_dat["news_title"] = news_title mars_dat["summary"] = news_content # While chromedriver is open go to JPL's Featured Space Image page. JPL_url = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(JPL_url) # Scrape the browser into soup and use soup to find the full resolution image of mars # Save the image url to a variable called `featured_image_url` html = browser.html soup = bs(html, 'html.parser') image = soup.find("img", class_="thumb")["src"] img_url = "https://jpl.nasa.gov" + image featured_image_url = img_url # Add the featured image url to the dictionary mars_dat["featured_image_url"] = featured_image_url # ## Mars Weather twitter_url = "https://twitter.com/marswxreport?lang=en" browser.visit(twitter_url) tweets = browser.html tweets_soup = bs(tweets, 'html.parser') Marsweather = tweets_soup.find("div", class_="js-tweet-text-container") Mars_weat = Marsweather.text marswed2 = Mars_weat.replace('\n', ' ') # Add the weather to the dictionary mars_dat["marswed2"] = marswed2 # ## Mars Facts mars_facts = "http://space-facts.com/mars/" browser.visit(mars_facts) import pandas as pd mars_facts_todf = pd.read_html(mars_facts) mars_data = pd.DataFrame(mars_facts_todf[0]) mars_data.columns = ['Mars', 'Data'] mars_table = mars_data.set_index("Mars") marsdata = mars_table.to_html(classes='marsdata') marsdata = marsdata.replace('\n', ' ') # Add the Mars facts table to the dictionary mars_dat["marsdata"] = marsdata # Visit the USGS Astogeology site and scrape pictures of the hemispheres USGS_link = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(USGS_link) import time html = browser.html soup = bs(html, 'html.parser') mars_pictures = [] for i in range(4): time.sleep(5) images = browser.find_by_tag('h3') images[i].click() html = browser.html soup = bs(html, 'html.parser') partial = soup.find("img", class_="wide-image")["src"] img_title = soup.find("h2", class_="title").text img_url = 'https://astrogeology.usgs.gov' + partial dictionary = {"title": img_title, "img_url": img_url} mars_pictures.append(dictionary) browser.back() mars_dat["mars_pictures"] = mars_pictures print(mars_dat) print("this is the type: ", type(mars_dat)) # Return the dictionary return mars_dat
myCursor = myDB.cursor() def selectAllDigimon(): sql = 'select * from digimon' myCursor.execute(sql) x = myCursor.fetchall() print(type(x)) for data in x: print(data) url = "http://digidb.io/digimon-list/" dataDigimon = requests.get(url).content # print(dataultra) dataDigimon = bs(dataDigimon, 'html.parser') listDigi = [] temList = [] for i in dataDigimon.find_all('td'): cek = str(i.text) print(cek, end=' ') if cek[0] == ' ': print('y') temList.append(cek.replace('\xa0', '')) # replace some noise character counter += 1 # get each row by counting the column. end the nCols is 13 if (counter % 13 == 0): listDigi.append(temList) temList = []
import requests from bs4 import BeautifulSoup as bs import re import matplotlib.pyplot as plt from wordcloud import WordCloud #creating an empty review list redmi_reviews = [] for i in range (1,21): ip=[] url ="https://www.amazon.in/Redmi-Pro-Blue-64GB-Storage/product-reviews/B07DJHR5DY/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber="+str(i) response = requests.get(url) soup = bs(response.content,"html.parser") reviews = soup.findAll("span",attrs = {"class","a-size-base review-text review-text-content"}) for i in range(len(reviews)): ip.append(reviews[i].text) redmi_reviews = redmi_reviews+ip ##Writing reviews in a text file with open("redmi.txt","w",encoding = 'utf-8') as output: output.write(str(redmi_reviews)) import os os.getcwd() ## Joining all the reviews into single paragraph red_rev_string = " ".join(redmi_reviews)
from sqlalchemy import create_engine from flask import Flask, jsonify, render_template from flask_sqlalchemy import SQLAlchemy app = Flask(__name__) ## Data scrape: pull latest fire ## commenting out due to trouble with heroku loading beautiful soup ## note: in order to restore, also modify the "/" route & the index page url_incident = "https://inciweb.nwcg.gov/feeds/rss/incidents/" r = requests.get(url_incident) soup = bs(r.text, "lxml-xml") title = soup.find_all("title") date = soup.find_all("pubDate") print("Latest Fire Information") title = title[1].text date = date[0].text ## Setting up flask routes @app.route("/") def index():
def get_soup(url): raw = remove_non_ascii(get(url).content) soup = bs(raw) return soup.select("#MainTxt")[0].select('.ds-single')[0].text.strip()
from os import getcwd from os.path import join from bs4 import BeautifulSoup as bs import requests as req from splinter import Browser import pandas as pd # In[ ]: #MARS NEWS: # In[186]: url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' response = req.get(url) soup = bs(response.text, 'html.parser') # In[187]: results = soup.find_all('div', class_="image_and_description_container") # In[188]: counter = 0 for result in results: if counter == 0: try: news_p = result.find('div', class_="rollover_description_inner").text if (news_p): print(news_p)
def post_to_slack(url, r): payload = u'Noe nytt har skjedd på Blank: <' + url + '>' slack_data = {'text': payload} response = r.post( WEBHOOK_URL, data=json.dumps(slack_data), headers={'Content-Type': 'application/json'} ) if response.status_code != 200: raise ValueError( 'Request to slack returned an error %s, the response is:\n%s' % (response.status_code, response.text) ) page = r.get('https://blank.p3.no') content = page.content soup = bs(content, "html.parser") articles = soup.find_all('article') bylines = [byline.text.replace('\n','') for byline in soup.select('.byline')] link = articles[0].find_all('a', href=True)[0]['href'] cursor.execute("SELECT * FROM updates ORDER BY ID DESC") records = cursor.fetchall() if not records: cursor.execute("INSERT INTO updates (title) VALUES (%s) ON CONFLICT DO NOTHING", ["Blankbot ass."]) conn.commit() print("Måtte lissom legge noe i tabellen a") if records: if records[0][1] != link: post_to_slack(link, r) cursor.execute("INSERT INTO updates (title) VALUES (%s)", [link])
def treatment(): with open('source_code.txt', 'r') as f: data = f.read() cont = bs(data, 'lxml') dias = [] first_content = cont.find( 'p', class_='TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text') dias.append(first_content) contents = cont.findAll('p', class_='TweetTextSize js-tweet-text tweet-text') dias.extend(contents) index = dias[0].text.find("F") lista = [] lista.append(dias[0].text[21:]) for i in dias[1:]: lista.append(i.text[31:]) start = lista[1].find(':') end = lista[1].find('\n') ''' - Flexões - Abdominais - Mergulhos - Agachamentos - Prancha ''' flex = [] abdo = [] merg = [] agac = [] prancha = [] for i in lista: flex.append(int(i[start + 1:end].strip())) start = lista[1].find(':', start + 1) end = lista[1].find('\n', end + 1) for i in lista: abdo.append(int(i[start + 1:end].strip())) start = lista[1].find(':', start + 1) end = lista[1].find('\n', end + 1) for i in lista: merg.append(int(i[start + 1:end].strip())) start = lista[1].find(':', start + 1) end = lista[1].find('\n', end + 1) for i in lista: agac.append(int(i[start + 1:end + 1].strip())) start = lista[1].find(':', start + 1) end = lista[1].find('\n', end + 1) for i in lista: res = str(i[start + 2:].strip()) prancha.append(int(res[:-1])) return flex, abdo, merg, agac, prancha
5. Start Date/Time(Enter date and time in different columns) 6. End Date/Time(Enter date and time in different columns) Store the information into a database mySQL Database on cloud """ bid_no=[] item_name=[] department_name=[] date=[] import mysql.connector from bs4 import BeautifulSoup as bs import requests url = "https://bidplus.gem.gov.in/bidlists" source = requests.get(url).text soup=bs(source,"lxml") soup.prettify() html_data=soup.findAll('div', class_='border block') for row in html_data: cells = row.find('div', class_='block_header') # first row has 7 TH detail = row.findAll('div', class_='col-block') bid_no.append(cells.p.text.strip("")) item_name.append(detail[0].p.text.strip("")) department_name.append(detail[1].text.strip("\n")) date.append(detail[2].text.strip("\n")) conn = mysql.connector.connect(user='******', password='******',
def scrape(): final_dict = {} # Finding Most recent title and summary url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' response = requests.get(url) soup = bs(response.text, 'lxml') # list_text = soup.find('div', class_='list_text') # news_title = list_text.find('div', class_='content_title') # final_dict['title'] = news_title # final_dict['text'] = list_text news_title = soup.find('div', class_='content_title').text news_title.replace("\n", "") final_dict['news_title'] = news_title list_text = soup.find('div', class_='rollover_description').text list_text.replace("\n", "") final_dict['list_text'] = list_text # Find the spaceimage using splinter executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') soup feat_img = soup.find("div", class_='carousel_items').find( "article", class_='carousel_item').find("a")['data-fancybox-href'] full_url = f'https://www.jpl.nasa.gov{feat_img}' final_dict['feat_img'] = full_url # close browser browser.quit() # Scraping twitter url = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'lxml') tweet = soup.find( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text final_dict['tweet'] = tweet # Scraping Mars Facts using pandas url = 'https://space-facts.com/mars/' tables = pd.read_html(url) mars_facts = tables[0] mars_facts.to_html('mars_facts.html') final_dict['mars_fact'] = mars_facts.to_html() # Scraping Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' response = requests.get(url) soup = bs(response.text, 'lxml') title = [] hemi = soup.find_all('div', class_='description') hemi = soup.find_all('h3') hemi[0].text.replace('Enhanced', '') for i in hemi: title.append(i.text.replace('Enhanced', '')) new_url = [] for a in soup.find_all('a', class_='itemLink product-item', href=True): new_url.append(a['href']) hemi_url = 'https://astrogeology.usgs.gov/' imgs = [] for i in new_url: response = requests.get(f'{hemi_url}{i}') soup = bs(response.text, 'lxml') finder = soup.find('img', class_='wide-image') full_url_img = f'{hemi_url}{finder["src"]}' imgs.append(full_url_img) hemi_list = [] hemi_dict = {} y = 0 for i in title: hemi_dict = {'title': i, 'images_url': imgs[y]} hemi_list.append(hemi_dict) y += 1 final_dict['hemi_list'] = hemi_list return final_dict
from urllib.request import * from bs4 import BeautifulSoup as bs response = urlopen("https://imdb.com") html = bs(response, 'lxml') images = html.find_all('img') # print(len(images)) for image in images: # print(image['src']) pass for i in range(len(images)): url = images[i]['src'] fileName = f"{i + 1}.{url[-3:]}" urlretrieve(url, fileName)
def write_coord_csv(html): seen = set() file = open("out.csv", "w") file.write("text_content, top_left, top_right, bottom_left, bottom_right") file.write("\n") driver = webdriver.Chrome() driver.get("data:text/html;charset=utf-8," + html) # annotate xpos and ypos for images for img_sel_elem in driver.find_elements_by_tag_name("img"): center = (img_sel_elem.location["x"] + img_sel_elem.size["width"] / 2, img_sel_elem.location["y"] + img_sel_elem.size["height"] / 2) driver.execute_script( "arguments[0].setAttribute('xpos','%s')" % str(int(center[0])), img_sel_elem) driver.execute_script( "arguments[0].setAttribute('ypos','%s')" % str(int(center[1])), img_sel_elem) # remove all scripts soup = bs(driver.page_source, "html5lib") [x.extract() for x in soup.findAll('script')] # remove all comments comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] e = etree.HTML(str(soup)) tree = etree.ElementTree(e) text_elements = [ element for element in e.getiterator() if element.text and len(element.text) > 1 ] # extract all text and annotate xpos and ypos for text for elem in text_elements: xpath = tree.getpath(elem) text_content = elem.text if xpath not in seen: seen.add(xpath) element = driver.find_element_by_xpath(xpath) area = element.size["width"] * element.size["height"] if area > 0: text_content = "%s" % element.text text_content.replace("\n", " ").replace("\r", " ").replace("\"", """) if len(text_content) > 1: top_left = (element.location["x"], element.location["y"]) top_right = (element.location["x"] + element.size["width"], element.location["y"]) bottom_left = (element.location["x"], element.location["y"] + element.size["height"]) bottom_right = (element.location["x"] + element.size["width"], element.location["y"] + element.size["height"]) center = (element.location["x"] + element.size["width"] / 2, element.location["y"] + element.size["height"] / 2) elem.set("xpos", str(int(center[0]))) elem.set("ypos", str(int(center[1]))) line = "\"{0}\",\"{1}\",\"{2}\",\"{3}\",\"{4}\"".format( text_content, top_left, top_right, bottom_left, bottom_right) seen.add(text_content) file.write(line) file.write("\n") new_html = open("out.html", "w") new_html.write(tostring(e)) new_html.close()
time.sleep(delay) while True: try: driver.find_element_by_xpath( "//button[@ng-click='vm.pagginator.showmorepage()']" ).click() time.sleep(2) # if DEBUG: # print("Clicked Successfully") except Exception as e: # if DEBUG: # print(e) break html = driver.execute_script( "return document.documentElement.outerHTML") soup = bs(html, 'html.parser') products = soup.findAll("div", {"qa": "product"}) rel_url = re.sub(r"/?.*", "", url) rel_url = rel_url.lstrip('https://www.bigbasket.com/pc/') ds_img = os.path.join(OUTPUT_DIR, 'images', 'large') dl_img = os.path.join(OUTPUT_DIR, 'images', 'small') if not os.path.exists(ds_img): os.makedirs(ds_img) if not os.path.exists(dl_img): os.makedirs(dl_img) for product in products: get_product_data(product, raw_data_file, url)
c_idpass = list(csv.reader(f_idpass)) minute_limit = 0 for idpass in c_idpass: try: un = idpass[0].strip() pw = idpass[1].strip() # un = l[(2*i)-2] # ps = l[(2*i)-1] with Session() as s: site = s.get("http://10.220.20.12/index.php/home/loginProcess") bs_content = bs(site.content, "html.parser") login_data = {"username": un, "password": pw} s.post("http://10.220.20.12/index.php/home/loginProcess", login_data) home_page = s.get( "http://10.220.20.12/index.php/home/dashboard") soup = bs(home_page.content, "lxml") table = soup.table c = 1 li = [] try: table_rows = table.find_all('tr') for tr in table_rows:
from bs4 import BeautifulSoup as bs from urllib import request as req import pandas as pd # Change the number of pages to the page until you want the data number_of_pages = 1 l_books = { "name": [], "author": [] } for i in range(1, number_of_pages+1): url = req.urlopen('https://archive.org/details/internetarchivebooks?&sort=-downloads&page=%d' % i) soup = bs(url, 'html5lib') titles = [] titles = [title for title in soup.find_all("div", {"class" : "ttl"})] for t in titles: try: fill_text = "" # print(t.parent.parent.find_next_siblings("div")[1].contents[3].text) fill_text = t.parent.parent.find_next_siblings("div")[1].contents[3].text except Exception: fill_text = " " finally: l_books["name"].append(' '.join(t.text.split())) l_books["author"].append(' '.join(fill_text.split())) # l_books.append({str(' '.join(t.text.split())) : str(' '.join(fill_text.split()))}) # books[' '.join(t.text.split())] = ' '.join(fill_text.split())
def scrape_info(): # adding the dictionary from where all the data is going to be shown final_dictionary = {} # calling the URL url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" # calling the executable path for Chrome extention executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) time.sleep(5) # getting the html file of the browser html = browser.html soup = bs(html,'html.parser') # creating a variable to store the article selected one_article = soup.find('li', class_='slide') # getting the text and the information required for the data title_news = one_article.find(class_="content_title").text paragraph = one_article.find(class_="article_teaser_body").text # print(paragraph) # appending to the final dictionary as with the key latest_news final_dictionary['latest_news'] = [title_news,paragraph] browser.quit() # -------------------------------------------- # -------------------------------------------- url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) button = browser.find_by_css('.button').first.click() featured_img = browser.find_by_css('.fancybox-image') # html = featured_img.html # soup = bs(html,'html.parser') time.sleep(3) # print(featured_img['src']) # featured_img = browser.find_by_tag('img') img[src]')['src'] # featured_img_2 = featured_img.find('img')['src'] # featured_img_3 = featured_img_2['src'] featured_img_url = featured_img['src'] final_dictionary['Featured_image']=featured_img_url browser.quit() # this part of the code deals with twitter having different versions when oppen in the browser # it will run a loop ultil the rigth version is open. # it will scrape what it needs and get out of the loop flag = False while flag == False: try: url = "https://twitter.com/marswxreport?lang=en" executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) time.sleep(5) html = browser.html soup = bs(html,'html.parser') mars_weather = soup.find('p',class_='tweet-text').text final_dictionary['Mars_weather']=mars_weather browser.quit() # print(final_dictionary) flag = True except: print('Wrong twitter version trying again') flag = False browser.quit() url = 'https://space-facts.com/mars/' tables = pd.read_html(url) table_1 = tables[0] table_1.set_index(0, inplace=True) table_1_html = table_1.to_html().replace('\n', '') final_dictionary['Facts_table'] = table_1_html url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) # ----------------------------------------------------- # ----------------------------------------------------- hemisphere_image_urls = [] for x in range(4): # print(x) button = browser.find_by_css('h3')[x].click() html = browser.html soup = bs(html,'html.parser') image = soup.find('div', class_='downloads') image = image.find('a')['href'] title = soup.find('h2', class_='title').text hemisphere_image_urls.append({'title': title, 'img_url': image}) browser.back() time.sleep(5) final_dictionary['hemisfere_images'] = hemisphere_image_urls browser.quit() # return final_dictionary return final_dictionary # print(scrape_info())
def indices(category): cat = { "market_cap/broad": "1,2", "sector_and_industry": "2,2", "thematics": "3,2", "strategy": "4,2", "sustainability": "5,2", "volatility": "6,1", "composite": "7,1", "government": "8,1", "corporate": "9,1", "money_market": "10,1" } try: ddl_category = cat[category] except KeyError: print(''' ### Invalid category ### Use one of the categories mentioned below: market_cap/broad sector_and_industry thematics strategy sustainability volatility composite government corporate money_market ''') return baseurl = '''https://m.bseindia.com/IndicesView_New.aspx''' res = requests.get(baseurl, headers=headers) c = res.content soup = bs(c, "lxml") options = { '__EVENTTARGET': 'ddl_Category', '__VIEWSTATEENCRYPTED': '', '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATEGENERATOR': '162C96CD', 'UcHeaderMenu1$txtGetQuote': '' } for input in soup("input"): try: if(input['type'] == "hidden"): if(input['id'] == '__VIEWSTATE'): options['__VIEWSTATE'] = input['value'] elif(input['id'] == '__EVENTVALIDATION'): options['__EVENTVALIDATION'] = input['value'] except KeyError: continue options['ddl_Category'] = ddl_category res = requests.post(url=baseurl, data=options) c = res.content soup = bs(c, "lxml") indices = [] for td in soup('td'): try: if(td['class'][0] == 'TTRow_left'): index = {} index['currentValue'] = td.next_sibling.string.strip() index['change'] = td.next_sibling.next_sibling.string.strip() index['pChange'] = td.next_sibling.next_sibling.next_sibling.string.strip() index['scripFlag'] = td.a['href'].strip().split('=')[1] index['name'] = td.a.string.strip().replace(';', '') indices.append(index) except KeyError: continue results = {} for span in soup("span", id="inddate"): results['updatedOn'] = span.string[6:].strip() results['indices'] = indices return results
def scrape(): executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) mars_data = {} # NASA Mars News url = 'https://redplanetscience.com/' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') # news_title = soup.find('div', class_='content_title').text # news_p = soup.find('div', class_='article_teaser_body').text mars_data['news_title'] = soup.find('div', class_='content_title').text mars_data['news_p'] = soup.find('div', class_='article_teaser_body').text # JPL Mars Space Images image_url = 'https://spaceimages-mars.com/' browser.visit(image_url) # HTML Object html = browser.html # Parse HTML with Beautiful Soup soup = bs(html, 'html.parser') featured_image_url = soup.find('img', class_='headerimage fade-in') #featured_image_url = image_url + featured_image_url['src'] mars_data['featured_image_url'] = image_url + featured_image_url['src'] ## Mars Facts url = 'https://galaxyfacts-mars.com/' tables = pd.read_html(url) df = tables[0] new_header = df.iloc[0] df = df[1:] df.columns = new_header df.set_index('Mars - Earth Comparison', inplace=True) html_table = df.to_html() mars_data['table'] = html_table ## Mars Hemispheres url = 'https://marshemispheres.com/' browser.visit(url) hemisphere_image_urls = [] hem_url = browser.find_element_by_css_selector( 'a', class_='itemLink product-item') for item in range(len(hem_url)): hemisphere = {} browser.find_by_css("a.product-item h3").click() mars_data["hem_title"] = browser.find_by_css("h2.title").text sample_element = browser.find_link_by_text("Sample").first mars_data["hem_img_url"] = sample_element["href"] # Append Hemisphere Object to List hemisphere_image_urls.append(hemisphere) # Navigate Backwards browser.back() # Quit the browser browser.quit() return mars_data
import requests import urllib.request from bs4 import BeautifulSoup as bs import csv import pandas # Set the URL you want to webscrape from url = 'http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases' # Connect to the URL response = requests.get(url) # Parse HTML and save to BeautifulSoup object¶ soup = bs(response.text, "html.parser") li = soup.findAll('li', 'flex-item') a = [i.find('a') for i in li] name = [name.get_text(strip=True) for name in a] img = ['http://www.agriculture.gov.au' + img.find('img')['src'] for img in a] href = [href['href'] for href in a] origin = [] pest_type = [] au_legal = [] for i in href: x = i.split('/') if '' == x[-1]: del x[-1] if '/pests/' in i: pest_type.append(x[-1]) else: pest_type.append('-') origin.append(x[-2]) if '.au' in i: au_legal.append('Yes')
from bs4 import BeautifulSoup as bs from db import Database site_url = 'https://www.edimdoma.ru' page_no = 1 db = Database() if not os.path.exists('img'): os.mkdir('img') while True: url = '{site_url}/retsepty?page={page_no}'.format(site_url=site_url, page_no=page_no) page = requests.get(url) html = bs(page.content, 'html.parser') for card in html.select('.card'): try: card_url = card.find('a').attrs['href'] recipe_url = site_url + card_url except: continue card_page = requests.get(recipe_url) card_html = bs(card_page.content, 'html.parser') imgs = card_html.findAll("div", {"class": "thumb-slider__image-container"}) if imgs:
cookie_jar = get_cookies(keys['id'], keys['pw']) for cookie in cookie_jar: s.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain']) # 1페이지부터 5페이지 내 게시글 URL 가져오기 url_list = [] for pageno in range(1, 6): print('Page: ', pageno) res = s.get( 'http://cafe.naver.com/ArticleList.nhn?search.clubid={clubid}&search.menuid={menuid}&search.boardtype=L&search.page={pageno}'.format( clubid='28385054', menuid='53', pageno=pageno ) ) soup = bs(res.text, 'lxml') article_link_list = soup.select('td.board-list span > a') for article in article_link_list: article_url = article['href'] url_list.append(article_url) print('URL counter: ', len(url_list)) # 중복 URL 거르기 url_list = set(url_list) print('전체 URL개수: ', len(url_list)) # 앞서 가져온 URL 내용 가져오기 (제목, 본문) contents_list = [] for url in url_list: url = 'http://cafe.naver.com' + url res2 = s.get(url)
def scrape_info(): # Set executable path and browser for splinter executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Save url as a variable url = "https://mars.nasa.gov/news/" # Visit url using splinter browser.visit(url) # Use sleep function in time module to wait for page to fully load time.sleep(10) # Save browser contents in html as variable page = browser.html # Create and parse BeautifulSoup object soup = bs(page, "html.parser") # Collect latest news title news_title = soup.find("div", class_="content_title").text # Strip whitespace news_title = news_title.strip() # Collect corresponding paragraph text news_p = soup.find("div", class_="rollover_description_inner").text #Strip whitespace news_p = news_p.strip() # Visit url using splinter url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) # Find featured image featured_img = browser.find_by_css("article.carousel_item").first # Set base url to concatenate with image url base_url = "https://www.jpl.nasa.gov" # Select image reference from featured_img featured_image_url = featured_img["style"] # Split with " delimiter to remove extra text featured_image_url = featured_image_url.split('"')[1] # Concatenate with base url to get full url featured_image_url = f"{base_url}{featured_image_url}" # Save url as a variable url = "https://twitter.com/marswxreport?lang=en" # Visit url using splinter browser.visit(url) # Save html contents as a variable page = browser.html # Create and parse BeautifulSoup object soup = bs(page, "html.parser") # Find latest mars weather and save as a variable # Link text needs to be removed, so do not include .text in find mars_weather = soup.find("p", class_="TweetTextSize") # Find unwanted link text remove_link = soup.find("a", class_="twitter-timeline-link u-hidden") # Extract link text from mars_weather remove_link.extract() # Convert to text and replace line breaks with spaces mars_weather = mars_weather.text mars_weather = mars_weather.replace("\n", " ") # Quit browser session browser.quit() # Use pandas to scrape Mars facts website url = "https://space-facts.com/mars/" tables = pd.read_html(url)[1] # Format dataframe tables = tables.rename(columns={0: "Description", 1: "Value"}) tables = tables.set_index("Description") # Convert to html html_table = tables.to_html() html_table # Save url as a variable url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" # Retrieve page with requests response = requests.get(url) # Create and parse BeautifulSoup object soup = bs(response.text, "html.parser") # Find all divs containing hemisphere image links link_list = soup.find_all("a", class_="itemLink") # Set base url used to create links base_url = "https://astropedia.astrogeology.usgs.gov/download" # Extract links from divs cerberus = f"{base_url}{link_list[0]['href'].replace('/search/map', '')}.tif/full.jpg" schiaparelli = f"{base_url}{link_list[1]['href'].replace('/search/map', '')}.tif/full.jpg" syrtis = f"{base_url}{link_list[2]['href'].replace('/search/map', '')}.tif/full.jpg" valles = f"{base_url}{link_list[3]['href'].replace('/search/map', '')}.tif/full.jpg" links = [cerberus, schiaparelli, syrtis, valles] titles = [] # Loop through soup results to get titles for each hemishphere for item in link_list: title = item.find("img") title = title["alt"] title = title.replace(" Enhanced thumbnail", "") titles.append(title) # Create empty list to store dictionaries for each title and url hemisphere_image_urls = [] # Loop through links and titles and add dictionaries to list for item in range(len(links)): hemisphere_image_urls.append({ "title": titles[item], "url": links[item] }) # Create dictionary for all Mars data mars_data = { "headline": news_title, "subhead": news_p, "featured": featured_image_url, "weather": mars_weather, "table": html_table, "hemispheres": hemisphere_image_urls } # Return results in a single dictionary return mars_data
def find_folder_table(self,html): three = bs(html, "html.parser") folders = three.find('table',{"id":"ctl00_ContentPlaceHolder_ProcessFolderGrid_T"}) return folders
# accessing website data via soup object and # feeding the data as msg to our Notification # A string corresponds to a bit of text "within a html tag" , and # The split() method splits a string into a list. You can specify the separator, # default separator is any whitespace. # Beautiful Soup converts html doc into a tree data structure and parses it # Sending http Requests htmldata = getdata("https://covid-19tracker.milkeninstitute.org/") html = getdata("https://www.worldometers.info/coronavirus/country/india") html_gov = getdata("https://www.mohfw.gov.in/") # creation of soup objects for Web Scrapping soup1 = bs(htmldata, "html.parser") soup2 = bs(html, "html.parser") soup3 = bs(html_gov, "html.parser") # WEB SCRAPPING : to find Vaccine Names # Some Prints to Check the location in result str containing the Vaccine Names # print (result[46:86]) # printing vaccine names to the console # WEB SCRAPPING : to find the No. of Deaths and New Cases cases = list(soup2.find("li", {"class": "news_li"}))[0].text.split()[0] deaths = list(soup2.find("li", {"class": "news_li"}))[2].text.split()[0] # print (deaths ) # WEB SCRAPPING : to fetch total and recovered cases and recovery % tot_cases = list(soup2.find("div", {"class": "maincounter-number"}))[1].text.split()[0]
def scrape(): browser = init_browser() nasa_url = "https://mars.nasa.gov/news/" browser.visit(nasa_url) time.sleep(1) html = browser.html news_soup = bs(html, "html.parser") title = news_soup.find("div", class_="content_title").text article_summary = news_soup.find("div", class_="article_teaser_body").text print(f"Article Title: {title}") print(f"Summary: {article_summary}") image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(image_url) time.sleep(1) browser.find_by_css('a.button').click() image_soup = bs(browser.html, 'html.parser') end = image_soup.find('img', class_='fancybox-image')['src'] JPL_image = "https://www.jpl.nasa.gov" + end print(JPL_image) weather_url = "https://twitter.com/marswxreport?lang=en" browser.visit(weather_url) time.sleep(1) html = browser.html weather_soup = bs(html, "html.parser") tweet = weather_soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text print(tweet) fact_url = "http://space-facts.com/mars/" browser.visit(fact_url) time.sleep(1) html = browser.html fact_soup = bs(html, "html.parser") table = pd.read_html(fact_url) table[0] df_mars_facts = table[0] df_mars_facts fact_html = df_mars_facts.to_html() fact_html = fact_html.replace("\n", "") fact_html hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemi_url) time.sleep(1) html = browser.html hemi_soup = bs(html, "html.parser") headers = [] titles = hemi_soup.find_all('h3') for title in titles: headers.append(title.text) images = [] count = 0 for thumb in headers: browser.find_by_css('img.thumb')[count].click() images.append(browser.find_by_text('Sample')['href']) browser.back() count = count + 1 hemisphere_image_urls = [] counter = 0 for item in images: hemisphere_image_urls.append({ "title": headers[counter], "img_url": images[counter] }) counter = counter + 1 data = { "News_Header": title, "News_Article": article_summary, "JPL_Image": JPL_image, "Weather": tweet, "Facts": fact_html, "Hemispheres": hemisphere_image_urls } return data
def scrape(): browser = init_browser() # Visit the Nasa site news_url = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module response = requests.get(news_url) # Create BeautifulSoup object; parse with 'lxml' soup = bs(response.text, 'html.parser') # Extract the title of the news article title = soup.find('div', class_="content_title").text.strip() # Extract the teaser paragraph about the news article paragraph = soup.find( 'div', class_="image_and_description_container").text.strip() # visit the Nasa Images site nasa_images_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(nasa_images_url) # Extract the url of the featured image image_html = browser.html soup = bs(image_html, 'html.parser') article = soup.find('a', class_='button fancybox') href = article['data-fancybox-href'] featured_image_url = "https://www.jpl.nasa.gov" + href # Visit the Mars Weather Twitter page weather_url = 'https://twitter.com/marswxreport?lang=en' from selenium import webdriver driver = webdriver.Chrome() driver.get(weather_url) html = driver.page_source driver.close() # Extract the current weather on Mars weather_html = browser.html soup = bs(html, 'html.parser') mars_weather = soup.find( 'div', class_= "css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0" ).text.strip() # Visit the Space Facts page about Mars facts_url = 'https://space-facts.com/mars/' browser.visit(facts_url) # Extract the Mars Facts table as a Pandas dataframe table = pd.read_html(facts_url) profile = table[0] profile_df = profile.rename(columns={0: 'Description', 1: 'Value'}) facts = [] for index, row in profile_df.iterrows(): desc = row['Description'] value = row['Value'] fact = {'description': desc, 'value': value} facts.append(fact) # Visit the USGS Astrogeology site hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemisphere_url) # Extract the name of each of Mars's hemispheres and the url of the image of that hemisphere, then insert into MongoDB hemisphere_html = browser.html soup = bs(hemisphere_html, 'html.parser') results = soup.find_all('div', class_="item") hemisphere_image_urls = [] for result in results: heading = result.find('h3').text.replace('Enhanced', '') link = result.find('a')['href'] url = "https://astrogeology.usgs.gov" + link browser.visit(url) image_html = browser.html soup = bs(image_html, 'html.parser') img_url = soup.find('div', class_="downloads").find('a')['href'] print(heading) print(img_url) hemisphere = {'title': heading, 'img_url': img_url} hemisphere_image_urls.append(hemisphere) mars_data = { "news_title": title, "news_paragraph": paragraph, "featured_image": featured_image_url, "mars_weather": mars_weather, "mars_facts": facts, "hemisphere_image_urls": hemisphere_image_urls } browser.quit() return mars_data