def run_for_browser(server, frmvars, browser, extra_params=None): if extra_params is None: extra_params = {} url = server['url'] TestBase.log(1, 'BROWSER', browser) br = Browser(browser, **extra_params) if TestMode.remote_testdb_on(br, server): # default tests test_obj = TestConfiguredUsers(br, url, frmvars) test_obj.run() # user defined tests from modules/_plugin_splinter_tests for TestClass in TESTCLASSES: if frmvars['all_tests'] or frmvars.get('test_' + TestClass, False): TestBase.log(2, 'TESTCLASS', TestClass) test_obj = globals()[TestClass](br, url) #** see imports test_obj.run() # seems not necessary and not good here: TestMode.remote_testdb_off(br, server) else: TestBase.log(2, 'FATAL', 'Cannot log in.') br.quit() print
class WebKit(object): '''WebKit引擎''' def __init__(self): self.tag_attr_dict = {'*':'href', 'embed':'src', 'frame':'src', 'iframe':'src', 'object':'data'} def extract_links(self,url): ''' 抓取页面链接 ''' self.browser = Browser("phantomjs") try: self.browser.visit(url) except Exception,e: return for tag,attr in self.tag_attr_dict.iteritems(): link_list = self.browser.find_by_xpath('//%s[@%s]' % (tag,attr)) if not link_list: continue for link in link_list: link = link.__getitem__(attr) if not link: continue link = link.strip() if link == 'about:blank' or link.startswith('javascript:'): continue if not link.startswith('http'): link = urlparse.urljoin(url,link) yield link
class editcompany(object): def __init__(self,browser_type): self.browser_type=browser_type self.browser=Browser(self.browser_type) self.site='http://www.kuaixiuagency.com'#登录页面 self.browser.visit(self.site) def config(self,name='',type=''): self.name=name self.type=type def verify(self,case): pass if case==1:#正确更改公司有关内容可以编辑成功 pass if True:#验证公司名称和类型是否和输入一致 self.browser.quit() return u'通过' else: self.browser.quit() return u'失败' elif case==2:#删除公司名称,选择其他类型不能编辑成功 pass#验证不能提交 self.browser.quit() elif case==3:#编辑公司名称,不选择公司类型不能编辑成功 pass#验证不能提交 self.browser.quit()
def chrome_installed(): try: browser = Browser("chrome") browser.quit() except WebDriverException: return False return True
class newgroup(object): def __init__(self,browser_type): self.browser_type=browser_type self.site='http://www.kuaixiuagency.com'#登录页面 self.url=''#新增分组页面 def config(self,name='',admin=''): self.name=name self.admin=admin def verify(self,case): self._fill_form() if case==1:#输入分组名称,选择管理员可以新增分组成功 pass if True:#验证输入的名称是否和输入一致,选择的管理员是否一致 self.browser.quit() return u'通过' else: self.browser.quit() return u'失败' elif case==2:#输入分组名称,不选择管理员不能新增成功 pass#验证不能提交 self.browser.quit() elif case==3:#选择管理员不输入分组名称不能新增成功 pass#验证不能提交,有错误提示 self.browser.quit() elif case==4:#输入已有的分组名称不能新增成功 pass#不能提交 self.browser.quit() def _login(self):#登录 self.browser=Browser(self.browser_type) self.browser.visit(self.site) def _fill_form(self):#管理员身份登录 self._login()
def test_find_working_stream_fail(self): browser = Browser("firefox", extensions=["adblock.xpi"]) browser.visit("http://kinox.to/Stream/The_Big_Bang_Theory.html") stream = script.watchSeries("http://kinox.to/Stream/The_Big_Bang_Theory.html", None) sleep(5) assert not stream.find_working_stream(browser) browser.quit()
def test_find_working_stream_success(self): browser = Browser("firefox", extensions=["adblock.xpi"]) browser.visit("https://kinox.to/Stream/Poltergeist-2.html") stream = script.watchSeries("https://kinox.to/Stream/Poltergeist-2.html", None) sleep(5) assert stream.find_working_stream(browser) browser.quit()
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080}) self.process.start() time.sleep(1) def test_visit_index(self): self.browser.visit("http://0.0.0.0:8080/") self.assertEqual(self.browser.url, "http://0.0.0.0:8080/") def test_visit_browse(self): self.browser.visit("http://0.0.0.0:8080/browse") self.assertEqual(self.browser.url, "http://0.0.0.0:8080/browse") def test_visit_about(self): self.browser.visit("http://0.0.0.0:8080/about") self.assertEqual(self.browser.url, "http://0.0.0.0:8080/about") def test_visit_redirect(self): """ When a non-existent song url is requested, the browser should be redirected to the browse page """ self.browser.visit("http://0.0.0.0:8080/songs/lugubrious_lima_beans-love_lichtenstein") self.assertEqual(self.browser.url, "http://0.0.0.0:8080/browse") def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() self.browser.quit()
class SeleniumTestCase(LiveServerTestCase): """ A base test case for Selenium, providing hepler methods for generating clients and logging in profiles. """ def setUp(self): """ Base setup actions """ self.user = User.objects.create_superuser( username='******', password='******', email='*****@*****.**' ) self.browser = Browser() super(SeleniumTestCase, self).setUp() def open(self, url): """ Open a relative URL """ self.browser.visit("%s%s" % (self.live_server_url, url)) def tearDown(self): """ Base teardown action """ if hasattr(self, 'browser'): self.browser.quit() super(SeleniumTestCase, self).tearDown()
def main(): try: site_name = str(sys.argv[1]) except Exception as ex: print('Insert URL to access site.json file') print(ex) sys.exit(-1) site_name_clean = remove_bars(site_name) req = requests.get(site_name_clean + "/site.json") if req.status_code != 200: print("Only OK! This response is " + req.status_code) sys.exit(-1) check_dir("typoshot-output") j = json.loads(req.text) for i in j["pages"]: browser = Browser("firefox") browser.visit(i["url"]) if browser.status_code.is_success(): str_replaced = i["title"].replace(" ", "-") browser.driver.save_screenshot(str_replaced + ".png") browser.quit()
class Webkit(object): "" def __init__(self): self.tag_attr = { '*': 'href', 'frame': 'src', 'iframe': 'src', 'object': 'src' } def get_links(self, url): links = [] self.browser = Browser('phantomjs') self.browser.visit(url) for tag, attr in self.tag_attr.viewitems(): llinks = self.browser.find_by_xpath('//%s[@%s]'% (tag, attr)) if not llinks: continue for link in llinks: link = link.__getitem__(attr) if not link: continue if link == 'about:blank' or link.startswith('javascript:'): continue if not link.startswith('http:'): link = urlparse.urljoin(url, link) links.append(link) return links def close(self): self.browser.quit()
class closeadmin(object): def __init__(self,browser_type): self.browser_type=browser_type self.site='http://www.kuaixiuagency.com'#登录页面 def verify(self,case): self._fill_form() if case==1:#在管理员列表可以关闭 pass if True:#关闭的管理员不能登录 self.browser.quit() return u'通过' else: self.browser.quit() return u'失败' elif case==2:#已关闭的管理员不能重复关闭 pass#关闭后没有关闭按钮 self.browser.quit() elif case==3:#关闭的管理员可以开启 pass#开启后可以登录 self.browser.quit() def _login(self):#登录 self.browser=Browser(self.browser_type) self.browser.visit(self.site) def _fill_form(self):#填写用户名密码 self._login()
class searchorder(object): def __init__(self,browser_type): self.browser_type=browser_type self.site='http://www.kuaixiuagency.com'#登录页面 def config(self,placeholder=''): self.placeholder=placeholder def verify(self,case): self._fill_form() if case==1:#.输入存的订单号可以搜索到对应的订单 if True:#显示搜索到的订单 self.browser.quit() return u'通过' else: self.browser.quit() return u'失败' elif case==2:#输入不存在的订单不能搜索到订单 pass#没有显示该订单号 self.browser.quit() elif case==3:#输入部分订单号可以搜索到包含该部分订单号的所有订单 pass#显示包括订单号的所有订单 self.browser.quit() elif case==4:#不输入任何内容不能搜索到订单 pass#不显示任何内容,验证没有搜索到订单 self.browser.quit() def _login(self): self.browser=Browser(self.browser_type) self.browser.visit(self.site) def _fill_form(self): self._login()
class BrowserManager: def __init__(self): self._lock = False def bootup(self): self._display = Display(visible=0, size=(1024, 768)) self._display.start() profile = {} if 'HTTP_PROXY' in os.environ: proxy_url = os.environ['HTTP_PROXY'] proxy_server = proxy_url.split(':')[1][2:] proxy_port = proxy_url.split(':')[-1] profile['network.proxy.type'] = 1 profile['network.proxy.http'] = proxy_server profile['network.proxy.http_port'] = proxy_port profile['network.proxy.https'] = proxy_server profile['network.proxy.https_port'] = proxy_port self.browser = Browser(profile_preferences=profile) def obtain(self,background): while self._lock: background.wait('Browser lock', 15) self._lock = True return self.browser def release(self,background): self._lock = False def shutdown(self): self.browser.quit() self._display.stop()
class editgroup(object): def __init__(self,browser_type): self.browser_type=browser_type self.browser=Browser(self.browser_type) self.site='http://www.kuaixiuagency.com'#登录页面 self.browser.visit(self.site) def config(self,name='',admin=''): self.name=name self.admin=admin def verify(self,case): pass if case==1:#更改分组名称,选择其他管理员可以编辑分组 if True:#验证选择编辑后的内容是否和输入的一致 # 验证原来的管理员登录不显示更改前所在的分组 # 验证新的管理员登录后可以管理更改后的分组 self.browser.quit() return u'通过' else: self.browser.quit() return u'失败' elif case==2:#删除分组名称选择其他管理员不能编辑成功 pass#验证不能提交并且有错误提示 self.browser.quit() elif case==3:#更改分组名称,不选择管理员不能编辑成功 pass#验证不能提交 self.browser.quit()
class SplinterThesis(object): def __init__(self): self.browser = Browser("phantomjs", wait_time=10) #访问网页 def get_html(self, url): self.browser.visit(url) return self.browser #查找下载地址 def get_down_url(self, down_head, browser): result = browser.find_link_by_text("下载全文") down_url = "" if result: result = str(result["onclick"]).split("'") temp_url = down_head + result[1] browser.visit(temp_url) result = browser.find_link_by_text("下载地址") if browser and result["href"]: down_url = result["href"] return down_url def main(self, down_url, url): html_url = down_url + url browser = self.get_html(html_url) down_url = self.get_down_url(down_url, browser) return down_url
def webkit2png(url, image_file_path, browser=None, wait_time=0): new_browser = False try: if not browser: browser = Browser('phantomjs') new_browser = True browser.visit(url) if browser.status_code.is_success(): if wait_time > 0: time.sleep(wait_time) browser.driver.save_screenshot(image_file_path) image = Image.open(image_file_path) image.load() if is_transparent(image) and False: no_alpha = Image.new('L', image.size, (255)) no_alpha.paste(image, mask=image.split()[-1]) else: no_alpha = image.convert('L') # Convert to black and white imageage. bw = no_alpha.convert('L') # bw = bw.filter(ImageFilter.MedianFilter) # White background. bg = Image.new('L', image.size, 255) bbox = ImageChops.difference(bw, bg).getbbox() if bbox: image = image.crop(bbox) image.save(image_file_path) finally: if new_browser: browser.quit()
class LiveServerTestCase(TestCaseMixin, BaseLiveServerTestCase): """Base test case for in-browser functional tests.""" initial_url = None login_button_value = 'Log in' def login(self): self.browser.fill('username', self.username) self.browser.fill('password', self.password) self.browser.find_by_value(self.login_button_value).click() def create_staff_user(self, **kwargs): kwargs.setdefault('is_staff', True) kwargs.setdefault('is_superuser', True) return super(LiveServerTestCase, self).create_user(**kwargs) def setUp(self): self.browser = Browser('django') if self.initial_url is not None: self.browser.visit("{}{}".format( self.live_server_url, self.initial_url, )) def tearDown(self): self.browser.quit()
def test_set_episode(self): browser = Browser("firefox", extensions=["adblock.xpi"]) browser.visit("http://kinox.to/Stream/The_Big_Bang_Theory.html") stream = script.watchSeries("http://kinox.to/Stream/The_Big_Bang_Theory.html", 100, 100, None) stream.set_episode(browser) sleep(5) browser.quit()
class GoogleTestCase(unittest.TestCase): def setUp(self): self.browser = Browser('phantomjs') self.browser.visit('http://google.com') def test_check_title(self): assert self.browser.title == 'Google'
def _parse_article_html(self, objectId, url, industry_press=None): #browser.visit("http://www.businesswire.com/news/home/20150409005073/en") browser = Browser("phantomjs") browser.visit(url) time.sleep(2) html = browser.html #html = requests.get(url).text html = BeautifulSoup(html) article = html.find("div", {"class":"bw-release-story"}) company_name = html.find("h3", {"itemprop":"sourceOrganization"}) company_name = company_name.find("span", {"itemprop":"name"}) vals = [article, company_name] cols = ["article", "company_name"] #TODO - itemprop="name" company_name #TODO - persist in parse vals = [self.remove_non_ascii(i.text) if i else "" for i in vals] data = dict(zip(cols, vals)) #print data["company_name"] print data #q.enqueue(ClearSpark()._bulk_company_info, data["company_name"]) if industry_press: r = Parse().update("IndustryPress", objectId, data) else: r = Parse().update("Press", objectId, data) print r.json() browser.quit()
class BrowserTest(TestCase): def setUp(self): self.browser = Browser('phantomjs', wait_time=10) self.browser.driver.set_window_size(1024, 768) def tearDown(self): self.browser.quit()
def test_queue2csvzhconsumer(self): container = Queue() consumer = QueueConsumer(container) store = CSVStore(CSV_FILENAME, CSV_FIELDNAMES) zhconsumer = ZHConsumer(consumer, store) browser = Browser('firefox') browser.visit('http://127.0.0.1:8888') zhconsumer.start() """函数的启动方式 thread.start_new_thread(函数, (参数列表)) """ data = html.document_fromstring(browser.html) for i in range(1000): container.put(data) while 1: if not container.qsize(): break else: print("the size of queue : %s" % str(container.qsize())) # tmd , 不加 睡眠,那么,gil 都在 这个循环的手上了 # time.sleep(1) zhconsumer.close() zhconsumer.stop()
def firefox_installed(): try: browser = Browser("firefox") browser.quit() except OSError: return False return True
def run(self): browser = Browser('chrome') logging.warning('Process:%s start.' % self.name) while True: url = self.url_queue.get() browser.visit(url) html = browser.html self.output_queue.put((url, html))
def get_pages(portal): #gets last page browser = Browser("phantomjs", service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any']) browser.visit(portal) html = browser.html browser.quit() soup = BeautifulSoup(html, "lxml") last_page = soup.find('li', id="lastPage").text return last_page
def test_should_be_able_to_change_user_agent(self): from splinter import Browser driver_name = self.browser.driver_name.lower() browser = Browser(driver_name=driver_name, user_agent="iphone") browser.visit(EXAMPLE_APP + "useragent") result = 'iphone' in browser.html browser.quit() self.assertTrue(result)
def browser_can_change_user_agent(self, webdriver): from splinter import Browser browser = Browser(driver_name=webdriver, user_agent="iphone") browser.visit(EXAMPLE_APP + "useragent") result = 'iphone' in browser.html browser.quit() return result
def test_login(self): with pyvirtualdisplay.Display(): browser = Browser() browser.visit("http://ui:8080/accounts/login/") browser.fill("login", "testuser") browser.fill("password", "password") browser.find_by_css(".btn-primary").click() self.assertTrue(browser.find_by_text("Successfully signed in as testuser."))
def test_full_and_play(self): browser = Browser("firefox", extensions=["adblock.xpi"]) sleep(5) browser.visit("http://streamcloud.eu/e9xfkyuqt6ze/Poltergeist.BDScr.LD.German.x264-PsO.mkv.html") stream = script.watchSeries( "http://streamcloud.eu/e9xfkyuqt6ze/Poltergeist.BDScr.LD.German.x264-PsO.mkv.html", None ) stream.full_and_close(browser)
def get_hemis_img(url): # Path to chromedriver executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Go to website browser.visit(url) # read website's html html = browser.html soup = bs(html, 'html.parser') # find "a" tag result = browser.find_by_tag("a") # define a list to hold 1st link to full images hemis_image_path_list = [] for i in range(8): # if link exist, skip saving to list if (result[i + 4]["href"]) in hemis_image_path_list: print('') else: hemis_image_path_list.append(result[i + 4]["href"]) # browser reload browser.reload() # Close the browser after scraping browser.quit() final_hemis_img_url_list = [] for i in range(len(hemis_image_path_list)): # Path to chromedriver executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Go to website browser.visit(hemis_image_path_list[i]) # read website's html html = browser.html soup = bs(html, 'html.parser') # get image title result_title = soup.find('h2', class_='title').get_text() # get image URL result = soup.find('img', class_='wide-image')["src"] final_url = 'https://astrogeology.usgs.gov' + result # concat image URL to get complete URL link final_hemis_img_url_list.append({ "title": result_title, "img_url": final_url }) # browser reload browser.reload() # Close the browser after scraping browser.quit() return final_hemis_img_url_list
def init_browser(): executable_path = {'executable_path':"C:\Drivers\chromedriver\chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False)
def setUpClass(cls): cls.browser = Browser("phantomjs", wait_time=1.5)
def scrape(): mars_dict = {} # set chrome driver path executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) import time # visit NASA Mars News url url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, 'html.parser') article = soup.find("div", class_='list_text') news_title = article.find("div", class_="content_title").text mars_dict['title'] = news_title news_p = soup.find('div', class_='article_teaser_body').text mars_dict["paragraph"] = news_p ################################################# import time image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(image_url) time.sleep(3) browser.links.find_by_partial_text('FULL IMAGE')[0].click() time.sleep(3) browser.links.find_by_partial_text('more info').click() time.sleep(3) html = browser.html image_soup = BeautifulSoup(html, 'html.parser') feat_img_url = image_soup.find('figure', class_='lede').a['href'] featured_image_url = f'https://www.jpl.nasa.gov{feat_img_url}' mars_dict['featured_image_url'] = featured_image_url ################################################# import time url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_weather = soup.find( 'div', class_= 'css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0' ).text mars_dict['mars_weather'] = mars_weather ################################################# url = 'https://space-facts.com/mars/' browser.visit(url) html = browser.html table = pd.read_html(url) mars_facts = table[2] # Rename columns mars_facts.columns = ['Description', 'Value'] # Reset Index mars_facts.set_index('Description', inplace=True) # Converting table data to Html string mars_facts = mars_facts.to_html() mars_facts = mars_facts.replace("\n", "") mars_dict['mars_facts'] = mars_facts ################################################# import time url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) time.sleep(5) html = browser.html soup = BeautifulSoup(html, 'html.parser') products = soup.find('div', class_='result-list') hemispheres = products.find_all('div', class_='item') hemisphere_image_urls = [] for hemisphere in hemispheres: title = hemisphere.find("h3").text title = title.replace("Enhanced", "") end_link = hemisphere.find("a")["href"] image_link = "https://astrogeology.usgs.gov/" + end_link browser.visit(image_link) import time time.sleep(5) html = browser.html soup = BeautifulSoup(html, "html.parser") downloads = soup.find("div", class_="downloads") image_url = downloads.find("a")["href"] hemisphere_image_urls.append({"title": title, "image_url": image_url}) import datetime as dt mars_dict['hemisphere_urls'] = hemisphere_image_urls mars_dict["TimeStamp"] = dt.datetime.now() browser.quit() return mars_dict #if __name__ == '__main__': # scrape()
#test # Import Splinter, BeautifulSoup, and Pandas from splinter import Browser from bs4 import BeautifulSoup as soup import pandas as pd import datetime as dt # Set the executable path and initialize the chrome browser in splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path) def mars_facts(): # Add try/except for error handling try: # Use 'read_html' to scrape the facts table into a dataframe df = pd.read_html('http://space-facts.com/mars/')[0] except BaseException: print("None Available") # Assign columns and set index of dataframe df.columns=['Description', 'Mars'] df.set_index('Description', inplace=True) # Convert dataframe into HTML format, add bootstrap return df.to_html(classes="table table-striped")
def scrape(): from splinter import Browser from splinter.exceptions import ElementDoesNotExist import numpy as np from bs4 import BeautifulSoup import pandas as pd import requests executable_path = {'executable_path': 'Resources/chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # 1.1 Scraping News Title and Paragraphs url1 = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url1) news_title = [] news_para = [] for pages in range(10): html = browser.html soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all(class_='content_title') paragraphs = soup.find_all(class_='article_teaser_body') for title in titles: news_title.append(title.a.text) for paragraph in paragraphs: news_para.append(paragraph.text) try: browser.click_link_by_partial_text('MORE') except: print("Scraping Complete") np_news_title = np.unique(np.array(news_title)) np_news_para = np.unique(np.array(news_para)) # 1.2 Get Images JPL Mars Space Images - Featured Image url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url2) featured_image_url = [] for pages in range(5): html = browser.html soup = BeautifulSoup(html, 'html.parser') url_imgs = soup.find_all(class_='img') for url_img in url_imgs: image = url_img.img['src'] featured_image_url.append('https://www.jpl.nasa.gov' + image) try: browser.click_link_by_partial_text('Next') except: print("Scraping Complete") np_featured_image_url = np.unique(np.array(featured_image_url)) # 1.3 Mars Weather url3 = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url3) soup = BeautifulSoup(response.text, 'html.parser') mars_weather = [] results = soup.find_all('div', class_="js-tweet-text-container") for result in results: try: weather = result.p.text mars_weather.append(weather) except AttributeError as e: print(e) mars_weather = mars_weather[1] # 1.4 Mars Facts url4 = 'https://space-facts.com/mars/' marsFacts = pd.read_html(url4)[0] marsFacts.drop(columns='Earth', inplace=True) marsFacts.columns = ['MarsFacts', 'Value'] marsFacts.head() # 1.5 Mars Hemispheres url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url5) image_url = [] title = [] href_container = [] html = browser.html soup = BeautifulSoup(html, 'html.parser') href_url_divs = soup.find_all('div', class_='item') for div in href_url_divs: href_container.append('https://astrogeology.usgs.gov' + div.a['href']) for links in href_container: try: browser.visit(links) html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.click_link_by_partial_text('Open') img = soup.find('img', class_='wide-image') title = soup.find('h2', class_='title') print(img) image_url.append({ 'title': title.text.replace(' Enhanced', ''), 'img_url': 'https://astrogeology.usgs.gov/' + img['src'] }) except: print('scraping complete') scrapped = { 'NewsTitle': np_news_title, 'NewsParagraps': np_news_para, 'FeaturedImages': np_featured_image_url, 'Facts': marsFacts, 'Weather': mars_weather, 'Hemispheres': image_url } return (scrapped)
#!/usr/bin/env python # coding: utf-8 # In[1]: # Import Splinter and BeautifulSoup from splinter import Browser from bs4 import BeautifulSoup as soup from webdriver_manager.chrome import ChromeDriverManager import pandas as pd # In[2]: executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # In[3]: # Visit the mars nasa news site url = 'https://redplanetscience.com' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css('div.list_text', wait_time=1) # In[4]: html = browser.html news_soup = soup(html, 'html.parser') slide_elem = news_soup.select_one('div.list_text')
def init_browser(): # @NOTE: Replace the path with your actual path to the chromedriver executable_path = {"executable_path": "chromedriver.exe"} return Browser("chrome", **executable_path, headless=True)
def init_browser(): executable_path = {'executable_path': 'chromedriver.exe'} return Browser('chrome', **executable_path, headless=False)
def setUpClass(cls): custom_headers = { 'X-Splinter-Customheaders-1': 'Hello', 'X-Splinter-Customheaders-2': 'Bye' } cls.browser = Browser("phantomjs", custom_headers=custom_headers)
def init_browser(): # windows (me) Make sure the chromedriver is in the jupy nb file location executable_path = {"executable_path": "chromedriver.exe"} return Browser("chrome", **executable_path, headless=False)
def get_featured_img_func(url): # Path to chromedriver executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Go to website browser.visit(url) # find "Full Image" button to click on it to get to next webpage full_img = browser.find_by_id("full_image") full_img.click() # find "More Info" button to click on it to get to next webpage browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() # read website's html html = browser.html soup = bs(html, 'html.parser') # find "a" tag to find href containing the URL result = browser.find_by_tag("a") relative_image_path = result[58]["href"] # get image title relative_image_title = soup.find('h1', class_='article_title') relative_image_title = relative_image_title.get_text() relative_image_title = relative_image_title.split('\t') relative_image_title relative_image_title[4] final_title_feature_img = [] final_title_feature_img.append({ 'Title': relative_image_title[4], 'URL': relative_image_path }) # Close the browser after scraping browser.quit() #return scraped object return final_title_feature_img
def init_browser(): # Setting the chromedriver path executable_path = {'executable_path': '/usr/local/bin/chromedriver'} return Browser('chrome', **executable_path, headless=False)
def init_bowser(): #@NOTE: Replace the path with your actual path to the chromedriver executable_path = {"executable_path": "/usr/local/bin/chromedriver"} return Browser("chrome", **executable_path, headless=False)
def fetch(url): executable_path = {'executable_path': GeckoDriverManager().install()} browser = Browser('firefox', **executable_path, headless=True) browser.visit(url) html = browser.html return bs(html, 'html.parser')
def init_browser(): executable_path = {"executable_path": "C:\chromedriver_win32\chromedriver"} return Browser("chrome", **executable_path, headless=False)
def scrape(): import pandas as pd from bs4 import BeautifulSoup as bs import requests from splinter import Browser from webdriver_manager.chrome import ChromeDriverManager import pymongo #scrape news url = 'https://mars.nasa.gov/news' response = requests.get(url) soup = bs(response.text, 'html.parser') title_results = soup.find('div', class_="content_title") news_title = title_results.get_text(strip=True) para_results = soup.find('div', class_="rollover_description_inner") news_p = para_results.get_text(strip=True) #scrape for image executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') browser.find_by_css('div[class="NavDesktopDropdown -active"]')[0].click() browser.click_link_by_partial_text('Featured Image') html = browser.html soup = bs(html, 'html.parser') main = soup.find('main') img = main.find('img') featured_image_url = img['src'] #scrape for facts url = 'https://space-facts.com/mars/' mars_data_table = pd.read_html(url, header=None) mars_data_table = mars_data_table[0] mars_data_table = mars_data_table.rename(columns={ 0: 'Description', 1: 'Mars' }) mars_data_html = mars_data_table.to_html() # scrape hemi-info url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') url_list = [] hemi_list = [] items = soup.find_all('div', class_='description') for item in items: a = item.find('a', class_='itemLink') hemi = a.text.strip() hemi_list.append(hemi) url = item.find('a')['href'] url_list.append(url) hemi_list = [i.split(' Enhanced', 1)[0] for i in hemi_list] orignial_img_list = [] for url in url_list: browser.visit(f"https://astrogeology.usgs.gov/{url}") html = browser.html soup = bs(html, 'html.parser') downloads = soup.find_all('li') temp_list = [] for download in downloads: orignial_img = download.find('a')['href'] temp_list.append(orignial_img) orignial_img_list.append(temp_list[1]) hemisphere_image_urls = [] browser.quit() for i in range(0, 4, 1): temp_dict = {'title': hemi_list[i], 'img_url': orignial_img_list[i]} hemisphere_image_urls.append(temp_dict) #put all results in list of dict mars_dict = [{ 'Recent_News': news_title, 'News_Story': news_p }, { 'Featured_Image': featured_image_url }, { 'Mars_Data': mars_data_html }, { 'Mars_Hemispheres': hemisphere_image_urls }] conn = "mongodb://localhost:27017" client = pymongo.MongoClient(conn) db = client.mars_db mars_collection = db.mars_collection mars_collection.insert_many(mars_dict) print('Mars Data Uploaded')
#!/usr/bin/env python # coding: utf-8 # In[1]: # Import Splinter, BeautifulSoup, and Pandas from splinter import Browser from bs4 import BeautifulSoup as soup import pandas as pd from webdriver_manager.chrome import ChromeDriverManager # In[2]: # Set the executable path and initialize Splinter executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # ### Visit the NASA Mars News Site # In[3]: # Visit the mars nasa news site url = 'https://redplanetscience.com/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css('div.list_text', wait_time=1) # In[4]: # Convert the browser html to a soup object and then quit the browser
#!/usr/bin/env python # coding: utf-8 # # Module: 10.3.3 Scrape Mars Data: The News # Import Splinter and BeautifulSoup from splinter import Browser from bs4 import BeautifulSoup as soup from webdriver_manager.chrome import ChromeDriverManager import pandas as pd executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # Visit the mars nasa news site url = 'https://redplanetscience.com' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css('div.list_text', wait_time=1) #set up the HTML parser html = browser.html news_soup = soup(html, 'html.parser') slide_elem = news_soup.select_one('div.list_text') slide_elem.find('div', class_='content_title') # Use the parent element to find the first `a` tag and save it as `news_title` news_title = slide_elem.find('div', class_='content_title').get_text() news_title
def scrapey_mars(): mars_dict = {} ## Part 1 ## news = 'https://mars.nasa.gov/news/' response = requests.get(news) soup = bs(response.text, 'html.parser') title = soup.find('div', class_="content_title") news_title = title.a.text summary = soup.find('div', class_="rollover_description_inner") news_sum = summary.text mars_dict["News_Title"] = news_title mars_dict["News_Summary"] = news_sum ## Part 2 ## executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) full_image = browser.find_by_id('full_image') time.sleep(1) full_image.click() time.sleep(1) m_info = browser.find_link_by_partial_text('more info') m_info m_info.click() new_html = browser.html new_soup = bs(new_html, 'html.parser') full_image = new_soup.select_one('figure.lede a img').get("src") full_image_summary = new_soup.find('div', class_="wysiwyg_content").get("p") jpl_url = 'https://www.jpl.nasa.gov' featured_image_url = jpl_url + full_image mars_dict["Featured_Image_Link"] = featured_image_url mars_dict["Image_Summary"] = full_image_summary full_image_title = new_soup.find('h1', class_="article_title") mars_dict["Featured_Image_Title"] = full_image_title.text.strip('\n\t": ') ## Part 3 ## marsweather = 'https://twitter.com/marswxreport' response2 = requests.get(marsweather) soup2 = bs(response2.text, 'html.parser') weather = [] for w_info in soup2.find_all( 'p', class_= "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"): weather.append(w_info.text.strip()) for tweet in reversed(weather): if tweet[:3] == "InS": mars_weather = tweet mars_dict["weather_info"] = mars_weather ## Part 4 ## pandaurl = 'https://space-facts.com/mars/' tables = pd.read_html(pandaurl) df = tables[0] df.columns = ['Mars Facts', 'Values'] df.set_index("Mars Facts") mars_facts_html = df.to_html(index=False, classes="table-hover table-dark table-sm") mars_dict["facts_table"] = mars_facts_html ## Part 5 ## hemishperes = [ 'Cerberus', 'Schiaparelli', 'Syrtis Major', 'Valles Marineris' ] hem_title = [] img_urls = [] for i in hemishperes: my_url = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/' + i browser.visit(my_url) time.sleep(1) url_html = browser.html page_soup = bs(url_html, "html.parser") hem_images = page_soup.find('div', class_='downloads').find('li').a['href'] img_urls.append(hem_images) hem_name = i + ' Hemisphere' hem_title.append(hem_name) print(hem_name + ' is a great success!') d = dict(zip(hem_title, img_urls)) hemisphere_image_urls = [{ 'image_url': v } for k, v in zip(hem_title, img_urls)] hemisphere_image_titles = [{ 'hem_title': b } for b, l in zip(hem_title, img_urls)] mars_dict["Hemisphere_image_data"] = hemisphere_image_urls mars_dict["Hemisphere_title_data"] = hemisphere_image_titles browser.quit() return mars_dict
def scrape(): # Dependencies from bs4 import BeautifulSoup as bs import pandas as pd from splinter import Browser executable_path = {'executable_path': 'resources/chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Create Mission to Mars global dictionary that can be imported into MongoDB mars_info = {} ### NASA Mars News # Scrape the NASA Mars News Site and collect the latest news title and paragraph text. url_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url_news) soup_news = bs(browser.html, 'html.parser') #print(soup_news.prettify()) # Find latest news title news_title = soup_news.find('div', class_='content_title').text # Dictionary entry for news title mars_info['news_title'] = news_title # Find latest news paragraph news_paragraph = soup_news.find('div', class_='article_teaser_body').text # Dictionary entry for news paragraph mars_info['news_paragraph'] = news_paragraph ### JPL Mars Space Images - Featured Image # Use splinter to navigate the site and find the image url for the current Featured Mars Image url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_image) soup_image = bs(browser.html, 'html.parser') #print(soup_image.prettify()) image = soup_image.find_all('a', class_ ="fancybox")[1]['data-fancybox-href'] #print(image) # Concatenate website url with scrapped route featured_image_url = 'https://www.jpl.nasa.gov' + image # Dictionary entry for Mars featured image mars_info['featured_image_url'] = featured_image_url ### Mars Weather from Twitter # Scrape the latest Mars weather tweet from Mars Weather twitter account. url_twitter = 'https://twitter.com/marswxreport?lang=en' browser.visit(url_twitter) soup_weather = bs(browser.html, 'html.parser') #print(soup_weather.prettify()) # Display mars weather details mars_weather = soup_weather.find_all('p', class_ = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')[0].text.split("pic")[0] # Dictionary entry for Mars weather from twitter mars_info['mars_weather'] = mars_weather ### Mars Facts from Space Facts # Visit the Space Facts webpage, mars facts page. # Use Pandas to scrape the table containing facts about Mars including Diameter, Mass, etc. # Use Pandas to convert the data to a HTML table string url_facts = 'https://space-facts.com/mars/' browser.visit(url_facts) # Use Panda's `read_html` to parse the url facts_df = pd.read_html(url_facts)[0] # Rename columns facts_df.columns = ['Description', 'Value'] # Set description column as index facts_df.set_index('Description', inplace=True) # Dictionary entry for Mars Facts from Space Facts mars_info['mars_facts'] = facts_df.to_html() ### Mars Hemispheres # Visit the USGS Astrogeology site to obtain high resolution images for each of Mars' hemispheres. url_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_hemispheres) soup_hemisphere = bs(browser.html, 'html.parser') #print(soup_hemisphere.prettify()) results = soup_hemisphere.find_all('div', class_ = 'description') hemisphere_image_urls = [] for result in results: # Get hemisphere name and save in variable called title title = result.find('h3').text # Get links to the hemispheres and save in variable called url partial_url = result.find('a', class_="itemLink product-item")['href'] url = 'https://astrogeology.usgs.gov/' + partial_url # Click each of the url to find the full resolution hemisphere image. Save in variable called img_url). browser.visit(url) soup_imgs = bs(browser.html, 'html.parser') img_url = soup_imgs.find('div', class_='downloads').li.a['href'] # Use a Python dictionary to store the data using the keys img_url and title. # Append the dictionary with the hemisphere title and image url string to a list. # This list will contain one dictionary for each hemisphere. hemisphere_image_urls.append({'title':title, 'img_url':img_url}) mars_info['hemisphere_image_urls'] = hemisphere_image_urls browser.quit() return mars_info
def browser(self): browser = Browser('chrome') yield browser browser.quit()
#!/usr/bin/env python #coding: utf-8 from bs4 import BeautifulSoup from splinter import Browser from selenium import webdriver import pandas as pd import time import datetime as dt import re ################################################ executable_path = {"executable_path": "./chromedriver.exe"} browser = Browser("chrome", **executable_path) ################################################# def scrape(): mars_dict = {} # set chrome driver path executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) import time # visit NASA Mars News url url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, 'html.parser')
def scrape(): # Mars News executable_path = {"executable_path": "chromedriver"} browser = Browser("chrome", **executable_path, headless=False) url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" browser.visit(url) html = browser.html soup = BeautifulSoup(html, "html.parser") news_title = soup.find("div", class_="content_title").get_text() news_p = soup.find("div", class_="article_teaser_body").get_text() # JPL Mars space Image url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) html = browser.html soup = BeautifulSoup(html, "html.parser") image_url = soup.footer.find( "a", class_="button fancybox")["data-fancybox-href"] featured_image_url = "https://www.jpl.nasa.gov" + image_url # Mars weather url = "https://twitter.com/marswxreport?lang=en" browser.visit(url) html = browser.html soup = BeautifulSoup(html, "html.parser") tweets = soup.find_all( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text") for tweet in tweets: tweet_parent = tweet.find_parent("div", class_="content") tweet_id = tweet_parent.find( "a", class_= "account-group js-account-group js-action-profile js-user-profile-link js-nav" )["href"] if tweet_id == '/MarsWxReport': mars_weather = tweet_parent.find( "p", class_= "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).get_text() break # Mars Facts url = 'https://space-facts.com/mars/' tables = pd.read_html(url) df = tables[0] df.columns = ["Description", "Value"] df.set_index(df["Description"], inplace=True) df = df[["Value"]] html_table = df.to_html() html_table = html_table.replace('\n', '') # Mars Hemisphers url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) html = browser.html soup = BeautifulSoup(html, "html.parser") h3s = soup.find_all("h3") titles = [] for h3 in h3s: h3 = str(h3) h3 = h3[4:-14] titles.append(h3) img_urls = [] for title in titles: browser.click_link_by_partial_text(title) html = browser.html soup = BeautifulSoup(html, "html.parser") img_urls.append(soup.find("div", class_="downloads").find("a")["href"]) hemisphere_image_urls = [] for title, img_url in zip(titles, img_urls): hemisphere_image_urls.append({"title": title, "img_url": img_url}) data = { "news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, "mars_weather": mars_weather, "html_table": html_table, "hemisphere_image_urls": hemisphere_image_urls } return data
def init_browser(): # Launch splinter browser executable_path = {'executable_path': '/usr/local/bin/chromedriver'} return Browser('chrome', **executable_path, headless=False)
def scrape_info(): mars = {} #get_ipython().system('which chromedriver') executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') article = soup.find("div", class_="list_text") news_title = article.find("div", class_="content_title").text news_p = article.find("div", class_="article_teaser_body").text print(news_title) print(news_p) mars["news_title"] = news_title mars["news_p"] = news_p #print(news_title) # Featured Images # url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' base_url = 'https://www.jpl.nasa.gov' url = base_url + '/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') image_url = soup.find("a", class_="button fancybox")["data-fancybox-href"] featured_image_url = base_url + image_url print(featured_image_url) mars["featured_image"] = featured_image_url # Mars Facts url = 'https://space-facts.com/mars/' tables = pd.read_html(url) tables mars_facts_df = tables[0] mars_facts_df.columns = ['Fact', 'Value'] mars_facts_df['Fact'] = mars_facts_df['Fact'].str.replace(':', '') mars_facts_df mars_facts_df = tables[0] mars_facts_df.columns = ['Fact', 'Value'] mars_facts_df['Fact'] = mars_facts_df['Fact'].str.replace(':', '') mars_facts_df mars_facts_html = mars_facts_df.to_html() print(mars_facts_html) mars["facts"] = mars_facts_html # Mars Hemispheres # In[14]: base_url = 'https://astrogeology.usgs.gov' url = base_url + '/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') items = soup.find_all('div', class_='item') urls = [] titles = [] for item in items: urls.append(base_url + item.find('a')['href']) titles.append(item.find('h3').text.strip()) print(urls) titles browser.visit(urls[0]) html = browser.html soup = bs(html, 'html.parser') oneurl = base_url + soup.find('img', class_='wide-image')['src'] oneurl image_urls = [] for oneurl in urls: browser.visit(oneurl) html = browser.html soup = bs(html, 'html.parser') oneurl = base_url + soup.find('img', class_='wide-image')['src'] image_urls.append(oneurl) image_urls hemisphere_images_urls = [] for i in range(len(titles)): hemisphere_images_urls.append({ 'title': titles[i], 'image_url': image_urls[i] }) hemisphere_images_urls mars["hemispheres"] = hemisphere_images_urls #for i in range(len(hemisphere_images_urls)): # print(hemisphere_images_urls[i]['title']) #print(hemisphere_images_urls[i]['image_url'] + '\n') return mars
# -*- coding: \xc2 -*- from splinter import Browser import time import pytesseract from PIL import Image, ImageEnhance, ImageFilter from splinter import * import time tic = time.clock() browser = Browser('chrome') url = "file:///C:/Users/servadmin/Documents/Atom%20Projects/formfill/index.html" browser.visit(url) im = Image.open("abre.png") # the second one im = im.filter(ImageFilter.MedianFilter()) enhancer = ImageEnhance.Contrast(im) im = enhancer.enhance(2) im = im.convert('1') im.save('temp2.png') pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract' text = pytesseract.image_to_string(Image.open('temp2.png')) out = text.split("*") browser.find_by_id("1").fill(out[0]) # browser.find_by_id("2").fill(out[1]) # browser.find_by_id("3").fill(out[2]) # browser.find_by_id("4").fill(out[3]) # browser.find_by_id("5").fill(out[4]) # browser.find_by_id("6").fill(out[5]) # # browser.find_by_id("__tab_ctl00_ContentPlaceHolder1_tabForm_tabTransactionDetail").click() #TRANSACTION DETAILS browser.find_by_id("7").fill(out[6]) # browser.find_by_id("8").fill(out[7]) #
def init_browser(): # @NOTE: Replace the path with your actual path to the chromedriver executable_path = {'executable_path': ChromeDriverManager().install()} return Browser("chrome", **executable_path, headless=False)
def init_browser(): executable_path = {"executable_path": "/usr/local/bin/chromedriver"} return Browser("chrome", **executable_path, headless=False)
# Import Splinter and BeautifulSoup import requests as re from splinter import Browser from bs4 import BeautifulSoup as soup import pandas as pd # Path to chromedriver get_ipython().system('which chromedriver') # In[3]: # Set the executable path and initialize the chrome browser in splinter executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path) # ## Visit the NASA Mars News site # Visit the mars nasa news site url = 'https://mars.nasa.gov/news/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Convert the browser html to a soup object and then quit the browser html = browser.html news_soup = soup(html, 'html.parser') slide_elem = news_soup.select_one('ul.item_list li.slide') slide_elem.find("div", class_='content_title')