Python Browser Examples, splinter.Browser Python Examples

Example #1

0

Show file

File: plugin_splinter.py Project: zvolsky/codex2020

def run_for_browser(server, frmvars, browser, extra_params=None):
    if extra_params is None:
        extra_params = {}
    url = server['url']

    TestBase.log(1, 'BROWSER', browser)

    br = Browser(browser, **extra_params)

    if TestMode.remote_testdb_on(br, server):
        # default tests
        test_obj = TestConfiguredUsers(br, url, frmvars)
        test_obj.run()

        # user defined tests from modules/_plugin_splinter_tests
        for TestClass in TESTCLASSES:
            if frmvars['all_tests'] or frmvars.get('test_' + TestClass, False):
                TestBase.log(2, 'TESTCLASS', TestClass)

                test_obj = globals()[TestClass](br, url)  #** see imports
                test_obj.run()
        # seems not necessary and not good here: TestMode.remote_testdb_off(br, server)
    else:
        TestBase.log(2, 'FATAL', 'Cannot log in.')

    br.quit()
    print

Example #2

0

Show file

File: utils.py Project: bearsiji/vulcan

class WebKit(object):
    '''WebKit引擎'''
    def __init__(self):
        self.tag_attr_dict = {'*':'href',
                              'embed':'src',
                              'frame':'src',
                              'iframe':'src',
                              'object':'data'}

    def extract_links(self,url):
        '''
        抓取页面链接
        '''
        self.browser = Browser("phantomjs")
        try:
            self.browser.visit(url)
        except Exception,e:
            return
        for tag,attr in self.tag_attr_dict.iteritems():
            link_list = self.browser.find_by_xpath('//%s[@%s]' % (tag,attr))
            if not link_list:
                continue
            for link in link_list:
                link = link.__getitem__(attr)
                if not link:
                    continue
                link = link.strip()
                if link == 'about:blank' or link.startswith('javascript:'):
                    continue
                if not link.startswith('http'):
                    link = urlparse.urljoin(url,link)
                yield link

Example #3

0

Show file

File: test_edit_company.py Project: Josen15/kuaixiuscript

class editcompany(object):
	def __init__(self,browser_type):
		self.browser_type=browser_type
		self.browser=Browser(self.browser_type)
		self.site='http://www.kuaixiuagency.com'#登录页面
		self.browser.visit(self.site)
	def config(self,name='',type=''):
		self.name=name
		self.type=type
	def verify(self,case):
		pass
		if case==1:#正确更改公司有关内容可以编辑成功
			pass
			if True:#验证公司名称和类型是否和输入一致
				self.browser.quit()
				return u'通过'
			else:
				self.browser.quit()
				return u'失败'
		elif case==2:#删除公司名称，选择其他类型不能编辑成功
			pass#验证不能提交
			self.browser.quit()
		elif case==3:#编辑公司名称，不选择公司类型不能编辑成功
			pass#验证不能提交
			self.browser.quit()

Example #4

0

Show file

File: test_webdriver_chrome.py Project: brady-vitrano/splinter

def chrome_installed():
    try:
        browser = Browser("chrome")
        browser.quit()
    except WebDriverException:
        return False
    return True

Example #5

0

Show file

File: test_add_group.py Project: Josen15/kuaixiuscript

class newgroup(object):
	def __init__(self,browser_type):
		self.browser_type=browser_type
		self.site='http://www.kuaixiuagency.com'#登录页面
		self.url=''#新增分组页面
	def config(self,name='',admin=''):
		self.name=name
		self.admin=admin
	def verify(self,case):
		self._fill_form()
		if case==1:#输入分组名称，选择管理员可以新增分组成功
			pass
			if True:#验证输入的名称是否和输入一致，选择的管理员是否一致
				self.browser.quit()
				return u'通过'
			else:
				self.browser.quit()
				return u'失败'
		elif case==2:#输入分组名称，不选择管理员不能新增成功
			pass#验证不能提交
			self.browser.quit()
		elif case==3:#选择管理员不输入分组名称不能新增成功
			pass#验证不能提交，有错误提示
			self.browser.quit()
		elif case==4:#输入已有的分组名称不能新增成功
			pass#不能提交
			self.browser.quit()
	def _login(self):#登录
		self.browser=Browser(self.browser_type)
		self.browser.visit(self.site)
	def _fill_form(self):#管理员身份登录
		self._login()

Example #6

0

Show file

File: test.py Project: pafi1/stream

 def test_find_working_stream_fail(self):
     browser = Browser("firefox", extensions=["adblock.xpi"])
     browser.visit("http://kinox.to/Stream/The_Big_Bang_Theory.html")
     stream = script.watchSeries("http://kinox.to/Stream/The_Big_Bang_Theory.html", None)
     sleep(5)
     assert not stream.find_working_stream(browser)
     browser.quit()

Example #7

0

Show file

File: test.py Project: pafi1/stream

 def test_find_working_stream_success(self):
     browser = Browser("firefox", extensions=["adblock.xpi"])
     browser.visit("https://kinox.to/Stream/Poltergeist-2.html")
     stream = script.watchSeries("https://kinox.to/Stream/Poltergeist-2.html", None)
     sleep(5)
     assert stream.find_working_stream(browser)
     browser.quit()

Example #8

0

Show file

File: test_views_acceptance.py Project: jlreeder/whsongbook

class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        self.process = multiprocessing.Process(target=app.run,
                                               kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)

    def test_visit_index(self):
        self.browser.visit("http://0.0.0.0:8080/")
        self.assertEqual(self.browser.url, "http://0.0.0.0:8080/")
       
    def test_visit_browse(self):
        self.browser.visit("http://0.0.0.0:8080/browse")
        self.assertEqual(self.browser.url, "http://0.0.0.0:8080/browse")

    def test_visit_about(self):
        self.browser.visit("http://0.0.0.0:8080/about")
        self.assertEqual(self.browser.url, "http://0.0.0.0:8080/about")

    def test_visit_redirect(self):
        """
        When a non-existent song url is requested, the browser should be
        redirected to the browse page
        """
        self.browser.visit("http://0.0.0.0:8080/songs/lugubrious_lima_beans-love_lichtenstein")
        self.assertEqual(self.browser.url, "http://0.0.0.0:8080/browse")

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        self.browser.quit()

Example #9

0

Show file

File: __init__.py Project: selevit/keymaster2

class SeleniumTestCase(LiveServerTestCase):
    """
    A base test case for Selenium, providing hepler methods for generating
    clients and logging in profiles.
    """

    def setUp(self):
        """
        Base setup actions
        """

        self.user = User.objects.create_superuser(
            username='******',
            password='******',
            email='*****@*****.**'
        )
        self.browser = Browser()
        super(SeleniumTestCase, self).setUp()

    def open(self, url):
        """
        Open a relative URL
        """

        self.browser.visit("%s%s" % (self.live_server_url, url))

    def tearDown(self):
        """
        Base teardown action
        """

        if hasattr(self, 'browser'):
            self.browser.quit()
        super(SeleniumTestCase, self).tearDown()

Example #10

0

Show file

File: __init__.py Project: flavertonrr/typo-shot

def main():

    try:
        site_name = str(sys.argv[1])
    except Exception as ex:
        print('Insert URL to access site.json file')
        print(ex)
        sys.exit(-1)

    site_name_clean = remove_bars(site_name)

    req = requests.get(site_name_clean + "/site.json")
    if req.status_code != 200:
        print("Only OK! This response is " + req.status_code)
        sys.exit(-1)

    check_dir("typoshot-output")

    j = json.loads(req.text)
    for i in j["pages"]:
        browser = Browser("firefox")
        browser.visit(i["url"])
        if browser.status_code.is_success():
            str_replaced = i["title"].replace(" ", "-")
            browser.driver.save_screenshot(str_replaced + ".png")
        browser.quit()

Example #11

0

Show file

File: parser.py Project: SainXIII/DSpider

class Webkit(object):
	""
	def __init__(self):
		self.tag_attr = {
			'*': 'href',
			'frame': 'src',
			'iframe': 'src',
			'object': 'src'
		}

	def get_links(self, url):
		links = []
		self.browser = Browser('phantomjs')
		self.browser.visit(url)
		for tag, attr in self.tag_attr.viewitems():
			llinks = self.browser.find_by_xpath('//%s[@%s]'% (tag, attr))
			if not llinks:
				continue
			for link in llinks:
				link = link.__getitem__(attr)
				if not link:
					continue
				if link == 'about:blank' or link.startswith('javascript:'):
					continue
				if not link.startswith('http:'):
					link = urlparse.urljoin(url, link)
				links.append(link)

		return links
			
	def close(self):
		self.browser.quit()

Example #12

0

Show file

File: test_close_admin.py Project: Josen15/kuaixiuscript

class closeadmin(object):
	def __init__(self,browser_type):
		self.browser_type=browser_type
		
		self.site='http://www.kuaixiuagency.com'#登录页面
		
	def verify(self,case):
		self._fill_form()
		if case==1:#在管理员列表可以关闭
			pass
			if 	True:#关闭的管理员不能登录
				self.browser.quit()
				return u'通过'
			else:
				self.browser.quit()
				return u'失败'
		elif case==2:#已关闭的管理员不能重复关闭
			pass#关闭后没有关闭按钮
			self.browser.quit()
		elif case==3:#关闭的管理员可以开启
			pass#开启后可以登录
			self.browser.quit()
	def _login(self):#登录
		self.browser=Browser(self.browser_type)
		self.browser.visit(self.site)
	def _fill_form(self):#填写用户名密码
		self._login()

Example #13

0

Show file

File: test_search_order.py Project: Josen15/kuaixiuscript

class searchorder(object):
	def __init__(self,browser_type):
		self.browser_type=browser_type
		self.site='http://www.kuaixiuagency.com'#登录页面
	def config(self,placeholder=''):
		self.placeholder=placeholder
	def verify(self,case):
		self._fill_form()
		if case==1:#.输入存的订单号可以搜索到对应的订单
			if True:#显示搜索到的订单
				self.browser.quit()
				return u'通过'
			else:
				self.browser.quit()
				return u'失败'
		elif case==2:#输入不存在的订单不能搜索到订单
			pass#没有显示该订单号
			self.browser.quit()
		elif case==3:#输入部分订单号可以搜索到包含该部分订单号的所有订单
			pass#显示包括订单号的所有订单
			self.browser.quit()
		elif case==4:#不输入任何内容不能搜索到订单
			pass#不显示任何内容，验证没有搜索到订单
			self.browser.quit()
	def _login(self):
		self.browser=Browser(self.browser_type)
		self.browser.visit(self.site)
	def _fill_form(self):
		self._login()

Example #14

0

Show file

File: worker_manager.py Project: adrian-r/solar_radiation_model

class BrowserManager:
	def __init__(self):
		self._lock = False
	def bootup(self):
		self._display = Display(visible=0, size=(1024, 768))
		self._display.start()
		profile = {}
		if 'HTTP_PROXY' in os.environ:
			proxy_url = os.environ['HTTP_PROXY']
			proxy_server = proxy_url.split(':')[1][2:]
			proxy_port = proxy_url.split(':')[-1]
			profile['network.proxy.type'] = 1
			profile['network.proxy.http'] = proxy_server
			profile['network.proxy.http_port'] = proxy_port
			profile['network.proxy.https'] = proxy_server
			profile['network.proxy.https_port'] = proxy_port
		self.browser = Browser(profile_preferences=profile)
	def obtain(self,background):
		while self._lock:
			background.wait('Browser lock', 15)
		self._lock = True
		return self.browser
	def release(self,background):
		self._lock = False
	def shutdown(self):
		self.browser.quit()
		self._display.stop()

Example #15

0

Show file

File: test_edit_group.py Project: Josen15/kuaixiuscript

class editgroup(object):
	def __init__(self,browser_type):
		self.browser_type=browser_type
		self.browser=Browser(self.browser_type)
		self.site='http://www.kuaixiuagency.com'#登录页面
		self.browser.visit(self.site)
	def config(self,name='',admin=''):
		self.name=name
		self.admin=admin
	def verify(self,case):
		pass
		if case==1:#更改分组名称，选择其他管理员可以编辑分组
			if 	True:#验证选择编辑后的内容是否和输入的一致
#                   验证原来的管理员登录不显示更改前所在的分组
#                   验证新的管理员登录后可以管理更改后的分组
				self.browser.quit()
				return u'通过'
			else:
				self.browser.quit()
				return u'失败'
		elif case==2:#删除分组名称选择其他管理员不能编辑成功
			pass#验证不能提交并且有错误提示
			self.browser.quit()
		elif case==3:#更改分组名称，不选择管理员不能编辑成功
			pass#验证不能提交
			self.browser.quit()

Example #16

0

Show file

File: thesisclass.py Project: happyin3/captchaimgs

class SplinterThesis(object):
    def __init__(self):
        self.browser = Browser("phantomjs", wait_time=10)
    
    #访问网页
    def get_html(self, url):
        self.browser.visit(url)
        return self.browser
    
    #查找下载地址
    def get_down_url(self, down_head, browser):
        result = browser.find_link_by_text("下载全文")
        down_url = ""
        if result:
            result = str(result["onclick"]).split("'")
            temp_url = down_head + result[1]
            browser.visit(temp_url)
            result = browser.find_link_by_text("下载地址")
        
            if browser and result["href"]:
                down_url = result["href"]
        
        return down_url

    def main(self, down_url, url):
        html_url = down_url + url
        browser = self.get_html(html_url)
        down_url = self.get_down_url(down_url, browser)
        return down_url

Example #17

0

Show file

File: webkit2png.py Project: acabhishek942/html2latex

def webkit2png(url, image_file_path, browser=None, wait_time=0):
    new_browser = False
    try:
        if not browser:
            browser = Browser('phantomjs')
            new_browser = True
        browser.visit(url)
        if browser.status_code.is_success():
            if wait_time > 0:
                time.sleep(wait_time)
            browser.driver.save_screenshot(image_file_path)
            image = Image.open(image_file_path)
            image.load()
            if is_transparent(image) and False:
                no_alpha = Image.new('L', image.size, (255))
                no_alpha.paste(image, mask=image.split()[-1])
            else:
                no_alpha = image.convert('L')
            # Convert to black and white imageage.
            bw = no_alpha.convert('L')
            # bw = bw.filter(ImageFilter.MedianFilter)
            # White background.
            bg = Image.new('L', image.size, 255)
            bbox = ImageChops.difference(bw, bg).getbbox()
            if bbox:
                image = image.crop(bbox)
            image.save(image_file_path)
    finally:
        if new_browser:
            browser.quit()

Example #18

0

Show file

File: testcases.py Project: sqlviz/sqlviz

class LiveServerTestCase(TestCaseMixin, BaseLiveServerTestCase):

    """Base test case for in-browser functional tests."""

    initial_url = None
    login_button_value = 'Log in'

    def login(self):
        self.browser.fill('username', self.username)
        self.browser.fill('password', self.password)
        self.browser.find_by_value(self.login_button_value).click()

    def create_staff_user(self, **kwargs):
        kwargs.setdefault('is_staff', True)
        kwargs.setdefault('is_superuser', True)
        return super(LiveServerTestCase, self).create_user(**kwargs)

    def setUp(self):
        self.browser = Browser('django')
        if self.initial_url is not None:
            self.browser.visit("{}{}".format(
                self.live_server_url,
                self.initial_url,
            ))

    def tearDown(self):
        self.browser.quit()

Example #19

0

Show file

File: test.py Project: pafi1/stream

 def test_set_episode(self):
     browser = Browser("firefox", extensions=["adblock.xpi"])
     browser.visit("http://kinox.to/Stream/The_Big_Bang_Theory.html")
     stream = script.watchSeries("http://kinox.to/Stream/The_Big_Bang_Theory.html", 100, 100, None)
     stream.set_episode(browser)
     sleep(5)
     browser.quit()

Example #20

0

Show file

File: the_phantom.py Project: josemazo/learning-splinter-phantomjs

class GoogleTestCase(unittest.TestCase):
    def setUp(self):
        self.browser = Browser('phantomjs')
        self.browser.visit('http://google.com')

    def test_check_title(self):
        assert self.browser.title == 'Google'

Example #21

0

Show file

File: press.py Project: john2x/scaling-fortnight

    def _parse_article_html(self, objectId, url, industry_press=None):
        #browser.visit("http://www.businesswire.com/news/home/20150409005073/en")
        browser = Browser("phantomjs")
        browser.visit(url)
        time.sleep(2)
        html = browser.html

        #html = requests.get(url).text
        html = BeautifulSoup(html)
        article = html.find("div", {"class":"bw-release-story"})
        company_name = html.find("h3", {"itemprop":"sourceOrganization"})
        company_name = company_name.find("span", {"itemprop":"name"})
        vals = [article, company_name]
        cols = ["article", "company_name"]
        #TODO - itemprop="name" company_name
        #TODO - persist in parse
        vals = [self.remove_non_ascii(i.text) if i else "" for i in vals]
        data = dict(zip(cols, vals))
        #print data["company_name"]
        print data
        #q.enqueue(ClearSpark()._bulk_company_info, data["company_name"])
        if industry_press:
          r = Parse().update("IndustryPress", objectId, data)
        else:
          r = Parse().update("Press", objectId, data)
        print r.json()
        browser.quit()

Example #22

0

Show file

File: __init__.py Project: gds-attic/transactions-explorer

class BrowserTest(TestCase):
    def setUp(self):
        self.browser = Browser('phantomjs', wait_time=10)
        self.browser.driver.set_window_size(1024, 768)

    def tearDown(self):
        self.browser.quit()

Example #23

0

Show file

File: zhconsumerTest.py Project: xujianhai/zhihu

    def test_queue2csvzhconsumer(self):

        container = Queue()
        consumer =  QueueConsumer(container)

        store = CSVStore(CSV_FILENAME, CSV_FIELDNAMES)

        zhconsumer = ZHConsumer(consumer, store)

        browser = Browser('firefox')
        browser.visit('http://127.0.0.1:8888')       


        zhconsumer.start()
        """函数的启动方式
        thread.start_new_thread(函数, (参数列表))
        """        
        data = html.document_fromstring(browser.html)
        for i in range(1000):
            container.put(data)

        while 1:
            if not container.qsize():
                break
            else:
                print("the size of queue : %s" % str(container.qsize()))
                # tmd , 不加 睡眠，那么，gil 都在 这个循环的手上了
                # time.sleep(1)

        zhconsumer.close()
        zhconsumer.stop()

Example #24

0

Show file

File: test_webdriver_firefox.py Project: brady-vitrano/splinter

def firefox_installed():
    try:
        browser = Browser("firefox")
        browser.quit()
    except OSError:
        return False
    return True

Example #25

0

Show file

File: spider.py Project: GeekTemo/stocks_analyze

 def run(self):
     browser = Browser('chrome')
     logging.warning('Process:%s start.' % self.name)
     while True:
         url = self.url_queue.get()
         browser.visit(url)
         html = browser.html
         self.output_queue.put((url, html))

Example #26

0

Show file

File: scraper.py Project: ErinClark/td_jetro_jp

def get_pages(portal):                                                                                                  #gets last page
    browser = Browser("phantomjs", service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'])
    browser.visit(portal)
    html = browser.html
    browser.quit()
    soup = BeautifulSoup(html, "lxml")
    last_page = soup.find('li', id="lastPage").text
    return last_page

Example #27

0

Show file

File: base.py Project: 55500081/splinter

 def test_should_be_able_to_change_user_agent(self):
     from splinter import Browser
     driver_name = self.browser.driver_name.lower()
     browser = Browser(driver_name=driver_name, user_agent="iphone")
     browser.visit(EXAMPLE_APP + "useragent")
     result = 'iphone' in browser.html
     browser.quit()
     self.assertTrue(result)

Example #28

0

Show file

File: test_browser.py Project: 55500081/splinter

    def browser_can_change_user_agent(self, webdriver):
        from splinter import Browser
        browser = Browser(driver_name=webdriver, user_agent="iphone")
        browser.visit(EXAMPLE_APP + "useragent")
        result = 'iphone' in browser.html
        browser.quit()

        return result

Example #29

0

Show file

File: test_ui.py Project: gwu-libraries/sfm-docker

 def test_login(self):
     with pyvirtualdisplay.Display():
         browser = Browser()
         browser.visit("http://ui:8080/accounts/login/")
         browser.fill("login", "testuser")
         browser.fill("password", "password")
         browser.find_by_css(".btn-primary").click()
         self.assertTrue(browser.find_by_text("Successfully signed in as testuser."))

Example #30

0

Show file

File: test.py Project: pafi1/stream

 def test_full_and_play(self):
     browser = Browser("firefox", extensions=["adblock.xpi"])
     sleep(5)
     browser.visit("http://streamcloud.eu/e9xfkyuqt6ze/Poltergeist.BDScr.LD.German.x264-PsO.mkv.html")
     stream = script.watchSeries(
         "http://streamcloud.eu/e9xfkyuqt6ze/Poltergeist.BDScr.LD.German.x264-PsO.mkv.html", None
     )
     stream.full_and_close(browser)

Example #31

0

Show file

File: scrape_mars.py Project: prakashbalasubramaniam/webscraping

def get_hemis_img(url):
    # Path to chromedriver
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Go to website
    browser.visit(url)

    # read website's html
    html = browser.html
    soup = bs(html, 'html.parser')

    # find "a" tag
    result = browser.find_by_tag("a")

    # define a list to hold 1st link to full images
    hemis_image_path_list = []
    for i in range(8):
        # if link exist, skip saving to list
        if (result[i + 4]["href"]) in hemis_image_path_list:
            print('')
        else:
            hemis_image_path_list.append(result[i + 4]["href"])

    # browser reload
    browser.reload()

    # Close the browser after scraping
    browser.quit()

    final_hemis_img_url_list = []
    for i in range(len(hemis_image_path_list)):
        # Path to chromedriver
        executable_path = {'executable_path': 'chromedriver.exe'}
        browser = Browser('chrome', **executable_path, headless=False)

        # Go to website
        browser.visit(hemis_image_path_list[i])

        # read website's html
        html = browser.html
        soup = bs(html, 'html.parser')

        # get image title
        result_title = soup.find('h2', class_='title').get_text()

        # get image URL
        result = soup.find('img', class_='wide-image')["src"]
        final_url = 'https://astrogeology.usgs.gov' + result

        # concat image URL to get complete URL link
        final_hemis_img_url_list.append({
            "title": result_title,
            "img_url": final_url
        })

        # browser reload
        browser.reload()

        # Close the browser after scraping
        browser.quit()

    return final_hemis_img_url_list

Example #32

0

Show file

File: mars_scrape.py Project: edendd/Web-scraping-challenge

def init_browser():
    executable_path = {'executable_path':"C:\Drivers\chromedriver\chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

Example #33

0

Show file

 def setUpClass(cls):
     cls.browser = Browser("phantomjs", wait_time=1.5)

Example #34

0

Show file

File: scrape_mars.py Project: michaelvn1925/web-scraping-challenge

def scrape():
    mars_dict = {}
    # set chrome driver path
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    import time
    # visit NASA Mars News url
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(3)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    article = soup.find("div", class_='list_text')

    news_title = article.find("div", class_="content_title").text
    mars_dict['title'] = news_title

    news_p = soup.find('div', class_='article_teaser_body').text
    mars_dict["paragraph"] = news_p
    #################################################
    import time
    image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(image_url)
    time.sleep(3)

    browser.links.find_by_partial_text('FULL IMAGE')[0].click()
    time.sleep(3)

    browser.links.find_by_partial_text('more info').click()
    time.sleep(3)

    html = browser.html
    image_soup = BeautifulSoup(html, 'html.parser')

    feat_img_url = image_soup.find('figure', class_='lede').a['href']
    featured_image_url = f'https://www.jpl.nasa.gov{feat_img_url}'

    mars_dict['featured_image_url'] = featured_image_url

    #################################################

    import time
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    time.sleep(3)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    mars_weather = soup.find(
        'div',
        class_=
        'css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0'
    ).text
    mars_dict['mars_weather'] = mars_weather

    #################################################

    url = 'https://space-facts.com/mars/'
    browser.visit(url)
    html = browser.html

    table = pd.read_html(url)
    mars_facts = table[2]
    # Rename columns
    mars_facts.columns = ['Description', 'Value']
    # Reset Index
    mars_facts.set_index('Description', inplace=True)
    # Converting table data to Html string
    mars_facts = mars_facts.to_html()
    mars_facts = mars_facts.replace("\n", "")

    mars_dict['mars_facts'] = mars_facts

    #################################################
    import time

    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    time.sleep(5)
    html = browser.html

    soup = BeautifulSoup(html, 'html.parser')

    products = soup.find('div', class_='result-list')
    hemispheres = products.find_all('div', class_='item')

    hemisphere_image_urls = []

    for hemisphere in hemispheres:
        title = hemisphere.find("h3").text
        title = title.replace("Enhanced", "")
        end_link = hemisphere.find("a")["href"]
        image_link = "https://astrogeology.usgs.gov/" + end_link
        browser.visit(image_link)
        import time
        time.sleep(5)
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")
        downloads = soup.find("div", class_="downloads")
        image_url = downloads.find("a")["href"]
        hemisphere_image_urls.append({"title": title, "image_url": image_url})
        import datetime as dt

    mars_dict['hemisphere_urls'] = hemisphere_image_urls
    mars_dict["TimeStamp"] = dt.datetime.now()

    browser.quit()
    return mars_dict


#if __name__ == '__main__':
#      scrape()

Example #35

0

Show file

File: test.scraping.py Project: rmchartman/Mission-to-Mars

#test

# Import Splinter, BeautifulSoup, and Pandas
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import datetime as dt

# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path)

def mars_facts():
    # Add try/except for error handling
    try:
        # Use 'read_html' to scrape the facts table into a dataframe
        df = pd.read_html('http://space-facts.com/mars/')[0]

    except BaseException:
        print("None Available")

    # Assign columns and set index of dataframe
    df.columns=['Description', 'Mars']
    df.set_index('Description', inplace=True)

    # Convert dataframe into HTML format, add bootstrap
    return df.to_html(classes="table table-striped")

Example #36

0

Show file

File: scrape_mars.py Project: shresthaachyut/shresthaachyutWebScrapping

def scrape():
    from splinter import Browser
    from splinter.exceptions import ElementDoesNotExist
    import numpy as np
    from bs4 import BeautifulSoup
    import pandas as pd
    import requests
    executable_path = {'executable_path': 'Resources/chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # 1.1 Scraping News Title and Paragraphs
    url1 = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(url1)
    news_title = []
    news_para = []
    for pages in range(10):
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        titles = soup.find_all(class_='content_title')
        paragraphs = soup.find_all(class_='article_teaser_body')
        for title in titles:
            news_title.append(title.a.text)
        for paragraph in paragraphs:
            news_para.append(paragraph.text)
        try:
            browser.click_link_by_partial_text('MORE')
        except:
            print("Scraping Complete")
    np_news_title = np.unique(np.array(news_title))
    np_news_para = np.unique(np.array(news_para))

    # 1.2 Get Images JPL Mars Space Images - Featured Image
    url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url2)
    featured_image_url = []
    for pages in range(5):
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        url_imgs = soup.find_all(class_='img')
        for url_img in url_imgs:
            image = url_img.img['src']
            featured_image_url.append('https://www.jpl.nasa.gov' + image)
        try:
            browser.click_link_by_partial_text('Next')
        except:
            print("Scraping Complete")
    np_featured_image_url = np.unique(np.array(featured_image_url))

    # 1.3 Mars Weather
    url3 = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url3)
    soup = BeautifulSoup(response.text, 'html.parser')
    mars_weather = []
    results = soup.find_all('div', class_="js-tweet-text-container")
    for result in results:
        try:
            weather = result.p.text
            mars_weather.append(weather)
        except AttributeError as e:
            print(e)
    mars_weather = mars_weather[1]

    # 1.4 Mars Facts
    url4 = 'https://space-facts.com/mars/'
    marsFacts = pd.read_html(url4)[0]
    marsFacts.drop(columns='Earth', inplace=True)
    marsFacts.columns = ['MarsFacts', 'Value']
    marsFacts.head()

    # 1.5 Mars Hemispheres
    url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url5)
    image_url = []
    title = []
    href_container = []

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    href_url_divs = soup.find_all('div', class_='item')
    for div in href_url_divs:
        href_container.append('https://astrogeology.usgs.gov' + div.a['href'])
    for links in href_container:
        try:
            browser.visit(links)
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            browser.click_link_by_partial_text('Open')
            img = soup.find('img', class_='wide-image')
            title = soup.find('h2', class_='title')
            print(img)
            image_url.append({
                'title':
                title.text.replace(' Enhanced', ''),
                'img_url':
                'https://astrogeology.usgs.gov/' + img['src']
            })
        except:
            print('scraping complete')

    scrapped = {
        'NewsTitle': np_news_title,
        'NewsParagraps': np_news_para,
        'FeaturedImages': np_featured_image_url,
        'Facts': marsFacts,
        'Weather': mars_weather,
        'Hemispheres': image_url
    }
    return (scrapped)

Example #37

0

Show file

File: Mission_to_Mars_Challange.py Project: tkoide01/Mission-to-Mars

#!/usr/bin/env python
# coding: utf-8

# In[1]:

# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

# In[2]:

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# In[3]:

# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

# In[4]:

html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

Example #38

0

Show file

def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=True)

Example #39

0

Show file

File: scrape_info.py Project: hvellalath/Web_Scrapping

def init_browser():
    executable_path = {'executable_path': 'chromedriver.exe'}
    return Browser('chrome', **executable_path, headless=False)

Example #40

0

Show file

 def setUpClass(cls):
     custom_headers = {
         'X-Splinter-Customheaders-1': 'Hello',
         'X-Splinter-Customheaders-2': 'Bye'
     }
     cls.browser = Browser("phantomjs", custom_headers=custom_headers)

Example #41

0

Show file

File: scrape_mars.py Project: Aytch2eso4/12

def init_browser():
    # windows (me)   Make sure the chromedriver is in the jupy nb file location
    executable_path = {"executable_path": "chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=False)

Example #42

0

Show file

File: scrape_mars.py Project: prakashbalasubramaniam/webscraping

def get_featured_img_func(url):
    # Path to chromedriver
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Go to website
    browser.visit(url)

    # find "Full Image" button to click on it to get to next webpage
    full_img = browser.find_by_id("full_image")
    full_img.click()

    # find "More Info" button to click on it to get to next webpage
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_elem = browser.find_link_by_partial_text('more info')
    more_info_elem.click()

    # read website's html
    html = browser.html
    soup = bs(html, 'html.parser')

    # find "a" tag to find href containing the URL
    result = browser.find_by_tag("a")
    relative_image_path = result[58]["href"]

    # get image title
    relative_image_title = soup.find('h1', class_='article_title')
    relative_image_title = relative_image_title.get_text()
    relative_image_title = relative_image_title.split('\t')
    relative_image_title
    relative_image_title[4]
    final_title_feature_img = []
    final_title_feature_img.append({
        'Title': relative_image_title[4],
        'URL': relative_image_path
    })

    # Close the browser after scraping
    browser.quit()

    #return scraped object
    return final_title_feature_img

Example #43

0

Show file

def init_browser():
    # Setting the chromedriver path
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    return Browser('chrome', **executable_path, headless=False)

Example #44

0

Show file

def init_bowser():

    #@NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}

    return Browser("chrome", **executable_path, headless=False)

Example #45

0

Show file

File: scrape_mars.py Project: billyATGE/Missions_to_Mars

def fetch(url):
    executable_path = {'executable_path': GeckoDriverManager().install()}
    browser = Browser('firefox', **executable_path, headless=True)
    browser.visit(url)
    html = browser.html
    return bs(html, 'html.parser')

Example #46

0

Show file

def init_browser():
    executable_path = {"executable_path": "C:\chromedriver_win32\chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

Example #47

0

Show file

def scrape():
    import pandas as pd
    from bs4 import BeautifulSoup as bs
    import requests
    from splinter import Browser
    from webdriver_manager.chrome import ChromeDriverManager
    import pymongo
    #scrape news
    url = 'https://mars.nasa.gov/news'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')

    title_results = soup.find('div', class_="content_title")
    news_title = title_results.get_text(strip=True)

    para_results = soup.find('div', class_="rollover_description_inner")
    news_p = para_results.get_text(strip=True)

    #scrape for image

    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    browser.find_by_css('div[class="NavDesktopDropdown -active"]')[0].click()
    browser.click_link_by_partial_text('Featured Image')

    html = browser.html
    soup = bs(html, 'html.parser')
    main = soup.find('main')
    img = main.find('img')
    featured_image_url = img['src']

    #scrape for facts
    url = 'https://space-facts.com/mars/'
    mars_data_table = pd.read_html(url, header=None)
    mars_data_table = mars_data_table[0]
    mars_data_table = mars_data_table.rename(columns={
        0: 'Description',
        1: 'Mars'
    })
    mars_data_html = mars_data_table.to_html()

    # scrape hemi-info
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    url_list = []
    hemi_list = []

    items = soup.find_all('div', class_='description')

    for item in items:
        a = item.find('a', class_='itemLink')
        hemi = a.text.strip()
        hemi_list.append(hemi)
        url = item.find('a')['href']
        url_list.append(url)

    hemi_list = [i.split(' Enhanced', 1)[0] for i in hemi_list]

    orignial_img_list = []

    for url in url_list:
        browser.visit(f"https://astrogeology.usgs.gov/{url}")
        html = browser.html
        soup = bs(html, 'html.parser')
        downloads = soup.find_all('li')
        temp_list = []
        for download in downloads:
            orignial_img = download.find('a')['href']
            temp_list.append(orignial_img)

        orignial_img_list.append(temp_list[1])

    hemisphere_image_urls = []

    browser.quit()

    for i in range(0, 4, 1):
        temp_dict = {'title': hemi_list[i], 'img_url': orignial_img_list[i]}
        hemisphere_image_urls.append(temp_dict)

    #put all results in list of dict
    mars_dict = [{
        'Recent_News': news_title,
        'News_Story': news_p
    }, {
        'Featured_Image': featured_image_url
    }, {
        'Mars_Data': mars_data_html
    }, {
        'Mars_Hemispheres': hemisphere_image_urls
    }]

    conn = "mongodb://localhost:27017"
    client = pymongo.MongoClient(conn)
    db = client.mars_db
    mars_collection = db.mars_collection

    mars_collection.insert_many(mars_dict)

    print('Mars Data Uploaded')

Example #48

0

Show file

#!/usr/bin/env python
# coding: utf-8

# In[1]:

# Import Splinter, BeautifulSoup, and Pandas
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager

# In[2]:

# Set the executable path and initialize Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

# ### Visit the NASA Mars News Site

# In[3]:

# Visit the mars nasa news site
url = 'https://redplanetscience.com/'
browser.visit(url)

# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

# In[4]:

# Convert the browser html to a soup object and then quit the browser

Example #49

0

Show file

#!/usr/bin/env python
# coding: utf-8

# # Module: 10.3.3 Scrape Mars Data: The News

# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

executable_path = {'executable_path': ChromeDriverManager().install()}

browser = Browser('chrome', **executable_path, headless=False)

# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

#set up the HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

slide_elem.find('div', class_='content_title')

# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

Example #50

0

Show file

File: scrape_mars.py Project: xjarodx/Mars_Scraping

def scrapey_mars():

    mars_dict = {}

    ## Part 1 ##
    news = 'https://mars.nasa.gov/news/'
    response = requests.get(news)
    soup = bs(response.text, 'html.parser')
    title = soup.find('div', class_="content_title")
    news_title = title.a.text
    summary = soup.find('div', class_="rollover_description_inner")
    news_sum = summary.text
    mars_dict["News_Title"] = news_title
    mars_dict["News_Summary"] = news_sum

    ## Part 2 ##
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    full_image = browser.find_by_id('full_image')
    time.sleep(1)
    full_image.click()
    time.sleep(1)
    m_info = browser.find_link_by_partial_text('more info')
    m_info
    m_info.click()
    new_html = browser.html
    new_soup = bs(new_html, 'html.parser')
    full_image = new_soup.select_one('figure.lede a img').get("src")
    full_image_summary = new_soup.find('div',
                                       class_="wysiwyg_content").get("p")
    jpl_url = 'https://www.jpl.nasa.gov'
    featured_image_url = jpl_url + full_image
    mars_dict["Featured_Image_Link"] = featured_image_url
    mars_dict["Image_Summary"] = full_image_summary
    full_image_title = new_soup.find('h1', class_="article_title")
    mars_dict["Featured_Image_Title"] = full_image_title.text.strip('\n\t": ')

    ## Part 3 ##
    marsweather = 'https://twitter.com/marswxreport'
    response2 = requests.get(marsweather)
    soup2 = bs(response2.text, 'html.parser')
    weather = []

    for w_info in soup2.find_all(
            'p',
            class_=
            "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"):
        weather.append(w_info.text.strip())

    for tweet in reversed(weather):
        if tweet[:3] == "InS":
            mars_weather = tweet

            mars_dict["weather_info"] = mars_weather

    ## Part 4 ##
    pandaurl = 'https://space-facts.com/mars/'
    tables = pd.read_html(pandaurl)
    df = tables[0]
    df.columns = ['Mars Facts', 'Values']
    df.set_index("Mars Facts")
    mars_facts_html = df.to_html(index=False,
                                 classes="table-hover table-dark table-sm")
    mars_dict["facts_table"] = mars_facts_html

    ## Part 5 ##
    hemishperes = [
        'Cerberus', 'Schiaparelli', 'Syrtis Major', 'Valles Marineris'
    ]

    hem_title = []
    img_urls = []

    for i in hemishperes:
        my_url = 'https://astrogeology.usgs.gov/search/map/Mars/Viking/' + i
        browser.visit(my_url)
        time.sleep(1)
        url_html = browser.html
        page_soup = bs(url_html, "html.parser")

        hem_images = page_soup.find('div',
                                    class_='downloads').find('li').a['href']
        img_urls.append(hem_images)

        hem_name = i + ' Hemisphere'
        hem_title.append(hem_name)

        print(hem_name + ' is a great success!')

    d = dict(zip(hem_title, img_urls))
    hemisphere_image_urls = [{
        'image_url': v
    } for k, v in zip(hem_title, img_urls)]
    hemisphere_image_titles = [{
        'hem_title': b
    } for b, l in zip(hem_title, img_urls)]
    mars_dict["Hemisphere_image_data"] = hemisphere_image_urls
    mars_dict["Hemisphere_title_data"] = hemisphere_image_titles
    browser.quit()

    return mars_dict

Example #51

0

Show file

File: scrape_mars.py Project: CheskaGatus/Mission_To_Mars

def scrape():

	# Dependencies
	from bs4 import BeautifulSoup as bs
	import pandas as pd
	from splinter import Browser

	executable_path = {'executable_path': 'resources/chromedriver.exe'}
	browser = Browser('chrome', **executable_path, headless=False)

	# Create Mission to Mars global dictionary that can be imported into MongoDB
	mars_info = {}

	### NASA Mars News
	# Scrape the NASA Mars News Site and collect the latest news title and paragraph text.
	
	url_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
	
	browser.visit(url_news)

	soup_news = bs(browser.html, 'html.parser')

	#print(soup_news.prettify())

	# Find latest news title
	news_title = soup_news.find('div', class_='content_title').text

	# Dictionary entry for news title
	mars_info['news_title'] = news_title

	# Find latest news paragraph
	news_paragraph = soup_news.find('div', class_='article_teaser_body').text
	
	# Dictionary entry for news paragraph
	mars_info['news_paragraph'] = news_paragraph


	### JPL Mars Space Images - Featured Image
	# Use splinter to navigate the site and find the image url for the current Featured Mars Image

	url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
	
	browser.visit(url_image)

	soup_image = bs(browser.html, 'html.parser')

	#print(soup_image.prettify())

	image = soup_image.find_all('a', class_ ="fancybox")[1]['data-fancybox-href']
	#print(image)

	# Concatenate website url with scrapped route
	featured_image_url = 'https://www.jpl.nasa.gov' + image

	# Dictionary entry for Mars featured image
	mars_info['featured_image_url'] = featured_image_url
	
	
	### Mars Weather from Twitter
	# Scrape the latest Mars weather tweet from Mars Weather twitter account.

	url_twitter = 'https://twitter.com/marswxreport?lang=en'
	
	browser.visit(url_twitter)

	soup_weather = bs(browser.html, 'html.parser')

	#print(soup_weather.prettify())

	# Display mars weather details
	mars_weather = soup_weather.find_all('p', class_ = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')[0].text.split("pic")[0]
	
	# Dictionary entry for Mars weather from twitter
	mars_info['mars_weather'] = mars_weather

	### Mars Facts from Space Facts
	# Visit the Space Facts webpage, mars facts page. 
	# Use Pandas to scrape the table containing facts about Mars including Diameter, Mass, etc.
	# Use Pandas to convert the data to a HTML table string

	url_facts = 'https://space-facts.com/mars/'
	
	browser.visit(url_facts)

	# Use Panda's `read_html` to parse the url
	facts_df = pd.read_html(url_facts)[0]

	# Rename columns
	facts_df.columns = ['Description', 'Value']

	# Set description column as index
	facts_df.set_index('Description', inplace=True)

	# Dictionary entry for Mars Facts from Space Facts
	mars_info['mars_facts'] = facts_df.to_html()

	### Mars Hemispheres
	# Visit the USGS Astrogeology site to obtain high resolution images for each of Mars' hemispheres.

	url_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
	
	browser.visit(url_hemispheres)

	soup_hemisphere = bs(browser.html, 'html.parser')

	#print(soup_hemisphere.prettify())

	results = soup_hemisphere.find_all('div', class_ = 'description')

	hemisphere_image_urls = []

	for result in results:
		
		# Get hemisphere name and save in variable called title
		title = result.find('h3').text
		
		# Get links to the hemispheres and save in variable called url
		partial_url = result.find('a', class_="itemLink product-item")['href']
		url = 'https://astrogeology.usgs.gov/' + partial_url
		
		# Click each of the url to find the full resolution hemisphere image. Save in variable called img_url).
		browser.visit(url)
		soup_imgs = bs(browser.html, 'html.parser')
		img_url = soup_imgs.find('div', class_='downloads').li.a['href']
		
		# Use a Python dictionary to store the data using the keys img_url and title. 
		# Append the dictionary with the hemisphere title and image url string to a list. 
		# This list will contain one dictionary for each hemisphere.
		hemisphere_image_urls.append({'title':title, 'img_url':img_url})


	mars_info['hemisphere_image_urls'] = hemisphere_image_urls
	
	browser.quit()

	return mars_info

Example #52

0

Show file

 def browser(self):
     browser = Browser('chrome')
     yield browser
     browser.quit()

Example #53

0

Show file

File: scrape_mars.py Project: michaelvn1925/web-scraping-challenge

#!/usr/bin/env python
#coding: utf-8

from bs4 import BeautifulSoup
from splinter import Browser
from selenium import webdriver
import pandas as pd
import time
import datetime as dt
import re

################################################
executable_path = {"executable_path": "./chromedriver.exe"}
browser = Browser("chrome", **executable_path)

#################################################


def scrape():
    mars_dict = {}
    # set chrome driver path
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    import time
    # visit NASA Mars News url
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(3)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

Example #54

0

Show file

def scrape():

    # Mars News
    executable_path = {"executable_path": "chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    news_title = soup.find("div", class_="content_title").get_text()
    news_p = soup.find("div", class_="article_teaser_body").get_text()

    # JPL Mars space Image
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    image_url = soup.footer.find(
        "a", class_="button fancybox")["data-fancybox-href"]
    featured_image_url = "https://www.jpl.nasa.gov" + image_url

    # Mars weather
    url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    tweets = soup.find_all(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")

    for tweet in tweets:
        tweet_parent = tweet.find_parent("div", class_="content")
        tweet_id = tweet_parent.find(
            "a",
            class_=
            "account-group js-account-group js-action-profile js-user-profile-link js-nav"
        )["href"]

        if tweet_id == '/MarsWxReport':
            mars_weather = tweet_parent.find(
                "p",
                class_=
                "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
            ).get_text()
            break

    # Mars Facts
    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)
    df = tables[0]
    df.columns = ["Description", "Value"]
    df.set_index(df["Description"], inplace=True)
    df = df[["Value"]]
    html_table = df.to_html()
    html_table = html_table.replace('\n', '')

    # Mars Hemisphers
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    h3s = soup.find_all("h3")

    titles = []
    for h3 in h3s:
        h3 = str(h3)
        h3 = h3[4:-14]
        titles.append(h3)

    img_urls = []
    for title in titles:
        browser.click_link_by_partial_text(title)

        html = browser.html
        soup = BeautifulSoup(html, "html.parser")

        img_urls.append(soup.find("div", class_="downloads").find("a")["href"])

    hemisphere_image_urls = []
    for title, img_url in zip(titles, img_urls):
        hemisphere_image_urls.append({"title": title, "img_url": img_url})

    data = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "mars_weather": mars_weather,
        "html_table": html_table,
        "hemisphere_image_urls": hemisphere_image_urls
    }
    return data

Example #55

0

Show file

File: scrape_mars.py Project: bdthai81/Web-Scraping-challenge

def init_browser():
    # Launch splinter browser
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    return Browser('chrome', **executable_path, headless=False)

Example #56

0

Show file

File: scrape_mars2.py Project: amblakemore/UCI_Homework_Blakemore

def scrape_info():

    mars = {}

    #get_ipython().system('which chromedriver')

    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://mars.nasa.gov/news'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    article = soup.find("div", class_="list_text")
    news_title = article.find("div", class_="content_title").text
    news_p = article.find("div", class_="article_teaser_body").text
    print(news_title)
    print(news_p)
    mars["news_title"] = news_title
    mars["news_p"] = news_p
    #print(news_title)

    # Featured Images

    # url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    base_url = 'https://www.jpl.nasa.gov'
    url = base_url + '/spaceimages/?search=&category=Mars'

    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    image_url = soup.find("a", class_="button fancybox")["data-fancybox-href"]
    featured_image_url = base_url + image_url
    print(featured_image_url)
    mars["featured_image"] = featured_image_url

    # Mars Facts

    url = 'https://space-facts.com/mars/'

    tables = pd.read_html(url)
    tables

    mars_facts_df = tables[0]
    mars_facts_df.columns = ['Fact', 'Value']
    mars_facts_df['Fact'] = mars_facts_df['Fact'].str.replace(':', '')
    mars_facts_df

    mars_facts_df = tables[0]
    mars_facts_df.columns = ['Fact', 'Value']
    mars_facts_df['Fact'] = mars_facts_df['Fact'].str.replace(':', '')
    mars_facts_df
    mars_facts_html = mars_facts_df.to_html()
    print(mars_facts_html)
    mars["facts"] = mars_facts_html

    # Mars Hemispheres

    # In[14]:

    base_url = 'https://astrogeology.usgs.gov'
    url = base_url + '/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    items = soup.find_all('div', class_='item')

    urls = []
    titles = []
    for item in items:
        urls.append(base_url + item.find('a')['href'])
        titles.append(item.find('h3').text.strip())
    print(urls)
    titles

    browser.visit(urls[0])
    html = browser.html
    soup = bs(html, 'html.parser')
    oneurl = base_url + soup.find('img', class_='wide-image')['src']
    oneurl

    image_urls = []
    for oneurl in urls:
        browser.visit(oneurl)
        html = browser.html
        soup = bs(html, 'html.parser')
        oneurl = base_url + soup.find('img', class_='wide-image')['src']
        image_urls.append(oneurl)
    image_urls

    hemisphere_images_urls = []

    for i in range(len(titles)):
        hemisphere_images_urls.append({
            'title': titles[i],
            'image_url': image_urls[i]
        })

    hemisphere_images_urls
    mars["hemispheres"] = hemisphere_images_urls

    #for i in range(len(hemisphere_images_urls)):
    # print(hemisphere_images_urls[i]['title'])
    #print(hemisphere_images_urls[i]['image_url'] + '\n')

    return mars

Example #57

0

Show file

File: keytest.py Project: krisk248/ocr_for_form

# -*- coding: \xc2 -*-

from splinter import Browser
import time
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
from splinter import *
import time
tic = time.clock()
browser = Browser('chrome')
url = "file:///C:/Users/servadmin/Documents/Atom%20Projects/formfill/index.html"
browser.visit(url)
im = Image.open("abre.png")  # the second one
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
im.save('temp2.png')
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files (x86)/Tesseract-OCR/tesseract'
text = pytesseract.image_to_string(Image.open('temp2.png'))
out = text.split("*")

browser.find_by_id("1").fill(out[0])                                                               #
browser.find_by_id("2").fill(out[1])                                                               #
browser.find_by_id("3").fill(out[2])                                                               #
browser.find_by_id("4").fill(out[3])                                                               #
browser.find_by_id("5").fill(out[4])                                                               #
browser.find_by_id("6").fill(out[5])                                                               #
# browser.find_by_id("__tab_ctl00_ContentPlaceHolder1_tabForm_tabTransactionDetail").click()   #TRANSACTION DETAILS
browser.find_by_id("7").fill(out[6])                                                               #
browser.find_by_id("8").fill(out[7])                                                               #

Example #58

0

Show file

File: scrape_mars.py Project: mbarrera2020/web-scraping-challenge

def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {'executable_path': ChromeDriverManager().install()}
    return Browser("chrome", **executable_path, headless=False)

Example #59

0

Show file

File: scrape_mars.py Project: byungjlee333/web-scraping-challenge

def init_browser():
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

Example #60

0

Show file

File: Mission_to_Mars-Challenge.py Project: danjberes/Mission-to-Mars

# Import Splinter and BeautifulSoup
import requests as re
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

# Path to chromedriver
get_ipython().system('which chromedriver')

# In[3]:

# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path)

# ##  Visit the NASA Mars News site

# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

# Convert the browser html to a soup object and then quit the browser
html = browser.html
news_soup = soup(html, 'html.parser')

slide_elem = news_soup.select_one('ul.item_list li.slide')

slide_elem.find("div", class_='content_title')