Exemple #1
0
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = User(name="Alice",
                         email="*****@*****.**",
                         password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run,
                                               kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()

    def test_2_add_post(self):
        self.browser.visit("http://127.0.0.1:8080")
        print("current url = ", self.browser.url)

        self.browser.driver.set_window_size(1920, 1080)
        self.browser.click_link_by_text('login')
        print("current url = ", self.browser.url)

        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        print(self.browser.url)

        add_link = self.browser.find_link_by_partial_text('add')
        add_link.click()
        print(self.browser.url)

        title = "test_acceptance_add_post"
        self.browser.fill("title", title)
        now = datetime.datetime.now()
        now = str(now)
        self.browser.fill("content", now)
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        print(self.browser.url)

        new_post_appears = self.browser.is_text_present(
            title) and self.browser.is_text_present(now)
        print("new_post_appears = ", new_post_appears)
        self.assertEqual(new_post_appears, True)
Exemple #2
0
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = User(name="Alice",
                         email="*****@*****.**",
                         password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run,
                                               kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()

    def test_login_correct(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")

    def test_login_incorrect(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")

    def test_add_entry(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.browser.click_link_by_text("Add Entry")
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "Test Title")
        self.browser.fill("content", "Test Content")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
Exemple #3
0
class BaseWebTestCase(LiveServerTestCase):
    """
    Abstract class to handle logic for web tests
    """
    username = '******'
    password = '******'
    wait_seconds = 3.0

    def setUp(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("--no-sandbox")
        self.browser = Browser('chrome',
                               headless=True,
                               wait_time=10,
                               options=chrome_options)
        super(BaseWebTestCase, self).setUp()

    def tearDown(self):
        self.browser.quit()
        try:
            super(BaseWebTestCase, self).tearDown()
        except IndexError:
            print("Ignoring IndexError in tearDown...")

    def _login(self):
        self._visit("")
        self.browser.fill('username', self.username)
        self.browser.fill('password', self.password)
        self.browser.find_by_text('Sign in').first.click()

        assert self.browser.is_text_present('Home')
        assert not self.browser.is_text_present('Sign in')

    def _go_home(self):
        self.browser.click_link_by_text('Home')
        time.sleep(self.wait_seconds)

    def _setup_confirm(self):
        """
        First part of work-around to let phantomjs accept confirmation dialogs
        http://stackoverflow.com/questions/19903146/confirm-alert-window-in-phantom-js
        """
        js_confirm = 'window.confirm = function() { return true }'
        self.browser.execute_script(js_confirm)

    def _accept_confirm(self):
        """
        Second part of work-around to let phantomjs accept confirmation dialogs
        MUST call self._setup_confirm() for this to work
        """
        self.browser.execute_script('return window.confirm')

    def _visit(self, path):
        path = self.live_server_url + path
        self.browser.visit(path)
        time.sleep(self.wait_seconds)
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = User(name="Alice", email="*****@*****.**",
                         password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)


    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
    
    def test_2_add_post (self):
        self.browser.visit("http://127.0.0.1:8080")
        print ("current url = ", self.browser.url)
        
        self.browser.driver.set_window_size(1920, 1080)
        self.browser.click_link_by_text('login')
        print ("current url = ", self.browser.url)
        
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        print (self.browser.url)
        
        add_link=self.browser.find_link_by_partial_text('add')
        add_link.click()
        print (self.browser.url)
        
        title="test_acceptance_add_post"
        self.browser.fill("title", title)
        now=datetime.datetime.now()
        now=str(now)
        self.browser.fill("content", now)
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        print(self.browser.url)
        
        new_post_appears=self.browser.is_text_present(title) and self.browser.is_text_present(now)
        print ("new_post_appears = ", new_post_appears)
        self.assertEqual(new_post_appears, True)
class TestViews(unittest.TestCase):
    def setUp(self):
        # Setup client
        self.browser = Browser('phantomjs')
        self.browser.driver.set_window_size(1024, 768)

        # Setup DB
        db.create_all()
        # Create User
        self.user = User(name='Alice', email='*****@*****.**',
                         password=generate_password_hash('test'))
        db.session.add(self.user)
        db.session.commit()

        self.process = multiprocessing.Process(target=app.run)
        self.process.start()
        time.sleep(1)

    def tearDown(self):
        self.process.terminate()
        db.session.close()
        db.drop_all()
        self.browser.quit()

    def test_login_correct(self):
        self.browser.visit('http://127.0.0.1:5000/login')
        self.browser.fill('email', '*****@*****.**')
        self.browser.fill('password', 'test')
        self.browser.find_by_css('button[type=submit]').click()
        self.assertEqual(self.browser.url, 'http://127.0.0.1:5000/')

    def test_authenticated_add_entry(self):
        # do the login
        self.browser.visit('http://127.0.0.1:5000/login')
        self.browser.fill('email', '*****@*****.**')
        self.browser.fill('password', 'test')
        self.browser.find_by_css('button[type=submit]').click()

        # navigate to the entry add form
        self.browser.click_link_by_text('Add Entry')

        # create a new entry
        self.browser.fill('title', 'The Title')
        self.browser.fill('content', 'The Content')
        self.browser.find_by_css('button[type=submit]').click()

        # check for entry title in home
        self.assertIn('The Title', [e.text for e in self.browser.find_by_css('.row h1')])
	def get_job_information(self, keyword, writer):
		count = 0
		browser = Browser('phantomjs', service_args= self.proxy)
		soup = self.__visit_url(browser, keyword)
		jobs = soup.find_all('tr', attrs = {'class':'aJobS'})
		count += self.__parse_data(jobs, keyword, writer)
		
		while soup.find('li', attrs = {'class':'next'}):
			browser.click_link_by_text('Next')
			data = browser.html
			soup = BeautifulSoup(data)
			jobs = soup.find_all('tr', attrs = {'class':'aJobS'})
			count += self.__parse_data(jobs, keyword, writer)
		print count
		print keyword
			
		browser.quit()
Exemple #7
0
def scrape(username, password):
    appartments = []

    browser = Browser('chrome', headless=True)
    login(browser, username, password)

    browser.click_link_by_text('Lgh')
    links = browser.find_link_by_partial_href(
        'https://nya.boplats.se/objekt/1hand/')

    for l in links:
        appartments.append(extract_table_info(browser, l))

    for a in appartments:
        add_details(browser, a)

    return appartments
Exemple #8
0
 def login(self, name, passwd):
     browser = Browser(driver_name="chrome")
     url = 'https://www.jd.com/'
     browser.visit(url)
     browser.click_link_by_text("你好,请登录")
     #坑一
     browser.click_link_by_text("账户登录")
     browser.fill("loginname", name)  # 填写账户密码
     browser.fill("nloginpwd", passwd)
     try:
         self.JdVerfy.get_jd_verfy_code()
         content = self.ocr.get_image_verfy_code(contant.img_path)
         print 'jd verfy code is %s', content
     except Exception:
         raise VerificationError('验证码获取异常')
     browser.fill('authcode', content)
     time.sleep(3)
     browser.find_by_id("loginsubmit").click()
def find_mars_hemisphere_images():
    """Returns image urls of Mars Hemispheres"""
    #!which chromedriver
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemispheres_url)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    description_class = soup.find_all('div', class_='description')

    hemisphere_names = []

    for hemispheres in description_class:
        hemisphere_names.append(hemispheres.find('h3').text)

    start_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    hemisphere_images_windows = []

    for hemispheres_image in hemisphere_images:
        browser.click_link_by_partial_text(hemispheres_image)
        hemispheres_url = browser.url
        new_page = soup.body.find_all('div', class_='container')
        for sample in new_page:
            browser.click_link_by_text('Sample')
        hemisphere_images_windows.append(browser.windows)
        browser.visit(start_url)

    full_hemisphere_images = []

    for full_images in hemisphere_images_windows[3]:
        full_hemisphere_images.append(full_images.url)

    full_hemisphere_image_urls = [
        {"title": hemisphere_names[3], "img_url": full_hemisphere_images[1]},
        {"title": hemisphere_names[2], "img_url": full_hemisphere_images[2]},
        {"title": hemisphere_names[1], "img_url": full_hemisphere_images[3]},
        {"title": hemisphere_names[0], "img_url": full_hemisphere_images[4]},
    ]
    
    return full_hemisphere_image_urls
Exemple #10
0
def patent_parser(search_exp):
    """@todo: Docstring for patent_parser.
    """
    patent_list = []
    b = Browser("phantomjs")
    b.reload()
    b.visit(
        'http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml'
    )
    b.fill('searchInfo', search_exp)
    b.click_link_by_text(u'检索')
    b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
    for _ in xrange(10):
        item_list = b.find_by_css('.s_c_conter')
        for item in item_list:
            info_list = item.find_by_tag('td')
            if not urlset.has_url('patent', info_list[0].text[6:]):
                try:
                    patent = Patent(
                        id=info_list[0].text[6:],
                        path='~',
                        title=info_list[4].text[6:],
                        abstract='~',
                        inventor=info_list[7].text[5:].split(';')[:-1],
                        applicant=info_list[6].text[10:].split(';')[:-1],
                        category=info_list[5].text[8:].split('; '),
                        update_time=time.strftime('%Y-%m-%dT%XZ',
                                                  time.gmtime()))
                    patent_list.append(patent)
                    print patent.id, 'new'  # @todo logs
                except:
                    print 'error patent'
        if b.is_text_present(u'下一页'):
            b.click_link_by_text(u'下一页')
            b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
        else:
            break
    try:
        solr.add('patent', patent_list)
    except:
        'err adding patent'
    finally:
        b.quit()
Exemple #11
0
def patent_parser(search_exp):
    """@todo: Docstring for patent_parser.
    """
    patent_list = []
    b = Browser("phantomjs")
    b.reload()
    b.visit('http://www.pss-system.gov.cn/sipopublicsearch/search/searchHome-searchIndex.shtml')
    b.fill('searchInfo', search_exp)
    b.click_link_by_text(u'检索')
    b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
    for _ in xrange(10):
        item_list = b.find_by_css('.s_c_conter')
        for item in item_list:
            info_list = item.find_by_tag('td')
            if not urlset.has_url('patent', info_list[0].text[6:]):
                try:
                    patent = Patent(id=info_list[0].text[6:],
                                    path='~',
                                    title=info_list[4].text[6:],
                                    abstract='~',
                                    inventor=info_list[7].text[5:].split(';')[:-1],
                                    applicant=info_list[6].text[10:].split(';')[:-1],
                                    category=info_list[5].text[8:].split('; '),
                                    update_time=time.strftime('%Y-%m-%dT%XZ', time.gmtime()))
                    patent_list.append(patent)
                    print patent.id, 'new'    # @todo logs
                except:
                    print 'error patent'
        if b.is_text_present(u'下一页'):
            b.click_link_by_text(u'下一页')
            b.is_element_not_present_by_css('.s_c_conter', wait_time=8)
        else:
            break
    try:
        solr.add('patent', patent_list)
    except:
        'err adding patent'
    finally:
        b.quit()
def traverse(url):
    browser = Browser()
    browser.visit(url)
    file = open("student_list.txt")
    enrollment_no = file.read().split()

    for line in enrollment_no:
        time.sleep(1)
        browser.fill("eno", line)
        button = browser.find_by_value("Submit").click()
        time.sleep(3)
        capture()
        button = browser.click_link_by_text("Back ")
    file.close()
Exemple #13
0
def download_art(title):

    browser = Browser()
    # Visit URL
    url = "http://gen.lib.rus.ec/scimag/index.php"
    browser.visit(url)

    article_title = browser.find_by_name('s')
    article_title.fill(title)

    button = browser.find_by_value('Search!')
    # Interact with elements
    button.click()

    #sleep is use at each step to control the follow between program and internet speed

    time.sleep(10)
    browser.click_link_by_text('Libgen')
    time.sleep(15)
    browser.click_link_by_partial_href('http://gen.lib.rus.ec/scimag/get.php')

    time.sleep(5)
    browser.quit()
Exemple #14
0
class AutoSearch(object):
    def __init__(self):
        self.browser = None
        self.base_url = 'https://www.baidu.com/'
        self.keywords = ['高压线下也敢飞','珍惜现在的好天气吧']
        self.keywords_pngs = []

    def send_png(self):
        '''
         发送邮件,此函数请自行实现
        :return:
        '''
        sendHtmlMail(self.keywords_pngs)

    def search(self,time_freq = "一天内"):
        self.browser = Browser(driver_name='chrome', executable_path='chromedriver.exe')
        self.browser.visit(self.base_url)
        for word in self.keywords:
            kw = f'+"{word}" site:bbs.wjdaily.com'
            if self.browser.is_element_present_by_id("kw"):
                self.browser.find_by_id("kw").fill(kw)
                time.sleep(1)
                self.browser.find_by_id("su").click()
                # self.browser.find_by_xpath('//*[@id="container"]/div[2]/div/div[1]/span[2]').first.click()
                time.sleep(1)
                if self.browser.is_element_present_by_css(".search_tool_tf",wait_time=10):
                    self.browser.find_by_css(".search_tool_tf").first.click()
                if self.browser.is_element_present_by_text(time_freq):
                    self.browser.click_link_by_text(time_freq)
                time.sleep(3)
                soup = BeautifulSoup(self.browser.html,"html.parser")
                no_result = soup.find('div',{'class':'nors'})
                if no_result is None:
                    print("查到结果,截图")
                    screenshot_path = self.browser.screenshot(rf"E:\GitHub\somenzz\bbsMonitor\{word}",suffix=".png")
                    self.keywords_pngs.append(screenshot_path)
Exemple #15
0
def scrape():
    # Dependencies
    from splinter import Browser
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    import pymongo
    import time
    import ctypes  # An included library with Python install.
    
    def Mbox(title, text, style):
        return ctypes.windll.user32.MessageBoxW(0, text, title, style)

    
    mars_data_dict = {}
    
    ## (1) NASA Mars News
    # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. 
    # Assign the text to variables that you can reference later.
       
    # URL of page to be scraped
    url_nz = 'https://mars.nasa.gov/news/'

    # Retrieve page with the requests module
    response_nz = requests.get(url_nz)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_nz = BeautifulSoup(response_nz.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_nz.prettify())
    
    #time.sleep(2)
    
    # Find the latest News Title
    news_title = soup_nz.find("div", class_="content_title").a.text[1:-1]
    #print(news_title)
    
    # Find the latest News Paragraph Text
    news_p = soup_nz.find("div", class_="image_and_description_container").a.text[3:-7]
    #print(news_p)
    
    mars_data_dict["news_title"] = news_title
    mars_data_dict["news_p"] = news_p
        
        
    
    ## (2) JPL Mars Space Images - Featured Image
    # Use splinter to navigate the site and find the image url for the current Featured Mars Image 
    # and assign the url string to a variable called featured_image_url.
    # Make sure to find the image url to the full size .jpg image.
    # Make sure to save a complete url string for this image.
    
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # URL of page to be scraped
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_jpl)
    
    time.sleep(2)
    
    #dir(browser)
    
    browser.click_link_by_id('full_image')
    
    time.sleep(2)
    
    browser.click_link_by_partial_href("/spaceimages/details.")
    
    time.sleep(2)
    
    browser.click_link_by_partial_href("/spaceimages/images/largesize")
    
    time.sleep(2)
    
    featured_image_url = browser.url
    #print(featured_image_url)
    
    mars_data_dict["feat_img"] = featured_image_url
    
    browser.quit()
    
           
    
    ## (3) Mars Weather
    # Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page.
    # Save the tweet text for the weather report as a variable called mars_weather.
        
    # URL of page to be scraped
    url_tweet = 'https://twitter.com/marswxreport?lang=en'

    # Retrieve page with the requests module
    response_tweet = requests.get(url_tweet)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_tweet = BeautifulSoup(response_tweet.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_tweet.prettify())
    
    #time.sleep(2)
    
    # scrape the latest Mars weather tweet from the page
    tweets = soup_tweet.find_all("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
    for tweet in tweets:
        find_text = tweet.text.find("InSight sol")
        if find_text == 0:
            mars_weather = tweet.text
            #print(mars_weather)
            break
    
    mars_data_dict["weather"] = mars_weather
    
    
    
    ## (4) Mars Facts
    # URL of page to be scraped
    url_mfacts = 'https://space-facts.com/mars/'

    # Retrieve page with the requests module
    response_mfacts = requests.get(url_mfacts)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_mfacts = BeautifulSoup(response_mfacts.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_mfacts.prettify())
    
    #time.sleep(2)
    
    tables = pd.read_html(url_mfacts)[1]
    #tables
    
    mars_data_dict["mfacts"] = tables
    
    tables.to_html("../html/mars_facts.html")
    
    
    
    ## (5) Mars Hemispheres
    # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
    # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    # Save both the image url string for the full resolution hemisphere image, 
    #     and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the 
    #     keys img_url and title.
    # Append the dictionary with the image url string and the hemisphere title to a list. 
    #     This list will contain one dictionary for each hemisphere
    
    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

    # URL of page to be scraped
    url_mhemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_mhemi)
    
    time.sleep(2)
    
    # Image 1
    browser.click_link_by_partial_text("Cerberus Hemisphere Enhanced")
    
    time.sleep(2) 
    
    title1 = browser.title.split("|")[0]
    #print(title1)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img1_url = browser.windows[1].url
    #print(img1_url)
    
    time.sleep(2) 
    
    browser.windows[1].close()
    browser.back()
    
    hemi1_dict = {}
    hemi1_dict["title"] = title1
    hemi1_dict["img_url"] = img1_url
    #hemi1_dict
    
    # Image 2
    
    browser.click_link_by_partial_text("Schiaparelli Hemisphere Enhanced")
    
    time.sleep(2)
    
    title2 = browser.title.split("|")[0]
    #print(title2)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img2_url = browser.windows[1].url
    #print(img2_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi2_dict = {}
    hemi2_dict["title"] = title2
    hemi2_dict["img_url"] = img2_url
    #hemi2_dict
    
    # Image 3
    
    browser.click_link_by_partial_text("Syrtis Major Hemisphere Enhanced")
    
    time.sleep(2)
    
    title3 = browser.title.split("|")[0]
    #print(title3)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img3_url = browser.windows[1].url
    #print(img3_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi3_dict = {}
    hemi3_dict["title"] = title3
    hemi3_dict["img_url"] = img3_url
    #hemi3_dict
    
    # Image 4
    browser.click_link_by_partial_text("Valles Marineris Hemisphere Enhanced")
    
    time.sleep(2)
    
    title4 = browser.title.split("|")[0]
    #print(title4)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img4_url = browser.windows[1].url
    #print(img4_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi4_dict = {}
    hemi4_dict["title"] = title4
    hemi4_dict["img_url"] = img4_url
    #hemi4_dict
    
    hemisphere_image_urls = [hemi1_dict, hemi2_dict, hemi3_dict, hemi4_dict]
    #hemisphere_image_urls
    
    mars_data_dict["hemi_img"] = hemisphere_image_urls
    mars_data_dict  
    
    browser.quit()
    

    
    Mbox("Mission to Mars Completed", "Congratulations!!! You've mined Mars!", 1)   
class UploadTestCase(unittest.TestCase):

  def setUp(self):
    self.testbed = testbed.Testbed()
    self.testbed.activate()
    self.testbed.init_datastore_v3_stub()
    self.testbed.init_memcache_stub()
    self.browser = Browser('chrome')

  def tearDown(self):
    self.testbed.deactivate()

  def test_when_create_task_upload_file(self):
    #login
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

    self.browser.visit("http://127.0.0.1:8080/tasks")

    self.browser.click_link_by_text('Create new task')

    self.browser.fill('title', 'title')
    self.browser.fill('text', 'text')

    self.browser.is_element_present_by_name('files[]', wait_time=10)

    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    #self.browser.attach_file('files[]', 'test/1.png')
    self.browser.find_by_css('.btn.btn-primary.start').first.click()


    self.assertEqual(1, len(self.browser.find_by_css('.template-download.fade.in')))
    self.assertEqual(4, len(self.browser.find_by_css('.template-download.fade.in td')))

  def test_when_create_task_upload_many_files(self):
    #login
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

    self.browser.visit("http://127.0.0.1:8080/tasks")

    self.browser.click_link_by_text('Create new task')

    self.browser.fill('title', 'title')
    self.browser.fill('text', 'text')

    self.browser.is_element_present_by_name('files[]')

    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))

    #self.browser.attach_file('files[]', 'test/1.png')
    self.browser.find_by_css('.btn.btn-primary.start').first.click()
    sleep(3)

    self.assertEqual(3, len(self.browser.find_by_css('.files tr.template-download')))
Exemple #17
0
def Scraper():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)


    url="https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"

    browser.visit(url)
    html=browser.html
    soup=bs(html,'html.parser')

    print(soup.prettify())



    news_title=soup.find('div',class_='content_title').get_text()
    news_paragraph=soup.find('div',class_='article_teaser_body').get_text()


    news_title

    news_paragraph


    # **JPL FEATURED IMAGE**

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url="https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

    browser.visit(url)
    html=browser.html
    soup=bs(html,'html.parser')

    images=soup.footer.find('a',class_='button fancybox')['data-fancybox-href']
    url2='https://www.jpl.nasa.gov'
    actual_url=url2+images
    


    # **MARS WEATHER TWEETS**

    executable_path={'executable_path':'chromedriver.exe'}
    browser=Browser('chrome',**executable_path,headless=False)

    url='https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html=browser.html
    soup=bs(html,'html.parser')


    relevant_tweets=soup.find_all('p',class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')

    relevant_tweets

    weather_tweet=relevant_tweets[7].get_text()
    weather_tweet


    # **MARS FACTS**

    url='https://space-facts.com/mars/'
    tables=pd.read_html(url)
    tables


    df=tables[0]
    df


    df=df.rename(columns={0:'Charactristic',1:'Value'})

    df.set_index('Charactristic')

    final_fact_table=df.to_html(classes='Striped-table')
    final_fact_table


    # **MARS HEMISPHERES**


    executable_path={'executable_path':'chromedriver.exe'}
    browser=Browser('chrome',**executable_path,headless=False)

    url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    html=browser.html
    soup=bs(html,'html.parser')


    titles=[]
    hemi_titles=soup.find_all('h3')

    for i in hemi_titles:
        x=i.get_text()
        titles.append(x)

    titles


    links=[]

    for j in titles:
        browser.visit(url)
        browser.click_link_by_partial_text(j)
        browser.click_link_by_text('Sample')
        html=browser.html
        soup=bs(html,'html.parser')

        link=soup.find('div',class_='downloads').find('a')['href']

        links.append(link)

    links


    # mars_hemis={}
    # for m,k in zip(titles,links):
    #     mars_hemis[m]=k
    mars_hemis=[]
    for m,k in zip(titles,links):
        mars_hemis.append({"title":m,"link":k})


    data={"news_title":news_title,
    "news_paragraph":news_paragraph,
    "actual_url":actual_url,
    "weather_tweet":weather_tweet,
    "final_fact_table":final_fact_table,
    "mars_hemis":mars_hemis}


    # data={"Latest Mars News Headline":news_title,
    # "Latest Mars News":news_paragraph,
    # "Featured Image":image_url,
    # "Latest Mars Weather Update":weather_tweet,
    # "Mars Fun Facts":final_fact_table,
    # "Mars Hemispheres":mars_hemis}

    return data
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")
        
        # Set up the tables in the database
        Base.metadata.create_all(engine)
        
        # Create an example user
        self.user = User(name="Alice", email="*****@*****.**",
                        password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()
        
        self.process = multiprocessing.Process(target=app.run,
                                                kwargs={"port": 8080})
                                                
        self.process.start()
        time.sleep(1)
        
    def test_login_correct(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
        
    def test_login_incorrect(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")
        
    def test_add_entry(self):
        # Login to blog
        self.test_login_correct()
        # Add new entry
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "test post")
        self.browser.fill("content", "acceptance testing post")
        self.browser.find_by_css("button[type=submit]").first.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
        
    def test_view_single_entry(self):
        # Login to blog
        self.test_login_correct()
        # Click on top entry title
        self.browser.visit("http://127.0.0.1:8080/entry/1/")
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/entry/1/")
        
    def test_edit_entry(self):
        # Login to blog
        self.test_login_correct()
        # Add new entry
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "test post")
        self.browser.fill("content", "acceptance testing post")
        self.browser.find_by_css("button[type=submit]").first.click()
        # Click edit link on top entry
        self.browser.click_link_by_partial_href('edit')
        # Enter new title and contents
        self.browser.fill("title", "edited test post")
        self.browser.fill("content", "edited acceptance testing post")
        self.browser.find_by_css("button[type=submit]").first.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
    
    def test_delete_entry(self):
        # Login to blog
        self.test_login_correct()
        # Add new entry
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "test post")
        self.browser.fill("content", "acceptance testing post")
        self.browser.find_by_css("button[type=submit]").first.click()
        # Delete entry
        self.browser.click_link_by_partial_href('delete')
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        # Make sure browser puts you back on home 
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
    
    def test_logout(self):
        # Login to blog
        self.test_login_correct()
        # Click on 'Logout' link
        self.browser.click_link_by_text('Logout')
        # Check to see if 'Logout' link is visible
        self.assertEqual(self.browser.is_element_present_by_text('Logout'), False)
        # Check to see if 'Login' link is visible
        self.assertEqual(self.browser.is_element_present_by_text('Login'), True)
        
    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
Exemple #19
0

# In[26]:


#finding mars images of hemispheres


# In[27]:


#high res image of cerberus hem
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
browser.click_link_by_text('Sample')
img1 = browser.windows.current = browser.windows[1]
img1


# In[28]:


#high res image of Schiaparelli hem
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
browser.click_link_by_text('Sample')
img2 = browser.windows.current = browser.windows[1]
img2
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = models.User(name="Alice", email="*****@*****.**",
                                password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run)
        self.process.start()
        time.sleep(1)

    def testLoginCorrect(self):
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")

    def testLoginIncorrect(self):
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/login")
    
    
    def testAddEditPost(self):
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.browser.visit('http://127.0.0.1:5000/post/add')
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/add")
        self.browser.fill("title", "First Post")
        self.browser.fill("content", "Hello World!")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.browser.click_link_by_text('Edit Post')
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/1/edit")
        self.browser.fill("title", "Edited First Post")
        self.browser.fill("content", "Hello Universe!")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.assertEqual(self.browser.find_by_tag('h1').first.value, "Edited First Post")
        #divs = self.browser.find_by_tag("div")
        #myList = []
        #if "Hello Universe!" in divs:
            #myList.append("Hello Universe!")
        #self.assertEqual(myList[0], "Hello Universe!")
    
    def testAddDeletePost(self):
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.browser.visit('http://127.0.0.1:5000/post/add')
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/add")
        self.browser.fill("title", "First Post")
        self.browser.fill("content", "Hello World!")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.browser.click_link_by_text('Delete Post')
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/1/delete")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.assertEqual(len(self.browser.find_by_tag('h1')),0)
        #divs = self.browser.find_by_tag("div")
        #myList = []
        #if "Hello Universe!" in divs:
            #myList.append("Hello Universe!")
        #self.assertEqual(myList[0], "Hello Universe!")

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
Exemple #21
0
class Compass:

    def __init__(self, username='', password='', outdir=''):
        self._username = username
        self._password = password
        self._outdir = outdir

        self._browser = None
        self._record = None

    def quit(self):
        if self._browser:
            self._browser.quit()
            self._browser = None

    def loggin(self):
        prefs = {
            "browser.download.folderList": 2,
            "browser.download.manager.showWhenStarting": False,
            "browser.download.dir": self._outdir,
            "browser.helperApps.neverAsk.saveToDisk": "application/octet-stream,application/msexcel,application/csv"}

        self._browser = Browser('firefox', profile_preferences=prefs)

        self._browser.visit('https://compass.scouts.org.uk/login/User/Login')

        self._browser.fill('EM', self._username)
        self._browser.fill('PW', self._password)
        time.sleep(2)
        self._browser.find_by_value('Submit').first.click()

        # Look for the Role selection menu and select my Group Admin role.
        self._browser.is_element_present_by_name(
            'ctl00$UserTitleMenu$cboUCRoles',
            wait_time=30)
        self._browser.select('ctl00$UserTitleMenu$cboUCRoles', '1253644')

    def export(self, section):
        # Select the My Scouting link.
        self._browser.is_text_present('My Scouting', wait_time=30)
        self._browser.click_link_by_text('My Scouting')

        def wait_then_click_xpath(xpath, wait_time=30):
            self._browser.is_element_present_by_xpath(
                xpath, wait_time=wait_time)
            self._browser.find_by_xpath(xpath).click()

        # Click the "Group Sections" hotspot.
        wait_then_click_xpath('//*[@id="TR_HIER7"]/h2')

        # Clink the link that shows the number of members in the section.
        # This is the one bit that is section specific.
        # We might be able to match on the Section name in the list,
        # which would make it more robust but at present we just hard
        # the location in the list.
        section_map = {
            'garrick': 2,
            'paget': 3,
            'swinfen': 4,
            'brown': 4,
            'maclean': 5,
            'rowallan': 6,
            'somers': 7,
            'boswell': 8,
            'erasmus': 9,
            'johnson': 10
        }
        wait_then_click_xpath(
            '//*[@id="TR_HIER7_TBL"]/tbody/tr[{}]/td[4]/a'.format(
                section_map[section.lower()]
            ))

        # Click on the Export button.
        wait_then_click_xpath('//*[@id="bnExport"]')

        # Click to say that we want a CSV output.
        wait_then_click_xpath(
            '//*[@id="tbl_hdv"]/div/table/tbody/tr[2]/td[2]/input')
        time.sleep(2)

        # Click to say that we want all fields.
        wait_then_click_xpath('//*[@id="bnOK"]')

        download_path = os.path.join(self._outdir, 'CompassExport.csv')

        if os.path.exists(download_path):
            log.warn("Removing stale download file.")
            os.remove(download_path)

        # Click the warning.
        wait_then_click_xpath('//*[@id="bnAlertOK"]')

        # Browser will now download the csv file into outdir. It will be called
        # CompassExport.

        # Wait for file.
        timeout = 30
        while not os.path.exists(download_path):
            time.sleep(1)
            timeout -= 1
            if timeout <= 0:
                log.warn("Timeout waiting for {} export to download.".fomat(
                    section
                ))
                break

        # rename download file.
        os.rename(download_path,
                  os.path.join(self._outdir, '{}.csv'.format(section)))

        log.info("Completed download for {}.".format(section))

        # Draw breath
        time.sleep(1)

    def load_from_dir(self):
        # Load the records form the set of files in self._outdir.

        log.debug('Loading from {}'.format(self._outdir))

        def get_section(path, section):
            df = pd.read_csv(path, dtype=object, sep=',')
            df['section'] = section
            df['forenames_l'] = [_.lower().strip() for _ in df['forenames']]
            df['surname_l'] = [_.lower().strip() for _ in df['surname']]
            return df

        self._records = pd.DataFrame().append(
            [get_section(os.path.join(self._outdir, section),
                         os.path.splitext(section)[0])
             for section in os.listdir(self._outdir)], ignore_index=True)

    def find_by_name(self, firstname, lastname, section_wanted=None,
                     ignore_second_name=True):
        """Return list of matching records."""

        recs = self._records
        
        if ignore_second_name:
            df = recs[
                (recs.forenames_l.str.lower().str.match(
                        '^{}.*$'.format(firstname.strip(' ')[0].lower().strip()))) &
                  (recs.surname_l == lastname.lower().strip())]
            
        else:
            df = recs[(recs.forenames_l == firstname.lower().strip()) &
                      (recs.surname_l == lastname.lower().strip())]

        if section_wanted is not None:
            df = df[(df['section'] == section_wanted)]

        return df

    def sections(self):
        "Return a list of the sections for which we have data."
        return self._records['section'].unique()

    def all_yp_members_dict(self):
        return {s: members for s, members in self._records.groupby('section')}

    def section_all_members(self, section):
        return [m for i, m in self._records[
            self._records['section'] == section].iterrows()]

    def section_yp_members_without_leaders(self, section):
        return [m for i, m in self._records[
            (self._records['section'] == section) &
            (self._records['role'].isin(
                ['Beaver Scout', 'Cub Scout', 'Scout']))].iterrows()]

    def members_with_multiple_membership_numbers(self):
        return [member for s, member in self._records.groupby(
            ['forenames', 'surname']).filter(
                lambda x: len(x['membership_number'].unique()) > 1).groupby(
                    ['forenames', 'surname', 'membership_number'])]
Exemple #22
0
def scrape():
    executable_path = {"executable_path": "users\spitc\anaconda3\lib\site"}
    browser = Browser("chrome", **executable_path, headless=False)

    # title
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    html = browser.html
    soup = bs(html, "html.parser")
    title = soup.find("div", class_="content_title").text
    # paragraph
    browser.click_link_by_text(title)
    html = browser.html
    soup = bs(html, "html.parser")
    paragraph = soup.find("div", class_="wysiwyg_content")
    para = paragraph.find('p').text

    # picture
    url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    home_url = 'https://www.jpl.nasa.gov'
    browser.visit(url_image)
    html = browser.html
    soup = bs(html, "html.parser")
    mars_img = soup.find("li", class_="slide")
    mars_src = mars_img.find("a")
    mars_src["data-fancybox-href"]
    featured_image_url = home_url + mars_src["data-fancybox-href"]
    # weathr
    url_weather = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_weather)
    html = browser.html
    soup = bs(html, "html.parser")
    mars_w = soup.find("div", class_="js-tweet-text-container").text.rstrip()
    # facts
    facts_url = "https://space-facts.com/mars/"
    mars_table = pd.read_html(facts_url)
    df = mars_table[0]
    df.columns = ["Measurements", "Values"]
    df_facts = df.set_index('Measurements')
    mars_facts = df_facts.to_html()
    mars_facts_final = mars_facts.replace('\n', '')
    # hemi
    cerberus_link = mars_hemi("Cerberus", browser)
    schiaparelli_link = mars_hemi("Schiaparelli", browser)
    syrtis_major_link = mars_hemi("Syrtis Major", browser)
    valles_marineris_link = mars_hemi("Valles Marineris", browser)
    hemisphere_image_urls = [{
        "title": "Valles_Marineris_Hemisphere",
        "img_url": valles_marineris_link
    }, {
        "title": "Cerberus_Hemisphere",
        "img_url": cerberus_link
    }, {
        "title": "Shiaparelli_Hemisphere",
        "img_url": schiaparelli_link
    }, {
        "title": "Syrtis_Major_Hemisphere",
        "img_url": syrtis_major_link
    }]

    mars_data = {
        "title": title,
        "paragraph": para,
        "mars_pic": featured_image_url,
        "mars_weather": mars_w,
        "mars_facts": mars_facts_final,
        hemisphere_image_urls[0]["title"]: hemisphere_image_urls[0]["img_url"],
        hemisphere_image_urls[1]["title"]: hemisphere_image_urls[1]["img_url"],
        hemisphere_image_urls[2]["title"]: hemisphere_image_urls[2]["img_url"],
        hemisphere_image_urls[3]["title"]: hemisphere_image_urls[3]["img_url"]
    }

    return mars_data
Exemple #23
0
#!/usr/bin/python
from splinter import Browser
b = Browser()
url = 'http://google.com'
b.visit(url)
b.click_link_by_text('Sign up')
b.select("rateplanid", "spn")
b.fill('spn_postal', '11223')
b.fill('spn_email', '*****@*****.**')
b.check('spn_terms')
b.find_by_value('submit').first.click()
b.find_by_value('submit').first.click()
url = 'http://google.com'
Exemple #24
0
from splinter import Browser

# Your ID Password
user_email = "nishnik"
user_pass = "******"
browser= Browser('firefox')
browser.visit('http://www.facebook.com')

browser.fill('email', user_email)
browser.fill('pass', user_pass)

button = browser.find_by_id('loginbutton')
button.click()

# Paste the url you need to download from. Note: It must be from mobile site
browser.visit('https://m.facebook.com/photo.php?fbid=780845462017409&id=100002758879147&set=oa.876940942416747&relevant_count=1&source=48&refid=18&_ft_=qid.6274517251577062760%3Amf_story_key.876940939083414%3Atl_objid.876940939083414')
# The number of consecutive pics you have to download
NUM_PICS = 56

i = 0
while i < NUM_PICS:
    i = i + 1
    browser.click_link_by_text('View full size')
    browser.screenshot("screenshot" + str(i) + ".png")
    browser.back()
    browser.click_link_by_text('Next')

browser.quit()
def scrape():
    print('INITIALIZING DATA SCRAPE FOR YOSEMITE NATIONAL PARK')
    print('-------------------------------------------------------')

    # initialize browser
    # executable_path = {'executable_path': 'chromedriver.exe'}
    # use executable path below for mac
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    # dictionary to hold final scraped data
    yosemite_data = {}

    print('COMMENCING DATA SCRAPE FOR ECONOMIC BENEFITS INFO')

    # URL of yosemite articles page to be scraped
    url = 'https://www.nps.gov/yose/learn/news/newsreleases.htm'
    browser.visit(url)
    time.sleep(2)
    # empty lists to hold raw scraped data
    article_links = []
    headlines = []
    article_contents = []
    # empty lists that will hold cleaned scraped data
    years = []
    amounts = []
    job_counts = []
    visitor_counts = []
    # empty list to hold final scraped data
    economic_benefits = []

    # go through pages 1-33 and find links of targeted articles
    for x in range(1, 34):
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        article_snippets = soup.find_all('li', class_='ListingList-item')
        substring = 'Economic Benefit'
        for article_snippet in article_snippets:
            snippet_headline = article_snippet.find(
                'h3', class_='ListingResults-title').text
            if substring in snippet_headline:
                end_link = article_snippet.find('a')['href']
                article_link = 'https://www.nps.gov' + end_link
                article_links.append(article_link)
        browser.click_link_by_text('Next ')
        time.sleep(1)

    # visit each article link and extract content
    for article_link in article_links:
        browser.visit(article_link)
        article_html = browser.html
        article_soup = BeautifulSoup(article_html, 'html.parser')
        headline = article_soup.find('div', class_='ContentHeader').text
        headline = headline.replace('\n', '')
        headlines.append(headline)
        article_content = article_soup.find('div',
                                            class_='ArticleTextGroup').text
        article_contents.append(article_content)
    # loop through headlines and extract economic benefit $ amount (in millions)
    for headline in headlines:
        headline_split = headline.split('$')[1]
        amount = headline_split[:3]
        amounts.append(amount)
    # loop through article contents and extract year, job count, and visitor count
    for article_content in article_contents:
        year_split = article_content.split('Park in ')[1]
        year = year_split[:4]
        years.append(year)
        job_split = article_content.split('supported ')[1]
        job_count = job_split[:5]
        if ',' in job_count:
            job_count = job_count.replace(',', '')
            job_counts.append(job_count)
        elif ' ' in job_count:
            job_count = job_count.replace(' ', '')
            job_counts.append(job_count)
        else:
            job_counts.append(job_count)
        visitor_split = article_content.split('shows that')[1]
        visitor_count = visitor_split[:10]
        visitor_count = visitor_count.replace(',',
                                              '').replace('\xa0',
                                                          '').replace(' ', '')
        visitor_counts.append(visitor_count)

    # append extract information into economic_benefits dictionary
    economic_benefits.append({
        'years': years,
        'amounts': amounts,
        'job_counts': job_counts,
        'visitor_counts': visitor_counts
    })
    # append missing 2015 data
    economic_benefits[0]['years'].insert(2, '2015')
    economic_benefits[0]['amounts'].insert(2, '594')
    economic_benefits[0]['job_counts'].insert(2, '6890')
    economic_benefits[0]['visitor_counts'].insert(2, '4150217')
    # append to yosemite_data dictionary
    yosemite_data['economic_benefits'] = economic_benefits

    print('OBTAINED ECONOMIC BENEFITS')
    print('-------------------------------------------------------')

    print('COMMENCING DATA SCRAPE FOR TRAIL HEAD POSTS')

    # URL of page to be scraped
    url = 'https://www.hikespeak.com/sierras/yosemite/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # Examine the results, then determine element that contains sought info
    # results are returned as an iterable list
    results = soup.find_all("tr")
    post = []
    ## Probably need a loop here for all 20 rows
    # Loop through returned results
    for result in results:
        # Error handling
        try:
            # Identify and return title of listing
            trail = result.find("td", class_="column-2").text
            distance = result.find("td", class_="column-3").text
            coordinates = result.find("td", class_="column-4").text
            # Run only if title, price, and link are available
            if (trail and distance and coordinates):
                # Print results
                print('-------------')
                print(trail)
                print(distance)
                print(coordinates)
                post.append({
                    'trail': trail,
                    'distance': distance,
                    'coordinates': coordinates
                })
        except Exception as e:
            print(e)
    yosemite_data['post'] = post

    print('OBTAINED TRAIL HEAD POSTS')
    print('-------------------------------------------------------')

    print('COMMENCING DATA SCRAPE FOR TRAIL TABLE')

    # URL of page to be scraped
    trail_table_url = 'https://www.yosemitehikes.com/hikes.htm'
    # Retrieve page with the requests module
    response = requests.get(trail_table_url)
    # Create BeautifulSoup object; parse with 'lxml'
    soup = BeautifulSoup(response.text, 'lxml')
    # Examine the results, then determine element that contains sought info
    # results are returned as an iterable list
    trail_table_results = soup.find_all('tr')

    trail_table_post = []

    # Loop through returned results
    for trail_table_result in trail_table_results:
        # Error handling
        try:
            # Identify and return trail name
            trail_name = trail_table_result.find('td', column='Trail').text
            # Identify and return trail's distance
            raw_distance = result.find('td', column="Distance (miles/km)").text
            if ' (' in raw_distance:
                distance = str(raw_distance[:raw_distance.find(" (")])
            else:
                distance = result.find('td', column="Distance (miles/km)").text
            # Identify and return trail's elevation
            try:
                raw_elevation = result.find(
                    'td', column="Elevation Gain (feet/meters)").text
                elevation = str(raw_elevation[:raw_elevation.find(" (")])
                if ',' in elevation:
                    elevation = elevation.replace(',', '')
                else:
                    elevation = raw_elevation[:raw_elevation.find(" (")]
            except Exception as elevation:
                elevation = result.find(
                    'td', column="Elevation Gain (feet/meters)").text
            # Identify and return trail's crowd rating
            crowd = str(trail_table_result.find('td',
                                                column="Crowd Factor"))[44]
            # Identify and return trail's scenery rating
            scenery = str(
                trail_table_result.find('td', column="Scenery Factor"))[-14]
            # Identify and return trail's difficulty rating
            difficulty = str(trail_table_result.find('td',
                                                     column="Difficulty"))[-14]

            #Dictionary to be inserted as a MongoDB document
            trail_table_post.append({
                'trail_name': trail_name,
                'distance': distance,
                'elevation': elevation,
                'crowd': crowd,
                'scenery': scenery,
                'difficulty': difficulty
            })

        except Exception as e:
            print(e)
    yosemite_data['trail_table_post'] = trail_table_post

    print('OBTAINED TRAIL TABLE')
    print('-------------------------------------------------------')

    print('COMMENCING DATA SCRAPE FOR WEATHER')

    current_weather = []
    apikey = api_key.api_key
    location = "Yosemite Valley"
    url = "http://api.openweathermap.org/data/2.5/weather?units=Imperial&appid=" + apikey + "&q=" + location
    weather = requests.get(url).json()
    todays_temp = weather["main"]["temp"]
    todays_humid = weather["main"]["humidity"]
    todays_cloud = weather["clouds"]["all"]
    todays_wind = weather["wind"]["speed"]
    converted = datetime.utcfromtimestamp(weather["dt"])
    local_time = converted - timedelta(hours=7, minutes=0)
    weather_date = local_time.strftime("%B %d, %Y")
    current_weather.append({
        'todays_temp': todays_temp,
        'todays_humid': todays_humid,
        'todays_cloud': todays_cloud,
        'todays_wind': todays_wind,
        'weather_date': weather_date
    })
    yosemite_data['weather'] = current_weather

    print('OBTAINED WEATHER')
    print('-------------------------------------------------------')

    print('COMMENCING DATA SCRAPE FOR TWITTER')

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())
    target_user = "******"
    user_tweets_only = api.user_timeline(target_user,
                                         count=1,
                                         result_type="recent")
    user_tweets = user_tweets_only[0]["text"]
    yosemite_data['tweet'] = user_tweets

    print('OBTAINED TWEET')
    print('-------------------------------------------------------')

    print('COMMENCING DATA SCRAPE FOR MOST RECENT NEWS')

    url = 'https://www.nps.gov/yose/learn/news/newsreleases.htm'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    recent_news = []
    # Article Title
    news_title = soup.find("h3", class_="ListingResults-title").text
    # Article Date
    article_date = soup.find("div", class_="ListingMeta").text
    # Link to full article
    results = soup.find("li", class_="ListingList-item ListingResults-item")
    news_link = results.find("a")["href"]
    split_url = urlsplit(url)
    full_news_link = split_url.scheme + "://" + split_url.netloc + news_link
    # Article summary
    article_text = soup.find("p", class_="ListingResults-description").text

    recent_news.append({
        'news_title': news_title,
        'article_date': article_date,
        'full_news_link': full_news_link,
        'article_text': article_text
    })

    yosemite_data['recent_news'] = recent_news

    print('OBTAINED MOST RECENT NEWS')

    print('SCRAPING COMPLETED')
    print('-------------------------------------------------------')
    print(yosemite_data)

    return yosemite_data
Exemple #26
0
hemisphere2 = soup.find_all('a', class_="itemLink product-item")[3]['href']
hemisphere3 = soup.find_all('a', class_="itemLink product-item")[5]['href']
hemisphere4 = soup.find_all('a', class_="itemLink product-item")[7]['href']

# Create a single list called 'mars4hemis'
mars4hemis = [hemisphere1, hemisphere2, hemisphere3, hemisphere4]
mars4hemis

# Go to each of the 4 hemisphere websites and scrape the link for the Sample Image
usgs_url = "https://astrogeology.usgs.gov"
image_url = []
titles = []
for x in range(len(mars4hemis)):
    # Go to the hemisphere website
    browser.visit(usgs_url + mars4hemis[x])
    browser.click_link_by_text("Open")
    time.sleep(2)
    # CLick Sample link to get the image
    sample = browser.find_by_text('Sample')
    image = sample['href']
    image_url.append(image)
    # Search the h2 tags to get the title
    headers = browser.find_by_tag('h2')
    full_title = headers.text
    title = full_title.strip('Enhanced')
    titles.append(title)
    #    print(browser.url)
    print(title, image)

# Show the two newly created lists: titles and image_url
print(titles)
class InstaLiker():

	# constructor
	def __init__(self):
		self.mUrl = "https://www.instagram.com/"
		self.cycles = 4
		self.browser = Browser()
		self.username = "******"
		self.pw = 'xxxxxxxxxxxxxxxx\r'
		self.totalLikes = 0
		self.blackList = ["make a list of users to exclude", "including your own username" ]

	# scroll the page and
	# do the liking
	def launchPage(self):
		self.browser.visit(self.mUrl)
		self.login()

		self.scrollBy()
		for i in range(0, self.cycles):
			self.likePosts()

		print("just liked " + str(self.totalLikes) + " pix...Yay!")		

	def login(self):
		print("login")
		print("logging in as " + self.username)
		self.browser.click_link_by_text('Log in')
		self.browser.fill('username', self.username)
		self.browser.fill('password', self.pw)
		
		form = self.browser.find_by_tag('form')
		inputs = form.find_by_tag('button')
		inputs[0].click()

		# need to sleep a few seconds here
		time.sleep(5)

	def likePosts(self):
		print("liking posts")
		likeList = self.browser.find_by_text("Like")
		
		if len(likeList) == 0:
			print("nothing left to like. attempt to scroll farther to load more posts.")
			self.scrollBy()
			time.sleep(3)
			likeList = self.browser.find_by_text("Like")
			print("likeList is now: " + str(len(likeList)))

		if (len(likeList) > 0):
			print("found " + str(len(likeList)) + " posts to like")
			
			for foo in likeList:
				tmpParentNode = foo.find_by_xpath("ancestor::article/header")
				print(tmpParentNode["innerText"])
				if self.checkBlackList(tmpParentNode["innerText"]) == 0:
					foo.click()
					self.totalLikes += 1
					time.sleep(1)

	def checkBlackList(self, pString):
		for foo in self.blackList:
			if foo in pString:
				print("found blacklisted item '" + foo + "'")
				return 1		
		return 0

	def scrollBy(self):
		print("scrolling down.")
		self.browser.execute_script( "window.scrollBy(0,30000);" )
		time.sleep(1) 

	def boneyard(self):
		print('boneyard')
Exemple #28
0
def scrape():
    url = 'https://mars.nasa.gov/news/'
    browser = webdriver.Chrome('chromedriver.exe')

    browser.get(url)
    html = browser.page_source

    soup = bs(html, 'html.parser')
    browser.close()

    soup_li = soup.find_all('li', class_='slide')
    list_of_titles = []
    list_of_paragraphs = []

    for eachslide in soup_li:
        one_title = eachslide.find('div', class_='content_title').text
        one_paragraph = eachslide.find('div',
                                       class_='article_teaser_body').text
        list_of_titles.append(one_title)
        list_of_paragraphs.append(one_paragraph)

    ######## NEWS TITLE AND PARAGRAPHS LOCATION #########
    news_title = list_of_titles[0]
    news_p = list_of_paragraphs[0]
    #####################################################

    splint_browser = Browser('chrome',
                             executable_path='chromedriver.exe',
                             headless=False)

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    splint_browser.visit(url)

    splint_browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(2)
    splint_browser.click_link_by_partial_text('more info')
    html = splint_browser.html
    soup = bs(html, 'html.parser')
    splint_browser.quit()

    image_src = soup.find_all('figure', class_='lede')
    for each in image_src:
        ######### FEATURED IMAGE URL #########################
        featured_image_url = 'https://www.jpl.nasa.gov' + each.a['href']

    #######################################################

    # Twitter API Keys
    consumer_key = apikeys.TWITTER_CONSUMER_KEY
    consumer_secret = apikeys.TWITTER_CONSUMER_SECRET
    access_token = apikeys.TWITTER_ACCESS_TOKEN
    access_token_secret = apikeys.TWITTER_ACCESS_TOKEN_SECRET

    # Setup Tweepy API Authentication
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

    public_tweets = api.user_timeline('marswxreport', count=5)
    for tweet in public_tweets:
        ########## MARS WEATHER TWEET ##########################
        if (("hPa" in tweet['text']) and ("Sol" in tweet['text'])):
            mars_weather = tweet['text']
            break

    #########################################################

    url_tables = 'https://space-facts.com/mars/'
    tables = pd.read_html(url_tables)
    table_df = pd.DataFrame(tables[0])
    table_df = table_df.rename(columns={0: "planet_profile", 1: "mars_data"})
    table_df = table_df.set_index('planet_profile')
    ############### TABLE WITH MARS INFORMATION ###############
    table_html = pd.DataFrame.to_html(table_df)

    ###########################################################

    splint_browser = Browser('chrome',
                             executable_path='chromedriver.exe',
                             headless=False)
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    splint_browser.visit(url)

    hemisphere_list = []
    html = splint_browser.html
    soup = bs(html, 'html.parser')
    findHemisphere = soup.find_all('div', class_='item')

    for each in findHemisphere:
        hemisphere_list.append(each.h3.text)

    splint_browser.quit()

    hemisphere_image = []

    for eachHemi in hemisphere_list:

        splint_browser = Browser('chrome',
                                 executable_path='chromedriver.exe',
                                 headless=False)
        splint_browser.visit(url)
        time.sleep(2)
        splint_browser.click_link_by_partial_text(eachHemi)

        time.sleep(2)
        splint_browser.click_link_by_text('Sample')
        #Line 23
        splint_browser.windows.current = splint_browser.windows[1]
        #Line 24
        html = splint_browser.html
        soup = bs(html, 'html.parser')
        splint_browser.quit()
        #Line 25
        hemi_image = soup.body.find('img')['src']

        hemisphere_image.append(hemi_image)

    ################ HEMISPHERES IMAGES - URL ########################
    title_image_url = []
    title_image_tuple = zip(hemisphere_list, hemisphere_image)

    for each in title_image_tuple:
        temp_dict = {}
        temp_dict['title'] = each[0]
        temp_dict['img_url'] = each[1]
        title_image_url.append(temp_dict)
    ###################################################################

    mars_dict = {
        'News_Title': news_title,
        'News_Paragraph': news_p,
        'Featured_Image': featured_image_url,
        'Mars_Weather': mars_weather,
        'Mars_Info': table_html,
        'Hemisphere_Images': title_image_url
    }

    return mars_dict
Exemple #29
0
#!/usr/bin/python
import random
from splinter import Browser

email = str(random.randrange(10000000, 99999999)) + '@comcast.com'
zip_code = random.randrange(10000, 99999)
url = 'http://captive.apple.com'
browser = Browser('firefox')

browser.visit(url)
browser.click_link_by_text('Sign up')
browser.select("rateplanid", "spn")
browser.fill('spn_postal', zip_code)
browser.fill('spn_email', email)
browser.check('spn_terms')
browser.find_by_value('submit').first.click()
browser.find_by_value('submit').first.click()
browser.quit()
def scrape():
    browser=Browser("chrome", **executable_path, headless=False)

    #NASA Mars News
    url='https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    news=soup.find('ul',class_='item_list').find_all('li',class_='slide')
    for i in news[:1]:
        news_title=i.find('h3').text
        news_p=i.find('div',class_='article_teaser_body').text
        print(news_title,'\n',news_p)

    #JPL Mars Space Images - Featured Image
    url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(1)
    html=browser.html
    soup=BeautifulSoup(html,'html.parser')
    photos=soup.find('ul',class_='articles').find_all('li',class_='slide')
    for i in photos[:1]:
        partial_link=i.find('div',class_='img').find('img')['src']
        featured_image_url='https://www.jpl.nasa.gov/'+partial_link
        print(featured_image_url)
    
    #Mars Facts
    url='https://space-facts.com/mars/'
    browser.visit(url)
    time.sleep(1)
    html=browser.html
    soup=BeautifulSoup(html,'html.parser')
    tables=pd.read_html(html)
    table0=tables[0]
    table0.columns=['Feature','Value']
    table0['Feature']=table0['Feature'].apply(lambda x:x.replace(":",''))
    print(table0)
    table0.set_index('Feature',inplace=True)
    table0.to_html('Mars_Facts.html')
    table_data=table0.reset_index().values.tolist()

    #Mars Hemispheres
    url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    time.sleep(1)
    html=browser.html
    soup=BeautifulSoup(html,'html.parser')
    results=soup.find('div',id='product-section').find_all('div',class_='item')

    names_list=[]
    img_link_list=[]

    for result in results:
        result_link='https://astrogeology.usgs.gov/'+result.find('a')['href']
        name=result.find('h3').text
        names_list.append(name)
        browser.visit(result_link)
        time.sleep(1)
    
        html=browser.html
        soup=BeautifulSoup(html,'html.parser')
        browser.click_link_by_text('Open') 
        time.sleep(1)
    
        html=browser.html
        soup=BeautifulSoup(html,'html.parser')
        img_link=soup.find('div',class_='downloads').find('a')['href']
        img_link_list.append(img_link)
    
    hemisphere_image_urls = []
    for name,link in zip(names_list,img_link_list):
        hemisphere_image_urls.append({'title':name,'img_url':link})

    browser.quit()

    mars_dict={'News_Title':news_title,'News_Summary':news_p,"Mars_img":featured_image_url,\
                "Mars_Facts":table_data,"Mars_Hemispheres":hemisphere_image_urls}

    
    return mars_dict
Exemple #31
0
def scrape():

    mars_all = {}

    #MARS NEWS
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    #Prepare empty list for headlines and paragraphs
    news_info = []

    html = browser.html
    soup = bs(html, 'lxml')
    news_title = soup.find("div", class_='content_title').find('a').text
    news_paragraph = soup.find("div", class_='article_teaser_body').text

    news_info.append({"Headline": news_title, "Paragraph": news_paragraph})

    mars_all['news_title'] = news_title
    mars_all['news_paragraph'] = news_paragraph

    #MARS IMAGE
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'lxml')

    #Define the base image URL of high-res image
    base_imgurl = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/'

    #Locate the image, strip into components, and get only the 8-digit image name
    image_name = soup.find('div', class_='img').find('img')['src']
    image_name = image_name.split("/")[-1:][0][0:8]

    #Concatenate the image URL components
    featured_image_url = base_imgurl + image_name + '_hires.jpg'

    mars_all['featured_image_url'] = featured_image_url

    #MARS TWEET
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'lxml')

    mars_weather = soup.find('div',
                             class_='js-tweet-text-container').find('p').text

    mars_all['mars_weather'] = mars_weather

    #MARS FACTS
    url = 'http://space-facts.com/mars/'
    tables = pd.read_html(url)

    #Specify column titles for fact table
    mars_facts = tables[0]
    mars_facts.columns = ['Statistic', 'Detail']

    #Convert DataFrame to HTML
    mars_facts = mars_facts.to_html()

    mars_all['mars_facts'] = mars_facts

    #MARS HEMISPHERES
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    browser.visit(url)

    #Prepare empty list to store dictionary of image links and titles
    mars_hemispheres = []

    #Loop through 4 hemispheres
    for i in range(4):

        browser.find_by_css("a.product-item h3")[i].click()

        #Get the current HTML page structure
        html = browser.html
        soup = bs(html, 'lxml')

        #Identified from the enhanced image, this is the base URL...
        base_url = 'https://astrogeology.usgs.gov'

        #Each hemisphere image location found here...store it in a variable
        hemisphere_image = soup.find_all('img', class_='wide-image')[0]['src']
        image_link = base_url + hemisphere_image

        #Store the title in a variable...need to remove ' Enhanced'
        hemisphere_title = soup.find('h2', class_='title').text.replace(
            ' Enhanced', '')

        #Append image and title to a dictionary and append to list
        mars_hemispheres.append({
            'Hemisphere': hemisphere_title,
            'ImageURL': image_link
        })

        mars_all['hemispheres'] = mars_hemispheres

        #Back to previous page to loop through other hemispheres.
        browser.click_link_by_text('Back')

    return (mars_all)
class TestRoutes(unittest.TestCase):

	@classmethod
	def setUpClass(cls):
		# socketio.run(flapp, port = flapp.config['PORT'])
		pass
		
	# @init_db
	def setUp(self):
		self.browser = Browser()
		self.username = uuid4().hex
		self.userpassword = uuid4().hex

	def tearDown(self):
		self.browser.quit()
		
	def go_home(self):
		self.browser.visit( 'http://localhost:%s'%flapp.config['PORT'])

	def login(self, user):
		self.go_home()
		self.browser.fill_form({'username': self.username, 'password':self.userpassword})
		self.browser.find_by_value('Sign in').click()


	def test_login_success_with_confirmed_user(self):
		self.login(UserFactory.seed_confirmed_user(self.username, self.userpassword))
		assert self.browser.is_text_present('Signed in as %s'%self.username)


	def test_login_failure_with_nonconfirmed_user(self):
		user = UserFactory.seed_nonconfirmed_user(self.username, self.userpassword)
		self.login(user)
		assert self.browser.is_text_not_present('Signed in as %s'%self.userpassword)
		assert self.browser.is_text_present('Sign in')

	def test_login_failure_with_nonexisting_user(self):
		self.go_home()
		fake_username = uuid4().hex
		self.browser.fill_form({'username': fake_username, 'password':uuid4().hex})
		self.browser.find_by_value('Sign in').click()
		assert self.browser.is_text_not_present('Signed in as %s'%fake_username)
		assert self.browser.is_text_present('Sign in')

	def test_logout(self):
		self.login(UserFactory.seed_confirmed_user(self.username, self.userpassword))
		self.browser.click_link_by_text('Sign out')
		assert self.browser.is_text_not_present('Signed in as %s'%self.username)
		assert self.browser.is_text_present('Sign in')

	# def test_index(self):
	# 	r = flapp.get('/')
	# 	assert r.okbr

	# def test_login_with_id(self):
	# 	self.login()
	# 	print self.browser.html
	# 	# print browser.html

# class TestRoutesHeadless(unittest.TestCase):

# 	def setUp(self):
# 		self.browser 
Exemple #33
0
class Compass:

    def __init__(self, username='', password='', outdir=''):
        self._username = username
        self._password = password
        self._outdir = outdir

        self._browser = None
        self._record = None


    def quit(self):
        if self._browser:
            self._browser.quit()
            self._browser = None

    def loggin(self):
        prefs = {
            "browser.download.folderList": 2,
            "browser.download.manager.showWhenStarting": False,
            "browser.download.dir": self._outdir,
            "browser.helperApps.neverAsk.saveToDisk": "application/octet-stream,application/msexcel,application/csv"}

        self._browser = Browser('chrome') #, profile_preferences=prefs)

        self._browser.visit('https://compass.scouts.org.uk/login/User/Login')

        self._browser.fill('EM', self._username)
        self._browser.fill('PW', self._password)
        time.sleep(1)
        self._browser.find_by_text('Log in').first.click()

        # Look for the Role selection menu and select my Group Admin role.
        self._browser.is_element_present_by_name(
            'ctl00$UserTitleMenu$cboUCRoles',
            wait_time=30)
        self._browser.select('ctl00$UserTitleMenu$cboUCRoles', '1253644')
        time.sleep(1)

    def wait_then_click_xpath(self, xpath, wait_time=30, frame=None):
        frame = self._browser if frame is None else frame
        while True:
            try:
                if frame.is_element_present_by_xpath(xpath, wait_time=wait_time):
                    frame.find_by_xpath(xpath).click()
                    break
                else:
                    log.warning("Timeout expired waiting for {}".format(xpath))
                    time.sleep(1)
            except:
                log.warning("Caught exception: ", exc_info=True)

    def wait_then_click_text(self, text, wait_time=30, frame=None):
        frame = self._browser if frame is None else frame
        while True:
            if frame.is_text_present(text, wait_time=wait_time):
                frame.click_link_by_text(text)
                break
            else:
                log.warning("Timeout expired waiting for {}".format(text))

    def adult_training(self):
        self.home()

        # Navigate to training page a show all records.
        self.wait_then_click_text('Training')
        time.sleep(1)
        self.wait_then_click_text('Adult Training')
        time.sleep(1)
        self.wait_then_click_xpath('//*[@id="bn_p1_search"]')

    def home(self):
        # Click the logo to take us to the top
        self.wait_then_click_xpath('//*[@alt="Compass Logo"]')
        time.sleep(1)

    def search(self):
        self.home()

        # Click search button
        self.wait_then_click_xpath('//*[@id="mn_SB"]')
        time.sleep(1)

        # Click "Find Member(s)"
        self.wait_then_click_xpath('//*[@id="mn_MS"]')
        time.sleep(1)

        # Navigate to training page a show all records.
        with self._browser.get_iframe('popup_iframe') as i:
            self.wait_then_click_xpath('//*[@id="LBTN2"]', frame=i)
            time.sleep(1)
            self.wait_then_click_xpath('//*[@class="popup_footer_right_div"]/a', frame=i)
            time.sleep(1)

    def lookup_member(self, member_number):
        self.home()

        # Click search button
        self.wait_then_click_xpath('//*[@id="mn_SB"]')
        time.sleep(1)

        xpath = '//*[@id="CNLookup2"]'
        while True:
            try:
                if self._browser.is_element_present_by_xpath(xpath, wait_time=30):
                    self._browser.find_by_xpath(xpath).fill(member_number)
                    break
                else:
                    log.warning("Timeout expired waiting for {}".format(xpath))
                    time.sleep(1)
            except:
                log.warning("Caught exception: ", exc_info=True)

        self.wait_then_click_xpath('//*[@id="mn_QS"]')

    def fetch_table(self, table_id):
        parser = etree.HTMLParser()

        def columns(row):
            return ["".join(_.itertext()) for _ in
                    etree.parse(StringIO(row.html), parser).findall('/*/td')]

        def headers(row):
            return ["".join(_.itertext()) for _ in
                    etree.parse(StringIO(row.html), parser).findall('/*/td')]

        headers_xpath = '//*[@id ="{}"]/thead/*'.format(table_id)
        table_xpath = '//*[@id ="{}"]/tbody/tr[not(@style="display: none;")]'.format(table_id)

        if self._browser.is_element_present_by_xpath(table_xpath, wait_time=5):

            headings = [headers(row) for row
                        in self._browser.find_by_xpath(headers_xpath)][0]

            records = [columns(row) for row
                       in self._browser.find_by_xpath(table_xpath)]

            # Extend the length of each row to the same length as the columns
            records = [row+([None] * (len(headings)-len(row))) for row in records]

            # And add dummy columns if we do not have enough headings
            headings = headings + ["dummy{}".format(_) for _ in range(0,len(records[0]) - len(headings))]

            return pd.DataFrame.from_records(records, columns=headings)

        log.warning("Failed to find table {}".format(table_id))
        return None

    def member_training_record(self, member_number, member_name):
        self.lookup_member(member_number)

        # Select Training record
        self.wait_then_click_xpath('//*[@id="LBTN5"]')

        personal_learning_plans = self.fetch_table('tbl_p5_TrainModules')
        personal_learning_plans['member'] = member_number
        personal_learning_plans['name'] = member_name

        training_record = self.fetch_table('tbl_p5_AllTrainModules')
        training_record['member'] = member_number
        training_record['name'] = member_name

        mandatory_learning = self.fetch_table('tbl_p5_TrainOGL')
        mandatory_learning['member'] = member_number
        mandatory_learning['name'] = member_name

        return personal_learning_plans, personal_learning_plans, mandatory_learning

    def member_permits(self, member_number, member_name):
        self.lookup_member(member_number)

        # Select Permits
        self.wait_then_click_xpath('//*[@id="LBTN4"]')

        permits = self.fetch_table('tbl_p4_permits')
        if permits is not None:
            permits['member'] = member_number
            permits['name'] = member_name

        return permits

    @lru_cache()
    def get_all_adult_trainers(self):

        self.adult_training()

        return self.fetch_table('tbl_p1_results')

    @lru_cache()
    def get_all_group_members(self):

        self.search()

        self._browser.is_element_present_by_xpath('//*[@id = "MemberSearch"]/tbody', wait_time=10)
        time.sleep(1)

        # Hack to ensure that all of the search results loaded.
        for i in range(0, 5):
            self._browser.execute_script(
                'document.getElementById("ctl00_main_working_panel_scrollarea").scrollTop = 100000')
            time.sleep(1)

        return self.fetch_table('MemberSearch')

    def export(self, section):
        # Select the My Scouting link.
        self._browser.is_text_present('My Scouting', wait_time=30)
        self._browser.click_link_by_text('My Scouting')

        # Click the "Group Sections" hotspot.
        self.wait_then_click_xpath('//*[@id="TR_HIER7"]/h2')

        # Clink the link that shows the number of members in the section.
        # This is the one bit that is section specific.
        # We might be able to match on the Section name in the list,
        # which would make it more robust but at present we just hard
        # the location in the list.
        section_map = {
            'garrick': 2,
            'paget': 3,
            'swinfen': 4,
            'brown': 4,
            'maclean': 5,
            'rowallan': 6,
            'somers': 7,
            'boswell': 8,
            'erasmus': 9,
            'johnson': 10
        }
        self.wait_then_click_xpath(
            '//*[@id="TR_HIER7_TBL"]/tbody/tr[{}]/td[4]/a'.format(
                section_map[section.lower()]
            ))

        # Click on the Export button.
        self.wait_then_click_xpath('//*[@id="bnExport"]')

        # Click to say that we want a CSV output.
        self.wait_then_click_xpath(
            '//*[@id="tbl_hdv"]/div/table/tbody/tr[2]/td[2]/input')
        time.sleep(2)

        # Click to say that we want all fields.
        self.wait_then_click_xpath('//*[@id="bnOK"]')

        download_path = os.path.join(self._outdir, 'CompassExport.csv')

        if os.path.exists(download_path):
            log.warn("Removing stale download file.")
            os.remove(download_path)

        # Click the warning.
        self.wait_then_click_xpath('//*[@id="bnAlertOK"]')

        # Browser will now download the csv file into outdir. It will be called
        # CompassExport.

        # Wait for file.
        timeout = 30
        while not os.path.exists(download_path):
            time.sleep(1)
            timeout -= 1
            if timeout <= 0:
                log.warn("Timeout waiting for {} export to download.".fomat(
                    section
                ))
                break

        # rename download file.
        os.rename(download_path,
                  os.path.join(self._outdir, '{}.csv'.format(section)))

        log.info("Completed download for {}.".format(section))

        # Draw breath
        time.sleep(1)

    def load_from_dir(self):
        # Load the records form the set of files in self._outdir.

        log.debug('Loading from {}'.format(self._outdir))

        def get_section(path, section):
            df = pd.read_csv(path, dtype=object, sep=',')
            df['section'] = section
            df['forenames_l'] = [_.lower().strip() for _ in df['forenames']]
            df['surname_l'] = [_.lower().strip() for _ in df['surname']]
            return df

        self._records = pd.DataFrame().append(
            [get_section(os.path.join(self._outdir, section),
                         os.path.splitext(section)[0])
             for section in os.listdir(self._outdir)], ignore_index=True)

    def find_by_name(self, firstname, lastname, section_wanted=None,
                     ignore_second_name=True):
        """Return list of matching records."""

        recs = self._records
        
        if ignore_second_name:
            df = recs[
                (recs.forenames_l.str.lower().str.match(
                        '^{}.*$'.format(firstname.strip(' ')[0].lower().strip()))) &
                  (recs.surname_l == lastname.lower().strip())]
            
        else:
            df = recs[(recs.forenames_l == firstname.lower().strip()) &
                      (recs.surname_l == lastname.lower().strip())]

        if section_wanted is not None:
            df = df[(df['section'] == section_wanted)]

        return df

    def sections(self):
        "Return a list of the sections for which we have data."
        return self._records['section'].unique()

    def all_yp_members_dict(self):
        return {s: members for s, members in self._records.groupby('section')}

    def section_all_members(self, section):
        return [m for i, m in self._records[
            self._records['section'] == section].iterrows()]

    def section_yp_members_without_leaders(self, section):
        return [m for i, m in self._records[
            (self._records['section'] == section) &
            (self._records['role'].isin(
                ['Beaver Scout', 'Cub Scout', 'Scout']))].iterrows()]

    def members_with_multiple_membership_numbers(self):
        return [member for s, member in self._records.groupby(
            ['forenames', 'surname']).filter(
                lambda x: len(x['membership_number'].unique()) > 1).groupby(
                    ['forenames', 'surname', 'membership_number'])]
Exemple #34
0
def download(link):
    browser = Browser()
    browser.visit('https://www.ssyoutube.com' + link)
    time.sleep(22)
    print("OPENED")
    browser.click_link_by_text('Download')
browser.find_by_name('name')
browser.find_by_text('Hello World!')
browser.find_by_id('firstheader')
browser.find_by_value('query')
# get element
first_found = browser.find_by_name('name').first
last_found = browser.find_by_name('name').last
second_found = browser.find_by_name('name')[1]

# Get value of an element
browser.find_by_css('h1').first.value

# Clicking links,return the first link
browser.click_link_by_href('http://www.the_site.com/my_link')
browser.click_link_by_partial_href('my_link')
browser.click_link_by_text('my link')
browser.click_link_by_partial_text('part of link text')
browser.click_link_by_id('link_id')

# element is visible or invisible
browser.find_by_css('h1').first.visible

#fill content
browser.find_by_id('productName').fill(
    'splinter - python acceptance testing for web applications')
browser.fill('q', 'splinter - python acceptance testing for web applications')

# Verifying if element has a className
browser.find_by_css('.content').first.has_class('content')
# click button
browser.find_by_name('send').first.click()
Exemple #36
0
class DownPatent(object):
    def __init__(self, db, down_url):
        self.db = db
        self.down_url = down_url
        self.browser = Browser("phantomjs", wait_time=10)
        #self.browser = Browser()

    #下载专利
    def download(self, patentno):
        #访问网页
        #网页加载超时
        #down_flag, 0:未下载,1:不存在,2:下载失败
        download_link = ""
        down_flag = 0
        if True:
            print "打开网页"
            self.browser.visit(self.down_url)
            if not self.browser.is_element_not_present_by_value("查询", wait_time=10):
                #填写专利号
                self.browser.fill("cnpatentno", patentno)
                self.browser.find_by_value("查询").first.click()
                print "填写专利号"
                #连接超时,404
                if self.browser:
                    print "打开验证码网页"
                    #一个最多循环20次
                    code_handler = CodeHandler()
                    #填写验证码
                    list_fill_text = []
                    #验证码路径
                    list_code_path = []
                    #验证码分割标志
                    list_split_flag = []
                    #验证码识别标志
                    list_reg_flag = []
                    for code_num in xrange(20):
                        print code_num
                        #查找验证码
                        if not self.browser.is_element_not_present_by_id("getcode", wait_time=5):
                            print "查找验证码"
                            #截图
                            #self.browser.driver.maximize_window()
                            self.browser.driver.save_screenshot("screenshot.png")
                            #获取验证码图片
                            image = Image.open("screenshot.png")
                            image_location = self.find_location(image)
                            image_code = image.crop((image_location[0], image_location[1], image_location[0]+52, image_location[1]+21))
                            save_path = "static/images/onlinecode/" + time.ctime() + ".png"
                            save_path_temp = "../%s" % save_path
                            image_code.save(save_path_temp)
                            list_code_path.append(save_path)

                            #分割图片
                            list_split_image = self.deal_split(code_handler, image_code)
                            
                            #识别,如果能正确识别,则识别,不能,则重新获取验证码      
                            if len(list_split_image) == 4:
                                print "正确分割"
                                list_split_flag.append(1)
                                reg_plain_text = self.reg_code(list_split_image)
                                fill_text = "".join(reg_plain_text)
                                list_fill_text.append(fill_text)
                                #填写验证码
                                #hand_fill_text = raw_input("Enter fill text:")
                                self.browser.fill("ValidCode", fill_text)
                                self.browser.find_by_value("确定").first.click()

                                print self.browser.html.encode("utf-8").find("验证码输入错误") 
                                if self.browser.html.encode("utf-8").find("验证码输入错误") == -1:
                                    list_reg_flag.append(1)
                                    if self.browser.html.encode("utf-8").find("没有找到该专利") == -1:
                                        down_link_one = self.browser.find_link_by_text("申请公开说明书图形下载(标准版)")
                                        down_link_two = self.browser.find_link_by_text("申请公开说明书图形下载(极速版)")
                                        if down_link_one or down_link_two:
                                            print "查找说明书图形下载链接"
                                            list_reg_flag.append(1)
                                            if down_link_one:
                                                self.browser.click_link_by_text("申请公开说明书图形下载(标准版)")
                                            else:
                                                self.browser.click_link_by_text("申请公开说明书图形下载(极速版)")
                                            
                                            print "查找下载链接"
                                            #查找下载链接
                                            download_a = self.browser.find_link_by_text("下载专利")
                                            if download_a:
                                                download_link = download_a["href"]
                                            
                                                #找到下载链接
                                                down_flag = 3
                                                break
                                            else:
                                                print "下载失败"
                                                #下载失败
                                                down_flag = 2
                                                break
                                        '''
                                        else:
                                            print "识别正确,未找到链接"
                                            list_reg_flag.append(0)
                                            self.browser.back()
                                            self.browser.reload()
                                        '''
                                    else:
                                        print "不存在专利"
                                        #没有专利
                                        down_flag = 1
                                        break
                                else:
                                    print "识别错误,重新加载"
                                    list_reg_flag.append(0)
                                    self.browser.back()
                                    self.browser.reload()
                            else:
                                print "不能分割"
                                list_fill_text.append("")
                                list_split_flag.append(0)
                                list_reg_flag.append(0)
                                self.browser.reload()
                    
                    #存入数据集onlinecode,专利号,验证码路径,识别码,识别标志,不可分标志,时间
                    
                    for code_path, fill_text, split_flag, reg_flag in zip(list_code_path,list_fill_text, list_split_flag, list_reg_flag):
                        try:
                            self.db.onlinecode.insert({"indexflag": patentno, "codepath": code_path, "filltext": fill_text, \
                                                      "splitflag": split_flag, "regflag": reg_flag, "time": time.ctime()})
                        except: pass
        return download_link

    #处理验证码                       
    def deal_split(self, code_handler, image):
        list_split_image = code_handler.main_deal_split(image)
        return list_split_image

    #识别
    def reg_code(self, list_split_image):
        all_plain_text = "0123456789abcdef"
        reg_plain_text = []
        neural = NeuralWork()
        list_input_data = []
        for each_split_image in list_split_image:
            each_input_data = []
            for x in xrange(each_split_image.size[1]):
                for y in xrange(each_split_image.size[0]):
                    if each_split_image.getpixel((y, x)):
                        each_input_data.append(0)
                    else:
                        each_input_data.append(1)
            list_input_data.append(each_input_data)
        out = neural.reg_net(list_input_data)
        for each in out:
            plain_text = int(round(each[0] * 100))
            if plain_text < 16:
                reg_plain_text.append(all_plain_text[plain_text])
        return reg_plain_text

    #查找验证码图片位置
    def find_location(self, image):
        image = image.convert("L")
        image_width = image.size[0]
        image_height = image.size[1]
        
        flag = image_width
        location = [0, 0]
        for y in xrange(image_width):
            for x in xrange(image_height):
                if image.getpixel((y, x)) != 0:
                    flag = y
                    break
            if flag != image_width:
                location[0] = y
                location[1] = x
                break

        return location
Exemple #37
0
def scrape():
    mars_library = {}
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    news_title = soup.find_all(
        'div', class_='content_title')[0].find('a').text.strip()
    news_p = soup.find_all(
        'div', class_='rollover_description_inner')[0].text.strip()
    mars_library['news_title'] = news_title
    mars_library['news_p'] = news_p

    url1 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    response1 = requests.get(url1)
    soup1 = BeautifulSoup(response1.text, 'html.parser')
    half_addy = soup1.find_all(
        'a', class_='fancybox')[0].get('data-fancybox-href').strip()

    Big_Pic = "https://www.jpl.nasa.gov" + half_addy
    mars_library['featured_image_url'] = Big_Pic

    url2 = "https://twitter.com/marswxreport?lang=en"
    response2 = requests.get(url2)
    soup2 = BeautifulSoup(response2.text, 'html.parser')

    weather = soup2.find_all(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
    )[0].text

    mars_library['mars_weather'] = weather

    url3 = 'https://space-facts.com/mars/'

    tables = pd.read_html(url3)
    df = tables[0]
    df.columns = ['Description', 'Values']

    mars_facts = df.to_html(justify='left')
    mars_library['mars_facts'] = mars_facts

    url4 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    response4 = requests.get(url4)
    soup4 = BeautifulSoup(response4.text, 'html.parser')

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(url4)

    html = browser.html

    soup4 = BeautifulSoup(html, "html.parser")
    results = soup4.find_all('h3')

    hemisphere_image_urls = []
    tempdict = {}

    for result in results:
        item = result.text
        browser.click_link_by_partial_text(item)
        html1 = browser.html
        soup5 = BeautifulSoup(html1, "html.parser")
        image = soup5.find_all(
            'div', class_="downloads")[0].find_all('a')[0].get("href")
        tempdict["title"] = item
        tempdict["img_url"] = image
        hemisphere_image_urls.append(tempdict)

        tempdict = {}
        browser.click_link_by_text('Back')

    mars_library['hemisphere_image_urls'] = hemisphere_image_urls

    return mars_library
Exemple #38
0
def download(link):
    browser = Browser()
    browser.visit('https://www.ssyoutube.com'+link)
    time.sleep(22)
    print("OPENED")
    browser.click_link_by_text('Download')
Exemple #39
0
    browser.visit(url)

    # tell the browser to look for form data and fill in with the information we provided earlier
    browser.fill('username', user_name)
    browser.fill('password', password)

    # find the button and click it to submit
    browser.find_by_tag('button').click()

    # find the navbar table where e-subro is housed. when we find it, click it
    nav_bar = browser.find_by_id('td4')
    drop_down = nav_bar.click()

    # search for text demand search and click it.

    browser.click_link_by_text('Demand Search')

    # fill in the form 'fileno' with the claim number from earlier
    browser.fill('fileNo', claim_number)
    # find the button element and submit
    browser.find_by_name('btnSearch').first.click()

    # note this only works for trs files- olf files use different html to navigate to- use try and except to catch index errors and spit to console
    try:
        browser.click_link_by_partial_text('click here')
    except:
        print(f"This docket cannot be found, {claim_number}")
        continue

    # other wise for olf, use this
    # used for olf cases ---browser.click_link_by_partial_text('Docket Records')
class TwitterLiker():

	# constructor
	def __init__(self):
		self.mUrl = "https://www.twitter.com/"
		self.cycles = 2
		self.browser = Browser()
		self.username = "******"
		self.pw = 'XXXXXXXXXX\r'
		self.totalLikes = 0
		self.userNameField = 'session[username_or_email]'
		self.passwordField = 'session[password]'
		self.loginButtonId = 'submit btn primary-btn js-submit'

	# scroll the page and
	# do the liking
	def launchPage(self):
		self.browser.visit(self.mUrl)
		self.login()


		# self.scrollBy()
		for i in range(0, self.cycles):
			self.likePosts()

		print(str(self.totalLikes) + " total likes this session...Yay!")		

	def login(self):
		print("login")
		print("logging in as " + self.username)
		self.browser.click_link_by_text('Log in')
		
		# time.sleep(1)

		assert self.browser.find_by_name(self.userNameField)
		self.browser.fill(self.userNameField, self.username)
		self.browser.fill(self.passwordField, self.pw)

		inputs = self.browser.find_by_tag('input')
		for foo in inputs:
			if foo['class'] == self.loginButtonId:
				foo.click()
				print('clicked the log in button')

		# need to sleep a few seconds here
		time.sleep(3)

	def likePosts(self):
		print("liking posts")
		buttonList = self.browser.find_by_tag('button')

		time.sleep(2)

		buttonList = self.browser.find_by_tag('button')
		likeList = 0

		time.sleep(1)
		
		for b in buttonList:			
			if 'title="Like"' in b['innerHTML']:
				#check if it's visible, if not move on
				if b.visible:
					b.click()
					self.totalLikes += 1
					likeList += 1
		print("just liked " + str(likeList) + " tweets.")
		
		self.scrollBy()

		time.sleep(1)

	def scrollBy(self):
		print("scrolling down.")
		# print( self.browser.execute_script( "window.scrollY" ))
		self.browser.execute_script( "window.scrollBy(0,30000);" )
		time.sleep(2) 

	def boneyard(self):
		print('boneyard')	
def economic_benefits():
    # initialize browser
    executable_path = {'executable_path': 'chromedriver.exe'}
    # use executable path below for mac
    # executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    print('COMMENCING DATA SCRAPE FOR ECONOMIC BENEFITS INFO')
    client.yosemite_db.economic_benefits.drop()

    # URL of yosemite articles page to be scraped
    url = 'https://www.nps.gov/yose/learn/news/newsreleases.htm'
    browser.visit(url)
    time.sleep(2)
    # empty lists to hold raw scraped data
    article_links = []
    headlines = []
    article_contents = []
    # empty lists that will hold cleaned scraped data
    years = []
    amounts = []
    job_counts = []
    visitor_counts = []
    # empty list to hold final scraped data
    economic_benefits = []

    # go through pages 1-33 and find links of targeted articles
    for x in range(1, 34):
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        article_snippets = soup.find_all('li', class_='ListingList-item')
        substring = 'Economic Benefit'
        for article_snippet in article_snippets:
            snippet_headline = article_snippet.find(
                'h3', class_='ListingResults-title').text
            if substring in snippet_headline:
                end_link = article_snippet.find('a')['href']
                article_link = 'https://www.nps.gov' + end_link
                article_links.append(article_link)
        browser.click_link_by_text('Next ')
        time.sleep(1)

    # visit each article link and extract content
    for article_link in article_links:
        browser.visit(article_link)
        article_html = browser.html
        article_soup = BeautifulSoup(article_html, 'html.parser')
        headline = article_soup.find('div', class_='ContentHeader').text
        headline = headline.replace('\n', '')
        headlines.append(headline)
        article_content = article_soup.find('div',
                                            class_='ArticleTextGroup').text
        article_contents.append(article_content)
    # loop through headlines and extract economic benefit $ amount (in millions)
    for headline in headlines:
        headline_split = headline.split('$')[1]
        amount = headline_split[:3]
        amounts.append(amount)
    # loop through article contents and extract year, job count, and visitor count
    for article_content in article_contents:
        year_split = article_content.split('Park in ')[1]
        year = year_split[:4]
        years.append(year)
        job_split = article_content.split('supported ')[1]
        job_count = job_split[:5]
        if ',' in job_count:
            job_count = job_count.replace(',', '')
            job_counts.append(job_count)
        elif ' ' in job_count:
            job_count = job_count.replace(' ', '')
            job_counts.append(job_count)
        else:
            job_counts.append(job_count)
        visitor_split = article_content.split('shows that')[1]
        visitor_count = visitor_split[:10]
        visitor_count = visitor_count.replace(',',
                                              '').replace('\xa0',
                                                          '').replace(' ', '')
        visitor_counts.append(visitor_count)

    # append extract information into economic_benefits dictionary
    economic_benefits.append({
        'years': years,
        'amounts': amounts,
        'job_counts': job_counts,
        'visitor_counts': visitor_counts
    })
    # append missing 2015 data
    economic_benefits[0]['years'].insert(2, '2015')
    economic_benefits[0]['amounts'].insert(2, '594')
    economic_benefits[0]['job_counts'].insert(2, '6890')
    economic_benefits[0]['visitor_counts'].insert(2, '4150217')

    economic_benefits_collection = client.yosemite_db.economic_benefits
    economic_benefits_collection.update({}, economic_benefits[0], upsert=True)

    print('OBTAINED ECONOMIC BENEFITS')
    browser.quit()
    return economic_benefits
    print('-------------------------------------------------------')
Exemple #42
0
import config
from splinter import Browser

browser = Browser()

browser.visit('https://studentemployment.neu.edu/tsx_studentjobs.aspx')

browser.fill('Skin$ctl08$LoginNameText', config.username)
browser.fill('Skin$ctl08$LoginPasswordText', config.password)

browser.find_by_name('Skin$ctl08$ctl14').click()
browser.click_link_by_text(config.jobTitle)

browser.find_link_by_text('Go to time sheet').first.click()

# browser.find_link_by_text('Start time sheet').first.click()

# alert = browser.get_alert()
# alert.accept()


def addShift(shift):
    browser.click_link_by_text('Add New Entry')
    browser.find_by_id('Skin_body_ctl01_WDL').find_by_css('option')[
        shift.day].click()
    browser.find_by_id('Skin_body_ctl01_StartDateTime1').select(shift.start)
    browser.find_by_id('Skin_body_ctl01_EndDateTime1').select(shift.end)
    browser.find_by_value('Add').first.click()


for shift in config.shifts:
def scrape():
    # A webscraping function for the latest news on mars
    # Python dictionary of the results
    scrape_rsult = {}

    # ### NASA Mars News

    # In[2]:

    # *** Scrape the [NASA Mars News Site] ***
    url_NASA = "https://mars.nasa.gov/news"
    r = req.get(url_NASA)  # sends a request to the url
    time.sleep(1)
    data = r.text  # turns response into texts
    soup = BeautifulSoup(
        data, "html.parser")  # changes the response from text to html

    # In[3]:

    # collect the latest News Title and Paragragh Text. Assign the text to variables that you can reference later.
    soup_div = soup.find(
        class_="slide")  # within div in body, within <ul>, <li class=slide>.
    soup_news = soup_div.find_all('a')  # search by anchor

    # In[4]:

    #getting the title
    NASA_latest_t = soup_news[1].get_text().strip()
    # ^^^Latest News Title
    scrape_rsult["Nasa_latest_title"] = NASA_latest_t

    # In[5]:

    #getting the paragraph
    # getting the paragraph url
    soup_p = soup_div.find_all('a', href=True)
    soup_p_url = soup_p[0]['href']
    # only the url of latest news article's paragraph

    # In[6]:

    #    Scrape the href of the first news article
    url = "https://mars.nasa.gov/"
    news_url = url + soup_p_url
    # request url
    r = requests.get(news_url)
    time.sleep(1)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")

    soup_para = soup.find(class_='wysiwyg_content')
    soup_para = soup_para.find_all('p')

    # In[7]:

    #    save the text of the paragraphs to a list
    NASA_latest_p = []
    for entry in soup_para:
        paragraph = entry.get_text().strip()
        NASA_latest_p.append(paragraph)
        # ^^^ NASA_latest_p is list of paragraphs from the latest news article

    scrape_rsult["Nasa_latest_paragraph"] = NASA_latest_p

    # ### JPL Mars Space Images - Featured Image

    # In[8]:

    # Visit the url for JPL's Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(3)

    # In[9]:

    # Use splinter to navigate the site and find the image url for the current Featured Mars Image
    #     the mars featured images are under a list element of the slide class. '>' signifies a child element.
    browser.find_by_css('li.slide>a.fancybox').first.click()
    time.sleep(1)

    # clicks the 'more info' button (caution!: the 'share' button is under a similar but different class)
    browser.find_by_css('div.buttons>a.button').first.click()
    time.sleep(1)
    # In[10]:

    # assign the url string to a variable called `featured_image_url`.
    #     Here, I decide to get both the full-size .jpg and an 800x600 size image for the webpage
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    # full-size jpg (to be linked if image is clicked)
    feat_full_img_soup = soup.find(class_="main_image")
    feat_full_img = feat_full_img_soup.get('src')

    # smaller size jpg (to be displayed on the webpage)
    #     uses splinter instead of beautiful soup
    browser.click_link_by_partial_href('800x600.jpg')
    #     switch over to the next browser (window no. 2)
    #     save it's url, then close 2nd window
    browser.windows.current = browser.windows[1]
    featured_image_url = browser.url
    browser.windows[1].close()

    # save the two urls
    ori_url = 'https://www.jpl.nasa.gov'
    feat_full_img = ori_url + feat_full_img
    # ^^^ feat_full_img is https://www.jpl.nasa.gov + url of the full-sized featured image
    #     featured_image_url is the smaller 800x600 image that will be featured on the webpage

    scrape_rsult["featured_image_url"] = featured_image_url
    scrape_rsult['feat_full_img'] = feat_full_img

    # ### Mars Weather

    # In[11]:
    ''' 
    *** Visit the Mars Weather twitter account (https://twitter.com/marswxreport?lang=en) and scrape the latest 
    Mars weather tweet from the page. Save the tweet text for the weather report as a variable called `mars_weather`. ***
    '''
    url = 'https://twitter.com/marswxreport?lang=en'
    r = requests.get(url)
    time.sleep(1)
    data = r.text
    soup = BeautifulSoup(data, 'html.parser')

    mars_tweets = soup.find(class_='stream-items js-navigable-stream')
    mars_tweets = mars_tweets.find(class_="js-tweet-text-container")

    mars_weather = mars_tweets.p.text
    # ^^^ mars_weather is the paragraph <p> text of the latest tweet from the Mars weather handle

    scrape_rsult["mars_weather_tweet"] = mars_weather

    # ### Mars Facts

    # In[12]:
    ''' 
    *** Visit the Mars Facts webpage (http://space-facts.com/mars/) and use Pandas to scrape the table containing 
    facts about the planet including Diameter, Mass, etc. ***
    '''
    facts_url = 'http://space-facts.com/mars/'
    all_facts_df = pd.read_html(
        facts_url)  # searches for html tables & returns list of dataframes
    all_facts_df = all_facts_df[0]

    # In[14]:

    # Use Pandas to convert the data to a HTML table string.
    facts_html = all_facts_df.to_html(header=False,
                                      index=False,
                                      justify='left')

    # ^^^ facts_html is the html table of the mars facts table
    scrape_rsult["mars_facts_table"] = facts_html

    # ### Mars Hemispheres

    # In[114]:
    ''' 
    *** Visit the USGS Astrogeology site 
    (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) 
    to obtain high resolution images for each of Mar's hemispheres.
    '''
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    time.sleep(3)

    # In[115]:

    # click each of the links to the hemispheres to find the image url to the full resolution image.
    # old code, may be useful later
    '''
    #    get list of <a href links> 
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    hemi_soup = soup.find_all(class_='itemLink product-item')
    hemi_href_ls = []
    for item in hemi_soup:
        url_index = 'https://astrogeology.usgs.gov'
        href = item['href']
        link = url_index + href
        hemi_href_ls.append(link)
    '''
    # Get unique hrefs
    '''     I could just go to these urls separately using browser.visit(url). But I interpret the instructions 
            as saying that I need to use splinter to click on the link in the browser.     '''
    # hemi_href_ls = np.unique(hemi_href_ls)
    # hemi_href_ls

    # In[116]:
    ''' Caution!: It seems splinter can only click link based on the exact wording of the text
    browser.click_link_by_partial_text('Cerberus Hemisphere')    #e.g. function will fail to find lower case 'cerberus'
    '''

    # In[117]:

    # Beautiful soup to search browser html for headers (these contain the hemisphere names)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    headers_soup = soup.find_all('h3')
    #test = headers_soup[2].text.replace(" Enhanced", "")
    #test

    # In[128]:

    # For each header in the beautiful soup, click link associated with it and get img_url
    hemisphere_image_urls = []
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    for header in headers_soup:
        #start at origin url for the Mars hemisphere section
        window = browser.windows[0]  # current window, the first window
        browser.visit(url)
        time.sleep(2)  # wait 2 secs for browser to load
        #getting title
        title = header.text
        title = title.replace(
            " Enhanced",
            "")  #get rid of " " + "Enhanced" for when dict is appended
        browser.click_link_by_partial_text(title)
        time.sleep(2)  # again, wait 2 secs for browser to load
        browser.click_link_by_text('Sample')
        browser.windows.current = browser.windows[
            1]  # switch current window to the window that just opened
        img_url = browser.url
        browser.windows.current = window  # switch the current window back
        hemisphere_image_urls.append({'title': title, 'img_url': img_url})
        window.close_others(
        )  # close all the other windows to keep browser nice and tidy!

    # ^^^ hemisphere_image_urls is list of dicts of img_url and title of hemisphere
    scrape_rsult["hemispheres"] = hemisphere_image_urls

    return scrape_rsult
Exemple #44
-1
def getRoutes(start,end):
    browser = Browser(
        driver_name="firefox"
)
    browser.visit('https://www.hopstop.com/search?xfr=cityscape')
    print(browser.url)
    browser.fill('address1',str(start))
    browser.fill('address2',str(end))
    browser.find_by_name('get_dirs').click()
    print(browser.url)
    if browser.is_text_present('Did you mean?'):
        print "better at least get here"
        #browser.click_link_by_href("#") 
        for link in browser.find_link_by_href("#"):
            print "Okay"
            if link.visible == True:
                print link.text
                browser.click_link_by_text(link.text)
                break
    browser.click_link_by_href("#")
    links = browser.find_link_by_partial_href("/station?stid")
    results = []
    for link in links:
        results.append(link.value)
    browser.quit()
    return results