Exemple #1
0
def scrape():
    # scrapes several sites and returns free stuff from

    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    # Go to sacramento cl and navigate to 1st item of free stuff

    url = "https://sacramento.craigslist.org/d/free-stuff/search/zip"
    browser.visit(url)
    browser.click_link_by_partial_href('https://sacramento.craigslist.org')
    time.sleep(2)

    # Using Splinter to scrape cl and store data in dataframe stuff (clean up condition)

    stuff = pd.DataFrame(columns=['lat', 'long', 'age', 'title'])

    for x in range(1, 4):
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        title = soup.find(id="titletextonly").text
        age = soup.find('time', class_="date timeago")["datetime"]
        loc = soup.find(id='map')
        lat = loc["data-latitude"]
        long = loc["data-longitude"]

        stuff.loc[x] = [lat, long, age, title]
        time.sleep(2)
        browser.click_link_by_partial_text('next')

    return stuff
Exemple #2
0
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = models.User(name="Alice",
                                email="*****@*****.**",
                                password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run)
        self.process.start()
        time.sleep(1)

    def test_add_post(self):
        log = logging.getLogger("unittest.TestCase")

        ################################## Login as Alice
        #self.browser.visit("http://0.0.0.0:8080/login") # original line
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        #self.assertEqual(self.browser.url, "http://0.0.0.0:8080/") # original line
        # self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") # ask sam about this line

        ############################################ add a test post #####################
        self.browser.visit("http://127.0.0.1:5000")
        self.browser.click_link_by_partial_href('add')
        self.browser.fill("title", "post test1 title")
        self.browser.fill("content", "post test1 content")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        post_found = self.browser.find_by_tag(
            'h1'
        ).value  #cheated here - made template title h2. how do we access? index?
        #post_found = self.browser.find_by_text('post test1 title').value - didnt work

        log.debug("FIRSTH1= %r", post_found)

        self.assertEqual(post_found, "post test1 title")

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
Exemple #3
0
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = models.User(name="Alice", email="*****@*****.**",
                                password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run)
        self.process.start()
        time.sleep(1)
        

    def test_add_post(self):
        log= logging.getLogger("unittest.TestCase")
        
        ################################## Login as Alice
        #self.browser.visit("http://0.0.0.0:8080/login") # original line
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        #self.assertEqual(self.browser.url, "http://0.0.0.0:8080/") # original line
        # self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") # ask sam about this line
        
############################################ add a test post #####################
        self.browser.visit("http://127.0.0.1:5000")
        self.browser.click_link_by_partial_href('add')
        self.browser.fill("title", "post test1 title")
        self.browser.fill("content", "post test1 content")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        post_found = self.browser.find_by_tag('h1').value #cheated here - made template title h2. how do we access? index?
        #post_found = self.browser.find_by_text('post test1 title').value - didnt work
        
        log.debug( "FIRSTH1= %r", post_found )
        
        self.assertEqual(post_found, "post test1 title")

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
Exemple #4
0
class UserTest(StaticLiveServerTestCase):
    def setUp(self):
        check_permissions()
        self.username = "******"
        create_user(self.username)

        self.browser = Browser()
        self.browser.visit(self.live_server_url)

    def test_signup(self):
        signup_url = settings.SIGNUP_URL
        self.browser.click_link_by_partial_href(signup_url)

        username = "******"
        password = "******"
        email = "*****@*****.**"
        signup(self.browser, username, password, email)

        user_exists = exists_user(username)
        self.assertTrue(user_exists)

        user = get_user(username)
        self.assertEquals(user.username, username)
        # self.assertEquals(user.password, password)
        self.assertEquals(user.email, email)

        document_list_url = self.live_server_url + reverse("documents.views.list_documents")
        self.assertEquals(self.browser.url, document_list_url)

        profile_xpath = "/html/body/div/div[1]/div/ul[2]/li[4]/a"
        profile_link = self.browser.find_by_xpath(profile_xpath)
        self.assertEquals(profile_link.value, "@{}".format(username))

        #        import time; time.sleep(3)
        self.browser.quit()

    def test_signin(self):
        login_url = settings.LOGIN_URL
        self.browser.click_link_by_partial_href(login_url)

        username = self.username
        password = self.username
        login(self.browser, username, password)

        document_list_url = self.live_server_url + reverse("documents.views.list_documents")
        self.assertEquals(self.browser.url, document_list_url)

        profile_xpath = "/html/body/div/div[1]/div/ul[2]/li[4]/a"
        profile_link = self.browser.find_by_xpath(profile_xpath)
        self.assertEquals(profile_link.value, "@{}".format(username))

        #        import time; time.sleep(3)
        self.browser.quit()
Exemple #5
0
def get_url_code(auth_url, username, password, login='******'):
    b = Browser(driver_name='chrome')
    b.visit(auth_url)
    b.click_link_by_partial_href("/en/login")
    if login == 'facebook':
        b.click_link_by_partial_href("https://www.facebook.com")
        b.fill_form({'email': username, 'pass': password})
        b.click_link_by_id('loginbutton')
    elif login == 'spotify':
        b.fill_form({'username': username, 'password': password})
        loginbutton = b.find_by_text('Log In')[0]
        loginbutton.click()
    b.visit(auth_url)
    codeurl = b.url
    code = codeurl.split("?code=")[1].split('&')[0]
    b.quit()

    return code
Exemple #6
0
def download_art(title):

    browser = Browser()
    # Visit URL
    url = "http://gen.lib.rus.ec/scimag/index.php"
    browser.visit(url)

    article_title = browser.find_by_name('s')
    article_title.fill(title)

    button = browser.find_by_value('Search!')
    # Interact with elements
    button.click()

    #sleep is use at each step to control the follow between program and internet speed

    time.sleep(10)
    browser.click_link_by_text('Libgen')
    time.sleep(15)
    browser.click_link_by_partial_href('http://gen.lib.rus.ec/scimag/get.php')

    time.sleep(5)
    browser.quit()
Exemple #7
0
def get_pass():
    return check_output("gpg -dq ~/rsc_password.gpg", shell=True).strip("\n")

rsc_password_plaintext = get_pass()

b = Browser('chrome', headless=True)
time.sleep(wait_delay)
b.visit('https://mc.manuscriptcentral.com/ee/')
time.sleep(wait_delay)
b.fill('USERID', '*****@*****.**')
time.sleep(wait_delay)
b.fill('PASSWORD',rsc_password_plaintext)
time.sleep(wait_delay)
b.click_link_by_id('logInButton')
time.sleep(wait_delay)
b.click_link_by_partial_href("AUTHOR")
time.sleep(wait_delay)
html_obj = b.html
soup = BeautifulSoup(html_obj,"lxml")
#  soup = BeautifulSoup(html_obj)
table = soup.find("table", attrs={"class":"table table-striped rt cf"})
row = table.tbody.findAll('tr')[1]
first_column_html = str(row.findAll('td')[1].contents[0])
current_manuscript_status = BeautifulSoup(first_column_html,"lxml").text
# current_manuscript_status = 'demo'
# print current_status_msg
time.sleep(wait_delay)
b.quit()

if current_manuscript_status == previous_manuscript_status:
    print 'Your manuscript status remains unchanged ....'
Exemple #8
0
class DocTest(StaticLiveServerTestCase):
    def setUp(self):
        fss.remove_tree(settings.MEDIA_ROOT)
        check_permissions()
        set_site(self.live_server_url)

        self.browser = Browser()
        self.browser.visit(self.live_server_url)

        login_url = settings.LOGIN_URL
        self.browser.click_link_by_partial_href(login_url)

        username = '******'
        password = '******'
        create_user(username)
        login(
            self.browser,
            username,
            password,
        )

        upload_url = reverse('documents.views.add_document')
        self.browser.click_link_by_partial_href(upload_url)

        source = 'local'
        docfile = get_abs_path('doctest.pdf')
        language = 'eng'
        public = True
        title = 'test'
        notes = 'test notes'
        upload(
            self.browser,
            source,
            docfile,
            language,
            public,
            title,
            notes,
        )

        self.browser.is_element_not_present_by_value('ready', 10)

        self.public = public
        self.title = title
        self.notes = notes
        self.document = get_document(title)

    def test_upload_doc_local(self):  #Create
        document_exists = exists_document(self.title)
        self.assertTrue(document_exists)
        self.assertEquals(self.document.public, self.public)
        self.assertEquals(self.document.title, self.title)
        self.assertEquals(self.document.notes, self.notes)

        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)

        document_xpath = '/html/body/div/div[2]/table/tbody/tr[1]'
        document_tr = self.browser.find_by_xpath(document_xpath)
        document_id = document_tr['data-id']
        self.assertEquals(int(document_id), self.document.id)

        document_title_xpath = '//*[@id="documents_cell"]/span[1]'
        document_title = self.browser.find_by_xpath(document_title_xpath)
        self.assertEquals(document_title.value, self.title)

        profile_xpath = '/html/body/div/div[1]/div/ul[2]/li[4]/a'
        profile_link = self.browser.find_by_xpath(profile_xpath)
        owner_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[4]/a'
        owner_link = self.browser.find_by_xpath(owner_xpath)
        self.assertEquals(profile_link.value, owner_link.value)

        status_xpath = '/html/body/div/div[2]/table/tbody/tr/td[5]/div'
        status_div = self.browser.find_by_xpath(status_xpath)
        self.assertEquals(status_div.value, self.document.status)

        numpages_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[6]/div'
        numpages_div = self.browser.find_by_xpath(numpages_xpath)
        self.assertEquals(int(numpages_div.value), self.document.page_count)

        privacy_icon_xpath = '//*[@id="privacy"]/i'
        privacy_icon = self.browser.find_by_xpath(privacy_icon_xpath)
        self.assertTrue(privacy_icon.has_class('icon-eye-open'))

        structure = create_structure(self.document)
        root_path = self.document.get_root_path()
        dirs = fss.listdir(root_path)[0]
        files = fss.listdir(root_path)[1]
        for d in dirs:
            dir_path = os.path.join(root_path, d)
            for f in structure['dirs'][d]:
                self.assertIn(f, fss.listdir(dir_path)[1])
        for f in structure['files']:
            self.assertIn(f, fss.listdir(root_path)[1])

#        import time; time.sleep(3)
        self.browser.quit()


#
#    def test_upload_doc_dropbox(self): #Create
#        pass

    def test_view_doc(self):  #Read
        link_title_xpath = '//*[@id="documents_cell"]/span[1]/a'
        self.browser.find_by_xpath(link_title_xpath).click()
        viewer_title_xpath = ('//*[@id="documentviewer-container"]'
                              '/div/div[1]/div[1]/div[1]/div[2]/h4/a')
        viewer_title = self.browser.find_by_xpath(viewer_title_xpath)
        self.assertEquals(viewer_title.value, self.title)

        #        import time; time.sleep(3)
        self.browser.quit()

    def test_edit_doc(self):  #Update
        edit_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[7]/a[3]/i'
        self.browser.find_by_xpath(edit_xpath).click()

        public = False
        title = 'new title'
        notes = 'new notes'
        edit(
            self.browser,
            public,
            title,
            notes,
        )

        document = get_document(title)
        self.assertEquals(document.public, public)
        self.assertEquals(document.title, title)
        self.assertEquals(document.notes, notes)

        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)

        document_title_xpath = '//*[@id="documents_cell"]/span[1]'
        document_title = self.browser.find_by_xpath(document_title_xpath)
        self.assertEquals(document_title.value, title)

        privacy_icon_xpath = '//*[@id="privacy"]/i'
        privacy_icon = self.browser.find_by_xpath(privacy_icon_xpath)
        self.assertTrue(privacy_icon.has_class('icon-eye-close'))

        #        import time; time.sleep(3)
        self.browser.quit()

    def test_remove_doc(self):  #Delete
        old_doc_num = len(self.browser.find_by_css('tr.document-row'))

        remove_xpath = '//*[@id="remove"]/i'
        self.browser.find_by_xpath(remove_xpath).click()
        confirm_xpath = '//*[@id="confirm-remove"]/i'
        self.browser.find_by_xpath(confirm_xpath).click()

        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)

        new_doc_num = len(self.browser.find_by_css('tr.document-row'))
        self.assertEquals(new_doc_num, old_doc_num - 1)

        #        import time; time.sleep(3)
        self.browser.quit()
def scrape(): 
    executable_path = {'executable_path': 'C:/Users/osafi/Desktop/BOOT CAMP/12 WEB SCRAPING/Web_Scrapping_Challenge_OS/Missions_to_Mars/chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    html = browser.html
    time.sleep(5)
    soup = bs(html, 'html.parser')
    mars = soup.find('div', class_ = "list_text")
    news_title = mars.find('div', class_ = "content_title").text
    news_p = mars.find('div', class_ = "article_teaser_body").text
    mars_news = [news_title, news_p]

    image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(image_url)
    time.sleep(15)
    html = browser.html
    soup = bs(html, 'html.parser')
    browser.click_link_by_id("full_image")
    time.sleep(5)
    html = browser.html
    soup = bs(html, 'html.parser')
    time.sleep(15)
    more_info = soup.find('div', class_="addthis_toolbox addthis_default_style")['addthis:url']
    browser.click_link_by_partial_href(more_info)
    html = browser.html
    soup = bs(html, 'html.parser')
    featured_image = soup.find('img', class_="main_image")['src']
    featured_image_url = "https://www.jpl.nasa.gov" + featured_image

    weather_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(weather_url)
    time.sleep(15)
    html = browser.html
    soup = bs(html, 'html.parser')
    results = soup.find('div', class_="css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0")
    mars_weather = results.find('span').text


    facts_url = "https://space-facts.com/mars/"
    browser.visit(facts_url)
    time.sleep(15)
    html = browser.html
    soup = bs(html, 'html.parser')
    facts = pd.read_html(facts_url)
    mars_facts = pd.DataFrame(facts[0])
    mars_facts_string = mars_facts.to_html(header = False, index = False)

    hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemi_url)
    time.sleep(15)
    html = browser.html
    soup = bs(html, 'html.parser')
    hemisphere_image_urls = []
    results = soup.find('div', class_ = 'result-list')
    hemi_pics = results.find_all('div', class_ = 'item')
    print(hemi_pics)
    for i in hemi_pics:
        title = i.find('h3').text
        title = title.replace("Enhanced", "")
        href = i.find('a')['href']
        image_url = "https://astrogeology.usgs.gov/" + href
        browser.visit(image_url)
        time.sleep(15)
        html = browser.html
        soup = bs(html, 'html.parser')
        full_size = soup.find('div', class_ = 'downloads')
        img_url = full_size.find('a')['href']
        hemisphere_image_urls.append({'title': title, 'img_url': img_url})

    mars_data = {
        "mars_title": mars_news[0],
        "mars_news": mars_news[1],
        "featured_image": featured_image_url,
        "mars_weather": mars_weather,
        "mars_facts": mars_facts_string,
        "mars_hemis": hemisphere_image_urls
    }

    browser.quit()
    
    return(mars_data)  
def scrape():  
    # dependencies
    import pandas as pd
    import os
    from splinter import Browser
    from bs4 import BeautifulSoup
    from webdriver_manager.chrome import ChromeDriverManager
    from random import randint
    from time import sleep

    # Setup splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    #------------------------------------------------------------------------
    # ## NASA News latest headline and p text

    # visit the site
    nasa_news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(nasa_news_url)
    sleep(randint(3,10))

    # preparing soup object
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # get news title
    news_title = soup.find('div', class_='image_and_description_container').find('div', class_='content_title').a.text.strip()

    # get news teaser
    news_p = soup.find(
        'div', class_='list_text').find(
        'div', class_='article_teaser_body').text

    #------------------------------------------------------------------------
    # ## JPL Featured image

    # visit the site
    jpl_img_url = 'https://www.jpl.nasa.gov/images?search=&category=Mars'
    browser.visit(jpl_img_url)
    sleep(randint(3,10))

    # preparing the soup obj
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # finding the most recent image to click into 
    # clicking into the image
    try:
        partial_href = soup.find('div', class_='SearchResultCard').find('a', class_='group')['href']
        browser.click_link_by_partial_href(partial_href)
        sleep(randint(3,10))
    except AttributeError as e:
        print(e)

    # preparing the soup obj
    # including sleeps to allow browser and soup to catch up to new page
    sleep(randint(3,10))
    new_html = browser.html
    sleep(randint(3,10))
    soup=BeautifulSoup(new_html, 'html.parser')

    # getting the featured image url
    featured_image_url = soup.find('aside').find('a')['href']

    #------------------------------------------------------------------------
    # ## Mars Facts table

    # url for the mars facts website
    mars_facts_url = 'https://space-facts.com/mars/'

    # reading the table to html
    tables = pd.read_html(mars_facts_url)

    # saving the table to a df
    mars_facts_df = tables[0]

    # formatting the df - col names, and setting index
    mars_facts_df.columns = [' ', 'Mars facts']
    mars_facts_df = mars_facts_df.set_index(' ')

    # # removing the \n breaks
    mars_table = mars_facts_df.to_html(
        justify='center', header=False,
        classes=["table-striped", "table-responsive"]).replace(
            'dataframe ', '').replace('border="1"', '').replace(
            '\n', '')
    
    #------------------------------------------------------------------------
    # ## Hemisphere images

    # visit the site
    hemisphere_img_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemisphere_img_url)
    sleep(randint(3,10))

    # preparing the soup obj
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # creating a list of links for the 4 hemispheres
    link_list = []
    for item in soup.find_all('div', class_='description'):
        list_item = item.find('a', class_='itemLink').text
        link_list.append(list_item)

    # appending name of hemisphere and img link
    ## empty list to append dicts of name/img to
    hemisphere_image_urls = []
    ## for loop over the list of links
    for x in range(0, len(link_list)):
        nav_url = link_list[x]
        #print(nav_url)
        # visiting the link for to the hemisphere images
        browser.click_link_by_partial_text(nav_url)

        # prepping the soup object
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        # finding image url and title
        img_url = soup.find('div', class_='downloads').find('a')['href']
        # removing last 9 characters to remove the 'enhanced' end word
        img_title = soup.find('section', class_='block metadata').h2.text[:-9]
        # appending to list of dicts
        hemisphere_image_urls.append({'title': img_title, 'img_url': img_url})

        sleep(randint(1,3))

        # returning to page from which images come to apply next link
        hemisphere_img_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
        browser.visit(hemisphere_img_url)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        sleep(randint(1,3))


    # quit the browser
    browser.quit()

    return_dict = {
        'nasa_news': {'news_title': news_title, 'news_pp': news_p},
        'jpl_featured_image': featured_image_url,
        'mars_facts_table': mars_table,
        'hemisphere_image_urls': hemisphere_image_urls
    }

    return return_dict
Exemple #11
0
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")
        # Resize browser window to make sure all elements are visible for tests
        self.browser.driver.set_window_size(1920, 1080)

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        self.process = multiprocessing.Process(target=app.run,
                                               kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)

    def test_register_new_user(self):
        self.browser.visit("http://127.0.0.1:8080/create_user")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "testpass")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")

    def test_register_user_exists(self):
        self.test_register_new_user()
        self.browser.visit("http://127.0.0.1:8080/create_user")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "testpass")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/create_user")

    def test_login_correct(self):
        self.test_register_new_user()
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "testpass")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        login_link = self.browser.is_element_present_by_text('Login')
        self.assertFalse(login_link)
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/fight")

    def test_login_incorrect(self):
        self.test_register_new_user()
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "testpass")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        login_link = self.browser.is_element_present_by_text('Login')
        self.assertTrue(login_link)
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")

    def test_logout(self):
        self.test_login_correct()
        self.browser.click_link_by_partial_href('logout')
        logout_link = self.browser.is_element_present_by_text('Logout')
        self.assertFalse(logout_link)
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
def scrape():
    # browser = Browser("chrome")
    browser = init_browser()
    mars_data = {}

    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    url = "https://mars.nasa.gov/news/"
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    news_title = soup.find("div", class_="content_title").text
    news_p = soup.find("div", class_="article_teaser_body").text

    mars_data["news_title"] = news_title
    mars_data["summary"] = news_p

    # url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    # browser.visit(url2)

    # html = browser.html
    # soup = BeautifulSoup(html,"html.parser")

    # browser.click_link_by_partial_text("FULL IMAGE")
    # time.sleep(1)
    # browser.click_link_by_partial_text('more info')
    # time.sleep(1)

    # featured_image_url = soup.find("article")
    # featured_image_url = featured_image_url.find("a")

    # mars_data["featured_image_url"] = featured_image_url

    url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url2)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    browser.click_link_by_partial_text("FULL IMAGE")
    time.sleep(1)
    browser.click_link_by_partial_text('more info')
    time.sleep(1)
    browser.click_link_by_partial_href('/spaceimages/images')
    featured_image_url = (str(browser.url))
    mars_data["featured_image_url"] = featured_image_url

    # # Twitter API Keys
    # consumer_key = consumer_key
    # consumer_secret = consumer_secret
    # access_token = access_token
    # access_token_secret = access_token_secret

    # # Setup Tweepy API Authentication
    # auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    # auth.set_access_token(access_token, access_token_secret)
    # api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

    url_weather = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_weather)
    html_weather = browser.html
    soup = BeautifulSoup(html_weather, "html.parser")
    mars_weather = soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text

    mars_data["mars_weather"] = mars_weather

    url3 = 'https://space-facts.com/mars/'

    mars_facts = pd.read_html(url3)

    mars_df = mars_facts[0]
    mars_df.columns = ['Measure', 'Values']
    mars_df = mars_df.set_index('Measure')
    mars_df.head()
    mars_table = mars_df.to_html(classes='marstable')
    mars_table = mars_table.replace('\n', ' ')

    mars_data["mars_table"] = mars_table

    # hemisphere_image_urls = []

    # url3 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    # browser.visit(url3)
    # browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
    # html = browser.html
    # soup = BeautifulSoup(html,"html.parser")
    # hem1title= soup.find(class_='title').text
    # browser.click_link_by_partial_href('http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg')
    # hem1 = (str(browser.url))
    # hemisphere_image_urls.append({"title": hem1title, "img_url": hem1})

    # browser.visit(url3)
    # browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
    # html = browser.html
    # soup = BeautifulSoup(html,"html.parser")
    # hem2title= soup.find(class_='title').text
    # browser.click_link_by_partial_href("http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg")
    # hem2 = (str(browser.url))
    # hemisphere_image_urls.append({"title": hem2title, "img_url": hem2})

    # browser.visit(url3)
    # browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    # html = browser.html
    # soup = BeautifulSoup(html,"html.parser")
    # hem3title= soup.find(class_='title').text
    # browser.click_link_by_partial_href("http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg")
    # hem3 = (str(browser.url))
    # hemisphere_image_urls.append({"title": hem3title, "img_url": hem3})

    # browser.visit(url3)
    # browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced')
    # html = browser.html
    # soup = BeautifulSoup(html,"html.parser")
    # hem4title= soup.find(class_='title').text
    # browser.click_link_by_partial_href("http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg")
    # hem4 = (str(browser.url))
    # hemisphere_image_urls.append({"title": hem4title, "img_url": hem4})

    # # hemisphere_image_urls.append({"title": hem1title, "img_url": hem1})
    # # hemisphere_image_urls.append({"title": hem2title, "img_url": hem2})
    # # hemisphere_image_urls.append({"title": hem3title, "img_url": hem3})
    # # hemisphere_image_urls.append({"title": hem4title, "img_url": hem4})
    # mars_data["mars_hemis"] = hemisphere_image_urls
    return mars_data
Exemple #13
0
def scrape_mars():
    #setup browser
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    
    #get title information from the first site
    url = "https://mars.nasa.gov/news/"
    response = requests.get(url)
    
    #create the soup object
    soup = bs(response.text)

    m_titles = soup.find_all('div', class_ = 'content_title')
    #title1 is the variable that contains the desired news headline
    title1 = m_titles[0].text

    #get paragraph information from the first site
    m_para = soup.find_all('div', class_ = 'rollover_description_inner')

    #para1 is the variable that contains the desired blurb
    para1 = m_para[0].text

    #get the image desired from the second page
    url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

    browser.visit(url2)

    # Click the first link on the page
    browser.click_link_by_partial_href('images')

    html = browser.html

    ibisque = bs(html, 'html.parser')

    #return all the img tags and store in a variable
    img_url = ibisque.find_all("img")

    #url1 is the url for the desired image
    url1 = img_url[2]['src']

    #get tables from site number 3
    table_url = 'https://space-facts.com/mars/'

    mt_tables = pd.read_html(table_url)

    #save the desired table to mars_df
    mars_df = mt_tables[0]

    #write the table out to html, while getting rid of line break characters
    mars_html_t = mars_df.to_html(index = False)
    mars_html_t = mars_html_t.replace('\n', '')

    #get the martian hemisphere picture urls
    pics_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    p_urls = ['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles']

    pic_urls = []

    pic_titles = []

    for p in p_urls:
        #visit the website with the pictures
        browser.visit(pics_url)
    
        # Click the first link on the page
        browser.click_link_by_partial_text(p)
    
        #Use the html in the open browser
        html3 = browser.html
    
        #create a beautiful soup object
        chilli = bs(html3, 'html.parser')
    
        #find the image url by searching for <a> tags
        im_a = chilli.find_all('a')
    
        #append the image url into the correct list
        pic_urls.append(im_a[5]['href'])
    
        #find the image title
        im_title = chilli.find('h2', class_ = 'title').text
    
        #append the title to the correct list
        pic_titles.append(im_title)
    
    #package results in a list of dictionaries 
    # pics list is the output list for the function   
    pics_list = []

    for q in range(4):
        u_dict = {}
        u_dict['title'] = pic_titles[q]
        u_dict['url'] = pic_urls[q]
        pics_list.append(u_dict)
    #close browser
    browser.quit()
    #store data in a dict
    mars_data = {
    'news_headline': title1,
    'news_blurb': para1,
    'daily_img': url1,
    'html_d_table': mars_html_t,
    'hemi_pics': pics_list   
    }
    return(mars_data)
Exemple #14
0
# browser.find_by_id('product-section')
# browser.click_link_by_class('link_id')
# browser.click_link_by_id("results")
WebElement elem = driver.findElement(By.xpath("""//*[@id="product-section"]/div[2]/div[1]/a/img"""));
JavascriptExecutor executor = (JavascriptExecutor)driver;
executor.executeScript("arguments[0].click();", elem);


#%%
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

browser.visit(url)
# from selenium.webdriver.common.keys import Keys #need to send keystrokesWebDriverWait wait2 = new WebDriverWait(driver, 10);
# wait2.until(ExpectedConditions.elementToBeClickable(By.href("")));
browser.click_link_by_partial_href("/search/map/Mars/Viking/cerberus_enhanced")


#%%
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.events import EventFiringWebDriver as EwC
driver = webdriver.Chrome("C://Users/lcc25/repos/utexas_hmwk_python/marsmission/chromedriver.exe")
driver.get(url)


# element = WebDriverWait(driver,30).until(EwC.find_elements_by_partial_link_text('https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'))

Exemple #15
0
class control_google():
    def init_browser(self, email, passwd):
        self.state = 'good'
        self.passwd = passwd
        self.login = email

        param = {
            'chrome.noWebsiteTestingDefaults': True,
            'chrome.prefs': {
                'profile.default_content_settings': {
                    'images': 2
                },
            }
        }
        from selenium.webdriver.chrome.options import Options
        options = Options()

        #options.add_argument('--allow-running-insecure-content')
        #options.add_argument('--disable-web-security')
        #options.add_argument('--disk-cache-dir=/var/www/cake2.2.4/app/tmp/cache/selenium-chrome-cache')
        #options.add_argument('--no-referrers')
        #options.add_argument('--window-size=1003,719')
        #options.add_argument('--proxy-server=localhost:8118')
        options.add_argument(
            "'chrome.prefs': {'profile.managed_default_content_settings.images': 2}"
        )

        CHROME = {
            "browserName": "chrome",
            "chrome.prefs": {
                "profile.managed_default_content_settings.images": 2
            },
            "chrome.switches": ["disable-images"],
        }

        self.browser = Browser('chrome', user_agent=useragent)
        #self.browser = Browser('chrome', user_agent=useragent, desired_capabilities=CHROME)

        load_page = 'https://accounts.google.com/ServiceLogin?btmpl=mobile_tier2&hl=ru&service=mobile'
        self.browser.visit(load_page)

        self.browser.find_by_id('Email').first.fill(email + '@gmail.com')
        self.browser.find_by_id('Passwd').first.fill(passwd)

        self.browser.find_by_id('signIn').first.click()

    def _google_hook(self):
        if self.browser.is_element_present_by_id('358'):
            self.browser.find_by_id('358').first.click()

        if self.browser.is_element_present_by_id('link_dismiss'):
            try:
                self.browser.find_by_id('link_dismiss').first.click()
            except:
                pass

        if 'getstarted' in self.browser.url:
            self.browser.back()

        if self.browser.is_element_present_by_id('link_dismiss'):
            self.browser.find_by_id('link_dismiss').first.click()

    def open_profile(self):
        print 'Open light version profile'
        load_page = 'https://plus.google.com/app/basic/%s/about' % self.profile_id
        self.browser.visit(load_page)

    def save_profile(self):
        self.browser.find_by_id('177').first.click()

    def register_google_plus(self, firstName, lastName):
        load_page = 'https://plus.google.com/u/0/?gpsrc=ogpy0&tab=XX'
        self.browser.visit(load_page)

        self.browser.fill('firstName', firstName)
        self.browser.fill('lastName', lastName)

        self.browser.find_by_name('buttonPressed').first.click()

        self.browser.find_by_id('357').first.click()

    def get_profile_id(self):
        load_page = 'https://www.google.com/settings/general-light?ref=/settings/account'
        self.browser.visit(load_page)
        if self.browser.is_element_present_by_xpath('//a[@class="CS"]'):
            profile_link = self.browser.find_by_xpath('//a[@class="CS"]').first
            link_path = profile_link['href']

            return link_path.split('/')[3]
        else:
            return False

    def profile_edit(self, vals):
        self.open_profile()

        print 'Click change profile'
        self.browser.find_by_id('59').first.click()

        #Confirm mobile rules
        self._google_hook()

        self.browser.find_by_name('peWork0').first.fill(vals['company'])
        self.browser.find_by_name('peWorkTitle0').first.fill(vals['position'])
        self.browser.find_by_name('peWorkStartYear0').first.fill(
            vals['year_start'])
        self.browser.find_by_name('peWorkEndYear0').first.fill(
            vals['year_stop'])

        self.browser.find_by_name('peSchool0').first.fill(
            vals['university_name'])
        self.browser.find_by_name('peSchoolMajor0').first.fill(
            vals['field_education_name'])
        self.browser.find_by_name('peSchoolStartYear0').first.fill(
            vals['going_to_college_year'])
        self.browser.find_by_name('peSchoolEndYear0').first.fill(
            vals['after_graduation_year'])

        self.browser.find_by_name('pePlaceLived0').first.fill(
            vals['place_lived'])

        self.browser.find_by_name('pePlaceLivedIsCurrent').first.check()

        self.browser.find_by_name('peGender').first.select("1")

        print 'Done profile_edit'

        self.save_profile()

    def change_photo(self, photo_path):
        self.open_profile()

        print 'Click change profile'
        self.browser.find_by_id('59').first.click()

        print 'Click change photo'
        self.browser.find_by_id('375').first.click()

        self.browser.attach_file('photo_upload_file_name', self.photo_path)
        print 'Done profile_edit'

        self.browser.find_by_id('314').first.click()

        self.save_profile()

    def change_pass(self, old_pass, new_pass):
        print 'Open password  change page'
        load_page = 'https://accounts.google.com/b/0/EditPasswd?hl=ru'
        self.browser.visit(load_page)

        self.browser.find_by_id('OldPasswd').first.fill(old_pass)
        self.browser.find_by_id('Passwd').first.fill(new_pass)
        self.browser.find_by_id('PasswdAgain').first.fill(new_pass)

        self.browser.find_by_id('save').first.click()
        print 'Done change pass'

    def open_full_plus(self):
        'Print open full Google+'
        load_page = 'https://plus.google.com/u/0/'
        self.browser.visit(load_page)

    def open_full_profile(self):
        self.open_full_plus()
        self._google_hook()

        print 'Click user icon'
        self.browser.find_by_id('gbi4i').first.click()

        print 'Click show profile'
        #self.browser.find_by_id('gbmplp').first.click()
        self.browser.find_by_xpath(
            '//a[@class="gbqfb gbiba gbp1"]').first.click()

    def change_name(self, firstName, lastName):
        self.open_full_plus()
        self.open_full_profile()

        print 'Click change name'
        time.sleep(5)
        self.browser.find_by_xpath(
            '//div[@guidedhelpid="profile_name"]').first.click()

        print 'Fill values'
        time.sleep(5)
        self.browser.find_by_xpath(
            '//input[@class="l-pR osa g-A-G"]').first.fill(firstName)
        self.browser.find_by_xpath(
            '//input[@class="l-oR Ika g-A-G"]').first.fill(lastName)

        print 'Save results'
        self.browser.find_by_xpath(
            '//*[starts-with(@class,"a-f-e c-b c-b-M nVrMHf nZQKMd h019o")]'
        ).first.click()

        print 'Confirm'
        self.browser.find_by_name('ok').first.click()

    def youtube_hoock(self):
        if 'ServiceLogin?' in self.browser.url:
            print 'ServiceLogin? Hook'
            self.browser.fill('Passwd', self.passwd)
            self.browser.find_by_name('signIn').first.click()
            #self.browser.back()

        if 'create_channel?' in self.browser.url:
            print 'create_channel? Hook'
            self.browser.click_link_by_partial_href('create_channel')
            self.browser.fill('username', self.login)
            self.browser.find_by_id('channel_submit').click()
            self.browser.back()
            self.browser.back()
            self.browser.back()

        if 'select_site?' in self.browser.url:
            print 'select_site? Hook'
            self.browser.find_by_xpath('//input[@type="submit"]').click()
            self.browser.back()
            self.browser.back()

        if 'switch-profile.g?' in self.browser.url:
            print 'switch-profile.g? Hook'
            self.browser.find_by_id('switchButton').click()

    def youtube_like(self, url):
        self.browser.visit(url)
        self.browser.click_link_by_partial_href('action_like=1')

        self.youtube_hoock(url)

        self.browser.find_by_name('action_rate').click()

    def youtube_dislike(self, url):
        self.browser.visit(url)
        self.browser.click_link_by_partial_href('action_dislike=1')

        self.youtube_hoock()

        self.browser.find_by_name('action_rate').click()

    def youtube_comment(self, url, comment):
        self.browser.visit(url)

        self.browser.click_link_by_partial_href('post_comment')

        self.youtube_hoock()
        try:
            self.browser.click_link_by_partial_href('post_comment')
        except:
            pass

        self.youtube_hoock()
        self.browser.fill('comment', comment)
        self.browser.find_by_name('action_comment').click()

        self.youtube_hoock()

    def youtube_subscribe(self, chane_name):
        load_page = 'http://m.youtube.com/user/%s' % chane_name
        self.browser.visit(load_page)

        self.browser.find_by_name('submit')[1].click()

        self.youtube_hoock()

        try:
            self.browser.find_by_name('submit')[1].click()
        except:
            pass

    def google_friend_connector(self):
        #self.browser.click_link_by_partial_href('post_comment')
        pass

    def blogspot_follow(self, url):
        pass

    def get_capture(self):
        cap_element = self.browser.find_by_xpath('//img[@width="300"]').first
        cap_code = recognize_captcha(cap_element['src'])
        self.browser.fill('recaptcha_response_field', cap_code)

    def blogspot_post_plus(self, url):
        self.browser.visit(url)
        frame_name = self.browser.find_by_xpath(
            '//*[starts-with(@name,"I0_")]')[0]['name']
        print frame_name
        with self.browser.get_iframe(frame_name) as iframe:
            #	#self.browser.find_by_xpath('//span[@class="hAa Qo Bg"]').first.click()
            iframe.find_by_xpath('//span[@class="hAa Qo Bg"]').first.click()

    def blogspot_post(self, url, comment):
        self.browser.visit(url)

        with self.browser.get_iframe('comment-editor') as iframe:
            self.browser.fill('commentBody', comment)
            iframe.find_by_id('postCommentSubmit').click()
            self.youtube_hoock()

        with self.browser.get_iframe('comment-editor') as iframe:
            if iframe.is_element_present_by_id('recaptcha_image'):
                self.get_capture()
                iframe.find_by_id('postCommentSubmit').click()

        if 'showComment=' in self.browser.url:
            return True
        else:
            return False

    def google_post_like(self, url):
        self.browser.visit(url)

        if not self.browser.is_element_present_by_name('stupop'):
            self.browser.find_by_id('162').click()
            return True
        else:
            return False

    def google_post_dislike(self, url):
        self.browser.visit(url)

        if self.browser.is_element_present_by_name('stupop'):
            self.browser.find_by_id('162').click()
            return True
        else:
            return False

    def google_post_comment(self, url, comment):
        self.browser.visit(url)

        self.browser.fill('adcp', comment)
        self.browser.find_by_id('110').click()

    def google_post_share(self, url, comment):
        self.browser.visit(url)

        self.browser.find_by_id('396').click()
        self.browser.fill('rpPostMsg', comment)
        self.browser.find_by_id('253').click()

    def google_profile_join(self, id):
        self.browser.visit('https://plus.google.com/app/basic/%s/' % id)

        self.browser.find_by_id('59').click()
        self.circle_join()

    def circle_join(self):
        self.browser.find_by_name('chcccp')[3].click()
        self.browser.find_by_id('49').click()
        self.browser.reload()

    def google_communities_enter(self, id):
        self.browser.visit('https://plus.google.com/u/0/communities/%s/' % id)
        self._google_hook()

    def google_communities_join(self, id):
        self.google_communities_enter(id)

        if self.browser.is_element_present_by_xpath(
                '//*[starts-with(@class,"a-f-e c-b c-b-La")]'):
            self.browser.find_by_xpath(
                '//*[starts-with(@class,"a-f-e c-b c-b-La")]').first.click()

    def google_communities_post(self, id, mess):
        print 'Start  communities post'
        self.google_communities_join(id)
        time.sleep(60)
        #for i in self.browser.find_by_xpath('//a[@class="FW9qdb Wk"]'):
        #`	print i['oid']

        #self.browser.reload()
        self.browser.find_by_xpath(
            '//div[@guidedhelpid="sharebox_textarea"]').first.click()
        self.browser.find_by_xpath('//div[@class="yd editable"]').first.fill(
            mess)
        self.browser.find_by_xpath(
            '//div[@guidedhelpid="sharebutton"]').click()
        time.sleep(60)
        self.browser.find_by_xpath('//div[@class="a-n Ph Hw"]').first.click()
        print '-' * 30
        for i in self.browser.find_by_xpath('//a[@class="FW9qdb Wk"]'):
            print i['oid']

    def google_people_suggested(self):
        self.browser.visit(
            'https://plus.google.com/app/basic/people/suggested?')
        for i in range(10):
            try:
                self.browser.find_by_xpath('//a[@class="vfc"]').first.click()
                self.circle_join()
            except:
                self.browser.visit(
                    'https://plus.google.com/app/basic/people/suggested?')

    def google_grab_comm_members(self, id, qty):
        irr_qty = int((qty - 64) / 20.00) + 3
        print 'Irr qty= %d' % irr_qty

        self.browser.visit(
            'https://plus.google.com/u/0/communities/%s/members' % id)
        ret_arr = []

        js_del_all_img = """
		var images = document.getElementsByTagName('img');
			while(images.length > 0) 
			{
    		images[0].parentNode.removeChild(images[0]);
			}
		"""

        for i in range(irr_qty):
            elem_arr = self.browser.find_by_xpath('//div[@class="ib31if"]')
            print 'Array len %d' % len(elem_arr)
            print i
            print ''
            elem_arr[len(elem_arr) - 2].right_click()
            #self.browser.execute_script(js_del_all_img)
            for elem in elem_arr:
                oid = elem['oid']
                img = self.browser.find_by_xpath('//img[@oid="%s"]' % oid)[0]
                #print img['src']

                if not oid in ret_arr:
                    ret_arr.append(oid)
                    print oid

        f = open('/tmp/google_oid.txt', 'w')
        for s in ret_arr:
            f.write('<item>' + s + '</item>\n')
        f.close()
        print 'Grab done'

    def quit(self):
        self.browser.quit()
Exemple #16
0
def add_album_to_rym(args, config_file):
    br = Browser()

    br.visit('https://rateyourmusic.com/account/login')
    time.sleep(3)

    # Login
    br.fill('username', credentials.username)
    br.fill('password', credentials.password)
    br.find_by_id('login_submit').click()
    time.sleep(5)

    (title, artist, tracklist, release, cover) = config.read_config(config_file)
   

    """
    if args.update_album:

        br.visit(args.rym_album)

    else:
    """

    if args.add_artist:
        br.visit('https://rateyourmusic.com/artist_add')

        #br.fill('lastname', unicode(artist))
        br.fill('lastname', artist)
        br.fill('comments', args.url)

        br.find_by_id('submitbtn').click()

        time.sleep(3)
        
        br.find_by_text(artist).click()
    
    else:
        br.visit(args.rym_profile)
    
    time.sleep(3)
    
    br.click_link_by_partial_href('/releases/ac?artist_id=')
    
    # Add data
    #br.fill('title', unicode(title))
    br.fill('title', title)
    
    br.find_by_id('format58').click()

    br.find_by_id('goAdvancedBtn').click()
    tracks_div = br.find_by_id('tracks_adv')
    tracks_text_area = tracks_div.find_by_id('track_advanced')
    #tracks_text_area.fill(unicode(tracklist)) 
    tracks_text_area.fill(tracklist) 
    br.find_by_id('goSimpleBtn').click()

    br.fill('notes', args.url)
 
    (year, month, day)      = parse_release_date(release)

    release_month_selector  = br.find_by_id('month')
    release_month_selector.select(month)
    
    release_day_selector    = br.find_by_id('day')
    release_day_selector.select(day)
    
    release_year_selector   = br.find_by_id('year')
    release_year_selector.select(year)

    br.find_by_id('previewbtn').click()
    br.find_by_id('submitbtn').click()

    # Add cover art

    """
    coverart_img_element = br.find_by_xpath("//img[@class='coverart_img']")
    print(coverart_im_element)
    sys.exit(0)
    """

    br.click_link_by_partial_href('/images/upload?type=l&assoc_id=')
    br.attach_file('upload_file', cover)

    br.fill('source', args.url)
    br.find_by_id('uploadbutton').click()
    time.sleep(5)

    br.click_link_by_partial_href('javascript:setStatus')


    # Vote for genre
    br.click_link_by_partial_href('/release/')
    time.sleep(3)

    br.click_link_by_partial_href('/rgenre/set?')

    prigen_text_area = br.find_by_xpath("//input[@id='prigen']")
    prigen_text_area.fill('vaporwave')

    prigen_vote_button = br.find_by_xpath("//input[@value='+ propose']").first
    prigen_vote_button.click()

    # Done
    br.click_link_by_partial_href('/release/')
    print("Finished")
Exemple #17
0
class WOS(object):
    """ A little module for exporting Web of Science search results into a txt file """
    def __init__(self, **kwargs):
        """
        Construct a new WOS object given a query, an export file (without ".txt")
        a username and a password for authentication
        eg :
            WOS(query="TS=(epigenetic*", outfile="epigenetic", user="******", passw="mypassw")
        """
        #defining params
        self.query = kwargs["query"]
        self.outfile = kwargs["outfile"] + ".tsv"
        """
        try:
            self.user=kwargs["user"]
            self.passw = kwargs["passw"]
        except:
            self.user, self.passw = private
        """

        try:
            self.browser_app = kwargs["browser"]
        except:
            self.browser_app = "splinter"
        #using MLV Auth Server
        #self.auth_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/WOS_AdvancedSearch_input.do?&product=WOS&search_mode=AdvancedSearch"

        self.auth_url = "http://apps.webofknowledge.com/UA_AdvancedSearch_input.do?&product=UA&search_mode=AdvancedSearch"

        #Firefox Browser
        if self.browser_app == "splinter":
            self.browser = Browser("firefox")
        else:
            self.browser = spynner.Browser()
            self.browser.set_html_parser(PyQuery)

        #self.browser = Browser('zope.testbrowser', ignore_robots=True)
        #Session params
        self.session = None
        self.cookies = {}

        if self.query is None:
            sys.exit("No query provided")
        if "=" not in self.query:
            #or "(" not in self.query

            logging.warning("Syntax is not WOS compliant. Check Query Syntax")
            sys.exit("Query Syntax Error")
        if self.outfile is None:
            self.outfile = str(
                re.sub(re.compile("[^0-9a-zA-Z]+"), "_", self.query)) + ".txt"


#        if self.user is None and self.passw is None:
#            self.user, self.passw = private
#        logging.info("WOS search parameters:\n\t- query: %s\n\t- outfile: %s\n\t- user: %s\n\t- password: %s" %(self.query, self.outfile, self.user, self.passw))

        self.run()

    def auth(self):
        """ authentification throught auth_url to get the session id SID """
        #Loading url
        if self.browser_app == "splinter":
            self.browser.visit(self.auth_url)
            #            self.browser.fill('username', self.user)
            #            self.browser.fill('password', self.passw)
            #            self.browser.find_by_name("submit").click()
            self.cookies = self.browser.cookies.all()

        else:
            self.browser = self.browser.load(self.url)
            #            self.browser.wk_fill('input[id="username"]',self.username)
            #            self.browser.wk_fill('input[id="password"]',self.password)
            self.browser.click('input[name="submit"]')

            #~ if self.debug is True:
            #~ print "Proceding to authentication..."

            if "SessionError" in self.session.url:
                self.session.click('a[target="_top"]')
                self.session.wait(random.uniform(1, 3))

        p_url = urlparse(self.browser.url)

        if p_url.netloc == "apps.webofknowledge.com":
            #print p_url.scheme+"//"+p_url.netloc+"/WOS_GeneralSearch_input.do?"+p_url.query
            match = re.match(
                re.compile(
                    "product\=(?P<product>.*?)\&search_mode\=(?P<search_mode>.*?)\&SID=(?P<ssid>.*?)\&preferencesSaved\="
                ), str(p_url.query))
            if match is not None:
                self.product = match.group("product")
                self.ssid = match.group("ssid")
                self.search_mode = re.sub("General", "Advanced",
                                          match.group("search_mode"))
                #self.search_mode = match.group("search_mode")
                self.search_url = "%s://%s/%s_%s_input.do?product=%s&search_mode=%s&SID=%s" % (
                    p_url.scheme, p_url.netloc, self.product, self.search_mode,
                    self.product, self.search_mode, self.ssid)
                if self.browser_app == "splinter":
                    self.browser.visit(self.search_url)
                    print self.browser.url
                else:
                    self.browser.load(self.search_url)
                    print self.browser.url
                return self
            else:
                return sys.exit("Session Id could not be found")
        else:
            logging.info("No redirection to service")
            return sys.exit("Invalid credentials")

    def launch_search(self):
        """ Filling the query form found into advanced search page """
        logging.info("Launching search")

        if self.browser_app == "splinter":
            self.browser.fill("value(input1)", self.query)
            self.browser.find_by_xpath(
                "/html/body/div[1]/form/div[1]/table/tbody/tr/td[1]/div[2]/div[1]/table/tbody/tr/td[1]/span[1]/input"
            ).click()
            bs = BeautifulSoup(self.browser.html)

        else:
            self.session.wk_fill('textarea[id="value(input1)"]', self.query)
            self.session.click('input[title="Search"]')
            self.session.wait(random.randint(2, 5))

            bs = BeautifulSoup(self.browser.html.encode("utf-8"))

        query_history = bs.find_all("div", {"class": "historyResults"})
        self.nb_search = len(query_history)
        try:
            self.nb_results = int(re.sub(",", "", query_history[0].text))
        except IndexError:
            self.nb_results = int(re.sub(",", "", query_history.text))
            print self.nb_results

        logging.warning("Your search \"%s\" gave %i results" %
                        (self.query, self.nb_results))

        logging.info("Your SSID is : %s" % self.ssid)

        if self.nb_results > 0:
            if self.browser_app == "splinter":
                self.browser.click_link_by_partial_href('/summary.do?')
            else:
                self.session.click('a[title="Click to view the results"]',
                                   wait_load=True)

            print urlparse(self.browser.url).query

            match = re.search(
                re.compile(
                    "product=UA&doc\=(?P<doc>.*?)\&qid\=(?P<qid>.*?)&SID"),
                urlparse(self.browser.url).query)

            if match is not None:
                print match.group()
                self.doc, self.qid = match.group("doc"), match.group('qid')
                print self.doc, self.qid
                return self
            else:
                self.doc, self.qid = self.parse_params()
                return self
        else:
            return self

    def load_results(self, markFrom, markTo):
        """ Load_results(markFrom, markTo) 500 by 500 given the nb of results """
        logging.info("loading results")
        #print "exporting"
        #p_url0= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=output" %self.ssid
        #r0 = requests.post(p_url0, headers= headers, cookies=self.cookies)
        # print p_url0
        #print r0
        #p_url1= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=results" %self.ssid
        # print p_url1
        #r1 = requests.post(p_url1, headers= headers, cookies=self.cookies)
        #print r1
        r_url = "https://apps.webofknowledge.com/summary.do?product=UA&doc=1&qid=" + self.qid + "&SID=" + self.ssid + "&search_mode=AdvancedSearch"
        post_url = "https://apps.webofknowledge.com/OutboundService.do?action=go&&"
        #r2 = requests.post()

        header = {
            'Host':
            'apps.webofknowledge.com',
            'User-Agent':
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':
            'fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3',
            'Accept-Encoding':
            'gzip, deflate',
            'DNT':
            1,
            'Referer':
            'https://apps.webofknowledge.com/summary.do?product=UA&doc=1&qid=%s&SID=%s&search_mode=AdvancedSearch'
            % (self.qid, self.ssid),
            'Connection':
            'keep-alive'
        }

        # markTo = 500
        # markFrom = 1
        data = {
            'SID': self.ssid,
            'colName': 'WOS',
            'count_new_items_marked': 0,
            'displayCitedRefs': 'true',
            'displayTimesCited': 'true',
            'fields_selection':
            'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS',
            'filters':
            'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS',
            'format': 'saveToFile',
            'locale': 'en_US',
            'markFrom': 1,
            'markTo': markTo,
            'mark_from': markFrom,
            'mark_id': 'WOS',
            'mark_to': markTo,
            'mode': 'OpenOutputService',
            'product': 'UA',
            'qid': self.qid,
            #rurl:'http%3A%2F%2Fapps.webofknowledge.com%2Fsummary.do%3FSID%3DT1WYtnvIngPkHzI4ShI%26product%3DWOS%26doc%3D1%26qid%3D1%26search_mode%3DAd
            'rurl': urllib.quote_plus(r_url),
            'save_options': 'tabMacUnicode',
            'search_mode': 'AdvancedSearch',
            'selectedIds': '',
            'sortBy': 'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A',
            'value(record_select_type)': 'range',
            'viewType': 'summary',
            'view_name': 'WOS-summary',
        }

        r = requests.get(post_url,
                         params=data,
                         headers=header,
                         cookies=self.cookies)
        #redirects to #url = "http://ets.webofknowledge.com/ETS/ets.do?"

        final_r = requests.get(r.url, cookies=self.cookies, stream=True)
        with open(self.outfile, 'a') as f:
            final_r.text
            f.write(final_r.text.encode('utf-8'))
        return self.outfile

    def export(self):
        """Writing results into outfile (defaut is normalized query)"""
        start_time = time.time()
        open(self.outfile, 'w').close()
        l = list(range(0, self.nb_results, 500))
        l.append(self.nb_results)

        logging.info("Exporting %s 500 by 500..." % self.nb_results)
        for i, n in enumerate(l):
            if l[i] + 1 < self.nb_results:
                self.load_results(l[i] + 1, l[i + 1])

        total = time.time() - start_time, "seconds"
        raw_file = open(self.outfile, 'r')
        raw_file_data = raw_file.read().decode("utf-8-sig").encode("utf-8")
        nb_occurence = len(raw_file_data.split("\r")) - 2
        logging.info("Query \"%s\" had %d results: %d has been exported" %
                     (self.query, self.nb_results, nb_occurence))
        logging.info("Sucessfully stored in file : %s\n" % (self.outfile))
        #logging.info("Execution total time:"+str(" ".join(total)))
        return

    def run(self):
        """ Generic method that encapsulates the WOS extract process """
        self.auth()
        self.launch_search()
        self.export()
        self.browser.quit()
        return
Exemple #18
0
class DocTest(StaticLiveServerTestCase):
    def setUp(self):
        fss.remove_tree(settings.MEDIA_ROOT)
        check_permissions()
        set_site(self.live_server_url)
        
        self.browser = Browser()
        self.browser.visit(self.live_server_url)
        
        login_url = settings.LOGIN_URL
        self.browser.click_link_by_partial_href(login_url)
        
        username = '******'
        password = '******'
        create_user(username)
        login(
            self.browser,
            username,
            password,
        )
        
        upload_url = reverse('documents.views.add_document')
        self.browser.click_link_by_partial_href(upload_url)
        
        source = 'local'
        docfile = get_abs_path('doctest.pdf')
        language = 'eng'
        public = True
        title = 'test'
        notes = 'test notes'
        upload(
            self.browser,
            source,
            docfile,
            language,
            public,
            title,
            notes,
        )
        
        self.browser.is_element_not_present_by_value('ready', 10)
        
        self.public = public
        self.title = title
        self.notes = notes
        self.document = get_document(title)
    
    def test_upload_doc_local(self): #Create
        document_exists = exists_document(self.title)
        self.assertTrue(document_exists)
        self.assertEquals(self.document.public, self.public)
        self.assertEquals(self.document.title, self.title)
        self.assertEquals(self.document.notes, self.notes)
        
        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)
        
        document_xpath = '/html/body/div/div[2]/table/tbody/tr[1]'
        document_tr = self.browser.find_by_xpath(document_xpath)
        document_id = document_tr['data-id']
        self.assertEquals(int(document_id), self.document.id)
        
        document_title_xpath = '//*[@id="documents_cell"]/span[1]'
        document_title = self.browser.find_by_xpath(document_title_xpath)
        self.assertEquals(document_title.value, self.title)
        
        profile_xpath = '/html/body/div/div[1]/div/ul[2]/li[4]/a'
        profile_link = self.browser.find_by_xpath(profile_xpath)
        owner_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[4]/a'
        owner_link = self.browser.find_by_xpath(owner_xpath)
        self.assertEquals(profile_link.value, owner_link.value)
        
        status_xpath = '/html/body/div/div[2]/table/tbody/tr/td[5]/div'
        status_div = self.browser.find_by_xpath(status_xpath)
        self.assertEquals(status_div.value, self.document.status)
        
        numpages_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[6]/div'
        numpages_div = self.browser.find_by_xpath(numpages_xpath)
        self.assertEquals(int(numpages_div.value), self.document.page_count)
        
        privacy_icon_xpath = '//*[@id="privacy"]/i'
        privacy_icon = self.browser.find_by_xpath(privacy_icon_xpath)
        self.assertTrue(privacy_icon.has_class('icon-eye-open'))
        
        structure = create_structure(self.document)
        root_path = self.document.get_root_path()
        dirs = fss.listdir(root_path)[0]
        files = fss.listdir(root_path)[1]
        for d in dirs:
            dir_path = os.path.join(root_path, d)
            for f in structure['dirs'][d]:
                self.assertIn(f, fss.listdir(dir_path)[1])
        for f in structure['files']:
            self.assertIn(f, fss.listdir(root_path)[1])
        
#        import time; time.sleep(3)
        self.browser.quit()
#    
#    def test_upload_doc_dropbox(self): #Create
#        pass
    
    def test_view_doc(self): #Read
        link_title_xpath = '//*[@id="documents_cell"]/span[1]/a'
        self.browser.find_by_xpath(link_title_xpath).click()
        viewer_title_xpath = (
            '//*[@id="documentviewer-container"]'
            '/div/div[1]/div[1]/div[1]/div[2]/h4/a'
        )
        viewer_title = self.browser.find_by_xpath(viewer_title_xpath)
        self.assertEquals(viewer_title.value, self.title)
        
#        import time; time.sleep(3)
        self.browser.quit()
    
    def test_edit_doc(self): #Update
        edit_xpath = '/html/body/div/div[2]/table/tbody/tr[1]/td[7]/a[3]/i'
        self.browser.find_by_xpath(edit_xpath).click()
        
        public = False
        title = 'new title'
        notes = 'new notes'
        edit(
            self.browser,
            public,
            title,
            notes,
        )
        
        document = get_document(title)
        self.assertEquals(document.public, public)
        self.assertEquals(document.title, title)
        self.assertEquals(document.notes, notes)
        
        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)
        
        document_title_xpath = '//*[@id="documents_cell"]/span[1]'
        document_title = self.browser.find_by_xpath(document_title_xpath)
        self.assertEquals(document_title.value, title)
        
        privacy_icon_xpath = '//*[@id="privacy"]/i'
        privacy_icon = self.browser.find_by_xpath(privacy_icon_xpath)
        self.assertTrue(privacy_icon.has_class('icon-eye-close'))
        
#        import time; time.sleep(3)
        self.browser.quit()
    
    def test_remove_doc(self): #Delete
        old_doc_num = len(self.browser.find_by_css('tr.document-row'))
        
        remove_xpath = '//*[@id="remove"]/i'
        self.browser.find_by_xpath(remove_xpath).click()
        confirm_xpath = '//*[@id="confirm-remove"]/i'
        self.browser.find_by_xpath(confirm_xpath).click()
        
        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)
        
        new_doc_num = len(self.browser.find_by_css('tr.document-row'))
        self.assertEquals(new_doc_num, old_doc_num - 1)
        
#        import time; time.sleep(3)
        self.browser.quit()
Exemple #19
0
class SearchTest(StaticLiveServerTestCase):
    def setUp(self):
        fss.remove_tree(settings.MEDIA_ROOT)
        check_permissions()
        set_site(self.live_server_url)
        
        self.browser = Browser()
        self.browser.visit(self.live_server_url)
        
        login_url = settings.LOGIN_URL
        self.browser.click_link_by_partial_href(login_url)
        
        username = '******'
        password = '******'
        create_user(username)
        login(
            self.browser,
            username,
            password,
        )
        
        upload_url = reverse('documents.views.add_document')
        self.browser.click_link_by_partial_href(upload_url)
        
        source = 'local'
        docfile = get_abs_path('doctest.pdf')
        language = 'eng'
        public = True
        title = 'test'
        notes = 'test notes'
        upload(
            self.browser,
            source,
            docfile,
            language,
            public,
            title,
            notes,
        )
        
        self.browser.is_element_not_present_by_value('ready', 10)
        
        self.title = title
        import time; time.sleep(1)
    
    def test_search_title(self):
        self.browser.visit(self.live_server_url)
        
        title = 'test'
        
        driver = self.browser.driver
        actions = ActionChains(driver)
        searchbar_xpath = '//*[@id="search"]/div/div/div[2]'
        searchbar_div = driver.find_element_by_xpath(searchbar_xpath)
        actions.move_to_element(searchbar_div)
        actions.click()
        actions.perform()
        
        menu_title_xpath = '/html/body/ul/li[4]/a'
        menu_title = self.browser.find_by_xpath(menu_title_xpath)
        menu_title.click()
        input_title_xpath = \
            '//*[@id="search"]/div/div/div[2]/div[2]/div[2]/input'
        input_title = self.browser.find_by_xpath(input_title_xpath)
        input_title.type(title + '\r')
        
        search_list_url = \
            self.live_server_url + '/?title=' + title + '&'
        self.assertEquals(self.browser.url, search_list_url)
        
        summary_xpath = '/html/body/div/div[2]/p/small'
        summary = self.browser.find_by_xpath(summary_xpath)
        self.assertEquals(summary.value, '1 documents found')
        
        document_img_xpath = '/html/body/div/div[2]/ul/li/a/img'
        document_img = self.browser.find_by_xpath(document_img_xpath).click()
        viewer_title_xpath = (
            '//*[@id="documentviewer-container"]'
            '/div/div[1]/div[1]/div[1]/div[2]/h4/a'
        )
        viewer_title = self.browser.find_by_xpath(viewer_title_xpath)
        self.assertEquals(viewer_title.value, self.title)
        
#        import time; time.sleep(3)
        self.browser.quit()
    
    def test_search_text(self):
        self.browser.visit(self.live_server_url)
        
        text = 'download'
        
        driver = self.browser.driver
        actions = ActionChains(driver)
        searchbar_xpath = '//*[@id="search"]/div/div/div[2]'
        searchbar_div = driver.find_element_by_xpath(searchbar_xpath)
        actions.move_to_element(searchbar_div)
        actions.click()
        actions.perform()
        
        menu_text_xpath = '/html/body/ul/li[3]/a'
        menu_text = self.browser.find_by_xpath(menu_text_xpath)
        menu_text.click()
        input_text_xpath = \
            '//*[@id="search"]/div/div/div[2]/div[2]/div[2]/input'
        input_text = self.browser.find_by_xpath(input_text_xpath)
        input_text.type(text + '\r')
        
        search_list_url = \
            self.live_server_url + '/?q=' + text + '&'
        self.assertEquals(self.browser.url, search_list_url)
        
        summary_xpath = '/html/body/div/div[2]/p/small'
        summary = self.browser.find_by_xpath(summary_xpath)
        self.assertEquals(summary.value, '1 documents found')
        
        page_xpath = '/html/body/div/div[2]/ul/li[1]/div[2]/div/div[2]/a/div'
        page_div = self.browser.find_by_xpath(page_xpath)
        self.assertIn(text, page_div.value)
        
        document_img_xpath = '/html/body/div/div[2]/ul/li/a/img'
        document_img = self.browser.find_by_xpath(document_img_xpath).click()
        viewer_title_xpath = (
            '//*[@id="documentviewer-container"]'
            '/div/div[1]/div[1]/div[1]/div[2]/h4/a'
        )
        viewer_title = self.browser.find_by_xpath(viewer_title_xpath)
        self.assertEquals(viewer_title.value, self.title)
        
#        import time; time.sleep(3)
        self.browser.quit()
Exemple #20
0
class WOS(object):
    """ A little module for exporting Web of Science search results into a txt file """
    def __init__(self, **kwargs):
        """
        Construct a new WOS object given a query, an export file (without ".isi")
        a username and a password for authentication
        eg :
            WOS(query="TS=(epigenetic*", outfile="epigenetic", user="******", passw="mypassw")
        """
        #defining params
        self.query = kwargs["query"]
        self.outfile = kwargs["outfile"]+".isi"
        
        try:
            self.user=kwargs["user"]
            self.passw = kwargs["passw"]
        except:
            self.user, self.passw = private
        try:
            self.browser_app = kwargs["browser"]
        except:
            self.browser_app = "splinter"
        #using MLV Auth Server
        self.auth_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/WOS_AdvancedSearch_input.do?&product=WOS&search_mode=AdvancedSearch"
        #Firefox Browser
        if self.browser_app == "splinter":
            self.browser = Browser("firefox")
        else:
            self.browser = spynner.Browser()
            self.browser.set_html_parser(PyQuery)
        
        #self.browser = Browser('zope.testbrowser', ignore_robots=True)
        #Session params
        self.session = None
        self.cookies = {}
        
        
        
        
        if self.query is None:
            sys.exit("No query provided")
        if "=" not in self.query:
            #or "(" not in self.query
            
            logging.warning("Syntax is not WOS compliant. Check Query Syntax")
            sys.exit("Query Syntax Error")
        if self.outfile is None:
            self.outfile = str(re.sub(re.compile("[^0-9a-zA-Z]+"),"_", self.query))+".isi"
            
        if self.user is None and self.passw is None:
            self.user, self.passw = private
        logging.info("WOS search parameters:\n\t- query: %s\n\t- outfile: %s\n\t- user: %s\n\t- password: %s" %(self.query, self.outfile, self.user, self.passw))
        self.run()
        
    def auth(self):
        """ authentification throught auth_url to get the session id SID """
        #Loading url
        if self.browser_app == "splinter":
            self.browser.visit(self.auth_url)
            self.browser.fill('username', self.user)
            self.browser.fill('password', self.passw)
            self.browser.find_by_name("submit").click()
            self.cookies =  self.browser.cookies.all()
            
        else:
            self.browser = self.browser.load(self.url)
            self.browser.wk_fill('input[id="username"]',self.username)
            self.browser.wk_fill('input[id="password"]',self.password)
            self.browser.click('input[name="submit"]')
        
        #~ if self.debug is True:
            #~ print "Proceding to authentication..."
        
            if "SessionError" in self.session.url :
                self.session.click('a[target="_top"]')
                self.session.wait(random.uniform(1, 3))
        
        p_url = urlparse(self.browser.url)
        
        if p_url.netloc == "apps-webofknowledge-com.fennec.u-pem.fr":
            #print p_url.scheme+"//"+p_url.netloc+"/WOS_GeneralSearch_input.do?"+p_url.query
            match = re.match(re.compile("product\=(?P<product>.*?)\&search_mode\=(?P<search_mode>.*?)\&SID=(?P<ssid>.*?)\&preferencesSaved\="), str(p_url.query))
            if match is not None:
                self.product = match.group("product")
                self.ssid = match.group("ssid")
                self.search_mode = re.sub("General", "Advanced", match.group("search_mode"))
                #self.search_mode = match.group("search_mode")
                self.search_url = "%s://%s/%s_%s_input.do?product=%s&search_mode=%s&SID=%s" %(p_url.scheme, p_url.netloc, self.product,self.search_mode,self.product,self.search_mode,self.ssid)        
                if self.browser_app == "splinter":
                    self.browser.visit(self.search_url)
                    print self.browser.url
                else:
                    self.browser.load(self.search_url)
                    print self.browser.url
                return self
            else:
                return sys.exit("Session Id could not be found")    
        else:
            logging.info("No redirection to service")
            return sys.exit("Invalid credentials")
        
    def launch_search(self):
        """ Filling the query form found into advanced search page """
        logging.info("Launching search")
        
        if self.browser_app == "splinter":
            self.browser.fill("value(input1)", self.query)
            self.browser.find_by_xpath("/html/body/div[1]/form/div[1]/table/tbody/tr/td[1]/div[2]/div[1]/table/tbody/tr/td[1]/span[1]/input").click()
            bs = BeautifulSoup(self.browser.html)
            
        else:
            self.session.wk_fill('textarea[id="value(input1)"]', self.query)
            self.session.click('input[title="Search"]')
            self.session.wait(random.randint(2,5))
            
            bs = BeautifulSoup(self.browser.html.encode("utf-8"))
        
        query_history = bs.find_all("div", {"class":"historyResults"})
        self.nb_search = len(query_history)
        try:
            self.nb_results = int(re.sub(",", "", query_history[0].text))
        except IndexError:
            self.nb_results = int(re.sub(",", "", query_history.text))
            print self.nb_results
            
        logging.warning("Your search \"%s\" gave %i results"%(self.query, self.nb_results))
        logging.info("Your SSID is : %s" %self.ssid)
        if self.browser_app == "splinter":
            self.browser.click_link_by_partial_href('/summary.do?')
        else:
            self.session.click('a[title="Click to view the results"]',wait_load=True)
            
        print urlparse(self.browser.url).query
        match = re.search(re.compile("product=WOS&doc\=(?P<doc>.*?)\&qid\=(?P<qid>.*?)&SID"), urlparse(self.browser.url).query)        
        if match is not None:
            print match.group()
            self.doc, self.qid = match.group("doc"), match.group('qid')
            print self.doc, self.qid
            return self
        else:
            
            self.doc, self.qid = self.parse_params()
            return self
            
    
    def load_results(self, markFrom, markTo, i):
        """ Load_results(markFrom, markTo) 500 by 500 given the nb of results """
        logging.info("loading results")
        #print "exporting"
        #p_url0= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=output" %self.ssid
        #r0 = requests.post(p_url0, headers= headers, cookies=self.cookies)
        # print p_url0
        #print r0
        #p_url1= "http://apps.webofknowledge.com/AutoSave_UA_output.do?action=saveForm&SID=%s&product=UA&search_mode=results" %self.ssid
        # print p_url1
        #r1 = requests.post(p_url1, headers= headers, cookies=self.cookies)
        #print r1
        r_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/summary.do?product=WOS&doc=1&qid="+self.qid+"&SID="+self.ssid+"&search_mode=AdvancedSearch"
        post_url = "https://apps-webofknowledge-com.fennec.u-pem.fr/OutboundService.do?action=go&&"
        #r2 = requests.post()

        header={
                'Host': 'apps-webofknowledge-com.fennec.u-pem.fr',
                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3',
                'Accept-Encoding': 'gzip, deflate',
                'DNT': 1,
                'Referer': 'https://apps-webofknowledge-com.fennec.u-pem.fr/summary.do?product=WOS&doc=1&qid=%s&SID=%s&search_mode=AdvancedSearch'%(self.qid, self.ssid),
                'Connection': 'keep-alive'
                }
        # markTo = 500
        # markFrom = 1
        data = {
                'SID': self.ssid,
                'colName':'WOS',
                'count_new_items_marked':0,
                'displayCitedRefs':'true',
                'displayTimesCited':'true',
                'fields_selection':'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS',
                'filters':'USAGEIND AUTHORSIDENTIFIERS ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE CITREF ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS',
                'format':'saveToFile',
                'locale':'en_US',
                'markFrom':1,
                'markTo':markTo,
                'mark_from':markFrom,
                'product':'WOS',
                'mark_to':markTo,
                'mode':'OpenOutputService',
                'product':'WOS',
                'qid':self.qid,
                'startYear':'2015',
                'endYear':'2014',
                #rurl:'http%3A%2F%2Fapps.webofknowledge.com%2Fsummary.do%3FSID%3DT1WYtnvIngPkHzI4ShI%26product%3DWOS%26doc%3D1%26qid%3D1%26search_mode%3DAd
                'rurl':urllib.quote_plus(r_url),
                'save_options':'othersoftware',
                'search_mode':'AdvancedSearch',
                'selectedIds':'',
                'sortBy':'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A',
                'value(record_select_type)':'range',
                'viewType':'summary',
                'view_name':'WOS-summary',
                }
        
        
        r = requests.get(post_url, params=data,headers=header, cookies=self.cookies)
        #redirects to #url = "http://ets.webofknowledge.com/ETS/ets.do?"
        
        data_directory  = self.outfile.split('.isi')[0]
        try:
            os.mkdir("exported_data")
            print "creating directory exported_data"
        except:
            print "exported_data already exists"
            pass
        try:
            os.mkdir("exported_data/"+data_directory)
            print "creating directory "+data_directory
        except:
            print data_directory +" already exists"
            pass
        final_r = requests.get(r.url, cookies=self.cookies, stream=True)
        with open( "exported_data/"+data_directory+'/'+data_directory+'_'+str(i) +'.isi' , 'w') as f:
            final_r.text
            f.write(final_r.text.encode('utf-8'))
        return self.outfile
    
    def export(self):
        """Writing results into outfile (defaut is normalized query)"""
        start_time = time.time()
        #open(self.outfile, 'w').close()
        l = list(range(0, self.nb_results, 500))
        l.append(self.nb_results)
    
        logging.info("Exporting %s 500 by 500..." %self.nb_results)
        for i,n in enumerate(l):
            if l[i]+1 < self.nb_results:
                self.load_results(l[i]+1, l[i+1],str(l[i]+1)+'-'+str(l[i+1]))
        
        total = time.time() - start_time, "seconds"
        # raw_file = open(self.outfile, 'r')
        # raw_file_data = raw_file.read().decode("utf-8-sig").encode("utf-8")
        # nb_occurence = len(raw_file_data.split("\n\n"))-1
        logging.info("Query \"%s\" had %d results: %d has been exported" %(self.query, self.nb_results))
        logging.info("Sucessfully stored in directory : %s\n" %(self.outfile))
        #logging.info("Execution total time:"+str(" ".join(total)))
        return 
        
    def run(self):
        """ Generic method that encapsulates the WOS extract process """
        self.auth()
        self.launch_search()
        self.export()
        self.browser.close()
        return
Exemple #21
0
def scrape():
    #Executable path
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)

    #dictionary to store data
    mars_data = {}

    #############Nasa News##################
    #Use splinter module to visit Nasa news
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    #html object
    html = browser.html
    #Parse through HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    #Latest headline and blurb
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find("div", class_="article_teaser_body").text
    #enter into mars_data
    mars_data["news_title"] = (news_title)
    mars_data["news_paragraph"] = (news_p)
    #Print valuesp
    print(news_title)
    print(news_p)

    ############### Featured Image ################

    #Url #2
    jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl_url)
    #Click on appropriate link
    button = browser.find_by_id("full_image")
    button.click()
    #Click on second appropriate link
    time.sleep(5)
    browser.click_link_by_partial_text("more info")

    #html and Beautiful Soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #Obtain image source
    image = soup.find("img", class_='main_image').get('src')
    #Put it all together
    featured_image_url = f'https://www.jpl.nasa.gov{image}'
    #enter into mars_data
    mars_data["featured_image"] = (featured_image_url)
    #print url
    print(featured_image_url)

    ############ Mars Weather #####################
    #Url 3
    url_3 = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_3)
    #Beautiful soup object
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #scraping what I need
    mars_weather = soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    #enter into mars_data
    mars_data["mars_weather"] = (mars_weather)
    #print weather
    print(mars_weather)

    ######### Mars Facts ############

    #url 4
    url_4 = "https://space-facts.com/mars/"
    browser.visit(url_4)

    #obtain table data
    table = pd.read_html(url_4)[0]
    #Rename columns
    renamed_table = table.rename(columns={0: "Mars Profile", 1: "Value"})
    #Make an HTML object
    mars_html = renamed_table.to_html()
    #removed /n
    mars_html = mars_html.replace('\n', ' ')
    #save to mars_data
    mars_data["mars_facts"] = (mars_html)

    ############ Mars Hemispheres ##########
    #url_5
    url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_5)
    #Cerberus
    browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    cerberus_title = soup.find("h2", class_='title').text
    time.sleep(2)
    download = browser.find_link_by_partial_text('Sample').first
    cerberus_url = download['href']

    #back to main page
    url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_5)
    #Schiaparelli
    browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    schiaparelli_title = soup.find("h2", class_='title').text
    browser.click_link_by_partial_href(
        'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'
    )
    schiaparelli_url = (str(browser.url))

    #back to main page
    url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_5)
    #Syrtis Major
    browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    syrtis_title = soup.find("h2", class_='title').text
    browser.click_link_by_partial_href(
        'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'
    )
    syrtis_url = (str(browser.url))

    #back to main page
    url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_5)
    #Valles Marineris
    browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced')
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    valles_title = soup.find("h2", class_='title').text
    browser.click_link_by_partial_href(
        'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'
    )
    valles_url = (str(browser.url))

    #hemisphere_image_urls
    hemisphere_image_urls = [{
        "title": cerberus_title,
        "img_url": cerberus_url
    }, {
        "title": schiaparelli_title,
        "img_url": schiaparelli_url
    }, {
        "title": syrtis_title,
        "img_url": syrtis_url
    }, {
        "title": valles_title,
        "img_url": valles_url
    }]

    #put into mars_data
    mars_data["mars_hemispheres"] = (hemisphere_image_urls)

    return mars_data
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

import sys, os
from splinter import Browser

reload(sys)
sys.setdefaultencoding('utf-8')

br = Browser()

file_prefix = 'file://'
dir_name = os.path.dirname(os.path.realpath(__file__))
file_name = "output.html"

full_path = file_prefix + dir_name + '/' + file_name

br.visit(full_path)
br.click_link_by_partial_href('/releases/ac?artist_id=')
browser.find_by_tag('h1')
browser.find_by_name('name')
browser.find_by_text('Hello World!')
browser.find_by_id('firstheader')
browser.find_by_value('query')
# get element
first_found = browser.find_by_name('name').first
last_found = browser.find_by_name('name').last
second_found = browser.find_by_name('name')[1]

# Get value of an element
browser.find_by_css('h1').first.value

# Clicking links,return the first link
browser.click_link_by_href('http://www.the_site.com/my_link')
browser.click_link_by_partial_href('my_link')
browser.click_link_by_text('my link')
browser.click_link_by_partial_text('part of link text')
browser.click_link_by_id('link_id')

# element is visible or invisible
browser.find_by_css('h1').first.visible

#fill content
browser.find_by_id('productName').fill(
    'splinter - python acceptance testing for web applications')
browser.fill('q', 'splinter - python acceptance testing for web applications')

# Verifying if element has a className
browser.find_by_css('.content').first.has_class('content')
# click button
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")
        
        # Set up the tables in the database
        Base.metadata.create_all(engine)
        
        # Create an example user
        self.user = User(name="Alice", email="*****@*****.**",
                        password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()
        
        self.process = multiprocessing.Process(target=app.run,
                                                kwargs={"port": 8080})
                                                
        self.process.start()
        time.sleep(1)
        
    def test_login_correct(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
        
    def test_login_incorrect(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")
        
    def test_add_entry(self):
        # Login to blog
        self.test_login_correct()
        # Add new entry
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "test post")
        self.browser.fill("content", "acceptance testing post")
        self.browser.find_by_css("button[type=submit]").first.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
        
    def test_view_single_entry(self):
        # Login to blog
        self.test_login_correct()
        # Click on top entry title
        self.browser.visit("http://127.0.0.1:8080/entry/1/")
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/entry/1/")
        
    def test_edit_entry(self):
        # Login to blog
        self.test_login_correct()
        # Add new entry
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "test post")
        self.browser.fill("content", "acceptance testing post")
        self.browser.find_by_css("button[type=submit]").first.click()
        # Click edit link on top entry
        self.browser.click_link_by_partial_href('edit')
        # Enter new title and contents
        self.browser.fill("title", "edited test post")
        self.browser.fill("content", "edited acceptance testing post")
        self.browser.find_by_css("button[type=submit]").first.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
    
    def test_delete_entry(self):
        # Login to blog
        self.test_login_correct()
        # Add new entry
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "test post")
        self.browser.fill("content", "acceptance testing post")
        self.browser.find_by_css("button[type=submit]").first.click()
        # Delete entry
        self.browser.click_link_by_partial_href('delete')
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        # Make sure browser puts you back on home 
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
    
    def test_logout(self):
        # Login to blog
        self.test_login_correct()
        # Click on 'Logout' link
        self.browser.click_link_by_text('Logout')
        # Check to see if 'Logout' link is visible
        self.assertEqual(self.browser.is_element_present_by_text('Logout'), False)
        # Check to see if 'Login' link is visible
        self.assertEqual(self.browser.is_element_present_by_text('Login'), True)
        
    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
Exemple #25
0
def scrape():
    # Dependencies
    from splinter import Browser
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    import pymongo
    import time
    import ctypes  # An included library with Python install.
    
    def Mbox(title, text, style):
        return ctypes.windll.user32.MessageBoxW(0, text, title, style)

    
    mars_data_dict = {}
    
    ## (1) NASA Mars News
    # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. 
    # Assign the text to variables that you can reference later.
       
    # URL of page to be scraped
    url_nz = 'https://mars.nasa.gov/news/'

    # Retrieve page with the requests module
    response_nz = requests.get(url_nz)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_nz = BeautifulSoup(response_nz.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_nz.prettify())
    
    #time.sleep(2)
    
    # Find the latest News Title
    news_title = soup_nz.find("div", class_="content_title").a.text[1:-1]
    #print(news_title)
    
    # Find the latest News Paragraph Text
    news_p = soup_nz.find("div", class_="image_and_description_container").a.text[3:-7]
    #print(news_p)
    
    mars_data_dict["news_title"] = news_title
    mars_data_dict["news_p"] = news_p
        
        
    
    ## (2) JPL Mars Space Images - Featured Image
    # Use splinter to navigate the site and find the image url for the current Featured Mars Image 
    # and assign the url string to a variable called featured_image_url.
    # Make sure to find the image url to the full size .jpg image.
    # Make sure to save a complete url string for this image.
    
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # URL of page to be scraped
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_jpl)
    
    time.sleep(2)
    
    #dir(browser)
    
    browser.click_link_by_id('full_image')
    
    time.sleep(2)
    
    browser.click_link_by_partial_href("/spaceimages/details.")
    
    time.sleep(2)
    
    browser.click_link_by_partial_href("/spaceimages/images/largesize")
    
    time.sleep(2)
    
    featured_image_url = browser.url
    #print(featured_image_url)
    
    mars_data_dict["feat_img"] = featured_image_url
    
    browser.quit()
    
           
    
    ## (3) Mars Weather
    # Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page.
    # Save the tweet text for the weather report as a variable called mars_weather.
        
    # URL of page to be scraped
    url_tweet = 'https://twitter.com/marswxreport?lang=en'

    # Retrieve page with the requests module
    response_tweet = requests.get(url_tweet)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_tweet = BeautifulSoup(response_tweet.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_tweet.prettify())
    
    #time.sleep(2)
    
    # scrape the latest Mars weather tweet from the page
    tweets = soup_tweet.find_all("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
    for tweet in tweets:
        find_text = tweet.text.find("InSight sol")
        if find_text == 0:
            mars_weather = tweet.text
            #print(mars_weather)
            break
    
    mars_data_dict["weather"] = mars_weather
    
    
    
    ## (4) Mars Facts
    # URL of page to be scraped
    url_mfacts = 'https://space-facts.com/mars/'

    # Retrieve page with the requests module
    response_mfacts = requests.get(url_mfacts)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_mfacts = BeautifulSoup(response_mfacts.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_mfacts.prettify())
    
    #time.sleep(2)
    
    tables = pd.read_html(url_mfacts)[1]
    #tables
    
    mars_data_dict["mfacts"] = tables
    
    tables.to_html("../html/mars_facts.html")
    
    
    
    ## (5) Mars Hemispheres
    # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
    # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    # Save both the image url string for the full resolution hemisphere image, 
    #     and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the 
    #     keys img_url and title.
    # Append the dictionary with the image url string and the hemisphere title to a list. 
    #     This list will contain one dictionary for each hemisphere
    
    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

    # URL of page to be scraped
    url_mhemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_mhemi)
    
    time.sleep(2)
    
    # Image 1
    browser.click_link_by_partial_text("Cerberus Hemisphere Enhanced")
    
    time.sleep(2) 
    
    title1 = browser.title.split("|")[0]
    #print(title1)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img1_url = browser.windows[1].url
    #print(img1_url)
    
    time.sleep(2) 
    
    browser.windows[1].close()
    browser.back()
    
    hemi1_dict = {}
    hemi1_dict["title"] = title1
    hemi1_dict["img_url"] = img1_url
    #hemi1_dict
    
    # Image 2
    
    browser.click_link_by_partial_text("Schiaparelli Hemisphere Enhanced")
    
    time.sleep(2)
    
    title2 = browser.title.split("|")[0]
    #print(title2)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img2_url = browser.windows[1].url
    #print(img2_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi2_dict = {}
    hemi2_dict["title"] = title2
    hemi2_dict["img_url"] = img2_url
    #hemi2_dict
    
    # Image 3
    
    browser.click_link_by_partial_text("Syrtis Major Hemisphere Enhanced")
    
    time.sleep(2)
    
    title3 = browser.title.split("|")[0]
    #print(title3)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img3_url = browser.windows[1].url
    #print(img3_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi3_dict = {}
    hemi3_dict["title"] = title3
    hemi3_dict["img_url"] = img3_url
    #hemi3_dict
    
    # Image 4
    browser.click_link_by_partial_text("Valles Marineris Hemisphere Enhanced")
    
    time.sleep(2)
    
    title4 = browser.title.split("|")[0]
    #print(title4)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img4_url = browser.windows[1].url
    #print(img4_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi4_dict = {}
    hemi4_dict["title"] = title4
    hemi4_dict["img_url"] = img4_url
    #hemi4_dict
    
    hemisphere_image_urls = [hemi1_dict, hemi2_dict, hemi3_dict, hemi4_dict]
    #hemisphere_image_urls
    
    mars_data_dict["hemi_img"] = hemisphere_image_urls
    mars_data_dict  
    
    browser.quit()
    

    
    Mbox("Mission to Mars Completed", "Congratulations!!! You've mined Mars!", 1)   
Exemple #26
0
def scrape():

    import pandas as pd
    from bs4 import BeautifulSoup as bs
    import requests
    from selenium import webdriver
    from splinter import Browser

    #we visit the first site and get our title/paragraph text
    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    driver = webdriver.Chrome()
    driver.get(url)
    data = driver.page_source
    driver.quit()
    soup = bs(data, 'html.parser')
    news_title = soup.find('div', 'content_title').text
    news_p = soup.find('div', 'article_teaser_body').text

    #we visit the second site and navigate to our image page and save the url
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser = Browser('chrome', 'chromedriver.exe', headless=False)
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    browser.click_link_by_partial_text('FULL IMAGE')
    browser.click_link_by_partial_text('more info')
    browser.click_link_by_partial_href('/spaceimages/images/largesize')
    featured_image_url = browser.url
    browser.quit()

    #we visit our third site and retrieve the text of the latest tweet
    url = "https://twitter.com/marswxreport?lang=en"
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    tweet_text = soup.find('p', 'tweet-text').text
    #don't forget to remove the image text! If it exists, this will remove it.
    img_text = soup.find('a', 'u-hidden').text
    mars_weather = tweet_text.replace(img_text, '')

    #we visit our fourth site and retrieve the needed table
    url = "https://space-facts.com/mars/"
    tables = pd.read_html(url)
    df = tables[0].rename(columns={0: 'Metric', 1: 'Value'})
    df = df.set_index('Metric')
    html_table = df.to_html()

    #we visit our final site to retrieve our image urls
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser = Browser('chrome', 'chromedriver.exe', headless=False)
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    divs = soup.find_all('div', 'description')
    hemisphere_image_urls = []
    for div in divs:
        link = div.find('a')
        browser.visit("https://astrogeology.usgs.gov/" + link['href'])
        title = link.text.replace(" Enhanced", "")
        html = browser.html
        soup = bs(html, 'html.parser')
        img_url = soup.find('a', text="Sample")['href']
        hemisphere_image_urls.append({"title": title, "img_url": img_url})
    browser.quit()

    mars_dict = {
        'news_title': news_title,
        'news_p': news_p,
        'featured_image_url': featured_image_url,
        'mars_weather': mars_weather,
        'html_table': html_table,
        'hemisphere_image_urls': hemisphere_image_urls
    }
    return mars_dict
Exemple #27
0
def add_album_to_rym(args, config_file):
    br = Browser()

    br.visit('https://rateyourmusic.com/account/login')
    time.sleep(3)

    # Login
    br.fill('username', credentials.username)
    br.fill('password', credentials.password)
    br.find_by_id('login_submit').click()
    time.sleep(5)

    (title, artist, tracklist, release,
     cover) = config.read_config(config_file)
    """
    if args.update_album:

        br.visit(args.rym_album)

    else:
    """

    if args.add_artist:
        br.visit('https://rateyourmusic.com/artist_add')

        #br.fill('lastname', unicode(artist))
        br.fill('lastname', artist)
        br.fill('comments', args.url)

        br.find_by_id('submitbtn').click()

        time.sleep(3)

        br.find_by_text(artist).click()

    else:
        br.visit(args.rym_profile)

    time.sleep(3)

    br.click_link_by_partial_href('/releases/ac?artist_id=')

    # Add data
    #br.fill('title', unicode(title))
    br.fill('title', title)

    br.find_by_id('format58').click()

    br.find_by_id('goAdvancedBtn').click()
    tracks_div = br.find_by_id('tracks_adv')
    tracks_text_area = tracks_div.find_by_id('track_advanced')
    #tracks_text_area.fill(unicode(tracklist))
    tracks_text_area.fill(tracklist)
    br.find_by_id('goSimpleBtn').click()

    br.fill('notes', args.url)

    (year, month, day) = parse_release_date(release)

    release_month_selector = br.find_by_id('month')
    release_month_selector.select(month)

    release_day_selector = br.find_by_id('day')
    release_day_selector.select(day)

    release_year_selector = br.find_by_id('year')
    release_year_selector.select(year)

    br.find_by_id('previewbtn').click()
    br.find_by_id('submitbtn').click()

    # Add cover art
    """
    coverart_img_element = br.find_by_xpath("//img[@class='coverart_img']")
    print(coverart_im_element)
    sys.exit(0)
    """

    br.click_link_by_partial_href('/images/upload?type=l&assoc_id=')
    br.attach_file('upload_file', cover)

    br.fill('source', args.url)
    br.find_by_id('uploadbutton').click()
    time.sleep(5)

    br.click_link_by_partial_href('javascript:setStatus')

    # Vote for genre
    br.click_link_by_partial_href('/release/')
    time.sleep(3)

    br.click_link_by_partial_href('/rgenre/set?')

    prigen_text_area = br.find_by_xpath("//input[@id='prigen']")
    prigen_text_area.fill('vaporwave')

    prigen_vote_button = br.find_by_xpath("//input[@value='+ propose']").first
    prigen_vote_button.click()

    # Done
    br.click_link_by_partial_href('/release/')
    print("Finished")
def scrape():
    scrape_dict = {}

    # Update dictionary with scrape time
    scrape_dict["scrape_time"] = str(datetime.datetime.now())

    # Get most current news story from NASA's mars site
    nasa_news_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    nasa_html = requests.get(nasa_news_url).text
    nasa_soup = bs(nasa_html, 'lxml')
    # Get first title
    title_results = nasa_soup.find_all('div', class_="content_title")
    title_list = []
    for result in title_results:
        try:
            title = result.find('a').text.strip()
            if title:
                title_list.append(title)
        except Exception as e:
            return e
    news_title = title_list[0]
    # Get first paragraph
    p_results = nasa_soup.find_all('div', class_="rollover_description_inner")
    p_list = []
    for p in p_results:
        try:
            par = p.text.strip()
            if par:
                p_list.append(par)
        except Exception as e:
            return e
    news_p = p_list[0]
    # Update dictionary
    scrape_dict["mars_news_title"] = news_title
    scrape_dict["mars_news_p"] = news_p

    # Create splinter browser instance
    executable_path = {
        'executable_path':
        'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
    }
    browser = Browser('chrome', **executable_path)
    # Scrape NASA images page for featured image
    nasa_images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(nasa_images_url)
    browser.find_by_css('.button').first.click()
    time.sleep(3)
    browser.find_by_css('.button').last.click()
    partial_link = browser.find_by_css('.download_tiff').last.value.split(
        " ")[2]
    browser.click_link_by_partial_href(partial_link)
    featured_image_url = browser.url
    # Update dictionary
    scrape_dict["featured_image"] = featured_image_url

    # Use Splinter to scrape USGS for hemisphere images and urls
    usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(usgs_url)
    link_objects = browser.find_by_css('h3')
    hemisphere_list = []
    {
        hemisphere_list.append(link.value.replace(" Enhanced", ""))
        for link in link_objects
    }
    url_list = []
    for hemisphere in hemisphere_list:
        browser.click_link_by_partial_text(hemisphere)
        image_object = browser.find_by_css('img.wide-image')
        img_url = image_object['src']
        url_list.append(img_url)
        browser.back()
    browser.quit()
    hemisphere_image_urls = []
    for hemisphere, url in zip(hemisphere_list, url_list):
        hemisphere_dict = {"title": hemisphere, "url": url}
        hemisphere_image_urls.append(hemisphere_dict)
    hemisphere_image_urls
    # Update dictionary
    scrape_dict["hemisphere_images"] = hemisphere_image_urls

    # Scrape weather conditions from Mars Weather Twitter
    twitter_url = "https://twitter.com/marswxreport?lang=en"
    twitter_html = requests.get(twitter_url).text
    twitter_soup = bs(twitter_html, 'lxml')
    tweets = twitter_soup.find_all('div', class_="content")
    weather_only_tweets = []
    for tweet in tweets:
        username = tweet.find('span', class_="username u-dir u-textTruncate")
        pic_link = tweet.find('a', class_="twitter-timeline-link u-hidden")
        if username.text == "@MarsWxReport":
            tweet_content = tweet.find(
                'p',
                class_=
                "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
            ).text.strip()
            # Eliminate non-weather tweets
            report_test = tweet_content.split(" ")
            if report_test[0] == "Sol":
                if pic_link is not None:
                    weather_only_tweets.append(
                        tweet_content.replace(pic_link.text, ""))
                else:
                    weather_only_tweets.append(tweet_content)
    mars_weather = weather_only_tweets[0]
    # Update dictionary
    scrape_dict["mars_weather"] = mars_weather

    # Scrape facts table
    facts_url = "https://space-facts.com/mars/"
    facts_table = pd.read_html(facts_url)
    facts_df = facts_table[0]
    facts_df = facts_df.set_index(0)
    facts_html = facts_df.to_html(classes="table table-format",
                                  border=0,
                                  header=False,
                                  index_names=False).replace("\n", "")
    # Update dictionary
    scrape_dict["mars_facts"] = facts_html

    return scrape_dict
def scrape():

    #set up Browser

    executable_path = {'executable_path': "chromedriver"}
    browser = Browser('chrome', **executable_path, headless=False)
    
    #Get Nasa News
    nasa_news = 'https://mars.nasa.gov/news/'
    browser.visit(nasa_news)
    html = browser.html
    soup = bs(html, 'html.parser')
    results = soup.find_all('li', class_="slide")

    for result in results[0]:
        news_title = result.find('div',class_="content_title").text
        news_description = result.find('div',class_="article_teaser_body").text
        news_url = nasa_news + result.a['href']
        
    time.sleep(1)
    
    #Collect JPL Image
    jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl)
    html = browser.html
    soup = bs(html, 'html.parser')

    browser.click_link_by_id('full_image')
    time.sleep(2)
    browser.click_link_by_partial_href('/spaceimages/details')
    soup = bs(browser.html, 'html.parser')
    results = soup.find('figure', class_ = 'lede')
    base_url = browser.url[:24]
    img = results.a.img['src']

    featured_img_url =  base_url + img

    
    time.sleep(1)
    
    
    #Mars Weather
    weather = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(weather)
    html = browser.html
    soup = bs(html, 'html.parser')
    
    results = soup.find('div', class_="js-tweet-text-container")
    try:
        results.a.decompose()
    except:
        pass
    
    mars_weather = results.find('p').text
    
    time.sleep(1)
    
    #Mars Facts
    space_facts = 'https://space-facts.com/mars/'

    mars_facts = pd.read_html(space_facts)[1].rename(columns = {0:'Fact',1:'Data'}).to_html(index=False).replace('\n','')

    time.sleep(1)
    
    
    #Mars Hemispheres
    hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemispheres)
    html = browser.html
    soup = bs(html, 'html.parser')

    #Find list of image tags
    base_url = browser.url[:29]
    results = soup.find_all('div',attrs={'class':'collapsible results'})[0]
    images = results.find_all('div')[:]

    #iterate through length of tags and collect hrefs, navigate to page and collect full image link 
    hemisphere_image_urls = []

    for image in range(0,len(images)):
        if image == 0 or image % 2 == 0:
            url = base_url+images[image].a['href']
            title = (images[image].h3.text)
            browser.visit(url)
            time.sleep(1)
            soup = bs(browser.html,'html.parser')
            results = soup.find_all('ul')[0]
            result = results.find_all('li')[0]
            hemi_url = (result.a['href'])
            hemisphere_image_urls.append({'title':title,
                                          'img_url':hemi_url})
    facts = {'news_title':news_title,
             'news_description':news_description,
             'news_url':news_url,
             'featured_img_url':featured_img_url,
             'mars_weather':mars_weather,
             'mars_facts':mars_facts,
             'hemi_img_url':hemisphere_image_urls
            }
    
    browser.visit('https://i.pinimg.com/originals/49/78/3e/49783e18b9ac11c560362029ba1f3328.jpg')


    return facts
def scrape():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Mars News
    url_1 = "https://mars.nasa.gov/news/"

    browser.visit(url_1)
    time.sleep(2)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    news_title = soup.find('div', class_="content_title").a.text
    news_p = soup.find('div', class_="article_teaser_body").text

    # JPL Mars Space Images
    url_2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

    browser.visit(url_2)
    time.sleep(2)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(5)
    browser.click_link_by_partial_text('more info')
    time.sleep(5)
    browser.click_link_by_partial_href('/spaceimages/images/largesize/')

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    featured_image_url = soup.find('img')["src"]

    # Mars Weather
    url_3 = "https://twitter.com/marswxreport?lang=en"

    browser.visit(url_3)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    mars_weather = soup.find("p", class_="js-tweet-text").text

    # Mars Facts
    url_4 = "https://space-facts.com/mars/"

    tables = pd.read_html(url_4)
    df = tables[0]
    df = df.rename(columns={0: "Category", 1: "Value"})
    df = df.set_index("Category", drop=True)
    del df.index.name
    table_data = df.to_html()
    print(table_data)

    # Mars Hemispheres
    Mars_Hem = []

    url_5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    browser.visit(url_5)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    Hemis = soup.findAll("div", class_="description")

    for hemi in Hemis:
        Name = hemi.a.h3.text
        print(Name)
        browser.click_link_by_partial_text(Name)
        time.sleep(3)
        browser.click_link_by_partial_text('Open')
        time.sleep(2)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        img_src = soup.find('img', class_="wide-image")['src']

        img_src_full = f"https://astrogeology.usgs.gov" + img_src
        print(img_src_full)
        Name = Name[:-9]
        post = {"title": Name, "img_url": img_src_full}
        Mars_Hem.append(post)
        print(Mars_Hem)
        browser.click_link_by_partial_text('Close')
        time.sleep(3)
        browser.click_link_by_partial_text('Back')

    return news_title, news_p, featured_image_url, mars_weather, table_data, Mars_Hem
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")
        self.browser.driver.set_window_size(1280, 800)

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = User(name="Alice", email="*****@*****.**",
                         password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run,
                                               kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)

    def test_login_correct(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")

    def test_login_incorrect(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")
        
    def test_logout(self):
        self.test_login_correct()
        self.browser.find_by_css("button[type=logout]").first.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")
        
    def testAddEntryNotLoggedIn(self):
        self.test_login_incorrect()
        #tries to visit entry page
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        #redirects to login page
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login?next=%2Fentry%2Fadd")    
        
    def testAddEntryLoggedIn(self):
        self.test_login_correct()
        #visit the add entry page by clicking on button - how to make this work?
        #self.browser.find_by_css("button[type=add]").first.click()
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "Add Entry Logged In Test Title")
        self.browser.fill("content", "Test content for add entry logged in")
        #find button for add entry and click it 
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        #browser should return to homepage after test entry added
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
        
    def testEditEntryLoggedIn(self):
        self.test_login_correct()
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "Edit Entry Logged In Title")
        self.browser.fill("content", "Edit Entry Logged in content")
        self.browser.find_by_css("button[type=submit]").first.click()
        self.browser.click_link_by_partial_href('edit')
        self.browser.fill("title", "edited title")
        self.browser.fill("content", "edited content")
        self.browser.find_by_css("button[type=submit]").first.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
        
    def testDeleteEntryLoggedIn(self):
        self.test_login_correct()
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        self.browser.fill("title", "Test Delete Entry")
        self.browser.fill("content", "Test content for delete entry")
        self.browser.find_by_css("button[type=submit]").first.click()
        self.browser.click_link_by_partial_href('delete')
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        #browser should return to homepage after delete
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")
        
    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
Exemple #32
0
class TagTest(StaticLiveServerTestCase):
    def setUp(self):
        fss.remove_tree(settings.MEDIA_ROOT)
        check_permissions()
        set_site(self.live_server_url)
        
        self.browser = Browser()
        self.browser.visit(self.live_server_url)
        
        login_url = settings.LOGIN_URL
        self.browser.click_link_by_partial_href(login_url)
        
        username = '******'
        password = '******'
        create_user(username)
        login(
            self.browser,
            username,
            password,
        )
        
        upload_url = reverse('documents.views.add_document')
        self.browser.click_link_by_partial_href(upload_url)
        
        source = 'local'
        docfile = get_abs_path('doctest.pdf')
        language = 'eng'
        public = True
        title = 'test'
        notes = 'test notes'
        upload(
            self.browser,
            source,
            docfile,
            language,
            public,
            title,
            notes,
        )
        
        self.browser.is_element_not_present_by_value('ready', 10)
        
        tag = 'tag'
        add_tag(
            self.browser,
            tag,
        )
        
        self.tag = tag
        self.tag_obj = get_tag(tag)
    
    def test_add_tag(self):
        tag_exists = exists_tag(self.tag)
        self.assertTrue(tag_exists)
        self.assertEquals(self.tag_obj.name, self.tag)
        
        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)
        
        tag_span = self.browser.find_by_css('span.taggit_tag')
        self.assertEquals(tag_span.value, self.tag)
        
#        import time; time.sleep(3)
        self.browser.quit()
    
    def test_add_different_tag(self):
        old_tag_num = len(self.browser.find_by_css('span.taggit_tag'))
        
        tag = 'other'
        add_tag(
            self.browser,
            tag,
        )
        
        new_tag_num = len(self.browser.find_by_css('span.taggit_tag'))
        self.assertEquals(new_tag_num, old_tag_num + 1)
        
#        import time; time.sleep(3)
        self.browser.quit()
    
    def test_add_same_tag(self):
        old_tag_num = len(self.browser.find_by_css('span.taggit_tag'))
        
        tag = self.tag
        add_tag(
            self.browser,
            tag,
        )
        
        new_tag_num = len(self.browser.find_by_css('span.taggit_tag'))
        self.assertEquals(new_tag_num, old_tag_num)
        
#        import time; time.sleep(3)
        self.browser.quit()
    
    def test_remove_tag(self):
        old_tag_num = len(self.browser.find_by_css('span.taggit_tag'))
        
        driver = self.browser.driver
        actions = ActionChains(driver)
        tag_link = driver.find_element_by_css_selector('#taggit_tags a')
        actions.move_to_element(tag_link)
        actions.move_by_offset(25, 10)
        actions.click()
        actions.perform()
        
        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)
        
        new_tag_num = len(self.browser.find_by_css('span.taggit_tag'))
        self.assertEquals(new_tag_num, old_tag_num - 1)
        
#        import time; time.sleep(3)
        self.browser.quit()
def scrape():
        #dependencies
    from bs4 import BeautifulSoup as bs
    import splinter
    import requests
    from splinter import Browser
    import time
    import pandas as pd
    from selenium import webdriver
    import os
    import pymongo
    import json

    #The dictionary
    mars_facts_data={}

    #1
    #emulate the browser and get the html
    executable_path = {'executable_path': 'C:/chromedriver/chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    #url to visit
    url='https://mars.nasa.gov/news/'
    #we need to use the browser to visit the page because there are many elements that do not load until the page is loaded.
    #requests would only get the raw html.
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    news_p =soup.select_one("div.rollover_description_inner")
    news_title = soup.select_one("div.content_title")
    news_p = news_p.text
    news_title = news_title.text
    mars_facts_data['news_title'] = news_title
    mars_facts_data['news_paragraph'] = news_p

    #2
    executable_path = {'executable_path': 'C:/chromedriver/chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    browser.visit(url)

    browser.click_link_by_id('full_image')
    time.sleep(3)
    browser.click_link_by_partial_text('more info')
    time.sleep(3)

    time.sleep(3)
    browser.click_link_by_partial_href('/spaceimages/images/')
    #Download the image and Store
    response = requests.get(browser.url)
    if response.status_code == 200:
        linkname= (browser.url.rsplit('/', 1)[-1])
        SaveFile = (f'Resources/{linkname}')
        with open(SaveFile, 'wb') as f:
            f.write(response.content)
    print(browser.url)
    Space_image_dict = {}
    Space_image_dict['Url'] = browser.url
    mars_facts_data['featured_image'] = browser.url
    #collection.insert_one(Space_image_dict)

    #3
    mars_weather_dict = {}
    url='https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    mars_weather = soup.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text
    mars_weather =mars_weather.strip()
    mars_facts_data['weather'] = mars_weather
    mars_facts_data
    #collection.insert_one(mars_weather_dict)

    #4
    url = 'https://space-facts.com/mars/'
    df = pd.read_html(url)
    #df = pd.DataFrame(df)
    df= df[0]
    df.columns = ['Category', 'Measure']
    df.set_index('Category',inplace = True)
    mars_html_table = df.to_html()
    mars_html_table = mars_html_table.replace("\n","")
    mars_facts_data['mars_facts_table'] = mars_html_table
    return mars_facts_data
Exemple #34
0
def scrape():

    #Dependencies
    from bs4 import BeautifulSoup
    import requests
    from splinter import Browser
    from splinter.exceptions import ElementDoesNotExist
    import pandas as pd

    # First URL of page to be scraped
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

    # Retrieve page with the requests module
    response = requests.get(url)

    # Create BeautifulSoup object; parse with 'lxml'
    soup = BeautifulSoup(response.text, 'lxml')

    #Retrieve latest news' title and paragraph; store into variables
    results = soup.find('div', class_='image_and_description_container')

    news_title = results.find_all('img')
    news_title = news_title[1]['alt']
    news_p = results.find('div', class_='rollover_description_inner').text
    news_p = news_p.replace('\n', '')

    #Set up Chrome.exe
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #Connect to URL
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    #Prepare to use Beautiful Soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    #Push FULL IMAGE button to retrieve the image URL
    browser.click_link_by_partial_text('FULL IMAGE')

    #Retrieve image URL
    results = soup.find('a', class_='button fancybox')
    feature_image_url = results['data-fancybox-href']
    feature_image_url = feature_image_url.replace('medium', 'large')
    feature_image_url = feature_image_url.replace('ip', 'hires')
    url_short = url.rsplit('/spaceimages', 1)[0]
    feature_image_url = url_short + feature_image_url

    #Now let's retrieve Mars weather
    url = 'https://twitter.com/marswxreport?lang=en'

    # Retrieve page with the requests module
    response = requests.get(url)

    # Create BeautifulSoup object; parse with 'lxml'
    soup = BeautifulSoup(response.text, 'lxml')

    #Find all the tags that contain tweets
    results = soup.find_all('div', class_='content')

    for result in results:
        texto = result.find(
            'p',
            class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
        ).text
        x = texto.find("InSight sol")

        if x == 0:
            mars_weather = texto.rsplit('pic.twitter', 1)[0]
            break

    #Go for the FACTS table!
    url = 'https://space-facts.com/mars/'

    #Start retrieving the data from the table
    table = pd.read_html(url)

    #Organize pandas df
    df = table[0]
    df.columns = ['Description', 'Value']
    df.set_index('Description', inplace=True)

    #Transform to HTML string
    html_table = df.to_html()

    #Set up Chrome.exe
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #Connect to URL to find photos of Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/maps/mars-viking-hemisphere-point-perspectives'
    browser.visit(url)

    Hemispheres = [
        'valles_marineris', 'syrtis_major', 'schiaparelli', 'cerberus'
    ]
    hemisphere_image_urls = []

    for Hemisphere in Hemispheres:

        try:
            browser.click_link_by_partial_href(Hemisphere + '_enhanced')
            #Prepare to use Beautiful Soup
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            title = soup.find('h2', class_='title').text
            #title = title.rsplit(' Enhanced',1)[0]
            image = soup.find('img', class_='wide-image')
            image_link = 'https://astrogeology.usgs.gov' + image['src']
            d = {'title': title, 'image_url': image_link}
            hemisphere_image_urls.append(d)

        except:
            browser.find_link_by_text('2').first.click()
            browser.click_link_by_partial_href(Hemisphere + '_enhanced')
            #Prepare to use Beautiful Soup
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            title = soup.find('h2', class_='title').text
            #title = title.rsplit(' Enhanced',1)[0]
            image = soup.find('img', class_='wide-image')
            image_link = 'https://astrogeology.usgs.gov' + image['src']
            d = {'title': title, 'image_url': image_link}
            hemisphere_image_urls.append(d)

    results_dict = {'news_title' : news_title, 'news_p' : news_p, 'feature_image_url' : feature_image_url,\
                    'mars_weather' : mars_weather, 'html_table' : html_table, 'hemisphere_image_urls' : hemisphere_image_urls}
    return results_dict
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

import sys, os
from splinter import Browser

reload(sys)
sys.setdefaultencoding('utf-8')

br = Browser()

file_prefix = 'file://'
dir_name    = os.path.dirname(os.path.realpath(__file__))
file_name   = "output.html"

full_path   = file_prefix + dir_name + '/' + file_name

br.visit(full_path)
br.click_link_by_partial_href('/releases/ac?artist_id=')
def scrape():

    #NEWS
    nasaUrl = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    html = requests.get(nasaUrl)

    soup = bs(html.text, 'html5lib')

    news_title = soup.find_all(class_='content_title')[0].text
    news_p = soup.find_all(class_='rollover_description_inner')[0].text

    #Featured Image
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False, wait_time=5)

    marsUrl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(marsUrl)
    time.sleep(5)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(5)
    browser.click_link_by_partial_text('more info')
    browser.click_link_by_partial_href('/jpeg')
    featuredImageUrl = browser.find_by_css('img')['src']

    #Mars Weather
    twitterUrl = 'https://twitter.com/marswxreport?lang=en'
    html = requests.get(twitterUrl)

    soup = bs(html.text, 'html5lib')

    marsWeather = soup.find_all(class_='TweetTextSize')[0].text

    #Mars Facts
    marsFactsUrl = 'https://space-facts.com/mars/'

    df = pd.read_html(marsFactsUrl)[0]

    df = df.rename(columns={0: 'Description', 1: 'Value'})
    df = df.set_index('Description')

    df = df.to_dict()
    marsfactsdict = df['Value']

    #Hemispheres
    marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(marsHemispheresUrl)

    hemiList = []

    marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(marsHemispheresUrl)
    browser.click_link_by_partial_text('Cerberus Hemisphere')
    img_url = browser.find_by_css('img[class = wide-image]')['src']
    title = browser.find_by_css('h2[class = title]').text
    hemiList.append(dict({'title': title, 'img_url': img_url}))

    marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(marsHemispheresUrl)
    browser.click_link_by_partial_text('Schiaparelli Hemisphere')
    img_url = browser.find_by_css('img[class = wide-image]')['src']
    title = browser.find_by_css('h2[class = title]').text
    hemiList.append(dict({'title': title, 'img_url': img_url}))

    marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(marsHemispheresUrl)
    browser.click_link_by_partial_text('Syrtis Major Hemisphere')
    img_url = browser.find_by_css('img[class = wide-image]')['src']
    title = browser.find_by_css('h2[class = title]').text
    hemiList.append(dict({'title': title, 'img_url': img_url}))

    marsHemispheresUrl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(marsHemispheresUrl)
    browser.click_link_by_partial_text('Valles Marineris Hemisphere')
    img_url = browser.find_by_css('img[class = wide-image]')['src']
    title = browser.find_by_css('h2[class = title]').text
    hemiList.append(dict({'title': title, 'img_url': img_url}))

    scrapedDict = {
        'news_title': news_title,
        'news_p': news_p,
        'featured_image': featuredImageUrl,
        'weather': marsWeather,
        'facts': marsfactsdict,
        'hemispheres': hemiList
    }

    return scrapedDict
def scrape():
    # A webscraping function for the latest news on mars
    # Python dictionary of the results
    scrape_rsult = {}

    # ### NASA Mars News

    # In[2]:

    # *** Scrape the [NASA Mars News Site] ***
    url_NASA = "https://mars.nasa.gov/news"
    r = req.get(url_NASA)  # sends a request to the url
    time.sleep(1)
    data = r.text  # turns response into texts
    soup = BeautifulSoup(
        data, "html.parser")  # changes the response from text to html

    # In[3]:

    # collect the latest News Title and Paragragh Text. Assign the text to variables that you can reference later.
    soup_div = soup.find(
        class_="slide")  # within div in body, within <ul>, <li class=slide>.
    soup_news = soup_div.find_all('a')  # search by anchor

    # In[4]:

    #getting the title
    NASA_latest_t = soup_news[1].get_text().strip()
    # ^^^Latest News Title
    scrape_rsult["Nasa_latest_title"] = NASA_latest_t

    # In[5]:

    #getting the paragraph
    # getting the paragraph url
    soup_p = soup_div.find_all('a', href=True)
    soup_p_url = soup_p[0]['href']
    # only the url of latest news article's paragraph

    # In[6]:

    #    Scrape the href of the first news article
    url = "https://mars.nasa.gov/"
    news_url = url + soup_p_url
    # request url
    r = requests.get(news_url)
    time.sleep(1)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")

    soup_para = soup.find(class_='wysiwyg_content')
    soup_para = soup_para.find_all('p')

    # In[7]:

    #    save the text of the paragraphs to a list
    NASA_latest_p = []
    for entry in soup_para:
        paragraph = entry.get_text().strip()
        NASA_latest_p.append(paragraph)
        # ^^^ NASA_latest_p is list of paragraphs from the latest news article

    scrape_rsult["Nasa_latest_paragraph"] = NASA_latest_p

    # ### JPL Mars Space Images - Featured Image

    # In[8]:

    # Visit the url for JPL's Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(3)

    # In[9]:

    # Use splinter to navigate the site and find the image url for the current Featured Mars Image
    #     the mars featured images are under a list element of the slide class. '>' signifies a child element.
    browser.find_by_css('li.slide>a.fancybox').first.click()
    time.sleep(1)

    # clicks the 'more info' button (caution!: the 'share' button is under a similar but different class)
    browser.find_by_css('div.buttons>a.button').first.click()
    time.sleep(1)
    # In[10]:

    # assign the url string to a variable called `featured_image_url`.
    #     Here, I decide to get both the full-size .jpg and an 800x600 size image for the webpage
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    # full-size jpg (to be linked if image is clicked)
    feat_full_img_soup = soup.find(class_="main_image")
    feat_full_img = feat_full_img_soup.get('src')

    # smaller size jpg (to be displayed on the webpage)
    #     uses splinter instead of beautiful soup
    browser.click_link_by_partial_href('800x600.jpg')
    #     switch over to the next browser (window no. 2)
    #     save it's url, then close 2nd window
    browser.windows.current = browser.windows[1]
    featured_image_url = browser.url
    browser.windows[1].close()

    # save the two urls
    ori_url = 'https://www.jpl.nasa.gov'
    feat_full_img = ori_url + feat_full_img
    # ^^^ feat_full_img is https://www.jpl.nasa.gov + url of the full-sized featured image
    #     featured_image_url is the smaller 800x600 image that will be featured on the webpage

    scrape_rsult["featured_image_url"] = featured_image_url
    scrape_rsult['feat_full_img'] = feat_full_img

    # ### Mars Weather

    # In[11]:
    ''' 
    *** Visit the Mars Weather twitter account (https://twitter.com/marswxreport?lang=en) and scrape the latest 
    Mars weather tweet from the page. Save the tweet text for the weather report as a variable called `mars_weather`. ***
    '''
    url = 'https://twitter.com/marswxreport?lang=en'
    r = requests.get(url)
    time.sleep(1)
    data = r.text
    soup = BeautifulSoup(data, 'html.parser')

    mars_tweets = soup.find(class_='stream-items js-navigable-stream')
    mars_tweets = mars_tweets.find(class_="js-tweet-text-container")

    mars_weather = mars_tweets.p.text
    # ^^^ mars_weather is the paragraph <p> text of the latest tweet from the Mars weather handle

    scrape_rsult["mars_weather_tweet"] = mars_weather

    # ### Mars Facts

    # In[12]:
    ''' 
    *** Visit the Mars Facts webpage (http://space-facts.com/mars/) and use Pandas to scrape the table containing 
    facts about the planet including Diameter, Mass, etc. ***
    '''
    facts_url = 'http://space-facts.com/mars/'
    all_facts_df = pd.read_html(
        facts_url)  # searches for html tables & returns list of dataframes
    all_facts_df = all_facts_df[0]

    # In[14]:

    # Use Pandas to convert the data to a HTML table string.
    facts_html = all_facts_df.to_html(header=False,
                                      index=False,
                                      justify='left')

    # ^^^ facts_html is the html table of the mars facts table
    scrape_rsult["mars_facts_table"] = facts_html

    # ### Mars Hemispheres

    # In[114]:
    ''' 
    *** Visit the USGS Astrogeology site 
    (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) 
    to obtain high resolution images for each of Mar's hemispheres.
    '''
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    time.sleep(3)

    # In[115]:

    # click each of the links to the hemispheres to find the image url to the full resolution image.
    # old code, may be useful later
    '''
    #    get list of <a href links> 
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    hemi_soup = soup.find_all(class_='itemLink product-item')
    hemi_href_ls = []
    for item in hemi_soup:
        url_index = 'https://astrogeology.usgs.gov'
        href = item['href']
        link = url_index + href
        hemi_href_ls.append(link)
    '''
    # Get unique hrefs
    '''     I could just go to these urls separately using browser.visit(url). But I interpret the instructions 
            as saying that I need to use splinter to click on the link in the browser.     '''
    # hemi_href_ls = np.unique(hemi_href_ls)
    # hemi_href_ls

    # In[116]:
    ''' Caution!: It seems splinter can only click link based on the exact wording of the text
    browser.click_link_by_partial_text('Cerberus Hemisphere')    #e.g. function will fail to find lower case 'cerberus'
    '''

    # In[117]:

    # Beautiful soup to search browser html for headers (these contain the hemisphere names)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    headers_soup = soup.find_all('h3')
    #test = headers_soup[2].text.replace(" Enhanced", "")
    #test

    # In[128]:

    # For each header in the beautiful soup, click link associated with it and get img_url
    hemisphere_image_urls = []
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    for header in headers_soup:
        #start at origin url for the Mars hemisphere section
        window = browser.windows[0]  # current window, the first window
        browser.visit(url)
        time.sleep(2)  # wait 2 secs for browser to load
        #getting title
        title = header.text
        title = title.replace(
            " Enhanced",
            "")  #get rid of " " + "Enhanced" for when dict is appended
        browser.click_link_by_partial_text(title)
        time.sleep(2)  # again, wait 2 secs for browser to load
        browser.click_link_by_text('Sample')
        browser.windows.current = browser.windows[
            1]  # switch current window to the window that just opened
        img_url = browser.url
        browser.windows.current = window  # switch the current window back
        hemisphere_image_urls.append({'title': title, 'img_url': img_url})
        window.close_others(
        )  # close all the other windows to keep browser nice and tidy!

    # ^^^ hemisphere_image_urls is list of dicts of img_url and title of hemisphere
    scrape_rsult["hemispheres"] = hemisphere_image_urls

    return scrape_rsult
Exemple #38
0
class UserTest(StaticLiveServerTestCase):
    def setUp(self):
        check_permissions()
        self.username = '******'
        create_user(self.username)

        self.browser = Browser()
        self.browser.visit(self.live_server_url)

    def test_signup(self):
        signup_url = settings.SIGNUP_URL
        self.browser.click_link_by_partial_href(signup_url)

        username = '******'
        password = '******'
        email = '*****@*****.**'
        signup(
            self.browser,
            username,
            password,
            email,
        )

        user_exists = exists_user(username)
        self.assertTrue(user_exists)

        user = get_user(username)
        self.assertEquals(user.username, username)
        #self.assertEquals(user.password, password)
        self.assertEquals(user.email, email)

        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)

        profile_xpath = '/html/body/div/div[1]/div/ul[2]/li[4]/a'
        profile_link = self.browser.find_by_xpath(profile_xpath)
        self.assertEquals(profile_link.value, '@{}'.format(username))

        #        import time; time.sleep(3)
        self.browser.quit()

    def test_signin(self):
        login_url = settings.LOGIN_URL
        self.browser.click_link_by_partial_href(login_url)

        username = self.username
        password = self.username
        login(
            self.browser,
            username,
            password,
        )

        document_list_url = \
            self.live_server_url + reverse('documents.views.list_documents')
        self.assertEquals(self.browser.url, document_list_url)

        profile_xpath = '/html/body/div/div[1]/div/ul[2]/li[4]/a'
        profile_link = self.browser.find_by_xpath(profile_xpath)
        self.assertEquals(profile_link.value, '@{}'.format(username))

        #        import time; time.sleep(3)
        self.browser.quit()