Python Browser.click_link_by_id Exemples, splinter.Browser.click_link_by_id Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : scrape_mars.py Projet : Mikelaifu/Mongo_Web_Scraping_Application

def mars_Images():
    from splinter import Browser
    from bs4 import BeautifulSoup
    image_dict = {}
    browser = Browser('chrome', headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    time.sleep(3)
    browser.click_link_by_id("full_image")
    elem = browser.find_link_by_partial_href("PIA")
    image_url = elem['href']
    browser.quit()
    browser2 = Browser('chrome', headless=False)
    url2 = image_url
    browser2.visit(url2) 
    browser2 = Browser('chrome', headless=False)
    url2 = image_url
    browser2.visit(url2)
    elem = browser2.find_link_by_partial_href("/spaceimages/images")
    featured_image_url = elem['href']
    
    
    image_dict["featured_image_url"] = featured_image_url
    
    return image_dict

Exemple #2

0

Afficher le fichier

def setHtml(roomDay, config):
    b = Browser(driver_name='chrome')
    url = htmlDir
    b.visit(url)
    b.select('selLocation', config['room'])
    b.fill('textDescription', config['text'])
    b.select('selStartMonth', roomDay.month)
    b.select('selStartDay', roomDay.day)
    b.select('selStartTime', config['startTime'])
    b.select('selEndTime', config['endTime'])
    b.click_link_by_id('submit')

Exemple #3

0

Afficher le fichier

class StudentSplinterTestCase(ChannelsLiveServerTestCase):

    fixtures = ['workshops', "languages", "problems", "problem_tests"]

    def setUp(self):
        self.browser = Browser('chrome', headless=True)
        self.browser.visit(f'{self.live_server_url}{reverse("workshop_auth")}')
        self.workshop = Workshop.objects.get(pk=1)
        self.workshop.start()
        self.browser.fill('code', self.workshop.code)
        self.browser.click_link_by_id("submit-id-submit")

    def tearDown(self):
        self.browser.quit()

Exemple #4

0

Afficher le fichier

def ebay_kleinanzeigen(login_name, login_pw, title, pic_path, description,
                       price, plz, street, company, phone):
    url = "https://www.ebay-kleinanzeigen.de/p-anzeige-aufgeben.html#?path=210/306/teile&isParent=false"
    browser = Browser('chrome')
    browser.driver.set_window_size(1200, 900)
    browser.visit(url)
    browser.fill('loginMail', login_name)
    browser.fill('password', login_pw)
    browser.click_link_by_id("login-submit")
    browser.find_by_id("cat_210").click()
    browser.find_by_id("cat_306").click()
    browser.find_by_id("cat_teile").click()
    browser.find_by_css('.button').first.click()
    browser.fill('title', title)
    browser.fill('description', description)
    browser.fill('priceAmount', price)
    browser.find_by_id("priceType2").click()
    browser.driver.execute_script(
        "window.scrollTo(0, document.body.scrollHeight/4);")
    browser.find_by_id('pictureupload-pickfiles').click()
    time.sleep(2)
    apps = pywinauto.findwindows.find_elements(title_re='Öffnen')
    for app in apps:
        print(app)
        prozess = re.search('.+#([0-9]+)', str(app))
        prozess = int(prozess.group(1))
        print(prozess)
        app = pywinauto.Application().connect(title='Öffnen')
        # app = pywinauto.Application().connect(process=prozess)
        window = app.Dialog
        window.Wait('ready')
        edit = window.Edit
        edit.ClickInput()
        edit.TypeKeys(pic_path)
        button = window.Button
        button.Click()
    time.sleep(10)
    browser.fill('zipCode', plz)
    browser.fill('streetName', street)
    browser.fill('contactName', company)
    browser.fill('phoneNumber', phone)
    browser.driver.execute_script(
        "window.scrollTo(0, document.body.scrollHeight);")
    browser.find_by_id('pstad-submit').click()
    time.sleep(10)
    browser.quit()

Exemple #5

0

Afficher le fichier

Fichier : token.py Projet : pranavbahl2308/Spomato

def get_url_code(auth_url, username, password, login='******'):
    b = Browser(driver_name='chrome')
    b.visit(auth_url)
    b.click_link_by_partial_href("/en/login")
    if login == 'facebook':
        b.click_link_by_partial_href("https://www.facebook.com")
        b.fill_form({'email': username, 'pass': password})
        b.click_link_by_id('loginbutton')
    elif login == 'spotify':
        b.fill_form({'username': username, 'password': password})
        loginbutton = b.find_by_text('Log In')[0]
        loginbutton.click()
    b.visit(auth_url)
    codeurl = b.url
    code = codeurl.split("?code=")[1].split('&')[0]
    b.quit()

    return code

Exemple #6

0

Afficher le fichier

Fichier : scrape_mars.py Projet : UncleBacon/web-scraping-challenge

def scrape():

    #set up Browser

    executable_path = {'executable_path': "chromedriver"}
    browser = Browser('chrome', **executable_path, headless=False)
    
    #Get Nasa News
    nasa_news = 'https://mars.nasa.gov/news/'
    browser.visit(nasa_news)
    html = browser.html
    soup = bs(html, 'html.parser')
    results = soup.find_all('li', class_="slide")

    for result in results[0]:
        news_title = result.find('div',class_="content_title").text
        news_description = result.find('div',class_="article_teaser_body").text
        news_url = nasa_news + result.a['href']
        
    time.sleep(1)
    
    #Collect JPL Image
    jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl)
    html = browser.html
    soup = bs(html, 'html.parser')

    browser.click_link_by_id('full_image')
    time.sleep(2)
    browser.click_link_by_partial_href('/spaceimages/details')
    soup = bs(browser.html, 'html.parser')
    results = soup.find('figure', class_ = 'lede')
    base_url = browser.url[:24]
    img = results.a.img['src']

    featured_img_url =  base_url + img

    
    time.sleep(1)
    
    
    #Mars Weather
    weather = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(weather)
    html = browser.html
    soup = bs(html, 'html.parser')
    
    results = soup.find('div', class_="js-tweet-text-container")
    try:
        results.a.decompose()
    except:
        pass
    
    mars_weather = results.find('p').text
    
    time.sleep(1)
    
    #Mars Facts
    space_facts = 'https://space-facts.com/mars/'

    mars_facts = pd.read_html(space_facts)[1].rename(columns = {0:'Fact',1:'Data'}).to_html(index=False).replace('\n','')

    time.sleep(1)
    
    
    #Mars Hemispheres
    hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemispheres)
    html = browser.html
    soup = bs(html, 'html.parser')

    #Find list of image tags
    base_url = browser.url[:29]
    results = soup.find_all('div',attrs={'class':'collapsible results'})[0]
    images = results.find_all('div')[:]

    #iterate through length of tags and collect hrefs, navigate to page and collect full image link 
    hemisphere_image_urls = []

    for image in range(0,len(images)):
        if image == 0 or image % 2 == 0:
            url = base_url+images[image].a['href']
            title = (images[image].h3.text)
            browser.visit(url)
            time.sleep(1)
            soup = bs(browser.html,'html.parser')
            results = soup.find_all('ul')[0]
            result = results.find_all('li')[0]
            hemi_url = (result.a['href'])
            hemisphere_image_urls.append({'title':title,
                                          'img_url':hemi_url})
    facts = {'news_title':news_title,
             'news_description':news_description,
             'news_url':news_url,
             'featured_img_url':featured_img_url,
             'mars_weather':mars_weather,
             'mars_facts':mars_facts,
             'hemi_img_url':hemisphere_image_urls
            }
    
    browser.visit('https://i.pinimg.com/originals/49/78/3e/49783e18b9ac11c560362029ba1f3328.jpg')


    return facts

Exemple #7

0

Afficher le fichier

def scrape():

    url = 'https://mars.nasa.gov/news/'
    response = req.get(url)

    soup = BeautifulSoup(response.text, 'lxml')

    title = soup.find("div", class_="content_title").text
    description = soup.find("div", class_="rollover_description_inner").text

    browser = Browser('chrome', headless=False)
    img_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(img_url)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    browser.click_link_by_id('full_image')

    browser.click_link_by_partial_text('more info')

    image_html = browser.html
    soup2 = BeautifulSoup(image_html, 'html.parser')
    main_img_url = soup2.find('img', class_='main_image')
    split_img_url = main_img_url.get('src')

    featured_image_url = "https://www.jpl.nasa.gov" + split_img_url

    mars_twitter = "https://twitter.com/marswxreport?lang=en"
    browser.visit(mars_twitter)

    html = browser.html
    twitter_soup = BeautifulSoup(html, 'html.parser')

    mars_tweet = twitter_soup.find('div', class_="js-tweet-text-container")

    mars_weather = mars_tweet.find('p', 'tweet-text').get_text()
    #mars_weather

    facts_url = "https://space-facts.com/mars/"
    tables = pd.read_html(facts_url)
    #tables

    mars_df = tables[0]
    mars_df.columns = ['Mars Facts', 'Mars Data']
    #mars_df

    mars_df.set_index('Mars Facts', inplace=True)
    #mars_df

    html_table = mars_df.to_html()

    mars_df.to_html('table.html')

    # Visit the USGS Astrogeology site to obtain for obtain high resolution images for each of Mar's hemispheres
    usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    usgs_req = req.get(usgs_url)

    # You will need to click each of the links to the hemispheres in order to find full res image
    soup = BeautifulSoup(usgs_req.text, "html.parser")
    hemi_attributes_list = soup.find_all('a', class_="itemLink product-item")

    # Save both the image url string using the keys img_url and title.
    # Append the dictionary with the image url string and the hemisphere title to a list.

    hemisphere_image_urls = []
    for hemi_img in hemi_attributes_list:
        img_title = hemi_img.find('h3').text
        link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href']
        img_request = req.get(link_to_img)
        soup = BeautifulSoup(img_request.text, 'lxml')
        img_tag = soup.find('div', class_='downloads')
        img_url = img_tag.find('a')['href']
        hemisphere_image_urls.append({
            "Title": img_title,
            "Image_Url": img_url
        })

    mars_data = {
        "News_Title": title,
        "Paragraph_Text": description,
        "Most_Recent_Mars_Image": featured_image_url,
        "Mars_Weather": mars_weather,
        "mars_h": hemisphere_image_urls
    }
    return mars_data

Exemple #8

0

Afficher le fichier

Fichier : scrape.py Projet : dougwatola/web-scraping-challenge

def scrape():

    # Initialize Splinter for Windows Users
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #--------NASA Mars News ---------
    # URL of page to be scraped
    NASA_News_url = 'https://mars.nasa.gov/news/'
    browser.visit(NASA_News_url)

    news_title = NASA_Mars_News_soup.find('div', class_='content_title').text

    #This is not consistently running correct
    news_p = NASA_Mars_News_soup.find('div', class_='article_teaser_body').text

    #---------JPL Mars Space Images - Featured Image----------
    # URL of page to visit
    JPL_Mars_Images_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(JPL_Mars_Images_url)
    time.sleep(5)

    # Click on first button
    browser.click_link_by_id('full_image')
    time.sleep(5)

    # Click again
    browser.click_link_by_partial_text('more info')
    time.sleep(5)

    # Create Soup Object from the JPL Mars Space Images URL
    html = browser.html
    JPL_Mars_Images_soup = bs(html, 'html.parser')

    # find the relative image url
    relative_img_url = JPL_Mars_Images_soup.select_one('figure.lede a img')

    #I found that the execution of this code is unreliable unless I put in some delays.  Maybe I am wrong.
    time.sleep(10)

    figure = JPL_Mars_Images_soup.find('figure')
    relative_mars_image_url = figure.find('a')['href']
    mars_image_url = f'https://www.jpl.nasa.gov{relative_mars_image_url}'

    #-------------Mars Facts ---------------------
    #Read Mars Facts into dataframe --- returns a list of DataFrame objects
    Mars_Facts_df = pd.read_html('https://space-facts.com/mars/')[0]

    #Set Column Names and set index to description column
    Mars_Facts_df.columns = ['description', 'value']
    Mars_Facts_df.set_index('description', inplace=True)

    Mars_Facts = Mars_Facts_df.to_html('Mars_Facts_table.html')

    #--------------- Mars Hemispheres ------------------------
    # URL of page to be scraped
    Mars_Hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(Mars_Hemisphere_url)
    # Create Soup Object from the Mars Hemisphere Web Page
    html = browser.html
    Mars_Hemisphere_soup = bs(html, 'html.parser')

    # Retreive all items that contain mars hemispheres information
    hemisphere_items = Mars_Hemisphere_soup.find_all('div', class_='item')
    # Create an empty list to store scraped hemisphere urls
    hemisphere_image_urls = []
    # Variable for the main web site
    hemispheres_main_url = 'https://astrogeology.usgs.gov'

    # Loop through the hemisphere items
    for i in hemisphere_items:
        # Store hemisphere title
        hemisphere_title = i.find('h3').text

        # Store link that leads to full image website
        partial_img_url = i.find('a', class_='itemLink product-item')['href']

        # Visit the complete image url
        browser.visit(hemispheres_main_url + partial_img_url)

        # Create HTML Object of individual hemisphere image information
        partial_img_html = browser.html

        # Create soup object using partial image url
        partial_img_soup = bs(partial_img_html, 'html.parser')

        # Create complete image source url
        complete_img_url = hemispheres_main_url + partial_img_soup.find(
            'img', class_='wide-image')['src']

        # Add title/image url dictionaries to hemisphere_image_urls list
        hemisphere_image_urls.append({
            "title": hemisphere_title,
            "img_url": complete_img_url
        })

    mars_information = {
        'news_title': news_title,
        'news_paragraph': news_p,
        'mars_image_url': mars_image_url,
        'mars_facts_html': Mars_Facts,
        'mars_hemispheres': hemisphere_image_urls
    }

    #print(scrape())

    return mars_information

Exemple #9

0

Afficher le fichier

def scrape():

    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    news_url = "https://mars.nasa.gov/news/"
    browser.visit(news_url)
    if browser.is_element_present_by_tag('li', wait_time=10):

        soup = BeautifulSoup(browser.html, 'html.parser')

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    section = soup.find('li', class_="slide")
    #print(section.prettify())
    news_title = section.find('div', class_="content_title").text
    #results_head = soup.find_all('div', class_="article_teaser_body")
    #print(news_title)
    #print(results_head)

    news_head = section.find('div', class_='article_teaser_body').text
    #print(news_head)

    images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(images_url)

    if browser.is_element_present_by_id('full_image', wait_time=10):

        soup = BeautifulSoup(browser.html, 'html.parser')

    browser.click_link_by_id("full_image")
    time.sleep(2)
    browser.click_link_by_partial_text("more info")

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    image_tags = soup.select_one('figure.lede a img').get("src")
    #print(image_tags)

    featured_image_url = "https://www.jpl.nasa.gov" + image_tags
    #print(featured_image_url)

    facts_url = "https://space-facts.com/mars/"

    tables = pd.read_html(facts_url)
    len(tables)

    print(type(tables))
    print(type(tables[0]))

    df = tables[1]
    df.head()

    html_table = df.to_html()
    print(html_table)

    df.to_html('table.html')

    hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemis_url)

    browser.click_link_by_partial_text("Cerberus")
    html = browser.html
    soup2 = BeautifulSoup(html, 'html.parser')

    hemis_title = soup2.select_one("div.content h2").text
    #print(hemis_title)

    cerberus_hemis_url = browser.find_by_text("Sample")["href"]
    print(cerberus_hemis_url)

    print(f"{hemis_title}: {cerberus_hemis_url}")

    browser.click_link_by_partial_text("Schiaparelli")
    html = browser.html
    soup3 = BeautifulSoup(html, 'html.parser')

    hemis_title_2 = soup3.select_one("div.content h2").text
    print(hemis_title_2)

    schiaparelli_hemis_url = browser.find_by_text("Sample")["href"]

    print(schiaparelli_hemis_url)
    print(f"{hemis_title_2}: {schiaparelli_hemis_url}")

    browser.click_link_by_partial_text("Syrtis")
    html = browser.html
    soup4 = BeautifulSoup(html, 'html.parser')

    hemis_title_3 = soup4.select_one("div.content h2").text
    print(hemis_title_3)

    syrtis_hemis_url = browser.find_by_text("Sample")["href"]

    print(f"{hemis_title_3}: {syrtis_hemis_url}")

    browser.click_link_by_partial_text("Valles")
    html = browser.html
    soup5 = BeautifulSoup(html, 'html.parser')

    hemis_title_4 = soup5.select_one("div.content h2").text
    #print(hemis_title_4)

    marineris_hemis_url = browser.find_by_text("Sample")["href"]
    print(f"{hemis_title_4}: {marineris_hemis_url}")

    return_dict = {
        "news_title":
        news_title,
        "news_head":
        news_head,
        "featured_img":
        featured_image_url,
        "table":
        html_table,
        "Hemisphere_urls": [
            hemis_url, schiaparelli_hemis_url, syrtis_hemis_url,
            marineris_hemis_url
        ]
    }
    #print(f"{hemis_title}: {cerberus_hemis_url},\n{hemis_title_2}: {schiaparelli_hemis_url},\n{hemis_title_3}: {syrtis_hemis_url},\n{hemis_title_4}: {marineris_hemis_url}")

    browser.quit()

    return return_dict

Exemple #10

0

Afficher le fichier

def mars_scrape():
    mars = {}

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=True)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    sleep(1)
    ourwebpage = browser.html
    soup = bs(ourwebpage, 'html.parser')
    x = soup.body.find_all(class_="content_title")
    alltitle = []

    for i in x[1:]:
        alltitle.append(i.find('a').text.strip())

    alltitle = alltitle[0]

    mars['title'] = alltitle

    paragraph = soup.body.find_all(class_="article_teaser_body")

    news_p = []

    for i in paragraph:
        #print(i.text)
        news_p.append(i.text)
    news_p = news_p[0]

    mars['news_paragraph'] = news_p

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    sleep(1)

    browser.click_link_by_id('full_image')

    z = browser.find_link_by_partial_text("more info")
    z.click()
    sleep(1)

    imgwebpage = browser.html
    soup2 = bs(imgwebpage, 'html.parser')

    image_path = soup2.find(class_="main_image")['src']
    image_full_path = "https://www.jpl.nasa.gov" + image_path

    mars["feature_img"] = image_full_path

    mars_table = pd.read_html("https://space-facts.com/mars/")[0]
    mars_table.rename(columns={0: "Category", 1: "Value"}, inplace=True)

    mars["mars_table"] = mars_table

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    sleep(1)

    image = []

    for i in range(4):
        browser.find_by_css("a.product-item h3")[i].click()
        sleep(1)
        html_image = browser.html

        soupitem = bs(html_image, 'html.parser')

        zz = soupitem.find('a', text="Sample")
        image.append(zz['href'])

        browser.back()
    hemisphere_image_urls = [
        {
            "title": "Valles Marineris Hemisphere",
            "img_url": image[0]
        },
        {
            "title": "Cerberus Hemisphere",
            "img_url": image[1]
        },
        {
            "title": "Schiaparelli Hemisphere",
            "img_url": image[2]
        },
        {
            "title": "Syrtis Major Hemisphere",
            "img_url": image[3]
        },
    ]

    mars["mars_image"] = hemisphere_image_urls

    return mars

Exemple #11

0

Afficher le fichier

Fichier : splinter_tests.py Projet : cathalhughes/skin-condition-classifier

browser = Browser('chrome', **executable_path) ##chrome
#browser = Browser(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 11_1 like Mac OS X) AppleWebKit/604.2.8 (KHTML, like Gecko) Version/11.0 Mobile/15B57 Safari/604.1", **executable_path) ## iPhone
#browser = Browser('firefox', **executable_path) ##firefox

#Test Case 1
print("=========================================================")
print("Running Test Case 1: Upload photo for Classification")
browser.visit('http://54.191.193.7:5000/')
print("Visiting browser...")
time.sleep(2)
element = browser.driver.find_element_by_id("imageFile")
pathToImage = os.path.abspath("static/testing/Capture5.JPG")
element.send_keys(pathToImage)
print("Image chosen...")
time.sleep(2)
browser.click_link_by_id('submit')
print("Image submitted for classification...")
time.sleep(2)
assert browser.is_text_present('Image Uploaded') == True
print("=========================================================")

#Test Case 2
print("Running Test Case 2: Upload nothing and try to submit for classification")
browser.visit('http://54.191.193.7:5000/')
print("Visiting browser...")
time.sleep(2)
browser.click_link_by_id('submit')
print("Submit button pressed...")
time.sleep(2)
assert browser.is_text_present('Please choose an image!') == True
print("=========================================================")

Exemple #12

0

Afficher le fichier

Fichier : scrape_mars.py Projet : neilvora/web-scraping-challenge

def scrape():
    import pandas as pd
    from bs4 import BeautifulSoup as bs
    from splinter import Browser
    import requests
    import re
    import nbconvert
    import time


    # In[76]:


    # URL of page to be scraped
    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"


    #Retrieve page with requests

    response = requests.get(url)

    # Parse the response with beautifulsoup

    soup = bs(response.text, 'html.parser')


    # Pretty Print the html
    #print(soup.prettify())


    # In[77]:


    #Store and Print the news title

    news_title = soup.find('div',class_='content_title').text

    #print(news_title)


    # In[78]:


    # Navigate and Parse the article URL

    p_url = "https://mars.nasa.gov/news/8719/nasa-invites-public-to-share-excitement-of-mars-2020-perseverance-rover-launch/"

    response = requests.get(p_url)

    soup = bs(response.text,'lxml')


    # In[79]:


    # Store the first paragraph in news_p

    results = soup.find_all('p')

    paragraphs = []

    for result in results:
        paragraphs.append(result)

    news_p = paragraphs[2].text
        
    #print(news_p)




    # In[80]:


    # Activate splinter
    executable_path = {'executable_path': r'C:\Users\nvora\AppData\Roaming\chromedriver_win32\chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)


    # In[35]:


    # Open URL in splinter
    splinter_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(splinter_url)


    # In[36]:


    # Click Full Image link
    browser.click_link_by_id('full_image')


    # In[37]:


    # Navigate to more info to find the full size image

    browser.click_link_by_partial_text('more info')


    # In[38]:


    # Extract the full-size image url

    featured_image_url = browser.find_by_css('.main_image')[0]['src']

    browser.quit()


    # In[60]:


    # Connect Browser to Twitter URL (Splinter)

    executable_path = {'executable_path': r'C:\Users\nvora\AppData\Roaming\chromedriver_win32\chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)


    twitter_url = 'https://twitter.com/MarsWxReport'

    browser.visit(twitter_url)
    time.sleep(5)

    # In[61]:


    # Scrape Mars HTML

    html = browser.html

    soup = bs(html,'html.parser')

    #print(soup.prettify())


    # In[63]:


    #Scrape Mars Tweet text and store in variable

    mars_weather = soup.find_all('span',class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0")[27].text
    browser.quit()


    # In[1]:


    # Pandas Scraping Mars Facts

    url = 'https://space-facts.com/mars/'


    # In[7]:


    # Needed to run conda install -c conda-forge html5lib on the PythonData kernel to get this working
    # Read URL HTML into tables

    tables = pd.read_html(url)
    tables[0]


    # In[ ]:


    mars_facts = tables[0].to_html()


    # In[65]:


    # Splinter scrape image urls
    executable_path = {'executable_path': r'C:\Users\nvora\AppData\Roaming\chromedriver_win32\chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)


    mars_images_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    browser.visit(mars_images_url)


    # In[67]:


    # Click first link URL and store image URL and title

    browser.find_by_css('h3')[0].click()
    cerberus_image = browser.find_by_text('Sample')['href']
    cerberus_title = browser.find_by_css('.title').text

    #Return to main page

    browser.back()


    # In[68]:


    # Click second link URL and store image URL and title

    browser.find_by_css('h3')[1].click()
    schiaparelli_image = browser.find_by_text('Sample')['href']
    schiaparelli_title = browser.find_by_css('.title').text

    browser.back()


    # In[69]:


    # Click second link URL and store image URL and title

    browser.find_by_css('h3')[2].click()
    syrtis_major_image = browser.find_by_text('Sample')['href']
    syrtis_major_title = browser.find_by_css('.title').text

    browser.back()


    # In[70]:


    # Click second link URL and store image URL and title

    browser.find_by_css('h3')[3].click()
    valles_marineris_image = browser.find_by_text('Sample')['href']
    valles_marineris_title = browser.find_by_css('.title').text

    browser.quit()


    # In[71]:


    # Append the images to a dictionary

    hemisphere_images = [
        {"title": cerberus_title, "img_url": cerberus_image},
        {"title": schiaparelli_title, "img_url": schiaparelli_image},
        {"title": syrtis_major_title, "img_url": syrtis_major_image},
        {"title": valles_marineris_title, "img_url": valles_marineris_image}
    ]

    # print(hemisphere_images)    
    
    mars_dict = {'mars_news':news_title,'news_summary':news_p,'featured_mars_image':featured_image_url,'mars_weather':mars_weather,'mars_facts':mars_facts,'mars_hemispheres':hemisphere_images}
    return mars_dict

Exemple #13

0

Afficher le fichier

def scrape():

    # Visit Nasa URL through splinter and parse HTML with beautiful soup
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # In[3]:

    # Scrape the site and collect the latest News Title and Paragraph Text.
    news = soup.find("div", class_="list_text")
    mars_title = news.find("div", class_="content_title").get_text()
    mars_p = news.find("div", class_="article_teaser_body").get_text()
    print(mars_title)
    print(mars_p)

    # In[4]:

    browser.quit()

    # ### JPL Mars Space Images - Featured Image

    # In[5]:

    # Visit Nasa URL through splinter and parse HTML with beautiful soup
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # In[6]:

    # Navigate the site to display the full size .jpg image
    browser.click_link_by_id('full_image')

    # In[7]:

    button = 'more info     '
    time.sleep(3)
    browser.find_by_text(button).click()

    # In[8]:

    # Parse the new html page with beautiful soup and retrieve url string for the image
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    url = soup.find("img", class_="main_image")['src']
    featured_url = 'https://www.jpl.nasa.gov' + url
    featured_url

    # In[9]:

    browser.quit()

    # ### Mars Weather

    # In[10]:

    # Visit  URL through splinter and parse HTML with beautiful soup
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # In[11]:

    # scrape the latest Mars weather tweet from the page.
    #data=soup.find("div",class_="js-tweet-text-container").get_text()
    mars_weather = soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    mars_weather

    # In[12]:

    # replace \n with , and display data
    mars_weather = ",".join(mars_weather.split("\n"))
    mars_weather

    # In[13]:

    browser.quit()

    # ### Mars Hemispheres

    # In[14]:

    # Visit URL through splinter and parse HTML with beautiful soup
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    #browser = webdriver.Chrome('./chromedriver')
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # In[15]:

    items = soup.find_all('div', class_='description')

    # Create empty list for hemisphere urls
    hemisphere_image_urls = []

    # Store the main_ul
    hemispheres_main_url = 'https://astrogeology.usgs.gov'

    # Loop through the items previously stored
    for i in items:
        # Store title
        title = i.find('h3').text

        # Store link that leads to full image website
        partial_img_url = i.find('a', class_='itemLink product-item')['href']

        # Visit the link that contains the full image website
        browser.visit(hemispheres_main_url + partial_img_url)

        # HTML Object of individual hemisphere information website
        partial_img_html = browser.html

        # Parse HTML with Beautiful Soup for every individual hemisphere information website
        soup = BeautifulSoup(partial_img_html, 'html.parser')

        # Retrieve full image source
        img_url = hemispheres_main_url + soup.find('img',
                                                   class_='wide-image')['src']

        # Append the retreived information into a list of dictionaries
        hemisphere_image_urls.append({"title": title, "img_url": img_url})

    # Display hemisphere_image_urls
    hemisphere_image_urls
    #items

    # In[16]:

    browser.quit()

    # In[17]:

    # desc=soup.find_all("div",class_="description")
    #data=[]
    # for i in range(len(desc)):
    #     name=desc[i].find("h3").get_text()
    #     browser.find_by_text(name).click()
    #     time.sleep(5)
    #     #browser.find_by_text('Original').click()
    #     #browser.find_by_text('Open').click()
    #     #src=soup.find("img",class_="wide-image")['src']
    #     #load=soup.find("div",class_="downloads", href=True)
    #     #ProdLinkElem = soup.find_all('a',target = '_blank', href = True)
    #     html = browser.html
    #     soup = BeautifulSoup(html, 'html.parser')
    #     hemispheres_main_url = 'https://astrogeology.usgs.gov'
    #     img_url = hemispheres_main_url + soup.find('img', class_='wide-image')['src']
    #     data.append({'title':name,
    #                 'url':img_url})

    # ### Mars Facts

    # In[24]:

    # Visit  URL through splinter and parse HTML with beautiful soup
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    #browser = webdriver.Chrome('./chromedriver')
    url = 'https://space-facts.com/mars/'
    browser.visit(url)
    #html = browser.html
    #soup = BeautifulSoup(html, 'html.parser')

    # In[25]:

    # use read_html function in Pandas to automatically scrape any tabular data from a page.
    mars_factsDf = pd.read_html(url)
    mars_fact = mars_factsDf[0]
    #mars_facts = mars_facts.to_html()
    #mars_facts
    mars_fact.columns = ['Description', 'Value', 'Value2']
    mars_fact.set_index('Description', inplace=True)
    mars_fact = mars_fact.iloc[:, 0:1]
    mars_facts = mars_fact.to_html()
    mars_facts = mars_facts.replace('\n', '')
    mars_fact.to_html('table1.html')
    mars_fact

    # In[20]:

    browser.quit()

    # In[21]:

    # # Visit Mars facts url
    # #facts_url = 'http://space-facts.com/mars/'

    # # Use Panda's `read_html` to parse the url
    # mars_facts = pd.read_html(url)

    # # Find the mars facts DataFrame in the list of DataFrames as assign it to `mars_df`
    # mars_df = mars_facts[0]

    # # Assign the columns `['Description', 'Value']`
    # mars_df.columns = ['Description','Value']

    # # Set the index to the `Description` column without row indexing
    # mars_df.set_index('Description', inplace=True)

    # # Save html code to folder Assets
    # mars_df.to_html()

    # data = mars_df.to_dict(orient='records')  # Here's our added param..

    # # Display mars_df
    # mars_df

    # In[22]:

    # Display a python dictionary of all scraped data

    # In[27]:

    mars_data = {
        "news_title": mars_title,
        "news_p": mars_p,
        "featured_url": featured_url,
        "mars_weather": mars_weather,
        "mars_facts": mars_facts,
        "hemisphere_image_urls": hemisphere_image_urls
    }
    return mars_data

Exemple #14

0

Afficher le fichier

Fichier : scrape_mars.py Projet : suvs27/HW12Mars

def scrape_info():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Scraping Nasa Mars News
    # Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/)
    # and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

    source = requests.get('https://mars.nasa.gov/news/').text
    soup = bs(source, 'html.parser')
    article = soup.find_all('div', class_='content_title')

    news_title0 = article[0].a.text
    news_title1 = article[1].a.text
    news_title2 = article[2].a.text

    description = soup.find_all('div', class_="rollover_description_inner")
    news_p0 = description[0].text
    news_p1 = description[1].text
    news_p2 = description[2].text

    # Scraping JPL Mars Space Images - Featured Image
    # Return featured_img_url

    #Visit the url for JPL Featured Space Image [here](https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars).
    #Use splinter to navigate the site
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    try:
        browser.click_link_by_id('full_image')
    except:
        browser.click_link_by_partial_text('FULL IMAGE')
    else:
        print("Scraping Full Image Complete")

    check = 0
    try:
        links_found = browser.find_link_by_partial_href('spaceimages/details')
        url2 = links_found[0]["href"]
        browser.click_link_by_partial_text('more info')
        links_found2 = browser.find_link_by_partial_href(
            'spaceimages/images/largesize')
        f1 = links_found2[0]["href"]
        check = 1
    except:
        browser.visit(url2)
        links_found3 = browser.find_link_by_partial_href(
            'spaceimages/images/largesize')
        f2 = links_found3[0]["href"]
    else:
        print("Scraping More Info Complete")

    if check == 1:
        featured_image_url = f1
    else:
        featured_image_url = f2

    # Mars Weather
    # Returns (mars_weather)

    #Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en)
    #and scrape the latest Mars weather tweet from the page.
    #Save the tweet text for the weather report as a variable called `mars_weather`
    source3 = requests.get('https://twitter.com/marswxreport?lang=en').text
    soup = bs(source3, 'html.parser')
    tweets = soup.find_all(
        'p',
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
    #print(tweets[0].text)

    mars_weather = tweets[0].text

    #Mars Facts
    # Returns (mars_facts_table)

    facts = pd.read_html("https://space-facts.com/mars/")

    mars_facts_df = facts[1]
    mars_facts_df.columns = ['Description', 'Value']
    mars_facts_df.set_index("Description", inplace=True)
    mars_facts_df.head()
    mars_facts_table = mars_facts_df.to_html()
    mars_facts_table = mars_facts_table.replace('\n', '')

    # Mars Hemispheres
    # Returns hemisphere_image_urls

    hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemi)
    html = browser.html

    # Get titles for all four mars pictures
    soup = bs(html, 'html.parser')
    hemi_class = soup.find_all('h3')
    cerberus_title = hemi_class[0].text
    schiaparelli_title = hemi_class[1].text
    syrtis_title = hemi_class[2].text
    valles_title = hemi_class[3].text

    # Get Cerberus Information
    browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced')
    link1 = (browser.find_link_by_partial_text('Original'))
    cerberus_link = (link1[0]["href"]) + "/full.jpg"
    browser.back()

    # Get Schiaparelli Information
    browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced')
    link2 = (browser.find_link_by_partial_text('Original'))
    schiaparelli_link = (link2[0]["href"]) + "/full.jpg"

    browser.back()

    # Get Syrtis Major Information
    browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced')
    link3 = (browser.find_link_by_partial_text('Original'))
    syrtis_link = (link3[0]["href"]) + "/full.jpg"
    browser.back()

    # Get Valles Major Information
    browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced')
    link4 = (browser.find_link_by_partial_text('Original'))
    valles_link = (link4[0]["href"]) + "/full.jpg"
    browser.back()

    marsdata = {
        "news_title0": news_title0,
        "description0": news_p0,
        "news_title1": news_title1,
        "description1": news_p1,
        "news_title2": news_title2,
        "description2": news_p2,
        "JPL_link": featured_image_url,
        "weather_tweet": mars_weather,
        "facts_table": mars_facts_table,
        "title1": cerberus_title,
        "img_url1": cerberus_link,
        "title2": schiaparelli_title,
        "img_url2": schiaparelli_link,
        "title3": syrtis_title,
        "img_url3": syrtis_link,
        "title4": valles_title,
        "img_url4": valles_link
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return marsdata

Exemple #15

0

Afficher le fichier

Fichier : tickets.py Projet : SQ-Duan/SqToolBox

class tickets(object):
    # 用户名，密码
    username = None
    passwd = None
    # 起始站和终点站
    starts = None
    ends = None
    # 时间格式2019-02-20
    dtime = None
    # 车次
    order = None
    # 乘客名
    passenger = None
    # 席位
    seatType =  None
    # 是否学生票
    isStudent = False
    # 刷新周期
    refresh_period = 5
    # 座位类型对照表
    __seatTypeList=[None,"YZ_",None,"YW_"]

    """网址"""
    ticket_url = "https://kyfw.12306.cn/otn/leftTicket/init"
    login_url = "https://kyfw.12306.cn/otn/resources/login.html"
    initmy_url = "https://kyfw.12306.cn/otn/view/index.html"
    buy_url="https://kyfw.12306.cn/otn/confirmPassenger/initDc"

    def __init__(self):
        self.driver_name='firefox'

    def login(self):
        self.driver.visit(self.login_url)
        sleep(1)
        # 填充密码
        self.driver.find_by_text("账号登录")[0].click()
        self.driver.find_by_id("J-userName")[0].fill(self.username)
        self.driver.find_by_id("J-password")[0].fill(self.passwd)
        print("等待验证码，自行输入...")
        while True:
            if self.driver.url != self.initmy_url:
                sleep(1)
            else:
                break

    def start(self):
        # 检测参数是否为空
        if not self.username or not self.passwd or not self.starts or not self.ends \
            or not self.dtime or not self.order or not self.passenger or not self.seatType:
            print("请初始化参数username,passwd,starts,ends,dtime,order,passenger,seatType...")
            return
        # 打开浏览器 
        self.driver=Browser(driver_name=self.driver_name)
        # 登陆
        self.login()
        # 开始抢票
        self.driver.visit(self.ticket_url)

        print("购票页面开始...")
        # 修改cookie
        self.driver.cookies.add({"_jc_save_fromStation": self.starts})
        self.driver.cookies.add({"_jc_save_toStation": self.ends})
        self.driver.cookies.add({"_jc_save_fromDate": self.dtime})
        # 重新载入
        self.driver.reload()

        while self.driver.url==self.ticket_url:
            # 点击查询按钮
            try:
                sleep(self.refresh_period)
                if self.driver.is_element_present_by_id("query_ticket",5):
                    self.driver.click_link_by_id("query_ticket")
                
                    if self.driver.is_element_present_by_id("ticket_"+self.order,3):
                        train_info=self.driver.find_by_id("ticket_"+self.order)
                        train_seat=train_info.find_by_id(self.__seatTypeList[self.seatType]+self.order)[0]
                
                        if train_seat.text!="无" and train_seat.text!="--":
                            print("有票，准备预订...")
                            train_info.find_by_text("预订")[0].click()
                else:
                    self.driver.reload()
            except:
                self.driver.reload()

        print('开始选择用户...')
        # 等待加载
        if self.driver.is_element_present_by_id("normal_passenger_id",5):
            if self.driver.is_element_present_by_text(self.passenger,5):
                psg_list=self.driver.find_by_id("normal_passenger_id")
                psg_list.find_by_text(self.passenger)[0].click()
                if "学生" in self.passenger:
                    self.driver.is_element_present_by_id("dialog_xsertcj_ok",5)
                    self.driver.find_by_id("dialog_xsertcj_ok")[0].click()

                print("开始选座...")
                # 3:硬卧，1:硬座
                self.driver.is_element_present_by_id("seatType_1",5)
                # select要求元素必须有name属性，由于12306只指定了选项的id
                # 我们模仿select函数的定义，构造select_by_id
                self.driver.find_by_xpath(
                    '//select[@id="%s"]//option[@value="%s"]' % ("seatType_1",str(self.seatType))
                    ).first._element.click()

                print('提交订单...')
                self.driver.is_element_present_by_id("submitOrder_id",5)
                #self.driver.find_by_id('submitOrder_id').click()

                #self.driver.is_element_present_by_id("qr_submit_id",5)
                #self.driver.find_by_id('qr_submit_id').click()
            else:
                print('找不到text为\"%s\"的元素，程序即将退出...'%self.passenger)

Exemple #16

0

Afficher le fichier

Fichier : project.py Projet : Nnothdurft/Chess

    # for newPos in boardPositions:
    #     boardLayout = boardLayout.replace(newPos, pos[len(pos)-2:])
    # for ph in placeholders:
    #     boardLayout = boardLayout.replace(ph, "  ")
    # print(boardLayout)


entryURL = "https://www.chess.com/login"
secondaryURL = "https://www.chess.com/tactics"
browser = Browser()
browser.visit(entryURL)
browser.driver.set_window_position(0, 0, windowHandle='current')
browser.driver.set_window_size(1920, 1080, windowHandle='current')
browser.find_by_id("username").first.fill(username)
browser.find_by_id("password").first.fill(password)
browser.click_link_by_id("login")
browser.visit(secondaryURL)
startBtn = browser.driver.find_element_by_css_selector(
    "#sidebar .tactics-sidebar .btn-primary.btn-start")
startBtn.click()
styles = []
locations = []
lastMove = []
lastCapture = ""
time.sleep(3)
elements = browser.driver.find_element_by_id(
    "chess_com_tactics_board_boardarea").find_elements_by_tag_name("img")
lastMovePositions = browser.driver.find_element_by_id(
    "chess_com_tactics_board_boardarea").find_elements_by_tag_name("div")
index = lastMovePositions[len(lastMovePositions) -
                          1].get_attribute("style").find("translate(")

Exemple #17

0

Afficher le fichier

def scrape():
    # Dependencies
    from splinter import Browser
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    import pymongo
    import time
    import ctypes  # An included library with Python install.
    
    def Mbox(title, text, style):
        return ctypes.windll.user32.MessageBoxW(0, text, title, style)

    
    mars_data_dict = {}
    
    ## (1) NASA Mars News
    # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. 
    # Assign the text to variables that you can reference later.
       
    # URL of page to be scraped
    url_nz = 'https://mars.nasa.gov/news/'

    # Retrieve page with the requests module
    response_nz = requests.get(url_nz)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_nz = BeautifulSoup(response_nz.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_nz.prettify())
    
    #time.sleep(2)
    
    # Find the latest News Title
    news_title = soup_nz.find("div", class_="content_title").a.text[1:-1]
    #print(news_title)
    
    # Find the latest News Paragraph Text
    news_p = soup_nz.find("div", class_="image_and_description_container").a.text[3:-7]
    #print(news_p)
    
    mars_data_dict["news_title"] = news_title
    mars_data_dict["news_p"] = news_p
        
        
    
    ## (2) JPL Mars Space Images - Featured Image
    # Use splinter to navigate the site and find the image url for the current Featured Mars Image 
    # and assign the url string to a variable called featured_image_url.
    # Make sure to find the image url to the full size .jpg image.
    # Make sure to save a complete url string for this image.
    
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # URL of page to be scraped
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_jpl)
    
    time.sleep(2)
    
    #dir(browser)
    
    browser.click_link_by_id('full_image')
    
    time.sleep(2)
    
    browser.click_link_by_partial_href("/spaceimages/details.")
    
    time.sleep(2)
    
    browser.click_link_by_partial_href("/spaceimages/images/largesize")
    
    time.sleep(2)
    
    featured_image_url = browser.url
    #print(featured_image_url)
    
    mars_data_dict["feat_img"] = featured_image_url
    
    browser.quit()
    
           
    
    ## (3) Mars Weather
    # Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page.
    # Save the tweet text for the weather report as a variable called mars_weather.
        
    # URL of page to be scraped
    url_tweet = 'https://twitter.com/marswxreport?lang=en'

    # Retrieve page with the requests module
    response_tweet = requests.get(url_tweet)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_tweet = BeautifulSoup(response_tweet.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_tweet.prettify())
    
    #time.sleep(2)
    
    # scrape the latest Mars weather tweet from the page
    tweets = soup_tweet.find_all("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text")
    for tweet in tweets:
        find_text = tweet.text.find("InSight sol")
        if find_text == 0:
            mars_weather = tweet.text
            #print(mars_weather)
            break
    
    mars_data_dict["weather"] = mars_weather
    
    
    
    ## (4) Mars Facts
    # URL of page to be scraped
    url_mfacts = 'https://space-facts.com/mars/'

    # Retrieve page with the requests module
    response_mfacts = requests.get(url_mfacts)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup_mfacts = BeautifulSoup(response_mfacts.text, 'lxml')

    # Examine the results, then determine element that contains sought info
    #print(soup_mfacts.prettify())
    
    #time.sleep(2)
    
    tables = pd.read_html(url_mfacts)[1]
    #tables
    
    mars_data_dict["mfacts"] = tables
    
    tables.to_html("../html/mars_facts.html")
    
    
    
    ## (5) Mars Hemispheres
    # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
    # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    # Save both the image url string for the full resolution hemisphere image, 
    #     and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the 
    #     keys img_url and title.
    # Append the dictionary with the image url string and the hemisphere title to a list. 
    #     This list will contain one dictionary for each hemisphere
    
    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

    # URL of page to be scraped
    url_mhemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_mhemi)
    
    time.sleep(2)
    
    # Image 1
    browser.click_link_by_partial_text("Cerberus Hemisphere Enhanced")
    
    time.sleep(2) 
    
    title1 = browser.title.split("|")[0]
    #print(title1)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img1_url = browser.windows[1].url
    #print(img1_url)
    
    time.sleep(2) 
    
    browser.windows[1].close()
    browser.back()
    
    hemi1_dict = {}
    hemi1_dict["title"] = title1
    hemi1_dict["img_url"] = img1_url
    #hemi1_dict
    
    # Image 2
    
    browser.click_link_by_partial_text("Schiaparelli Hemisphere Enhanced")
    
    time.sleep(2)
    
    title2 = browser.title.split("|")[0]
    #print(title2)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img2_url = browser.windows[1].url
    #print(img2_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi2_dict = {}
    hemi2_dict["title"] = title2
    hemi2_dict["img_url"] = img2_url
    #hemi2_dict
    
    # Image 3
    
    browser.click_link_by_partial_text("Syrtis Major Hemisphere Enhanced")
    
    time.sleep(2)
    
    title3 = browser.title.split("|")[0]
    #print(title3)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img3_url = browser.windows[1].url
    #print(img3_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi3_dict = {}
    hemi3_dict["title"] = title3
    hemi3_dict["img_url"] = img3_url
    #hemi3_dict
    
    # Image 4
    browser.click_link_by_partial_text("Valles Marineris Hemisphere Enhanced")
    
    time.sleep(2)
    
    title4 = browser.title.split("|")[0]
    #print(title4)
    
    browser.click_link_by_text("Sample")
    
    time.sleep(2)
    
    img4_url = browser.windows[1].url
    #print(img4_url)
    
    time.sleep(2)
    
    browser.windows[1].close()
    browser.back()
    
    hemi4_dict = {}
    hemi4_dict["title"] = title4
    hemi4_dict["img_url"] = img4_url
    #hemi4_dict
    
    hemisphere_image_urls = [hemi1_dict, hemi2_dict, hemi3_dict, hemi4_dict]
    #hemisphere_image_urls
    
    mars_data_dict["hemi_img"] = hemisphere_image_urls
    mars_data_dict  
    
    browser.quit()
    

    
    Mbox("Mission to Mars Completed", "Congratulations!!! You've mined Mars!", 1)

Exemple #18

0

Afficher le fichier

Fichier : scrape_mars_all.py Projet : BrianLabelle/Mission-to-Mars

def scrape_info():
    browser = init_browser()

    # URL of pages to be scraped
    # NASA Mars News |  NASA Mars News Site
    url_nasa_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

    # JPL Mars Space Images - Featured Image
    url_jpl_images = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    # Mars Weather | Twitter ( return entry with InSight sol )
    url_twitter_weather = 'https://twitter.com/marswxreport?lang=en'

    # Mars Facts | Space Facts
    url_sf_facts = 'https://space-facts.com/mars/'

    # Mars Hemispheres |  USGS Astrogeology site
    # url_USGS_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    # Error 404 We couldn’t find the page. If you think you are seeing this in error please contact us. You can also try searching for the missing page.

    # Temp
    url_USGS_hemispheres = 'http://www.labellelube.com/mars.html'

    executable_path = {"executable_path":"chromedriver"}
    browser = Browser("chrome", **executable_path, headless = False)
    browser.visit(url_nasa_news)
    time.sleep(1)
    html = browser.html
    soup_news = BeautifulSoup(html, "html.parser")
    time.sleep(1)

    # Examine results determine div that contains info
    news_title = soup_news.find("div",class_="content_title").text
    news_paragraph = soup_news.find("div", class_="article_teaser_body").text

    # Display scrapped data 
    print(news_title)
    print(news_paragraph)

    browser.quit()
    time.sleep(1)

    # JPL Mars Space Images - Featured Image
    # http://localhost:8888/notebooks/003-99-EXTRA_Stu_Scrape_Mars/Stu_Scrape_Mars.ipynb
    executable_path = {"executable_path":"chromedriver"}
    browser = Browser("chrome", **executable_path, headless = False)
    browser.visit(url_jpl_images)
    time.sleep(1)


    # https://splinter.readthedocs.io/en/latest/elements-in-the-page.html
    # https://stackoverflow.com/questions/29773368/splinter-how-to-click-a-link-button-implemented-as-a-div-or-span-element
    browser.click_link_by_id('full_image')
    time.sleep(1)

    # https://splinter.readthedocs.io/en/latest/elements-in-the-page.html
    # browser.click_link_by_text('more info')
    browser.click_link_by_partial_text('more info')
    time.sleep(1)

    #jpl_image = soup_jpl.select_one("article figure.lede a img").get("src")
    html = browser.html
    soup_jpl = BeautifulSoup(html, "html.parser")
    time.sleep(1)
    jpl_image = soup_jpl.find('figure', class_='lede').a['href']
    url_root = "https://www.jpl.nasa.gov/"
    time.sleep(1)
    jpl_image_absolute = url_root+jpl_image
    print(jpl_image_absolute)

    browser.quit()
    time.sleep(1)


    ## https://github.com/taspinar/twitterscraper
    ## DIDN"T WORK
    # http://localhost:8888/notebooks/003-99-EXTRA_Stu_Scrape_Mars/Stu_Scrape_Mars.ipynb
    executable_path = {"executable_path":"chromedriver"}
    browser = Browser("chrome", **executable_path, headless = False)
    browser.visit(url_twitter_weather)
    time.sleep(1)
    html = browser.html 
    soup_jpl = BeautifulSoup(html, "html.parser")
    time.sleep(1)

    # Find all tweets
    tweets = soup_jpl.find_all('div', class_='js-tweet-text-container')

    time.sleep(1)
    for tweet in tweets: 
        weather_tweet = tweet.find('p').text
        if 'Sol' and 'pressure' in weather_tweet:
           print(weather_tweet)
           break
        else: 
            pass

        browser.quit()
        time.sleep(1)



    # Mars Facts | Space Facts
    executable_path = {"executable_path":"chromedriver"}
    browser = Browser("chrome", **executable_path, headless = False)
    browser.visit(url_sf_facts)
    time.sleep(1)
    html = browser.html
    soup_sf = BeautifulSoup(html, "html.parser")
    time.sleep(1)

    ### # Mars Facts | Space Facts
    # https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.read_html.html
    # https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.to_html.html

    df_mars_facts = pd.read_html(url_sf_facts)
    #df_mars_facts has earth comparison.

    
    df_mars_space_facts = df_mars_facts[1]

    df_mars_space_facts.columns = ['Description','Value']
    df_mars_space_facts.set_index('Description', inplace=True)
    df_mars_space_facts.to_html()
    df_mars_space_facts
    print(df_mars_space_facts.to_html())

    browser.quit()


    # Mars Hemispheres |  USGS Astrogeology site
    executable_path = {"executable_path":"chromedriver"}
    browser = Browser("chrome", **executable_path, headless = False)
    browser.visit(url_USGS_hemispheres)
    time.sleep(2)

    html = browser.html
    soup_USGS = BeautifulSoup(html, "html.parser")
    items = soup_USGS.find_all('div', class_='item')
    USGS_images = []
    url_USGS_hemispheres = 'http://www.labellelube.com/mars.html'
    url_root='http://www.labellelube.com/'

    for i in items: 
        title = i.find('h2').text
        image = i.find('img')['src']
        USGS_images.append({"title" : title,"link" : url_root+image})
        
        USGS_images

        browser.quit()


    
    # Store data in a dictionary
    mars_data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "jpl_image_absolute": jpl_image_absolute,
        "weather_tweet":weather_tweet,
        "df_mars_space_facts.to_html":df_mars_space_facts.to_html,
        "USGS_images":USGS_images
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return mars_data

Exemple #19

0

Afficher le fichier

def scrape_info():
    browser = init_browser()
    mars_info = {}
    browser = Browser("chrome", headless=False)
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)

    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    news_title = soup.find("div", class_="content_title").text
    news_p = soup.find("div", class_="article_teaser_body").text

    mars_info["NewsTitle"] = news_title
    mars_info["NewsDescription"] = news_p

    ### JPL Mars Space Images - Featured Image
    base_url = "https://www.jpl.nasa.gov"
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    time.sleep(1)

    browser.click_link_by_id('full_image')
    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    featured_image_url1 = soup.find('img', class_="fancybox-image")
    featured_image_url = base_url + featured_image_url1['src']

    print(featured_image_url)

    mars_info["FeaturedImage"] = featured_image_url

    ### Mars Weather from Twitter

    url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url)
    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    mars_weather = soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text.strip()

    mars_info["WeatherTweet"] = mars_weather

    ### Mars Facts

    #Visit the Mars Facts webpage http://space-facts.com/mars/ and use Pandas to scrape
    #the table containing facts about the planet including Diameter, Mass, etc.

    #Use Pandas to convert the data to a HTML table string.
    url = "https://space-facts.com/mars/"

    browser.visit(url)
    time.sleep(3)

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    # tables_df_list  = soup.find("table", id="tablepress-mars")
    tables = pd.read_html(html)

    # for table in tables_df_list:

    #     html_table = table.to_html()
    #     html_table

    # html_table.replace('\n', '')

    # table.to_html('table.html')

    mars_info["MarsTable"] = tables[0].to_html()

    # tables = pd.read_html(url)
    # tables

    # tables_df = tables[0]
    # tables_df

    # html_table = tables_df.to_html()
    # html_table

    # html_table.replace('\n', '')

    # tables_df.to_html('table.html')

    # mars_info["MarsTable"] = tables_df.to_html('table.html')

    ### Mars Hemispheres

    #Visit the USGS Astrogeology site https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars
    #to obtain high resolution images for each of Mar's hemispheres.

    #You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.

    #Save both the image url string for the full resolution hemisphere image,
    #and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using
    #the keys `img_url` and `title`.

    #Append the dictionary with the image url string and the hemisphere title to a list.
    #This list will contain one dictionary for each hemisphere.

    ### JPL Mars Space Images - Featured Image
    base_url1 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(base_url1)
    time.sleep(1)
    #url_new = "https://astrogeology.usgs.gov/search/map/Mars/Viking/"

    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    products = soup.find("div", id="product-section")
    link_lists = soup.find_all("div", class_="description")
    #link_lists = [x.find("a") for x in product_des]

    link_lists

    imagelist = []

    for link_list in link_lists:
        linktext = link_list.h3.text
        #browser.execute_script("arguments[0].scrollIntoView();", link_list)
        browser.click_link_by_partial_text(linktext)
        time.sleep(1)
        soup2 = BeautifulSoup(browser.html, "html.parser")
        image_url1 = soup2.find('a', target="_blank")
        img_url = image_url1['href']
        title = soup2.find('h2', class_="title").get_text()
        imagelist.append({"title": title, "img_url": img_url})
        browser.back()
        time.sleep(1)

    mars_info["ImageTitle"] = title
    mars_info["ImageURL"] = img_url

    return (mars_info)

Exemple #20

0

Afficher le fichier

Fichier : mission_to_mars.py Projet : shandizmon/Data-Analysis-Visualization-Projects

# In[4]:

executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

# In[5]:

url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'lxml')

# In[6]:

#browser.click_link_by_partial_href('/spaceimages/images/mediumsize/PIA17932_ip.jpg')
browser.click_link_by_id('full_image')

# In[7]:

html = browser.html
soup = BeautifulSoup(html, 'lxml')
featured_image = soup.find('img', class_='fancybox-image')
featured_image_url = 'https://www.jpl.nasa.gov' + featured_image['src']
browser.quit()
print(featured_image_url)

# ### Mars Weather

# In[8]:

# Retrieve page with the requests module

Exemple #21

0

Afficher le fichier

def scrape():

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    print("SAM after setting browser")
    #store all the scraped data in a dictionary
    mars_dictionary = {}

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # In[3]:

    # Retrieve page with the requests module
    html = requests.get(url)

    # In[4]:

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html.text, 'lxml')

    # In[5]:

    # news_title = soup.find('div', class_='content_title').text
    # print(news_title)
    # news_p = soup.find('div', class_='image_and_description_container').text
    # print(news_p)

    # print("title: ", news_title)
    # print("paragraph: ", news_p)

    # In[6]:

    #get the title
    container = soup.find('div', class_="content_title")
    news_title = container.a.text

    #get the paragraph description
    container = soup.find('div', class_="image_and_description_container")
    text_tot = container.find('div', class_="rollover_description_inner")
    news_p = text_tot.text

    print("title: ", news_title)
    print("paragraph: ", news_p)

    mars_dictionary["news_title"] = news_title
    mars_dictionary["news_para"] = news_p

    # In[7]:

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # In[8]:

    jpl_url = 'https://www.jpl.nasa.gov'

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    #use splinter to get the URL of the image

    # HTML object
    html = browser.html

    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    # In[9]:

    # Retrieve the article with the featured image
    article = soup.find('article', class_='carousel_item')

    # Use Beautiful Soup's find() method to navigate and retrieve attributes
    h1 = article.find('h1', class_='media_feature_title').text
    print(h1)

    # # Click the 'Full Image' button
    # browser.click_link_by_partial_text('FULL IMAGE')

    # In[10]:

    browser.click_link_by_id("full_image")

    # In[11]:

    #then click "more info" to get the full size image
    time.sleep(2)
    browser.click_link_by_partial_text('more info')

    # In[12]:

    #get the html of the new page
    # HTML object
    html = browser.html

    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    # In[13]:

    img_url = soup.find('img', class_="main_image")['src']
    # print(img_url)

    # fig_url = figure['src']

    # print(fig_url)

    # In[14]:

    featured_image_url = jpl_url + img_url
    print(featured_image_url)

    mars_dictionary["featured_img_title"] = h1
    mars_dictionary["featured_img_url"] = featured_image_url

    # In[15]:

    #Save the tweet text for the weather report as a variable called mars_weather.
    url = 'https://twitter.com/marswxreport?lang=en'
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.text, 'html.parser')

    # In[16]:

    stream = soup.find('div', class_="tweet")
    print(stream)

    # In[17]:

    streams = soup.find_all('div', class_="tweet")
    for tweet in streams:
        mars_weather = tweet.find('p').text
        if 'Sol' and 'pressure' in mars_weather:
            print(mars_weather)
            break
        else:
            pass

    mars_dictionary["weather"] = mars_weather
    # In[18]:

    # streams = soup.find_all('div', class_="tweet")
    # for tweet in streams:
    #     if (tweet['data-screen-name'] == "MarsWxReport"):
    #         #save the tweet
    #         mars_weather = tweet.find('p', class_="tweet-text").text
    #         print(mars_weather)
    #         break
    #     else:
    #         pass

    # In[19]:

    # mars_weather = stream.find("p", class_="tweet-text").text
    # mars_weather = mars_weather.rstrip("pic.twitter.com/X7ISVrTgLY")
    # print(mars_weather)

    # In[20]:

    url = 'https://space-facts.com/mars/'
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.text, 'html.parser')

    tables = pd.read_html(url)
    tables[0]

    facts_df = pd.DataFrame(tables[0])

    #change the row headers
    header = pd.Series(["Type", "Value"])
    facts_df.rename(columns=header, inplace=True)
    # facts_df.set_index('Type')
    html_table = facts_df.to_html()

    #remove new line characters
    html_table.replace('\n', '')

    # get_ipython().system('open table.html')

    mars_dictionary["html_table"] = html_table

    # In[28]:

    base_url = "https://astrogeology.usgs.gov"
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.text, 'html.parser')

    # In[29]:

    hemispheres = soup.find_all('a', class_="itemLink")
    print(hemispheres)

    # In[30]:

    hemisphere_image_urls = []

    for item in hemispheres:

        #     try:
        #find title
        title = item.text
        #         print(title)

        #find link
        link = item['href']
        #         print(link)

        full_link = base_url + link
        #         print(full_link)
        #go to the link to get to the page with the full image
        response = requests.get(full_link)
        # Create BeautifulSoup object; parse with 'html.parser'
        soup = BeautifulSoup(response.text, 'html.parser')

        #get full image url from href in the <a> in the div class='download'
        high_res = soup.find('div', class_='downloads')

        full_href = high_res.find('a')['href']

        #         print("full_href: ", full_href)

        #put title and image URL into dictionary
        hemisphere_image_urls.append({"title": title, "img_url": full_href})

    #     except Exception as e:
    #         print("e: ",e)
    hemisphere_image_urls

    mars_dictionary["hemispheres"] = hemisphere_image_urls
    print(mars_dictionary)

    return mars_dictionary

Exemple #22

0

Afficher le fichier

Fichier : app.py Projet : JoeDReynolds/rent_predictor

def scraping():
    # Scrape miles from Domain and Downtown

    payload = {
        "units": "imperial",
        "origins": Address,
        "destinations":
        "11410 Century Oaks Terrace, Austin, TX 78758|1100 Congress Ave, Austin, TX 78701",
        "key": "AIzaSyCQhKXIlYN6TQ3MHT4lujpN0lXAyB1Tvyo"
    }

    response = req.get(
        "https://maps.googleapis.com/maps/api/distancematrix/json",
        params=payload).json()

    global miles_from_domain
    global miles_from_downtown
    miles_from_domain = response['rows'][0]['elements'][0]['distance']['text']
    miles_from_downtown = response['rows'][0]['elements'][1]['distance'][
        'text']
    miles_from_domain = miles_from_domain.split(' ')[0]
    miles_from_downtown = miles_from_downtown.split(' ')[0]

    # Scrape demographic data

    # URL of page to be scraped
    url_income = "http://www.energyjustice.net/justice/index.php"
    url_population = "https://www.freemaptools.com/find-population.htm"

    # retrive page with the requests module
    response = req.get(url_income)
    # create beautifulsoup object; parse with 'html.parser'
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    global median_household_income
    browser.visit(url_income)
    browser.fill('gsLocation', Address)
    browser.find_by_name('gsSubmit_desk').first.click()
    time.sleep(5)
    browser.find_by_id('income_layer_desk').first.click()
    time.sleep(5)
    map_soup = BeautifulSoup(browser.html, 'html.parser')
    income = map_soup.find_all('table')[1].find_all('td')[1].text
    income = income.split(' ')[0]
    income = income.split('$')[1]
    income = income.split(',')[0] + income.split(',')[1]
    median_household_income = float(income)

    global population
    browser.visit(url_population)
    time.sleep(5)
    browser.fill('radiusinputkm', '1.61')
    time.sleep(5)
    browser.find_by_id('tb_searchlocation').fill(Address)
    # browser.find_by_id('tb_searchlocation').fill('\n')
    time.sleep(5)
    browser.find_by_tag('p')[3].click()
    population = browser.find_by_id('div_output').text
    population = population.split(' ')[-1]

    browser.quit()

    # Scrape the built_year
    # set the chromedriver path
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    costar_url = "https://costar.com/"
    browser.visit(costar_url)

    # login
    browser.click_link_by_id('loginLink')
    browser.fill('username', 'SDong2')
    browser.fill('password', '719111719111')
    browser.click_link_by_id('loginButton')
    time.sleep(20)

    try:
        x_path = '//*[@id="cs-gateway-home-page"]/div[2]/div[1]/div/div/div[2]/div/div[1]/input'
        search_box = browser.find_by_xpath(x_path)
        search_box.fill(Address)
        search_button = browser.find_by_xpath(
            '//*[@id="react-autowhatever-1--item-0"]/div/span[1]')
        search_button.click()
        time.sleep(20)
        global built_year
        built_year = browser.find_by_xpath(
            '//*[@id="Building_YearBuilt"]/span[2]').text
        if len(built_year) > 4:
            built_year = built_year.split(' ')[1]
            built_year = float(built_year)
        else:
            built_year = float(built_year)
    except:
        built_year = 1997

    browser.quit()

    # Scrape walk scores
    # set the chromedriver path
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)
    walk_score_url = 'https://www.walkscore.com/'
    browser.visit(walk_score_url)
    search_entry = browser.find_by_id('gs-street')
    search_entry.fill(Address)
    time.sleep(10)

    browser.find_by_css('.go-btn')[1].click()
    time.sleep(10)

    html = browser.html
    score_soup = BeautifulSoup(html, 'html.parser')

    scores = []
    for i in range(1, len(score_soup.find_all('img'))):
        try:
            score_path = score_soup.find_all('img')[i]['src']
            split = score_path.split('/')[-1]
            score = split.split('.')[0]
            scores.append(score)
        except:
            print("no src")
            break

    global walk_score
    global transit_score
    walk_score = scores[-3]
    transit_score = scores[-2]

    browser.quit()

    # redirect to the machine learning page
    return redirect('/ml')

Exemple #23

0

Afficher le fichier

Fichier : scrape_mars.py Projet : jt0948332/MarsNewsScraper

def scrape():
        #dependencies
    from bs4 import BeautifulSoup as bs
    import splinter
    import requests
    from splinter import Browser
    import time
    import pandas as pd
    from selenium import webdriver
    import os
    import pymongo
    import json

    #The dictionary
    mars_facts_data={}

    #1
    #emulate the browser and get the html
    executable_path = {'executable_path': 'C:/chromedriver/chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)
    #url to visit
    url='https://mars.nasa.gov/news/'
    #we need to use the browser to visit the page because there are many elements that do not load until the page is loaded.
    #requests would only get the raw html.
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    news_p =soup.select_one("div.rollover_description_inner")
    news_title = soup.select_one("div.content_title")
    news_p = news_p.text
    news_title = news_title.text
    mars_facts_data['news_title'] = news_title
    mars_facts_data['news_paragraph'] = news_p

    #2
    executable_path = {'executable_path': 'C:/chromedriver/chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    browser.visit(url)

    browser.click_link_by_id('full_image')
    time.sleep(3)
    browser.click_link_by_partial_text('more info')
    time.sleep(3)

    time.sleep(3)
    browser.click_link_by_partial_href('/spaceimages/images/')
    #Download the image and Store
    response = requests.get(browser.url)
    if response.status_code == 200:
        linkname= (browser.url.rsplit('/', 1)[-1])
        SaveFile = (f'Resources/{linkname}')
        with open(SaveFile, 'wb') as f:
            f.write(response.content)
    print(browser.url)
    Space_image_dict = {}
    Space_image_dict['Url'] = browser.url
    mars_facts_data['featured_image'] = browser.url
    #collection.insert_one(Space_image_dict)

    #3
    mars_weather_dict = {}
    url='https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    mars_weather = soup.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text
    mars_weather =mars_weather.strip()
    mars_facts_data['weather'] = mars_weather
    mars_facts_data
    #collection.insert_one(mars_weather_dict)

    #4
    url = 'https://space-facts.com/mars/'
    df = pd.read_html(url)
    #df = pd.DataFrame(df)
    df= df[0]
    df.columns = ['Category', 'Measure']
    df.set_index('Category',inplace = True)
    mars_html_table = df.to_html()
    mars_html_table = mars_html_table.replace("\n","")
    mars_facts_data['mars_facts_table'] = mars_html_table
    return mars_facts_data

Exemple #24

0

Afficher le fichier

Fichier : utilBrowser.py Projet : pesutsinga/cobalah

class ChopeBrowser:
    def __init__(self, headless=False):
        self.chrome = Browser('chrome', headless=headless)

    def time_delay(self, time):
        self.chrome.is_element_present_by_name('!@#$%^&*())(*&^%$#@!',
                                               wait_time=time)

    def login(self, usr, pwd, domain='STUDENT'):
        url = 'https://ntupcb.ntu.edu.sg'
        url += '/fbscbs/Account/SignIn?ReturnUrl=%2ffbscbs'
        self.chrome.visit(url)
        dropdown = self.chrome.find_by_tag('option')

        for option in dropdown:
            if option.text == domain:
                option.click()

        self.chrome.fill('Username', usr)
        self.chrome.fill('Password', pwd + '\n')


# PC BOOKING STARTS HERE
# Tries to book the PC of selected type

    def pc_setup(self, usr, pwd, Type):
        self.login(usr, pwd)
        button = self.chrome.find_by_id('tdPcBook')
        button.click()
        time.sleep(2)
        with self.chrome.get_iframe('frmAdminViewControls') as iframe:
            iframe.find_by_id('pnlInsLoc3').click()
        self.type_number(Type)
        data = self.scrape_pc()

        can_book = self.book_pc(data[1], data[2])
        self.chrome.quit()
        return data[0], can_book

    # identify pc type requested
    def type_number(self, Types):
        for i in range(0, 4):
            with self.chrome.get_iframe('frmAdminViewControls') as iframe:
                page = iframe.find_by_id('pnlInsPcGrp' + str(i))
                if page != []:
                    page = page.html
                    page = BeautifulSoup(page, "lxml")
                    page = page.find("span", {
                        "style":
                        "display:inline-block;height:20px;width:80px;"
                    })
                    page = page.get_text()
                    if page == Types:
                        page = iframe.find_by_id('pnlInsPcGrp' +
                                                 str(i)).click()
                        return
        return 0

    # Scrape all PC in the current screen
    def scrape_pc(self):
        with self.chrome.get_iframe('frmSeating') as iframe:
            for i in range(0, 6):

                for j in range(1, 11):
                    btnID = 'grdSeating_tblCol' + str(j) + '_' + str(i)
                    parse = iframe.find_by_id(btnID)
                    if parse == []:
                        return 'no pc', 100, 100
                    if parse != []:
                        color = self.color(parse.html)
                        if (color == '#FFFFFF'):
                            return self.name_pc(parse.html), j, i
        no_pc = 'no pc'
        j = 100
        i = 100
        return no_pc, j, i

    # Identify name of PC
    def name_pc(self, codes):
        soup = BeautifulSoup(codes, "lxml")
        mydivs = soup.findAll("span", {"class": "lblPcName"})
        return mydivs[0].get_text()

    # Check availability of PC, by detecting background color
    def color(self, code):
        soup = BeautifulSoup(code, "lxml")
        tag = soup.findAll('td', {"style": "background-color: #FFFFFF"})
        if tag != []:
            return '#FFFFFF'
        else:
            return 'blabla'

    # Try to book the selected PC
    def book_pc(self, col, row):
        with self.chrome.get_iframe('frmSeating') as iframe:
            if (col != 100) and (row != 100):
                try:
                    time.sleep(1)
                    butt = iframe.find_by_id("grdSeating_divOuterCol" +
                                             str(col) + "_" + str(row))
                    if butt != []:
                        butt.click()
                    time.sleep(1)
                    sub = iframe.find_by_name("btnsumit")
                    sub.click()
                    return "booked"
                except:
                    pyautogui.press('enter')
                    return "cannot book"
        return "cannot book"

    # Initialize booking site until arriving to the booking table
    def first_setup(self):
        button = self.chrome.find_by_id('tdFacilityBook')
        button.click()
        self.chrome.click_link_by_href('#8')
        self.chrome.click_link_by_href('#-1')
        self.chrome.click_link_by_href('/fbscbs/Booking/Create?resourceId=69')
        self.chrome.click_link_by_id('book')
        self.chrome.click_link_by_id('changeResource')
        self.chrome.click_link_by_href('#-1')
        self.chrome.click_link_by_id('book')

    # Eliminates unnecessary booking slots
    def is_registered(event):
        if event.has_class('noShowWhite'):
            return False
        if event.has_class('currentEvent'):
            return False
        return True

    # Adds weekly booked slots for selected facility
    # Each list of weekly bookings contain list of daily bookings
    # each containing lists booked slots, determined by start and end time
    def check_facility(self, evFacilities):
        columnWeek = self.chrome.find_by_css('.wc-event-column')
        evWeek = []
        for columnDay in columnWeek:
            evToday = []
            evList = columnDay.find_by_css('.ui-corner-all')
            for event in evList:
                if not event.has_class('noShowWhite'):
                    if not event.has_class('currentEvent'):
                        event = event.text
                        if not event.find('—') == -1:
                            if event == '':
                                continue
                            evToday.append(event.split('—'))
            evWeek.append(evToday)
        evFacilities.append(evWeek)

    def click_next(self, counter, evFacilities):
        # Recursively check facilities.
        # Choose facility based on counter
        dropdown = self.chrome.find_by_id('ResourceId')
        options = dropdown.find_by_tag('option')
        if counter < len(options):
            nextOption = options[counter]
            nextOption.click()
            self.check_facility(counter, evFacilities)
        else:
            return evFacilities

    # Scrape seats main function
    # OPTIMIZE: by multithreading
    #           and by runnnig multiple browser at once
    def scrape_seats(self, usr, pwd):
        self.login(usr, pwd)
        self.first_setup()
        evFacilities = []
        dropdown = self.chrome.find_by_id('ResourceId')
        options = dropdown.find_by_tag('option')
        optRange = range(len(options))
        for i in optRange:
            opt = options[i]
            nextOption = opt
            nextOption.click()
            self.time_delay(0.2)
            # while loadingTitle.visible:
            #     pass
            evFacilities.append(opt.text)
            self.check_facility(evFacilities)
        self.quit()
        return evFacilities

    def quit(self):
        self.chrome.quit()

Exemple #25

0

Afficher le fichier

Fichier : scrape_mars.py Projet : nameyeh/web-scraping-01

def scrape():
    # Dependencies
    import time
    import requests
    import pandas as pd
    from bs4 import BeautifulSoup
    from splinter import Browser
    # from selenium.webdriver.common import action_chains, keys
    # from selenium import webdriver
    import pymongo
    conn = "mongodb://localhost:27017"
    client = pymongo.MongoClient(conn)
    db = client.mars_db
    mars_data = db.mars_data
    db.mars_data.drop()

    # having issues with browser, use webdriver instead

    #driver = webdriver.Chrome()
    #url = 'https://mars.nasa.gov/news/'
    #driver.get(url)

    #html = driver.page_source
    #soup = BeautifulSoup(html, 'lxml')
    # In[35]:

    browser = Browser('chrome', headless=False)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(5)

    # In[36]:

    html = browser.html
    soup = BeautifulSoup(html, 'lxml')
    print(soup.prettify())

    # In[37]:

    # soup.body.prettify()

    # In[38]:

    # Extract news title text
    title = soup.find('div', class_='bottom_gradient').text
    print(title)

    # In[39]:

    # Extract paragraph text
    paragraph = soup.find('div', class_='rollover_description_inner').text
    print(paragraph)

    # ### JPL Mars Space Images - Featured Image

    # In[7]:

    # Visit the url for JPL's Featured Space Image here.
    # Use splinter to navigate the site and find the image url for
    # the current Featured Mars Image and assign the url string to a variable called featured_image_url.
    # Make sure to find the image url to the full size .jpg image.
    # Make sure to save a complete url string for this image.

    # # Example:
    # featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg'

    # In[11]:

    from splinter import Browser
    #img_url = 'https://www.jpl.nasa.gov/spaceimages/'

    #executable_path = {'executable_path': './chromedriver'}
    #browser = Browser('chrome', **executable_path)
    #browser.visit(img_url)

    browser = Browser('chrome', headless=False)
    img_url = 'https://www.jpl.nasa.gov/spaceimages/'
    browser.visit(img_url)
    time.sleep(5)

    # In[12]:

    browser.click_link_by_id('full_image')

    # In[13]:

    time.sleep(5)
    browser.find_link_by_partial_text('more info').click()

    # In[14]:

    #time.sleep(5)
    #browser.find_link_by_partial_text('.jpg').click()

    # In[15]:

    time.sleep(5)
    featured_image_url = browser.find_by_tag('img')[6]['src']
    featured_image_url

    # ### Mars Weather

    # In[16]:

    # Visit the Mars Weather twitter account here
    # and scrape the latest Mars weather tweet from the page.
    # Save the tweet text for the weather report
    # as a variable called mars_weather.

    # Example:
    # mars_weather = \
    # 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa, daylight 06:09-17:55'

    # In[17]:

    from splinter import Browser
    browser = Browser('chrome', headless=False)
    tw_acct_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(tw_acct_url)
    time.sleep(5)

    # In[18]:

    html = browser.html
    soup = BeautifulSoup(html, 'lxml')
    # print(soup.prettify())

    # In[19]:

    container = soup.find('div', class_='js-tweet-text-container')
    container

    # In[20]:

    mars_weather = container.find(
        'p',
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    mars_weather

    # ### Mars Facts

    # In[21]:

    # Visit the Mars Facts webpage here and use Pandas
    # to scrape the table containing facts about the planet
    # including Diameter, Mass, etc.
    # Use Pandas to convert the data to a HTML table string.

    # In[22]:

    marsfacts_url = 'https://space-facts.com/mars/'
    tables = pd.read_html(marsfacts_url)
    tables

    # In[23]:

    df = tables[0]
    df

    # In[24]:

    df = df.rename(columns={0: 'Measurement', 1: 'Value'})
    df = df.set_index('Measurement')
    df

    # In[25]:

    # convert table to html string
    html_table = df.to_html()
    html_table

    # In[26]:

    # strip unwanted newlines to clean up the table.
    html_table = html_table.replace('\n', '')
    html_table

    # ### Mars Hemisperes

    # In[27]:

    # Visit the USGS Astrogeology site here to obtain
    # high resolution images for each of Mar's hemispheres.
    # You will need to click each of the links to the hemispheres
    # in order to find the image url to the full resolution image.
    # Save both the image url string for the full resolution hemipshere image,
    # and the Hemisphere title containing the hemisphere name.

    # Use a Python dictionary to store the data using the keys img_url and title.
    # Append the dictionary with the image url string and the hemisphere title to a list.
    # This list will contain one dictionary for each hemisphere.

    # # Example:
    # hemisphere_image_urls = [
    #     {"title": "Valles Marineris Hemisphere", "img_url": "..."},
    #     {"title": "Cerberus Hemisphere", "img_url": "..."},
    #     {"title": "Schiaparelli Hemisphere", "img_url": "..."},
    #     {"title": "Syrtis Major Hemisphere", "img_url": "..."},
    # ]

    # In[28]:

    from splinter import Browser
    browser = Browser('chrome', headless=False)
    usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(usgs_url)
    time.sleep(5)

    # In[29]:

    browser.find_by_css('h3')[0].click()
    img1_url = browser.find_by_tag('a')[41]['href']
    print(img1_url)

    img1_title = browser.find_by_css('h2')[0].text
    img1_title = img1_title.replace(' Enhanced', '')
    print(img1_title)

    # In[30]:

    browser.back()
    browser.find_by_css('h3')[1].click()
    img2_url = browser.find_by_tag('a')[41]['href']
    print(img2_url)

    img2_title = browser.find_by_css('h2')[0].text
    img2_title = img2_title.replace(' Enhanced', '')
    print(img2_title)

    # In[31]:

    browser.back()
    browser.find_by_css('h3')[2].click()
    img3_url = browser.find_by_tag('a')[41]['href']
    print(img3_url)

    img3_title = browser.find_by_css('h2')[0].text
    img3_title = img3_title.replace(' Enhanced', '')
    print(img3_title)

    # In[32]:

    browser.back()
    browser.find_by_css('h3')[3].click()
    img4_url = browser.find_by_tag('a')[41]['href']
    print(img4_url)

    img4_title = browser.find_by_css('h2')[0].text
    img4_title = img4_title.replace(' Enhanced', '')
    print(img4_title)

    # In[33]:

    # Use a Python dictionary to store the data using the keys img_url and title.
    hemisphere_img_dict = [
        {
            "title": img1_title,
            "img_url": img1_url
        },
        {
            "title": img2_title,
            "img_url": img2_url
        },
        {
            "title": img3_title,
            "img_url": img3_url
        },
        {
            "title": img4_title,
            "img_url": img4_url
        },
    ]

    data_outputs = {
        'title': title,
        'paragraph': paragraph,
        'featured_image_url': featured_image_url,
        'mars_weather': mars_weather,
        'html_table': html_table,
        'hemisphere_img_dict': hemisphere_img_dict
    }

    mars_data.insert(data_outputs)
    return data_outputs

Exemple #26

0

Afficher le fichier

Fichier : splinterAllApi.py Projet : windanchaos/mk-Regression-testing-python

browser.find_by_id('firstheader')
browser.find_by_value('query')
# get element
first_found = browser.find_by_name('name').first
last_found = browser.find_by_name('name').last
second_found = browser.find_by_name('name')[1]

# Get value of an element
browser.find_by_css('h1').first.value

# Clicking links,return the first link
browser.click_link_by_href('http://www.the_site.com/my_link')
browser.click_link_by_partial_href('my_link')
browser.click_link_by_text('my link')
browser.click_link_by_partial_text('part of link text')
browser.click_link_by_id('link_id')

# element is visible or invisible
browser.find_by_css('h1').first.visible

#fill content
browser.find_by_id('productName').fill(
    'splinter - python acceptance testing for web applications')
browser.fill('q', 'splinter - python acceptance testing for web applications')

# Verifying if element has a className
browser.find_by_css('.content').first.has_class('content')
# click button
browser.find_by_name('send').first.click()
browser.find_link_by_text('my link').first.click()

Exemple #27

0

Afficher le fichier

Fichier : scrape_mars.py Projet : tarajgray12/webscrapeHW

def scrape():

    url = 'https://mars.nasa.gov/news/'
    browser = Browser('chrome')
    browser.visit(url)
    html = browser.html

    soup = bs(html, 'lxml')

    #title of first article
    title = soup.find('div', class_='content_title')
    time.sleep(2)
    news_title = title.text
    print(news_title)

    #paragraph text of first article
    para = soup.find('div', class_='article_teaser_body')
    time.sleep(2)
    new_para = para.text
    print(new_para)
    news = {'title': news_title, 'paragraph': new_para}

    url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&q=Mars'
    browser.visit(url2)
    time.sleep(2)

    browser.click_link_by_id("full_image")
    time.sleep(2)
    html2 = browser.html

    soup2 = bs(html2, 'lxml')
    #soup2

    imagediv = soup2.find('img', class_="fancybox-image")['src']
    imagediv

    featured_image_url = 'https://www.jpl.nasa.gov' + imagediv
    print(featured_image_url)
    imageurl = {'featured_image': featured_image_url}

    url3 = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url3)
    html3 = browser.html

    soup3 = bs(html3, 'lxml')

    weather = soup3.find(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')
    mars_weather = weather.text
    print(mars_weather)
    marsweather = {'weather': mars_weather}

    url4 = 'https://space-facts.com/mars/'
    browser.visit(url4)
    html4 = browser.html

    soup4 = bs(html4, 'lxml')

    tables = pd.read_html(html4)
    info_table = tables[0]
    mars_info = {'table': info_table.to_html()}

    def retrieve_hemis():
        # URL for USGS Astrogeology
        url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

        # Visit URL and parse html
        browser.visit(url5)
        html5 = browser.html
        soup5 = bs(html5, 'lxml')

        # find the articles
        articles = soup5.find_all('div', class_='description')[0:4]

        # create list object to store output
        imgs = []

        # iterate over articles
        for article in articles:
            img = {}
            href = article.h3.text
            browser.click_link_by_partial_text(href)
            html5 = browser.html
            soup5 = bs(html5, 'lxml')
            img['title'] = href
            img['img_url'] = soup5.find('a', target='_blank')['href']
            imgs.append(img)

            #restart process
            browser.visit(url5)

        return (imgs)

    imgs = retrieve_hemis()

    print(imgs)

    db.news.insert(news)
    db.imageurl.insert(imageurl)
    db.marsweather.insert(marsweather)
    db.mars_hemispheres.insert_many(imgs)
    db.mars_info.insert(mars_info)


#scrape()

Exemple #28

0

Afficher le fichier

Fichier : scrape_mars.py Projet : spearjen/web-scraping-challenge

def scrape_all():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    time.sleep(2)

    html = browser.html
    soup = BeautifulSoup(html,'html.parser')
    article = soup.find('div',class_="list_text")

    try:
        title = article.find('div',class_='content_title').text.strip()
    except: 
        title = '[No information returned. Click the button again.]'
    try:
        para = article.find('div',class_='article_teaser_body').text.strip()
    except:
        para = '[No information returned.  Click the button again.]'

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    browser.click_link_by_id('full_image')

    html = browser.html
    soup = BeautifulSoup(html,'html.parser')

    featured_image_url = soup.article.a['data-fancybox-href']
    featured_image_url = (f'https://www.jpl.nasa.gov{featured_image_url}')
    descrip =soup.h1.text.strip()

    url = 'https://space-facts.com/mars/'
    browser.visit(url)

    tables = pd.read_html(url)

    df = tables[0]
    df.columns = ['', 'Mars']
    df = df.set_index('')

    mars_facts= df.to_html(classes='table')

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # Click the link for large image, find title and link, add try and except, store in dict format, append to dict
    hemisphere_image_urls_dict = []
    for i in range(4):
        browser.find_by_css("a.product-item h3")[i].click()
        hemi_soup = BeautifulSoup(browser.html, "html.parser")

        try:
            title_elem = hemi_soup.find("h2", class_="title").get_text()
            sample_elem = hemi_soup.find("a", text="Sample").get("href")

        except AttributeError:
            title_elem = None
            sample_elem = None

        hemispheres = {
            "title": title_elem,
            "img_url": sample_elem
        }

        # Append hemisphere info
        hemisphere_image_urls_dict.append(hemispheres)

        # Finally, we navigate backwards
        browser.back()

    browser.quit()

    data = {
        'news_title':title,
        'news_paragraph':para,
        'featured_image': featured_image_url,
        'featured_image_description':descrip,
        'hemisphere_image_urls': hemisphere_image_urls_dict,
        'facts':mars_facts,
        'last_modified':dt.datetime.now()
    }

    return(data)

Exemple #29

0

Afficher le fichier

def scrape_all():
    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser('chrome', **executable_path, headless=True)

    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    browser.visit(url)
    time.sleep(3)

    page = browser.html
    soup = bs(page, 'html.parser')


    #Title text and description
    results = soup.find('div', class_='image_and_description_container')

    title = results.find('div', class_='content_title')

    #Returns
    title_text = title.a.text
    description = results.find('div', class_='article_teaser_body').text

    #Image scraping

    #Setup 
    #Browser navigation
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    time.sleep(2)
    browser.click_link_by_id('full_image')
    time.sleep(2)
    browser.click_link_by_partial_text('more info')

    #Large Image HTML
    page = browser.html
    soup = bs(page, 'html.parser')

    #Store large Image URL
    results = soup.find('img', class_='main_image')
    image_link = results['src']

    #return
    featured_image_url = ("https://www.jpl.nasa.gov" + image_link)


    #Table scraping
    url = "https://space-facts.com/mars/"
    tables = pd.read_html(url)
    mars_facts = tables[0]
    mars_facts.columns = ['Facts', 'Mars']
    mars_facts.set_index('Facts', inplace=True)

    #return
    fact_table = mars_facts.to_html(classes="table table-striped")

    #Mars Hemispheres
    #browser navigation
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    time.sleep(2)

    #page html
    page = browser.html
    soup = bs(page, 'html.parser')
    results = soup.find_all('a', class_='description')
    results = soup.find_all('div', class_='description')

    hemispheres = []

    for result in results:
        url = result.a['href']
        url_full = ("https://astrogeology.usgs.gov" + url)
        browser.visit(url_full)
        time.sleep(2)
        url_page = browser.html
        url_soup = bs(url_page, 'html.parser')
        url_results = url_soup.find('img', class_="wide-image")['src']
        img_url = ("https://astrogeology.usgs.gov" + url_results)
        title = url_soup.find('h2', class_='title').text
        hem_dic = {
            "title": title,
            "img_url": img_url
        }
        hemispheres.append(hem_dic)

    data = {
        "latest_title": title_text,
        "latest_description": description,
        "featured_image": featured_image_url,
        "mars_fact_table": fact_table,
        "hemispheres": hemispheres
    }

    browser.quit()
    return data

Exemple #30

0

Afficher le fichier

def scrape_mars():

    # Defines the path to the chrome driver and create a browser object
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)

    # Defines url of site to be scraped and navigates to it
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'lxml')
    #print(soup.prettify())
    time.sleep(5)
    # Scrapes the first news headline and description and save to variable
    news_title = soup.find('div', class_="bottom_gradient").h3.text
    news_p = soup.find('div', class_="article_teaser_body").text

    ###                                                                         ###
    # ------------------------------------------------------------------------- #
    ###                                                                         ###

    # executable_path = {'executable_path': 'chromedriver.exe'}
    # browser = Browser('chrome', **executable_path, headless=True)

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    # HTML object
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Click the full image button
    image_class = soup.find('a', class_="button fancybox")
    full_image_click = image_class.get("id")
    browser.click_link_by_id(full_image_click)
    browser.is_element_present_by_id("fancybox-lock", wait_time=10)
    time.sleep(10)

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    after_full_image_click = soup.body.prettify()

    # Clicks the more info button
    b = soup.body.find('div', class_="buttons")
    lin = b.find_all('a')
    more_in = lin[1].get('href')
    browser.links.find_by_partial_href(more_in).click()

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    # Scraping partial url for featured image and saving to variable. Then appending the partial url to a base url for a full url to the featured image.
    base_url = "jpl.nasa.gov"
    im_page = soup.select_one("figure.lede a img")
    im = im_page.get("src")
    im_url = base_url + im

    ###                                                                         ###
    # ------------------------------------------------------------------------- #
    ###                                                                         ###

    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    time.sleep(5)

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    mars_weather = soup.find(
        'div',
        class_=
        "css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0"
    ).text

    ###                                                                         ###
    # ------------------------------------------------------------------------- #
    ###                                                                         ###

    url = 'https://space-facts.com/mars/'
    browser.visit(url)

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    tables = soup.find_all('td')

    df = pd.read_html(url)
    mars_facts = df[0]

    mars_facts_table = mars_facts.to_html()
    #mars_facts_table = mars_facts_table.replace('\n', '')
    #pprint.pprint(mars_facts_table)

    ###                                                                         ###
    # ------------------------------------------------------------------------- #
    ###                                                                         ###

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    hemispheres = []

    for i in range(4):

        browser.find_by_css('a.product-item h3')[i].click()

        # HTML object
        html = browser.html
        # Parse HTML with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')

        title = soup.find('h2', class_="title").get_text()
        link = soup.find("a", text="Sample").get("href")
        hemisphere = {"title": title, "link": link}
        hemispheres.append(hemisphere)
        browser.back()

    ###                                                                         ###
    # ------------------------------------------------------------------------- #
    ###                                                                         ###
    browser.quit()

    mars_dict = {
        "feat_im": im_url,
        "news_title": news_title,
        "news_desc": news_p,
        "weather": mars_weather,
        "facts": mars_facts_table,
        "hemishperes": hemispheres
    }

    print(mars_dict)