Exemple #1
0
def twitterMagic():
    print "Twitter Magic Time!!!"
    browser = Browser("firefox")
    browser.visit("https://twitter.com/signup")
    nameslist = grabNames()
    emaillist = grabEmails()
    passlist = grabPasses()
    userlist = grabUsers()
    # for each name in the list, fill the form in with data from the text files
    # note to self - you have to set variables to loop through and pick the next name after the first is signed up
    # STEPS!!!
    # fill name field
    # fill email
    # fill password
    # uncheck check mark
    # click signup button
    # (NEXT PAGE)
    # fill username?
    # profit
    x = 0
    for x in nameslist:
        browser.fill(nameslist[x], "full-name")
        browser.fill(emaillist[x], "email")
        browser.fill(passlist[x], "password")
        browser.fill(userlist[x], "username")
        browser.uncheck("checkbox")
        browser.find_by_name("Sign Up").first.click()
        browser.back()
        x = x + 1
Exemple #2
0
def splinter(url):
    #"""""""""""""""""""""""""MySQL DEF**********************************************
    conn = MySQLdb.connect(host='192.168.1.8',user='******',passwd='123123',db='gwycf')
    cursor = conn.cursor()#create cursor operate db
    #"""""""""""""""""""""""""MySQL DEF**********************************************
    data = xlrd.open_workbook('./chafen.xlsx')
    table = data.sheets()[0]
    nrows = table.nrows 
    ncols = table.ncols
    print nrows
    
    browser = Browser('firefox')
#    browser = Browser('chrome')
    dir(browser)
    browser.visit(url)
    time.sleep(5)
    count = 0
    #<================================================>
    for i in range(nrows):
        #HaoMa = str(table.row_values(i)[1]).split(".")[0]
        name = table.row_values(i)[0]
        HaoMa = table.row_values(i)[1]
#        epost = table.row_values(i)[2]

        browser.find_by_name('TxtName').fill(name)
        browser.find_by_name('TxtHaoMa').fill(HaoMa)
        browser.find_by_id('btnSubmit').click()
	#=================获取页面数据=====================
        epost = browser.find_by_tag('td')[10].value
        ecode = browser.find_by_tag('td')[14].value
        xingce = browser.find_by_tag('td')[16].value
        shenlun = browser.find_by_tag('td')[18].value
        jiafen = browser.find_by_tag('td')[20].value
        zongfen = browser.find_by_tag('td')[22].value
	#=================获取页面数据======================
        query = u"insert into info values('%s','%s','%s','%s','%s','%s','%s','%s',0)" % (name,HaoMa,epost,ecode,xingce,shenlun,jiafen,zongfen)
        print count,query
        cursor.execute(query.encode('utf-8')) #原始数据可以根据gbk运行无错,现在改成utf8
        conn.commit()
        browser.back()
        count = count +1
    cursor.close()
    conn.commit()
    conn.close()
# Scrape the Daily Weather Report table
weather_table = weather_soup.find('table', class_='mb_table')
print(weather_table.prettify())

# # D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles

# 1. Use browser to visit the URL
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []

# 3. Write code to retrieve the image urls and titles for each hemisphere.
links = browser.find_by_css("a.product-item h3")
for i in range(len(links)):
    hemisphere = {}
    browser.find_by_css("a.product-item h3")[i].click()
    sample = browser.links.find_by_text('Sample').first
    hemisphere['img_url'] = sample['href']
    hemisphere['title'] = browser.find_by_css("h2.title").text
    hemisphere_image_urls.append(hemisphere)
    browser.back()

# 4. Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

# 5. Quit the browser
browser.quit()
def scrape():

    scraped_data = {}

    # URL of page to be scraped - Launch page first
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    # Use Beautiful Soup to parse the data
    html = browser.html
    soup = bs(html, 'html.parser')
    # Retrieve the Latest News Title and paragraph text
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='rollover_description').text
    scraped_data['News_Title'] = news_title
    scraped_data['News_Paragraph'] = news_p

    # JPL Mars Space Images - Featured Image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    # Request and parse the HTML
    html = browser.html
    soup = bs(html, 'html.parser')
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(4)
    browser.click_link_by_partial_text('more info')

    # Request and parse again
    html_code = browser.html
    soup = BeautifulSoup(html_code, "html.parser")
    image = soup.find('figure', class_='lede').a['href']
    featured_image_url = 'https://www.jpl.nasa.gov' + image
    scraped_data['Featured_Img_URL'] = featured_image_url

    ## JPL Mars Space Images - Featured Image
    url = 'https://twitter.com/marswxreport?lang=en'
    time.sleep(3)
    browser.visit(url)
    # Request and parse
    html_code = browser.html
    soup = BeautifulSoup(html_code, "html.parser")
    mars_weather = soup.find(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
    ).text
    scraped_data['Mars_Weather'] = mars_weather

    ## Mars Facts
    url = 'https://space-facts.com/mars/'
    browser.visit(url)

    # Request and parse
    html_code = browser.html
    soup = BeautifulSoup(html_code, "html.parser")
    My_table = soup.find('table', {'class': 'tablepress tablepress-id-p-mars'})

    My_table_rows = My_table.find_all('tr')
    col_1 = []
    col_2 = []

    for row in My_table_rows:
        rows = row.find_all('td')
        col_1.append(rows[0].text)
        col_2.append(rows[1].text)

    facts_df = pd.DataFrame({'facts': col_1, 'values': col_2})
    facts_html = facts_df.to_html()
    scraped_data['Mars_Facts'] = facts_html

    ## Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # Request and parse the HTML
    html = browser.html
    soup = bs(html, 'html.parser')
    #print(soup.prettify())
    images = soup.find_all('h3')
    #     print(images)
    titles = []
    for image in images:
        titles.append(image.text)
#     for link in soup.find_all('a'):
#         print(link.get('href'))
    for title in titles:
        print(title)

    links = []
    for title in titles:
        browser.click_link_by_partial_text(title)
        time.sleep(1)
        html = browser.html
        soup = bs(html, 'html.parser')
        link_addr = soup.find('img', class_='wide-image')
        links.append('https://astrogeology.usgs.gov' + link_addr.attrs['src'])
        browser.back()

    hemisphere_image_urls = {}
    combine = list(zip(titles, links))
    title_link = []
    for title, link in combine:
        title_link.append({'title': title, 'img_url': link})
    scraped_data['Hemisphere_Image_URLs'] = title_link

    return scraped_data
Exemple #5
0
def scrape():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    mars_library = {}

    ############### Mars News ###############

    # Website to be scraped
    url1 = "https://mars.nasa.gov/news/"
    browser.visit(url1)
    html = browser.html
    soup = bs(html, 'html.parser')
    # Assign the most recent article, title, paragraph and date
    article = soup.find("div", class_="list_text")
    news_p = article.find("div", class_="article_teaser_body").text
    news_title = article.find("div", class_="content_title").text
    news_date = article.find("div", class_="list_date").text
    # Add to dictionary
    mars_library["news_date"] = news_date
    mars_library["news_title"] = news_title
    mars_library["summary"] = news_p

    ############### Image Search ###############

    # Second website to be scraped
    url2 = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url2)
    # This must be re-run every time unless the code is changed to each individual website
    html = browser.html
    soup = bs(html, 'html.parser')
    # Assign the image url for the current featured Mars image
    image = soup.find("img", class_="thumb")["src"]
    featured_image_url = "https://jpl.nasa.gov" + image
    # Add to dictionary
    mars_library["featured_image_url"] = featured_image_url

    ############### Mars Weather ###############

    # Third website to be scraped
    url3 = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url3)
    # Repost
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_weather = soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    # Add to dictionary
    mars_library["mars_weather"] = mars_weather

    ############### Mars Facts ###############

    # Fourth, but using Pandas
    url4 = "https://space-facts.com/mars/"
    table = pd.read_html(url4)
    # Cleaning of the table
    mars_table = table[0]
    mars_table = mars_table.drop(columns="Earth").rename(
        columns={
            "Mars - Earth Comparison": "",
            "Mars": "Mars Data"
        }).set_index("")
    html_table = mars_table.to_html()
    html_table = html_table.replace('\n', '')
    html_table
    # Add to dictionary
    mars_library["mars_table"] = html_table

    ############### Mars Hemispheres ###############

    # Fifth
    url5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url5)
    hemisphere_image_urls = []
    # Loop through the photos
    for i in range(4):
        images = browser.find_by_tag('h3')
        images[i].click()
        # Required each loop
        html = browser.html
        soup = bs(html, 'html.parser')
        partial_url = soup.find("img", class_="wide-image")["src"]
        img_title = soup.find("h2", class_="title").text
        img_url = 'https://astrogeology.usgs.gov' + partial_url
        dictionary = {"title": img_title, "img_url": img_url}
        hemisphere_image_urls.append(dictionary)
        browser.back()
    # Add to dictionary
    mars_library["mars_hemisphere"] = hemisphere_image_urls

    # Return Library
    return mars_library
def scrape():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #Scraping the title and the paragraph

    news_url = "https://mars.nasa.gov/news/"
    browser.visit(news_url)
    html = browser.html
    soup = bs(html, "html.parser")
    title = soup.find("div", class_="list_text")
    news_title = title.find("div", class_="content_title").text
    news_p = soup.find("div", class_="article_teaser_body").text

    #Splinter the image

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    browser.click_link_by_partial_text("FULL IMAGE")
    time.sleep(3)
    browser.click_link_by_partial_text("more info")
    time.sleep(3)
    html = browser.html
    image_soup = bs(html, 'html.parser')
    img_url = image_soup.find('figure', class_='lede').a['href']
    image_url = f'https://www.jpl.nasa.gov{img_url}'

    #Scrape the weather
    url = 'https://twitter.com/marswxreport?lang=en'
    response = req.get(url)
    soup = bs(response.text, 'html.parser')

    tweet_container = soup.find_all('div', class_="js-tweet-text-container")
    for tweet in tweet_container:
        mars_weather = tweet.find('p').text
        if 'sol' and 'pressure' in mars_weather:
            #print(mars_weather)
            break
        else:
            pass

#scrape the table
    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)
    df = tables[0]
    df.columns = ['description', 'value']
    df.set_index('description', inplace=True)
    mars_facts = df.to_html(classes="table table-striped")

    #Mars Hemispheres

    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = bs(html, 'html.parser')

    title = []
    img = []
    for i in soup.body.find_all('h3'):
        title.append(i.text)
        browser.click_link_by_partial_text(i.text[0:6])
        time.sleep(2)
        browser.click_link_by_partial_text('Sample')
        browser.windows[1].is_current = True
        html = browser.html
        soup = bs(html, 'html.parser')
        img.append(soup.img.get('src'))
        browser.windows[1].close()
        browser.back()
        time.sleep(2)
        hemisphere_image_urls = []
    for x in range(0, 4):
        mydict = {"title": title[x], "img_url": img[x]}
        hemisphere_image_urls.append(mydict)


# Store data in a dictionary
    mars_data = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": image_url,
        "mars_weather": mars_weather,
        "mars_facts": mars_facts,
        "hemisphere_image_urls": hemisphere_image_urls
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return mars_data
Exemple #7
0
def scrape():
    # Get the path to the chromedriver.exe and run the browser.
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', executable_path, headless=False)

    # Access the Mars news URL
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    # Get the latest title from the item list in the Mars news site.
    news_title = soup.find('ul', class_ = 'item_list').\
                find('div', class_ = 'content_title').text

    # Get the latest article teaser from the item list in the Mars news site.
    news_p = soup.find('ul', class_ = 'item_list').\
                find('div', class_='article_teaser_body').text

    # Specify the space images URL and visit page using the browser. Parse content as HTML
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')

    # Using BeautifulSoup on the browser HTML, get the latest featured image URL and save it as a string.
    img_url_short = soup.find('ul', class_ = 'articles').\
            find('li', class_ = 'slide').find('a')['data-fancybox-href']

    featured_image_url = 'https://www.jpl.nasa.gov' + img_url_short

    # Get response from Mars Weather Twitter page
    url = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')

    # Get reference to the featured image in the latest tweet.
    mars_weather_img = soup.find(
        'div', class_='js-tweet-text-container').find('p').find('a').text

    # Remove reference to the image in the tweet text and save as a string.
    mars_weather = soup.find(
        'div', class_='js-tweet-text-container').find('p').text.replace(
            mars_weather_img, '')

    # Gather the tables at the Mars facts site.
    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)

    # Extract the stats table and save as HTML.
    mars_table = tables[0].rename(columns={
        0: 'description',
        1: 'value'
    }).to_html().replace('\n', '')

    # Specify the Mars astrogeology URL and visit page using the browser. Parse content as HTML
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # Create a list with the hemisphere names and an empty list to save data dictionaries.
    hemisphere_image_urls = []
    hemispheres = [
        'Cerberus', 'Schiaparelli', 'Syrtis Major', 'Valles Marineris'
    ]

    # Iterate through the hemispheres list.
    for hemisphere in hemispheres:

        # Create an empty dictionary for each hemisphere and save its name in the 'title' entry.
        hemisphere_dict = {}
        hemisphere_dict['title'] = hemisphere + ' Hemisphere'

        # Navigate through the browser and get the image URL.
        browser.click_link_by_partial_text(hemisphere)
        html = browser.html
        soup = bs(html, 'html.parser')
        img_url = soup.find('div', class_ = 'container').\
            find('div', class_ = 'wide-image-wrapper').\
            find('img', class_ = 'wide-image')['src']

        # Store the image URL in the hemisphere dictionary.
        hemisphere_dict['img_url'] = 'https:astrogeology.usgs.gov' + img_url
        hemisphere_image_urls.append(hemisphere_dict)
        browser.back()

        mars_data = {
            'news_title': news_title,
            'news_p': news_p,
            'featured_image_url': featured_image_url,
            'mars_weather': mars_weather,
            'mars_table': mars_table,
            'hemisphere_image_urls': hemisphere_image_urls
        }

    browser.quit()

    return mars_data
def scrape():
    # Create dictionary to return
    return_dict = {}

    # Create initial browser object
    executable_path = {'executable_path': '/Users/joshchung/Bootcamp/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Scrape NASA Mars news
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'lxml')
    results = soup.find('li', class_="slide")
    article_date = results.find('div', class_="list_date").text
    article_title = results.find('div', class_="content_title").text
    article_teaser = results.find('div', class_="article_teaser_body").text
    return_dict.update({'article_date':article_date,
                        'article_title':article_title,
                        'article_teaser':article_teaser})

    # Scrape JPL image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'lxml')
    results = soup.find_all('article', class_="carousel_item")
    url_string = results[0].get('style')
    url_string = url_string.split("url('")
    url_string = url_string[1].split("');")
    url_string = url_string[0]
    img_url = 'https://www.jpl.nasa.gov' + url_string
    return_dict.update({'img_url':img_url})

    # Scrape Twitter
    url = 'https://twitter.com/marswxreport'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'lxml')
    last_tweet = soup.find('p', class_="tweet-text").text
    last_tweet = last_tweet.replace('\n', ' ')
    return_dict.update({'last_tweet':last_tweet})

    # Scrape Mars facts
    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)
    mars_df = tables[0]
    mars_df.columns = ['Statistic','Values']
    mars_df = mars_df.set_index('Statistic')
    mars_table = mars_df.to_html()
    mars_table = mars_table.replace('\n', '')
    return_dict.update({'mars_table':mars_table})

    # Scrape Mars hemisphere images
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    mars_urls = {}
    for x in range(0,4):
        browser.visit(url)
        links = browser.find_by_tag('h3')
        links[x].click()
        html = browser.html
        soup = bs(html, 'lxml')
        downloads = soup.find('div', class_="downloads")
        dl_links = downloads.find_all('a')
        img_link = dl_links[0].get('href')
        dld_link = dl_links[1].get('href')
        title = soup.find('h2', class_="title").text
        mars_urls.update({
            f"marsimg_{x}" : img_link,
            f"marstitle_{x}": title,
            f"marsdld_{x}": dld_link
        })
        browser.back()
    return_dict.update(mars_urls)

    # Return dictionary when function is run
    return return_dict
Exemple #9
0
def scrape():
    #####################################################################################
    #                                                                                   #
    # Import all the needed libraries                                                   #
    #                                                                                   #
    #####################################################################################
    from bs4 import BeautifulSoup as bs
    import requests
    import numpy
    import pandas as pd
    from splinter import Browser

    #####################################################################################
    #                                                                                   #
    # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph    #
    # Text. Assign the text to variables that you can reference later.                  #
    #                                                                                   #
    #####################################################################################
    url = "https://mars.nasa.gov/news/"
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    results = soup.find_all('div', class_='slide')
    news_title = []
    news_p = []
    for result in results:
        news_title.append(
            result.find_all('div', class_='content_title')[0].find('a').text)
        news_p.append(
            result.find_all('div',
                            class_='rollover_description_inner')[0].text)

    #####################################################################################
    #                                                                                   #
    # Visit the url for JPL Featured Space Image here.                                  #
    #                                                                                   #
    # Use splinter to navigate the site and find the image url for the current Featured #
    # Mars Image and assign the url string to a variable called featured_image_url.     #
    #                                                                                   #
    # Make sure to find the image url to the full size .jpg image.                      #
    #                                                                                   #
    # Make sure to save a complete url string for this image.                           #
    #                                                                                   #
    #####################################################################################
    executable_path = {'executable_path': '/drivers/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    browser.click_link_by_partial_text('FULL IMAGE')
    browser.find_by_css('a[title="Display actual size"]').click()
    featured_image_url = browser.find_by_css(
        'img[class="fancybox-image"]')['src']

    #####################################################################################
    #                                                                                   #
    # Visit the Mars Weather twitter account here and scrape the latest Mars weather    #
    # tweet from the page. Save the tweet text for the weather report as a variable     #
    # called mars_weather.                                                              #
    #                                                                                   #
    #####################################################################################
    url = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    results = soup.find_all('div', class_='js-tweet-text-container')
    mars_weather = []
    for result in results:
        mars_weather.append(
            result.find_all(
                'p',
                class_=
                'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text')
            [0].text)

    #####################################################################################
    #                                                                                   #
    # Visit the Mars Facts webpage here and use Pandas to scrape the table containing   #
    # facts about the planet including Diameter, Mass, etc.                             #
    #                                                                                   #
    # Use Pandas to convert the data to a HTML table string.                            #
    #                                                                                   #
    #####################################################################################
    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)
    df = tables[0]
    df.columns = ["Fact", "Value"]
    df.set_index('Fact', inplace=True)
    html_table = df.to_html()
    html_table = html_table.replace('\n', '')
    html_table = html_table.replace(
        '<table border="1" class="dataframe">',
        '<table border="1" class="table table-striped table-sm table-condensed">'
    )

    #####################################################################################
    #                                                                                   #
    # Visit the USGS Astrogeology site here to obtain high resolution images for each   #
    # of Mar's hemispheres.                                                             #
    #                                                                                   #
    # You will need to click each of the links to the hemispheres in order to find the  #
    # image url to the full resolution image.                                           #
    #                                                                                   #
    # Save both the image url string for the full resolution hemisphere image, and the  #
    # Hemisphere title containing the hemisphere name. Use a Python dictionary to store #
    # the data using the keys img_url and title.                                        #
    #                                                                                   #
    # Append the dictionary with the image url string and the hemisphere title to a list#
    # This list will contain one dictionary for each hemisphere.                        #
    #                                                                                   #
    #####################################################################################
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    results = soup.find_all('div', class_='item')
    hemis_title = []
    hemis_url = []
    for result in results:
        hemis_title.append(
            result.find_all('div', class_='description')[0].find('h3').text)
    executable_path = {'executable_path': '/drivers/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    browser.visit(url)
    for hem in hemis_title:
        browser.click_link_by_partial_text(hem)
        response = requests.get(browser.url)
        soup = bs(response.text, 'html.parser')
        results = soup.find_all('li')
        for result in results:
            if result.find_all('a')[0].text == "Sample":
                hemis_url.append(result.find_all('a')[0]['href'])
        browser.back()
    hemisphere_image_urls = []
    for x in range(0, 4):
        myDict = {"title": hemis_title[x], "img_url": hemis_url[x]}
        hemisphere_image_urls.append(myDict)
    ret_dict = {
        "news_titles": news_title,  #
        "news_paragraphs": news_p,  #
        "feat_image": featured_image_url,  #
        "mars_weather": mars_weather,  #
        "html_table": html_table,
        "hemis_images": hemisphere_image_urls  #
    }

    return ret_dict
def scrape():

    # In[2]:

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # # Step 1 - Scraping

    # NASA Mars News

    # In[9]:

    mars_dict = {}

    #URL of NASA Mars News Site
    url1 = '''https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=
    19%2C165%2C184%2C204&blank_scope=Latest'''

    browser.visit(url1)

    # In[10]:

    #HTML object
    html1 = browser.html

    #Parse HTML with BeautifulSoup
    soup1 = BeautifulSoup(html1, 'html.parser')

    # In[11]:

    #Retrieve first article
    # first_art = soup1.find('li', class_= 'slide')

    # In[12]:

    #Use Beautiful Soup's find() method to navigate and retrieve attributes

    # step1 = soup1.find('div', class_='image_and_description_container')
    # step2 = step1.find('div', class_='list_text')
    # news_title = step2.find('div', class_='content_title').get_text

    try:
        step1 = soup1.select_one(
            'div.image_and_description_container div.list_text')
        #find news title
        news_title = step1.find("div", class_="content_title").text
        #find news paragraph
        news_p = step1.find("div", class_="article_teaser_body").text
    except:
        return None, None

    #Add news_title to the mars_dict dictionary
    mars_dict['News Title'] = news_title

    # news_p = soup1.find('div', class_= 'article_teaser_body').get_text

    #Add news_p to the mars_dict dictionary
    mars_dict["News Para."] = news_p

    # JPL Mars Space Images - Featured Image

    # In[17]:

    #URL of JPL Mars Space Images Site
    url2 = '''https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'''

    browser.visit(url2)

    # In[18]:

    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(3)

    # In[19]:

    browser.click_link_by_partial_text('more info')
    time.sleep(3)

    #HTML object
    html2 = browser.html

    #Parse HTML with BeautifulSoup
    soup2 = BeautifulSoup(html2, 'html.parser')

    # In[24]:

    image_url = soup2.find('figure', class_="lede").a['href']
    image_url

    # In[25]:

    featured_image_url = 'https://www.jpl.nasa.gov' + image_url

    #Add featured_image_url to the mars_dict dictionary
    mars_dict['Featured Image URL'] = featured_image_url

    # Mars Facts

    # In[28]:

    #URL of Space Facts Site
    url3 = 'https://space-facts.com/mars/'

    # In[29]:

    #Read in table
    mars_table = pd.read_html(url3)
    mars_table

    # In[32]:

    #Create a DataFrame with the 1st table available on the site
    df = mars_table[0]
    df

    #Remove header column
    df.columns = df.iloc[0]
    df = df[1:]

    # In[33]:

    #Convert the DataFrame table to HTML
    html_table = df.to_html(index=False)
    html_table

    # In[38]:

    #Remove escape sequences
    html_table = html_table.replace('\n', '')

    #Add html_table to the mars_dict dictionary
    mars_dict['Mars Table'] = html_table

    # Mars Hemispheres

    # In[3]:

    # URL of page to be scraped
    url4 = '''https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'''

    browser.visit(url4)

    #HTML object
    html4 = browser.html

    # In[8]:

    # Find titles and image urls and build the dictionary
    titles = browser.find_by_css('a.product-item h3')

    hemi_list = []

    for i in range(len(titles)):
        hemi_dict = {}
        browser.find_by_css('a.product-item h3')[i].click()
        sample = browser.find_by_text('Sample')
        image_url = sample['href']
        hemi_dict['Title'] = browser.find_by_css('h2.title').text
        hemi_dict['ImageURL'] = image_url
        hemi_list.append(hemi_dict)
        browser.back()
        print("---")
        print(hemi_dict['Title'])
        print(image_url)

    # In[9]:

    #Add hemi_list to the mars_dict dictionary
    mars_dict['Hemispheres'] = hemi_list

    return mars_dict
def scrape_all():
    # Set the executable path and initialize the chrome browser in splinter
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path)

    # Visit the mars nasa news site
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_soup = BeautifulSoup(html, 'html.parser')

    slide_elem = news_soup.select_one('ul.item_list li.slide')
    slide_elem.find("div", class_='content_title')

    # Use the parent element to find the first a tag and save it as `news_title`
    news_title = slide_elem.find("div", class_='content_title').get_text()

    # Use the parent element to find the paragraph text
    news_p = slide_elem.find('div', class_="article_teaser_body").get_text()

    # JPL Space Images Featured Image - Visit URL
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_id('full_image')
    full_image_elem.click()

    # Find the more info button and click that
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_elem = browser.find_link_by_partial_text('more info')
    more_info_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = BeautifulSoup(html, 'html.parser')

    # find the relative image url
    img_url_rel = img_soup.select_one('figure.lede a img').get("src")

    # Use the base url to create an absolute url
    img_url = f'https://www.jpl.nasa.gov{img_url_rel}'

    #Mars weather - visit url
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    weather_soup = BeautifulSoup(html, 'html.parser')

    # First, find a tweet with the data-name `Mars Weather`
    mars_weather_tweet = weather_soup.find('div',
                                           attrs={
                                               "class": "tweet",
                                               "data-name": "Mars Weather"
                                           })

    # Next, search within the tweet for the p tag containing the tweet text
    mars_weather = mars_weather_tweet.find('p', 'tweet-text').get_text()

    #Hemispheres of Mars
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    hemisphere_image_urls = []

    # First, get a list of all of the hemispheres
    links = browser.find_by_css("a.product-item h3")

    # Next, loop through those links, click the link, find the sample anchor, return the href
    for i in range(len(links)):
        hemisphere = {}

        # We have to find the elements on each loop to avoid a stale element exception
        browser.find_by_css("a.product-item h3")[i].click()

        # Next, we find the Sample image anchor tag and extract the href
        sample_elem = browser.find_link_by_text('Sample').first
        hemisphere['img_url'] = sample_elem['href']

        # Get Hemisphere title
        hemisphere['title'] = browser.find_by_css("h2.title").text

        # Append hemisphere object to list
        hemisphere_image_urls.append(hemisphere)

        # Finally, we navigate backwards
        browser.back()

    #mars facts
    df = pd.read_html('https://space-facts.com/mars/')[0]
    df.columns = ['description', 'value']
    df.set_index('description', inplace=True)
    df = df.to_html()

    #final data dictionary
    data = {
        "news_title": news_title,
        "news_paragraph": news_p,
        "featured_image": img_url,
        "hemispheres": hemisphere_image_urls,
        "weather": mars_weather,
        "facts": df,
        "last_modified": dt.datetime.now()
    }
    browser.quit()
    return data
Exemple #12
0
def scrape_info():
    # ## Get Mars News
    executable_path = {"executable_path" : "chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    html = browser.html
    soup = bs(html, 'html.parser')

    news_title = soup.find("div", class_="content_title").text
    news_p     = soup.find("div", class_="article_teaser_body").text

    # ## Get Mars Featured Image
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    browser.click_link_by_partial_text("FULL IMAGE")
    time.sleep(3)
    browser.click_link_by_partial_text("more info")

    html = browser.html
    soup = bs(html, 'html.parser')
    
    featured_image = soup.find("figure", class_="lede")
    print(featured_image)

    featured_image_url = "https://www.jpl.nasa.gov" + featured_image.find("a")["href"]
    print(featured_image_url)


    # ## Get Mars Weather

    url = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'lxml')

    def getText(parent):
        return ''.join(parent.find_all(text=True, recursive=False)).strip()

    result = soup.find("p", class_="tweet-text")
    weather_report = getText(result)
    print(weather_report)


    # ## Get Mars Facts

    url = "https://space-facts.com/mars/"
    response = requests.get(url)
    soup = bs(response.text, "lxml")

    result_labels = soup.find_all("td", class_="column-1")
    result_values = soup.find_all("td", class_="column-2")

    result_labels_text = []
    result_values_text = []
    for rlabel in result_labels:
        result_labels_text.append(rlabel.text)
    for rvalue in result_values:
        result_values_text.append(rvalue.text)

    mars_df = pd.DataFrame({"Stats": result_labels_text,
                            "Values":  result_values_text})

    mars_df.set_index("Stats",inplace=True)
    
    mars_facts_html = mars_df.to_html()
   
    # ## Get Hemisphere Images

    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    hemisphere_list = []

    hemispheres = ["Cerberus", "Schiaparelli", "Syrtis Major", "Valles Marineris"]
    for x in range(0,4):
        browser.click_link_by_partial_text(hemispheres[x])
        
        html = browser.html
        soup = bs(html, 'html.parser')
        
        img_url = "https://astrogeology.usgs.gov" + (soup.find("img", class_="wide-image")["src"])
        title = (soup.find("h2", class_="title").text)
        
        hemisphere_dict = {"title": title, "img_url":img_url}
        hemisphere_list.append(hemisphere_dict)
        
        browser.back()
 
    browser.quit()

        # Store data in a dictionary
    mars_data = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "weather_report" : weather_report,
        "mars_facts_html" : mars_facts_html,
        "hemisphere_list" : hemisphere_list
    }

    return mars_data
Exemple #13
0
def scrape():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Scrape page into soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # save the most recent article, title and date
    article = soup.find("div", class_="list_text")
    news_p = article.find("div", class_="article_teaser_body").text
    news_title = article.find("div", class_="content_title").text
    news_date = article.find("div", class_="list_date").text
    print(news_date)
    print(news_title)
    print(news_p)

    # Visit the JPL Mars URL
    url2 = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url2)

    # Scrape the browser into soup and use soup to find the image of mars
    # Save the image url to a variable called `img_url`
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    image = soup.find("img", class_="thumb")["src"]
    img_url = "https://jpl.nasa.gov"+image
    featured_image_url = img_url
    # Use the requests library to download and save the image from the `img_url` above
    import requests
    import shutil
    response = requests.get(img_url, stream=True)
    with open('img.jpg', 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
        
    # Display the image with IPython.display
    from IPython.display import Image
    Image(url='img.jpg')

    # #### Mars Weather using twitter

    #get mars weather's latest tweet from the website
    url_weather = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_weather)
    html_weather = browser.html
    soup = BeautifulSoup(html_weather, "html.parser")
    mars_weather = soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text
    mars_weather

    url = 'https://space-facts.com/mars/'

    ### creating tables from webpage

    tables = pd.read_html(url)
    tables

    ### creating the table

    df = tables[0]
    df.columns = ['Item','Value']
    df

    df=df.set_index("Item")
    df

    marsdata = df.to_html(classes='marsdata')
    marsdata = marsdata.replace('\n', ' ')
    marsdata

    # Visit the USGS Astogeology site and scrape pictures of the hemispheres
    url4 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url4)

    # Use splinter to loop through the 4 images and load them into a dictionary
    import time 
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    mars_hemis=[]

    # loop through the four tags and load the data to the dictionary

    for i in range (4):
        time.sleep(5)
        images = browser.find_by_tag('h3')
        images[i].click()
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        partial = soup.find("img", class_="wide-image")["src"]
        img_title = soup.find("h2",class_="title").text
        img_url = 'https://astrogeology.usgs.gov'+ partial
        dictionary={"title":img_title,"img_url":img_url}
        mars_hemis.append(dictionary)
        browser.back()

    output = {'news_title':news_title, 'news_p':news_p, 'img_url':img_url,
              'mars_weather':mars_weather,'marsdata':marsdata, 'images':images}
   
    return output
def scrape():
    scraped_data = {}

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # NASA Mars News
    url_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(url_news)

    html_news = browser.html
    soup_news = BeautifulSoup(html_news, 'html.parser')

    result_title = soup_news.find('div', class_='content_title').find('a')
    news_title = result_title.text.strip()
    scraped_data["news-headline"] = news_title

    result_p = soup_news.find('div',
                              class_='image_and_description_container').find(
                                  'div', class_='rollover_description_inner')
    news_p = result_p.text.strip()
    scraped_data["news-text"] = news_p

    # JPL Mars Space Images - Featured Image
    url_img = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_img)

    html_img = browser.html
    soup_img = BeautifulSoup(html_img, 'html.parser')

    featured_title = soup_img.find(
        'section', class_='primary_media_feature').find(
            'h1', class_='media_feature_title').text.strip()

    browser.find_by_id('full_image').click()

    browser.is_element_present_by_text('more info')
    browser.find_link_by_partial_text('more info').click()

    featured_image_url = browser.find_by_css('img[class="main_image"]')['src']
    scraped_data["featured-image"] = featured_image_url

    # Mars Weather
    url_twitter = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url_twitter)

    html_twitter = browser.html
    soup_twitter = BeautifulSoup(html_twitter, 'html.parser')

    mars_weather = soup_twitter.find(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
    ).text
    scraped_data["Mars-weather-tweet"] = mars_weather

    # Mars Facts
    url_facts = 'https://space-facts.com/mars/'
    facts_table = pd.read_html(url_facts)[0]
    facts_table.columns = ['description', 'mesurement']
    facts_table_html = facts_table.to_html()
    scraped_data["table-of-facts-(html)"] = facts_table_html

    # Mars Hemispheres
    url_hems = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_hems)

    html_hems = browser.html
    soup_hems = BeautifulSoup(html_hems, 'html.parser')

    mars_hemisphere_products = browser.find_by_css('a.product-item h3')
    hemisphere_image_urls = []

    for i in range(len(mars_hemisphere_products)):
        hemisphere = {}
        browser.find_by_css('a.product-item h3')[i].click()
        hemisphere["img_url"] = browser.find_link_by_partial_text(
            'Sample').first['href']
        hemisphere["title"] = browser.find_by_css('h2.title').text
        hemisphere_image_urls.append(hemisphere)

        browser.back()

    scraped_data["Mars-hemisphere-images"] = hemisphere_image_urls

    print(scraped_data)
    return scraped_data
Exemple #15
0
def scrape():
    # Setup splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    # Define url  and set up config splinter to the site
    #Create a function that takes the url and return the soup
    def create_soup(url):
        browser.visit(url)
        # Create BeautifulSoup object; parse with 'html.parser'
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        return soup

    url = 'https://redplanetscience.com/'
    soup = create_soup(url)

    title = soup.find('div', class_="content_title").text

    news_p = soup.find('div', class_="article_teaser_body").text

    news_dict = {'title': title, 'news_p': news_p}

    news_dict

    space_url = 'https://spaceimages-mars.com/'
    soup = create_soup(space_url)

    try:
        target = 'button[class="btn btn-outline-light"]'
        browser.find_by_tag(target).click()
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        image_src = soup.find('img', class_="fancybox-image")['src']

    except:
        print('can\'t find the image')
    featured_image_url = space_url + image_src

    featured_image_url

    mars_url = 'https://galaxyfacts-mars.com/'
    # using pandas read html
    tables = pd.read_html(mars_url)
    mars_fact_df = tables[0]

    mars_fact_df.columns = ['Description', 'Mars', 'Earth']
    mars_fact_df = mars_fact_df.iloc[1:]
    mars_fact_df.set_index('Description', drop=True, inplace=True)

    mars_fact_df.head()

    # set url
    hem_url = 'https://marshemispheres.com/'
    soup = create_soup(hem_url)
    # get soup
    items = soup.find_all('div', class_="item")
    hemisphere_urls = []

    for item in items:
        hmsphere = {}
        name = item.h3.text
        #       link = item.a['href']

        # get full image
        try:
            browser.links.find_by_partial_text(name).click()
            print(browser.url)
            html2 = browser.html
            imgsoup = BeautifulSoup(html2, 'html.parser')
            imgsoup
            img = imgsoup.find('img', class_="wide-image")

            hmsphere['title'] = name[:-9]
            hmsphere['img_url'] = hem_url + img['src']

        except:
            print("Could not get Image Link")

        hemisphere_urls.append(hmsphere)
        browser.back()

    browser.quit()

    print(hemisphere_urls)

    mars_data = {
        'news_title': title,
        'news_p': news_p,
        'featured_image': featured_image_url,
        'hemisphere_image_urls': hemisphere_urls,
        'table': mars_fact_df
    }

    return mars_data
def scrape():
    # browser = init_browser()
    browser = Browser('chrome')
    #Visit the URL
    Nasa_news_url = 'https://mars.nasa.gov/news/'
    browser.visit(Nasa_news_url)
    html = browser.html

    #Parse HTML with Beautiful Soup
    soup_nasa = BeautifulSoup(html, 'html.parser')
    type(soup_nasa)

    ### NASA Mars News
    #<div class="content_title"><a href="/news/8782/sensors-on-mars-2020-spacecraft-answer-long-distance-call-from-earth/" target="_self">
    #Sensors on Mars 2020 Spacecraft Answer Long-Distance Call From Earth</a></div>
    #<div class="article_teaser_body">Instruments tailored to collect data during the descent of NASA's next rover through the Red Planet's atmosphere have been checked in flight.</div>
    #news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text
    news_titles = soup_nasa.find_all('div', class_="content_title")[0].text
    news_paragraphs = soup_nasa.find_all('div',
                                         class_="article_teaser_body")[0].text
    print(news_titles)
    print('------------------')
    print(news_paragraphs)

    ### JPL Mars Space Images - Featured Image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(5)
    #print(soup.prettify())
    #go to the full image
    #data-fancybox-href
    image = browser.find_by_id('full_image')
    image.click()
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    url_image_find = soup.find('img', class_='main_image').get("src")

    featured_image_url = 'https://www.jpl.nasa.gov' + url_image_find
    featured_image_url
    ### Mars Facts
    url = 'https://space-facts.com/mars/'
    mars_facts_df = pd.read_html('https://space-facts.com/mars/')[2]
    mars_facts_df
    mars_facts_df.columns = ["Details", "Measures"]
    mars_facts_df
    mars_facts_df = mars_facts_df.to_html()
    mars_facts_df
    ### Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    browser.visit(url)
    web_links = browser.find_by_css("a.product-item h3")
    len(web_links)
    web_list = []
    for i in range(len(web_links)):
        web_hemispheres = {}
        browser.find_by_css("a.product-item h3")[i].click()
        web_hemispheres["link"] = browser.find_link_by_text(
            'Sample').first["href"]
        web_hemispheres["Title"] = browser.find_by_css('h2.title').text
        web_list.append(web_hemispheres)
        browser.back()
        web_list

    browser.quit()
Exemple #17
0
class SplinterBrowserDriver(BaseBrowserDriver):
    """
        This is a BrowserDriver for splinter
        (http://splinter.cobrateam.info)
        that implements the BaseBrowserDriver API.

        To use it, you must have splinter installed on your env.

        For itself it's a browser driver that supports multiple browsing
        technologies such as selenium, phantomjs, zope, etc.
    """

    driver_name = 'splinter'

    def __init__(self):
        super(SplinterBrowserDriver, self).__init__()
        if not splinter_available:
            raise ImportError(
                "In order to use splinter Base Driver you have to install it. "
                "Check the instructions at http://splinter.cobrateam.info")
        self._browser = Browser(config.default_browser)

    def _handle_empty_element_action(self, element):
        if not element:
            raise ActionNotPerformableException(
                "The action couldn't be perfomed because the element couldn't "
                "be found; Try checking if your element"
                "selector is correct and if the page is loaded properly.")

    @property
    def page_url(self):
        return self._browser.url

    @property
    def page_source(self):
        return self._browser.html

    @property
    def page_title(self):
        return self._browser.title

    def open_url(self, url):
        self._browser.driver.get(url)

    def quit(self):
        return self._browser.quit()

    def is_element_visible(self, element):
        return element.visible

    def get_element_text(self, element):
        return element.text

    def get_element_by_xpath(self, selector):
        return self._browser.find_by_xpath(selector)

    def get_element_by_css(self, selector):
        return self._browser.find_by_css(selector)

    def get_element_by_id(self, selector):
        return self._browser.find_by_id(selector)

    def get_element_by_tag(self, selector):
        return self._browser.find_by_tag(selector)

    @element_action
    def type(self, element, text, slowly=False):
        return element.type(text, slowly)

    @element_action
    def fill(self, element, text):
      return element.fill(text)

    @element_action
    def clear(self, element):
      self.fill(element, '')

    @element_action
    def click(self, element):
        return element.click()

    @element_action
    def check(self, element):
        return element.check()

    @element_action
    def uncheck(self, element):
        return element.uncheck()

    @element_action
    def mouse_over(self, element):
        return element.mouse_over()

    @element_action
    def mouse_out(self, element):
        return element.mouse_out()

    def reload(self):
        return self._browser.reload()

    def go_back(self):
        return self._browser.back()

    def go_forward(self):
        return self._browser.forward()

    def execute_script(self, script):
        return self._browser.evaluate_script(script)

    def get_iframe(self, iframe_id):
        return self._browser.get_iframe(iframe_id)

    def get_alert(self):
        return self._browser.get_alert()

    def attach_file(self, input_name, file_path):
        return self._browser.attach_file(input_name, file_path)

    def wait_pageload(self, timeout=30):
        wait_interval = 0.05
        elapsed = 0

        while self.execute_script('document.readyState') != 'complete':
            self.wait(wait_interval)
            elapsed += wait_interval

            if elapsed > timeout:
                raise PageNotLoadedException

    def click_and_wait(self, element, timeout=30):
        self.click(element)
        self.wait_pageload(timeout)
Exemple #18
0
def scrape():
    #Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text.
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    news_url = 'https://mars.nasa.gov/news/'
    browser.visit(news_url)
    html = browser.html
    soup = bs(html, 'html.parser')

    # In[3]:

    #Assign the text to variables that you can reference later.
    #https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find
    news_date = soup.find('div', class_='list_date')
    time.sleep(1)
    news_date

    # In[4]:

    news_title = soup.find('div', class_='content_title').text
    news_title

    # In[5]:

    news_parag = soup.find('div', class_='article_teaser_body')
    news_parag

    # # JPL Mars Space Images - Featured Image

    # In[6]:

    #Visit the url for JPL Featured Space Image.
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl_url)

    # In[7]:

    #Use splinter to navigate the site and find the image url for the current Featured Mars Image
    #https://splinter.readthedocs.io/en/latest/finding.html
    browser.find_by_id('full_image').click()
    time.sleep(3)
    browser.click_link_by_partial_text('more info')

    # In[8]:

    #find and parse new url
    new_jpl_html = browser.html
    new_image_soup = bs(new_jpl_html, 'html.parser')

    # In[9]:

    #Make sure to find the image url to the full size .jpg image.
    image_url = new_image_soup.find('img', class_='main_image')
    partial_url = image_url.get('src')

    # In[10]:

    #Make sure to save a complete url string for this image...and assign the url string to a variable called featured_image_url.
    featured_image_url = f'https://www.jpl.nasa.gov{partial_url}'
    time.sleep(1)
    print(featured_image_url)

    # # Mars Weather

    # In[11]:

    #Visit the Mars Weather twitter account
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    weather_url = "https://twitter.com/marswxreport"
    browser.visit(weather_url)
    weather_html = browser.html
    weather_soup = bs(weather_html, 'html.parser')

    # In[12]:

    #scrape the latest Mars weather tweet from the page.
    mars_weather = weather_soup.find(
        "p",
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    print(mars_weather)

    # # Mars Facts

    # In[13]:

    #Visit the Mars Facts webpage
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    facts_url = "https://space-facts.com/mars/"
    browser.visit(facts_url)

    # In[14]:

    #use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    facts_table = pd.DataFrame(pd.read_html(facts_url)[0])
    facts_table.head()

    # In[15]:

    #Use Pandas to convert the data to a HTML table string.
    mars_facts = facts_table.to_html(header=False, index=False)
    print(mars_facts)
    facts_table.to_html('mars_facts.html')

    # # Mars Hemispheres

    # In[16]:

    #Visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres.
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemi_url)

    # In[17]:

    #You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    #hemi data container
    hemi_info = []
    #loop through, click, find and store url, title related to each hyperlinked hemisphere
    for hemi in range(4):
        time.sleep(3)
        #find hyperlink
        images = browser.find_by_tag('h3')
        #click hyperlink
        images[hemi].click()
        #read and find title and url
        hemi_loop = browser.html
        soup = bs(hemi_loop, "html.parser")
        img_title = soup.find('h2', class_='title')
        back_url = soup.find("img", class_="wide-image")["src"]
        #append url src to create full url
        img_url = f'https://astrogeology.usgs.gov{back_url}'
        #Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.
        #store loop list in hemi data container as a dictionary
        hemi_info.append({'title': img_title, 'img_url': img_url})
        browser.back()
    #print hemi data container after loop
    pprint(hemi_info)

    mars_data = {
        "Headline": news_title,
        "Desription": news_parag,
        "Featured_Image": featured_image_url,
        "Current_Weather": mars_weather,
        "Facts": mars_facts,
        "Hemis": hemi_info
    }

    return mars_data
class SurfThread(threading.Thread):

   
    def __init__(self, hoehe, breite, _format):
        threading.Thread.__init__(self) 
        self.seiten = []
        self.words = []
        self.toWait = None
        self.elemNo = None
        self.wordNo = None
        self.clickNo = None
        self.clickX = None
        self.clickY = None
        self.back = None
        self.changeTabs = None
        self.__browser = Browser("firefox", profile=constants.profile)
        time.sleep(5)
        #self.__maximizeWindow()
        #time.sleep(5)        
        SurfThread.timer = False
        SurfThread.hoehe = hoehe
        SurfThread.breite = breite 
        SurfThread._format = _format


    def __readData(self):
        # read homepages to visit 
        surfListe = open("/home/steffi/Dokumente/surfListe.txt", "rb")
        for line in surfListe: 
            self.seiten.append(line)
        surfListe.close()
        # read words for search in google, wikipedia, amazon, youtube
        keyWords = open("/home/steffi/Dokumente/keyWords.txt", "rb").readlines()
        for line in keyWords: 
            self.words.append(line.decode("utf-8"))
        #keyWords.close(), 
    print "data read"
    
    
    def run(self):
        
        self.__readData()    
       
        rand = random.randint(2,5)
        for i in range(0, rand):
            print "noch "+ str(i) +" mal"
	    print "TIMER:" +str(SurfThread.timer)
            if SurfThread.timer == False :
            
                self.__generateRandom()
                    
                print "visit: "+self.seiten[self.elemNo]
                self.__visitHomepage( self.seiten[self.elemNo].strip())
                print "clickNo: "+ str(self.clickNo)
		print "towait = "+ str(self.toWait)
                time.sleep(self.toWait)
                for i in range(self.clickNo):
                    time.sleep(random.randrange(5,10))
                    if i % 2 == 0:
                        self.__generateRandomClick()
                    if i == 2:
                        self.__pageDown()
                        time.sleep(random.randrange(1,5))
                    if i == (self.clickNo-1):
                        self.__pageBottom()
                        time.sleep(random.randrange(2,10))
                    if i%2 == 0 and self.back == 1:
                        self.__goBack()
                        time.sleep(random.randrange(2,10))  

    	path = self.__browser.driver.firefox_profile.profile_dir
    	print path
    	os.remove(constants.profile+'/places.sqlite')
    	shutil.copyfile(path+'/places.sqlite', constants.profile+'/places.sqlite')
        self.__closeWindow()
    	shutil.rmtree(path)
    	#os.rmdir(path)
        print "Firefox beendet"
        
        
    def starte(self):
        self.run()
    
    def __generateRandom(self):
        self.toWait = random.randrange(5,45)
        self.elemNo = random.randrange(0,len(self.seiten))
        self.clickNo = random.randrange(2,7)
        self.back = random.randrange(0,10)
        self.wordNo = random.randrange(0, len(self.words))
    
    def __generateRandomClick(self):
        self.clickX = random.randrange(100,constants.BREITE - 50) #1366
        self.clickY = random.randrange(50,constants.HOEHE-50) #768
        command = "mousemove "+ str(self.clickX) + " "+ str(self.clickY)
        print command
        subprocess.call(["xte", command])
        subprocess.call(["xte", "mouseclick 1"])
      
    def __followLink(self, text, index=0):
        if index == None:
            index = 0
        
        try:   
            self.__browser.click_link_by_partial_text(text)[index]
        except ElementDoesNotExist:
            print "Element does not exist"
        except TypeError:
            print "Type Error"
        except Exception as e: 
	       print "nix passiert" + e
    
    def __visitGooglePage(self, url):
             
        print "google"
        
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        self.__fillInput('q', searchWord)
        time.sleep(random.randrange(2,15))
        self.__findElementAndClick("btnG", "name", None)
        subprocess.call(["xte", "key Return"])
        wordSplit = str(searchWord).split(" ")
        time.sleep(random.randrange(10,30))
            #baaaad practice
        try:
            self.__followLink(wordSplit[0], self.wordNo%10)
        except Exception:
            try: 
                self.__followLink(wordSplit[1], self.wordNo%10)
            except Exception:
                    pass
        
        
    def __visitHomepage(self, url):
       
        clickNoMod4 = self.clickNo % 4
        toWaitMod4 = self.toWait % 4
        
        if "google" in url:
            self.__visitGooglePage(url)
        elif "wikipedia" in url:
            self.__visitWikipediaPage(url)
        elif "amazon" in url:
            self.__visitAmazonPage(url)
        elif "ebay" in url:
            self.__visitEbayPage(url)
        elif "youtube" in url:
            print "youtube"
            self.__watchYoutubeVideo(url)
        elif "facebook" in url:
            print "facebook"
            self.__visitFacebook(url)
        elif "twitter" in url:
            print "twitter"
            self.__twitterSomething(url)
        else:
	    try:
            	self.__browser.visit(url)
	    except Exception as e:
		print e
		pass
        
       
    def __goBack(self): 
        self.__browser.back()
        
    def shutdown(self):
        print "setze timer um und beende firefox"
        changeTimer()
        
    def __fillInput(self, _id, _input):
        try:
            self.__browser.fill(_id, _input)
        except Exception as e:
            print e.message
            pass
        
    def __findElementAndClick(self, name, identifier, index):
        #check falls keine nummer mitgenommen wurde
        if index == None:
            index = 0
        #suche nach elementen
        try:
            if identifier == "name":
                button = self.__browser.find_by_name(name)[index]
            elif identifier == "id":
                button = self.__browser.find_by_id(name).click
            
                button.click()
        except (exceptions.ElementDoesNotExist, ElementNotVisibleException, URLError):
            print "ElementDoesnotExist OR ElementNotVisible OR URLError"
            pass
	except Exception as e:
	    print e
	    pass
        
    def __closeWindow(self):
        time.sleep(3)  
        subprocess.call(["xte", "keydown Control_L"])
        #subprocess.call(["xte", "keydown Shift_L"])
        subprocess.call(["xte", "key q"])
        #subprocess.call(["xte", "keyup Shift_L"])
        subprocess.call(["xte", "keyup Control_L"])
        print "Fenster geschlossen"
    
    def __maximizeWindow(self):
        time.sleep(2)  
        subprocess.call(["xte", "keydown Control_L"])
        subprocess.call(["xte", "key F10"])
        subprocess.call(["xte", "keyup Control_L"])
        print "Fenster maximiert"
    
    def __pageDown(self):
        time.sleep(3)
        subprocess.call(["xte", "key Page_Down"])
    
    def __pageBottom(self):
        subprocess.call(["xte", "key End"])
    
    def __watchYoutubeVideo(self, url):
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        
        self.__fillInput('search_query', searchWord)
        time.sleep(random.randrange(2,15))

        subprocess.call(["xte", "key Return"])
        time.sleep(random.randrange(2,15))
        
	#nur bei 16:9 monitor
        index = None
        breite = 0
	if SurfThread._format == "16:9":

            index = [int(SurfThread.hoehe // 4.59), 
                     int(SurfThread.hoehe // 3.04),
                     int(SurfThread.hoehe // 2.22),
                     int(SurfThread.hoehe // 1.77)]
            breite = int(SurfThread.breite//4.74)
        else:
            index = [int(SurfThread.hoehe // 4.10), 
                     int(SurfThread.hoehe // 2.19),
                     int(SurfThread.hoehe // 1.54),
                     int(SurfThread.hoehe // 1.28)]
	    breite = int(SurfThread.breite//2.15)
                
        #self.__followLink(searchWord, None)
        #235 1 - 355 2 - 4853   
        rand = random.randint(0, (len(index)-1))
        subprocess.call(["xte", "mousemove "+ str(breite) + " " +str(index[rand])])
        time.sleep(random.randrange(2,15))
        subprocess.call(["xte", "mouseclick 1"])
        
        time.sleep(5)
        print "mousemove + anschauen"
    
        #breite höhe von links oben
        #subprocess.call(["xte", "mousemove "+ str(int(SurfThread.breite//3.17)) + " " + str(int(SurfThread.hoehe//3.2225))])
        #time.sleep(2)
        subprocess.call(["xte", "mouseclick 1"])
        #todo mehr zeit
        time.sleep(random.randrange(2,45))
        
    
        
        
    def __visitWikipediaPage(self, url):
        print "wikipedia"
        
        self.__browser.visit(url)
        time.sleep(2)
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        self.__fillInput('search', searchWord)
        time.sleep(random.randrange(2,15))
        subprocess.call(["xte", "key Return"])
        wordSplit = str(searchWord).split(" ")
        time.sleep(2)    
            #baaaad practice
        try:
            self.__followLink(wordSplit[0], self.wordNo%10)
        except Exception:
            try: 
                self.__followLink(wordSplit[1], self.wordNo%10)
            except Exception:
                    pass
                
    def __visitAmazonPage(self, url):
        print "amazon"
        
        self.__browser.visit(url)

        time.sleep(random.randrange(2,15))
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        self.__fillInput('field-keywords', searchWord+'\n')
        time.sleep(2)
       
	subprocess.call(["xte", "key Return"])
        
        wordSplit = str(searchWord).split(" ")
        time.sleep(random.randrange(2,15))  
            #baaaad practice
        try:
            self.__followLink(wordSplit[0], self.wordNo%10)
        except Exception:
            try: 
                self.__followLink(wordSplit[1], self.wordNo%10)
            except Exception:
                    pass
    
    def __visitEbayPage(self, url):
        print "ebay"
        
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        searchWord = str(self.words[self.wordNo]).strip().decode("utf-8")
        print searchWord
        self.__typeWord(searchWord)
        time.sleep(random.randrange(2,15))
        subprocess.call(["xte", "key Return"])
        wordSplit = str(searchWord).split(" ")
        time.sleep(random.randrange(2,15))
            #baaaad practice
        self.__followLink(wordSplit[0], self.wordNo%10)
        
    def __visitFacebook(self, url):
        print "facebook"
        
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        
        #gegenebenefalls einloggen
        if self.__browser.is_text_present(constants.FB_USER) == False:
            print "noch nicht eingeloggt"
            self.__fillInput('email', constants.FB_EMAIL)
            time.sleep(2)
            self.__fillInput('pass', constants.FB_PW)
            time.sleep(2)
            subprocess.call(["xte", "key Return"])
            time.sleep(5)
            
    def __twitterSomething(self, url):
        print "twitter"
        
        self.__browser.visit(url)
        time.sleep(random.randrange(2,15))
        #todo wenns tart seite nicht sichtbar, einloggen
        if self.__browser.is_text_present('Startseite') == False:
            print "noch nicht eingeloggt"
            
            '''name = self.__browser.find_by_name('session[username_or_email]').first
            if name != None:
                print "name gefunden"
            name.click()
            time.sleep(3)
            self.__typeWord('steffi_spam')
            
            passW = self.__browser.find_by_id('signin-password').first
            passW.click()
            time.sleep(3)
            self.__typeWord('steffispam')'''
            
            
            #self.__fillInput("session[username_or_email]", "*****@*****.**")
            #time.sleep(2)
            #self.__fillInput('signin-pass', "steffispam")
            #self.__fillInput('signin-pass', "session[password]")
            #time.sleep(2)
            #subprocess.call(["xte", "key Return"])
            #time.sleep(5)
            
            # so gehts 13.5.13
            time.sleep(random.randrange(2,15))
            subprocess.call(["xte", "key Tab"])
            time.sleep(3)
            subprocess.call(["xte", "key Tab"])
            time.sleep(3)
            subprocess.call(["xte", "key Tab"])
            time.sleep(random.randrange(2,15))
            self.__typeWord(constants.TWITTER_USER)
            subprocess.call(["xte", "key Tab"])
            time.sleep(2)
            self.__typeWord(constants.TWITTER_PW)
            time.sleep(2)
            subprocess.call(["xte", "key Return"])
            time.sleep(random.randrange(2,15))
            ''' self.__followLink("Kleine Zeitung")
           # time.sleep(5)
           # self.back()
           # self.__followLink("ORF Sport")
           # time.sleep(5)
           # self.back()'''
        
        self.__followLink("Startseite")
        time.sleep(3)
        print "input twitter"
        field = self.__browser.find_by_id("tweet-box-mini-home-profile").first
        field.click()
        print "geklickt"
        self.__typeWord(twittertext[random.randrange(0,len(twittertext)-1)])
        time.sleep(random.randrange(2,15))
        subprocess.call(["xte", "key Tab"])
        time.sleep(2)   
        subprocess.call(["xte", "key Return"])
        print "tweet gepostet"
        
            
            
    def __typeWord(self, word):
        spell = ""
        for i in range(0, len(word)):
            #special character
            if spell == "/":
                spell = "/"+word[i]
            else:    
                spell = word[i]
                
            # todo algorithmus der entescheidet, zuerst spezialzeichen oder normales zeichen               
            if spell == "@":
                subprocess.call(["xte", "keydown Control_L"])
                subprocess.call(["xte", "key at"])
                subprocess.call(["xte", "keyup Control_L"])
            #sonderzeichen
            elif spell not in string.ascii_letters:
                spell = keySyms[spell]
                #sonderzeichen mit shift
                if spell in upKeys:
                    subprocess.call(["xte", "keydown Shift_L"])
                    subprocess.call(["xte", "key "+spell])
                    subprocess.call(["xte", "keyup Shift_L"])
                #sonderzeichen mit altgr   
                elif spell in altGrKeys:
                    subprocess.call(["xte", "keydown Alt_R"])
                    subprocess.call(["xte", "key "+spell])
                    subprocess.call(["xte", "keyup Alt_R"])
                else:     
                    subprocess.call(["xte", "key "+spell])
            elif spell == "ß":
                spell = "question"
                subprocess.call(["xte", "key "+spell])
            else:    
                subprocess.call(["xte", "key "+spell])
Exemple #20
0
def scrape():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)


    # # In[9]:


    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)


    # # In[10]:


    html = browser.html
    soup = bs(html, 'html.parser')
    browser.click_link_by_partial_text('FULL IMAGE')


    # # In[11]:


    # #needs a pause or else code runs too fast
    time.sleep(2)
    browser.click_link_by_partial_text('more info')


    # # In[12]:


    html2 = browser.html
    soup2 = bs(html2, 'html.parser')
    image = soup2.find('img', class_='main_image')


    url = image.get('src')

    featured_image_url = 'https://www.jpl.nasa.gov' + url
    # #print(featured_image_url)
    time.sleep(2)
    browser.quit()


    # # In[13]:


    # #Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather.
    url = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    # #print(soup.prettify())


    # # In[14]:


    results = soup.find_all('div', class_='js-tweet-text-container')
    # #print(results)


    # # In[15]:


    mars_tweet= results[0].text
    # #print(mars_tweet)


    # # In[16]:


    # #Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    # #Use Pandas to convert the data to a HTML table string.
    mars_facts_url = 'https://space-facts.com/mars/'


    # # In[17]:


    tables = pd.read_html(url)
    tables


    # # In[18]:


    df = tables[0]
    df.head()


    # # In[19]:


    df.set_index(0, inplace=True)
    clean_df = df
    clean_df


    # # In[20]:


    html_table = clean_df.to_html()
    html_table


    # # In[21]:


    html_table.replace('\n', '')


    # # In[22]:


    df.to_html('mars_table.html')


    # # In[23]:


    # #Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
    # #You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    # #Save both the image url string for the full resolution hemipshere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.
    # #Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)


    # # In[24]:


    # #opening browser
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)


    # # In[25]:


    # #clicking into Cerberbus Hemisphere Enhanced page
    # #this needs to be modified to click into new hyperlink each time (store hyperlinks in a list to access?)
    hemisphere_info = []
    hyperlinks = ['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced']

    for hyperlink in hyperlinks:
        browser.click_link_by_partial_text(hyperlink)
        html = browser.html
        soup = bs(html, 'html.parser')
        image = soup.find('img', class_='wide-image')
        url = image.get('src')
        image_url = 'https://astrogeology.usgs.gov' + url
        results = soup.find('h2', class_="title").text
        hemisphere_info.append({'title':results, 'img_url': image_url})
        time.sleep(1)
        browser.back()



    # # In[26]:


    # #print(hemisphere_info)


    # # In[ ]:


    browser.quit()
    mars_info = {
        "image_URL": featured_image_url,
        "Mars_weather": mars_tweet,
        "Mars_table": mars_table(),
       # 'mars_facts': 'foo bar baz', 
        "Hemisphere_info": hemisphere_info
    }
    return mars_info
def scrape():
    # ------------------------------------------
    # 1. Scrapping the headline and sub-headline
    # ------------------------------------------
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

    driver = webdriver.Chrome('/usr/local/bin/chromedriver')
    driver.get(url)

    # Give the JS time to render
    time.sleep(1)

    # Scrap the web with BeautifulSoup
    soup = BeautifulSoup(driver.page_source)

    # Create a dictionary
    marsData = {}

    # Finding all the information that we want regarding the title and news.
    news_title = soup.find(class_='content_title').text
    news_p = soup.find(class_='article_teaser_body').text

    # Creating dictionary
    marsData['news_title'] = news_title
    marsData['news_p'] = news_p

    driver.close()

    # ------------------------------------------
    # 2. Scrapping the photo
    # ------------------------------------------
    # Scrapping the photo
    driver = webdriver.Chrome('/usr/local/bin/chromedriver')
    driver.get('https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars')
    element = driver.find_element_by_id("full_image")
    element.click()

    # Give the JS time to render
    time.sleep(1)

    soup = BeautifulSoup(driver.page_source)
    images = soup.find_all(class_="fancybox-image")

    for x in images:
        f_image = x['src']

    featured_image_url = f"https://www.jpl.nasa.gov/{f_image}"

    # Creating dictionary
    marsData['featured_image_url'] = featured_image_url

    # Close the browser
    driver.close()

    # ------------------------------------------
    # 3. Scrapping the weather from twitters
    # ------------------------------------------
    # Scrapping the wheather
    response = requests.get('https://twitter.com/marswxreport?lang=en')
    bs = BeautifulSoup(response.text, 'html.parser')
    weather = bs.find(class_='TweetTextSize').text

    # Using regrex expression to get rid of the pictures link
    mars_weather = re.sub(r'pic.twitter.com/\w+', "", weather)

    # Creating dictionary
    marsData['mars_weather'] = mars_weather

    # ------------------------------------------
    # 4. Scrapping tables with pandas
    # ------------------------------------------
    # Scrapping table with pandas
    marsFactsUrl = "https://space-facts.com/mars/"
    marsFactsTable = pd.read_html(marsFactsUrl)

    # Picking the first table and set index
    MarsFactRename = marsFactsTable[0]
    marsFact = MarsFactRename.rename(columns={0: "Descriptions", 1: "Values"})

    # turning the table into html and get rid of \n
    marsFact = marsFact.to_html()
    marsFact = marsFact.replace('\n', '')

    # Create dictionary
    marsData['marsFact'] = marsFact

    # ------------------------------------------
    # 5. Scrapping 4 images of the hemisphere
    # ------------------------------------------
    # Use splitter
    url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    browser.visit(url_hemisphere)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    mars_hemis = []

    for i in range(4):
        time.sleep(1)

        #Find all the tag that has h3 and store in image
        img = browser.find_by_tag('h3')

        # For each img, click on it
        img[i].click()

        # Pre work for Splitter
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        # Finding the image link
        partialLink = soup.find('img', class_="wide-image")['src']

        # Finding the title within the page
        imageTitle = soup.find('h2', class_='title').text

        # Concatenate the link and image link to create the link for the pics
        imageUrl = 'https://astrogeology.usgs.gov' + partialLink

        # Create a dictionary
        dic = {'title': imageTitle, 'img_url': imageUrl}

        # Appending the dictionary to mars_hemis
        mars_hemis.append(dic)

        # After finding the information go back and find the next imformation
        browser.back()

    # Create a dictionary
    marsData['mars_hemis'] = mars_hemis

    return marsData
Exemple #22
0
def scrape():
    browser = init_browser()

    ##### __NASA Mars News__ #####
    # URL of page to be scraped
    url = 'https://mars.nasa.gov/news/'
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.text, 'html.parser')
    # Collect the latest News Title assign the text to a variable that can be referenced later.
    news_title = soup.find_all('div', class_='content_title')[0].text
    # Collect the latest paragragph and assign the text to a variable that can be referenced later.
    news_p = soup.find_all('div', class_='rollover_description_inner')[0].text
    # Close the browser after scraping
    browser.quit()

    #### __JPL Mars Space Images - Featured Image__ ####
    browser = init_browser()
    # Setup Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    # Set up browser to connect to url and scrape
    url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url)
    # Click on FULL IMAGE button
    browser.links.find_by_partial_text('FULL IMAGE').click()
    # Create Browser and BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # Delay code to allow link to open before trying to scrape
    time.sleep(1)
    # Scrape page to find the featured Mars image
    mars_image = soup.find('img', class_='fancybox-image')
    url = mars_image['src']
    featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + url
    # Close the browser after scraping
    browser.quit()

    ##### __Mars Facts__ #####
    browser = init_browser()
    # Use Pandas to scrape the table and convert the data to a HTML table string
    url = 'https://space-facts.com/mars/'
    mars_table = pd.read_html(url)
    mars_data_df = mars_table[0]
    mars_html_table = mars_data_df.to_html(classes='table table-striped'
                                           'table-bordered',
                                           index=False,
                                           header=False,
                                           border=1)
    # #Close the browser after scraping
    browser.quit()

    ##### __Mars Hemispheres__ #####
    browser = init_browser()
    # Setup splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    # Set up browser to connect to url to scrape
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    # Setup empty list
    hemisphere_image_urls = []
    # Get list of hemispheres
    for i in range(4):
        hemisphere = {}

        time.sleep(1)

        # Click on each hemispher enhanced link
        browser.find_by_css("a.product-item h3")[i].click()

        # Scrape page to find Hemisphere title
        hemisphere["title"] = browser.find_by_css("h2.title").text

        # Locate sample jpg image & scrape url
        sample_element = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = sample_element["href"]

        # download = soup.find('div', class_ = 'downloads')
        # image_url = download.ul.li.a["href"]
        # hemisphere["image_url"] = image_url

        # Add data to hemisphere dictionary
        hemisphere_image_urls.append(hemisphere)

        # Navigate back to Products page to continue through range
        browser.back()

    # Close the browser after scraping
    browser.quit()

    # Python dictionary containing all of the scraped data.
    mars_data = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "mars_html_table": mars_html_table,
        "hemisphere_image_urls": hemisphere_image_urls
    }

    # Close remaing browser
    browser.quit()
    # Return results
    return mars_data
def scrape():

    # Setup splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    html = browser.html

    news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(news_url)
    #soup = BeautifulSoup(html, 'html.parser')
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')

    # Retrieve all elements that contain book information
    article_title = soup.find(class_='content_title')
    article_text = soup.find(class_='article_teaser_body')

    print(article_title)
    print(article_text)

    image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(image_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    image = soup.find('img', class_='thumb').get('src')
    image = 'https://www.jpl.nasa.gov' + image

    print(image)

    facts_url = 'https://space-facts.com/mars/'
    browser.visit(facts_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    tables = pd.read_html(facts_url)[0]
    tables.columns = ['Desc', 'Mars']
    tables = tables.set_index('Desc').to_html()
    hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemisphere_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    h3_loop = soup.find_all('h3')

    h3_list = []
    for x in h3_loop:
        h3_list.append(x.text)

    print(h3_list)
    hemisphere_image_urls = []

    for x in h3_list:

        mars_dict = {}
        browser.click_link_by_partial_text(x)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        mars_title = soup.find('h2', class_="title")
        sample_1 = soup.find('img', class_="wide-image").get('src')
        sample_1 = 'https://astrogeology.usgs.gov' + sample_1
        print(sample_1['src'])
        mars_dict['title'] = mars_title.text
        mars_dict['image_url'] = sample_1['src']

        hemisphere_image_urls.append(mars_dict)
        browser.back()

        scraped_data = {
            "title": article_title.text,
            "paragraph": article_text.text,
            "image": image,
            "mars_tables": tables,
            "hemispheres": mars_dict
        }

        browser.quit()
        return scraped_data
Exemple #24
0
def scrape():
    mars_dict = {}

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #NASA Mars news
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='article_teaser_body').text
    mars_dict['News'] = {'Title': news_title, 'Description': news_p}

    #3PL Mars Images
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(2)
    browser.click_link_by_partial_text('more info')
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_image = soup.find('img', class_='main_image')['src']
    feat_image_url = 'https://www.jpl.nasa.gov' + mars_image
    mars_dict['Featured Image'] = feat_image_url

    #Mars Weather
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_weather = soup.find_all('div', class_='content')
    indicators = ['Sol', 'InSight']
    for tweet in mars_weather:
        twit_user = tweet.find('a', class_='account-group')['data-user-id']
        if twit_user == '786939553':
            weather_text = tweet.find('p', class_='tweet-text').text
            #if weather_text.split()[0] == 'Sol':
            if weather_text.split()[0] in indicators:
                break
        continue
    mars_dict['Weather'] = weather_text
    print(weather_text)

    #Mars Data
    url = 'http://space-facts.com/mars/'
    tables = pd.read_html(url)
    df = tables[0]
    # df.columns = ['Parameter', 'Value(s)']
    # df.set_index('Parameter',inplace=True)
    web_table = df.to_html(classes='table', index=False)
    mars_dict['Facts'] = web_table
    #print(web_table)

    #Mars Hemispheres
    #First url stopped working, page was changed or deleted, or is down
    #url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    url = 'https://astrogeology.usgs.gov/maps/mars-viking-hemisphere-point-perspectives'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    # hemispheres = soup.find_all('div',class_='item')
    #hemis_array = []
    #url_front = 'https://astrogeology.usgs.gov'

    hemispheres = soup.find_all('a', class_='item')
    hemis_array = []
    url_front = 'https://astrogeology.usgs.gov'
    skip = [0, 2, 4, 6]
    iter_num = 0
    for item in hemispheres:
        if iter_num in skip:
            iter_num += 1
            continue
        else:
            iter_num += 1
            item_dict = {}
            text_header = item.find('h3').text
            item_dict['Title'] = text_header

            #link = item.find('a',class_='itemLink')['href']
            link = item['href']
            full_url = url_front + link
            browser.visit(full_url)

            html = browser.html
            soup = bs(html, 'html.parser')

            big_link = soup.find('img', class_='wide-image')['src']
            item_dict['img_url'] = url_front + big_link

            hemis_array.append(item_dict)

            browser.back()
    mars_dict['Hemispheres'] = hemis_array
    #print(hemis_array)

    #<img class="wide-image" src="/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg">

    # #click functions for elements wouldn't work, apparently a chrome driver issue, so I constructed a full link and used browser.visit
    # for item in hemispheres:

    #     item_dict = {}
    #     text_header = item.find('h3').text
    #     item_dict['Title'] = text_header

    #     link = item.find('a',class_='itemLink')['href']
    #     full_url = url_front + link
    #     browser.visit(full_url)

    #     html = browser.html
    #     soup = bs(html, 'html.parser')

    #     big_link = soup.find('img',class_='wide-image')['src']
    #     item_dict['img_url'] = url_front + big_link

    #     hemis_array.append(item_dict)

    #     browser.back()

    # mars_dict['Hemispheres'] = hemis_array

    return mars_dict
Exemple #25
0
def scrape():
    mars_dict = {}

    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

    #scrape the NASA Mars News SIte, collect news title, paragraph text, assign
    #to variables for later reference
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    time.sleep(5)
    html = browser.html
    soup = bs(html, 'html.parser')

    #scrape the title and accompanying paragraph
    ListTitle = soup.find("ul", class_="item_list")
    title = ListTitle.find('div', class_="content_title").get_text()
    paragraph = ListTitle.find("div", class_="article_teaser_body").get_text()

    mars_dict["title"] = title
    mars_dict["paragraph"] = paragraph

    # JPL Mars Space Images - Featured Image¶
    url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url_image)

    #Getting the base url
    from urllib.parse import urlsplit
    base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(url_image))
    time.sleep(10)

    #Design an xpath selector to grab the image
    xpath = "//*[@id=\"page\"]/section[3]/div/ul/li[1]/a/div/div[2]/img"

    #Use splinter to click on the mars featured image
    #to bring the full resolution image
    results = browser.find_by_xpath(xpath)
    img = results[0]
    img.click()

    #get image url using BeautifulSoup
    time.sleep(5)
    html_image = browser.html
    soup = bs(html_image, "html.parser")
    img_url = soup.find("img", class_="fancybox-image")["src"]
    full_img_url = base_url + img_url

    mars_dict["full_img_url"] = full_img_url

    # Mars Weather

    import GetOldTweets3 as got
    tweetCriteria = got.manager.TweetCriteria().setUsername(
        "MarsWxReport").setMaxTweets(5)
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)[3]

    mars_dict["tweet"] = tweet

    # Mars Facts

    facts_url = 'https://space-facts.com/mars/'
    table = pd.read_html(facts_url)
    table[0]

    df_mars_facts = table[0]
    df_mars_facts.columns = ["Parameter", "Values"]
    df_mars_facts.set_index(["Parameter"])

    mars_html_table = df_mars_facts.to_html()
    mars_html_table = mars_html_table.replace("\n", "")
    mars_html_table

    mars_dict["mars_html_table"] = mars_html_table

    # Mars Hemispheres

    hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemisphere)
    time.sleep(15)

    #Getting the base url
    base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(hemisphere))
    time.sleep(15)

    cerberus_image = browser.find_by_tag('h3')[0]
    schiaparelli_image = browser.find_by_tag('h3')[1]
    syrtis_image = browser.find_by_tag('h3')[2]
    marineris_image = browser.find_by_tag('h3')[3]

    browser.find_by_css('.thumb')[0].click()
    first_img = browser.find_by_text('Sample')['href']
    browser.back()

    browser.find_by_css('.thumb')[1].click()
    second_img = browser.find_by_text('Sample')['href']
    browser.back()

    browser.find_by_css('.thumb')[2].click()
    third_img = browser.find_by_text('Sample')['href']
    browser.back()

    browser.find_by_css('.thumb')[3].click()
    fourth_img = browser.find_by_text('Sample')['href']

    mars_hemispheres_images = [{
        'title': cerberus_image,
        'img_url': first_img
    }, {
        'title': schiaparelli_image,
        'img_url': second_img
    }, {
        'title': syrtis_image,
        'img_url': third_img
    }, {
        'title': marineris_image,
        'img_url': fourth_img
    }]
    time.sleep(10)

    mars_dict["mars_hemispheres_images"] = mars_hemispheres_images

    return mars_dict
def scrape():
    browser = init_browser()
    mars_facts_data = {}

    # url of page to be scraped
    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"

    # Retrieve page with requests module
    response = requests.get(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.text, 'html.parser')

    print(soup.prettify())

    # Look for news titles
    news_title = soup.find('div', class_="content_title").text
    news_title

    # find paragraph descriptions
    news_p = soup.find('div', class_='rollover_description_inner').text
    news_p

    # Use splinter to navigate the site and find the image url for the current Featured Mars
    # Image and assign the url string to a variable called featured_image_url.

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # print(soup.prettify())
    image_url = soup.find('img', class_="fancybox-image")["src"]
    featured_image_url = "https://www.jpl.nasa.gov" + image_url
    featured_image_url

    # Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts
    # about the planet including Diameter, Mass, etc.

    url = "https://space-facts.com/mars/"

    mars_table = pd.read_html(url)
    mars_table

    # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    hemisphere_image_urls = []

    for x in range(4):
        images = browser.find_by_tag('h3')
        images[x].click()
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        image_url_end = soup.find("img", class_="wide-image")["src"]
        title = soup.find("h2", class_="title").text
        img_url = 'https://astrogeology.usgs.gov' + image_url_end
        image_dict = {"title": title, "img_url": img_url}
        hemisphere_image_urls.append(image_dict)
        browser.back()

    hemisphere_image_urls

    return mars_facts_data
Exemple #27
0
def scrape_info():
    executable_path = {
        'executable_path': '/Users/prashantkapadia/Desktop/chromedriver'
    }
    browser = Browser('chrome', **executable_path, headless=False)

    # URL of page to be scraped
    url = 'https://mars.nasa.gov/news'
    browser.visit(url)
    time.sleep(1)

    # Scrape page into Soup

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # Get the News title and paragraph
    news_title = soup.select_one(
        'ul.item_list li.slide div.content_title a').text
    news_p = soup.select_one(
        'ul.item_list li.slide div.article_teaser_body').text

    ### JPL Mars Space Images - Featured Image
    images_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(images_url)
    time.sleep(1)
    full_image_bt = browser.find_by_id('full_image')
    full_image_bt.click()
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_bt = browser.links.find_by_partial_text('more info')
    more_info_bt.click()
    img_html = browser.html
    img_soup = BeautifulSoup(img_html, 'html.parser')
    image_path = img_soup.select_one('figure.lede a img').get('src')
    featured_image_url = f'https://www.jpl.nasa.gov{image_path}'

    # Mars Weather from Twitter
    twitter_url = ('https://twitter.com/marswxreport?lang=en')
    browser.visit(twitter_url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    time.sleep(1)
    tweets = soup.find("span", text=re.compile("InSight sol"))

    # Pulling only text part and assigning to current_weather variable.
    time.sleep(3)
    current_weather = tweets.text

    # Mars Hemispheres scrapping image titlel and image URLs.

    hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemispheres_url)
    hemisphere_image_urls = []
    # First, get a list of all of the hemispheres
    links = browser.find_by_css("a.product-item h3")
    # Next, loop through those links, click the link, find the sample anchor, return the href
    for i in range(len(links)):
        hemisphere = {}
        # We have to find the elements on each loop to avoid a stale element exception
        browser.find_by_css("a.product-item h3")[i].click()
        # Next, we find the Sample image anchor tag and extract the href
        sample_elem = browser.links.find_by_text('Sample').first
        hemisphere['img_url'] = sample_elem['href']
        # Get Hemisphere title
        hemisphere['title'] = browser.find_by_css("h2.title").text
        # Append hemisphere object to list
        hemisphere_image_urls.append(hemisphere)
        # Finally, we navigate backwards
        browser.back()

    # Store data in a dictionary
    mars_data = {
        'news_title': news_title,
        'news_p': news_p,
        'featured_image': featured_image_url,
        'current_weather': current_weather,
        'hemisphere_image_urls': hemisphere_image_urls
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return mars_data
Exemple #28
0
def mars_hemi():
    # scraping the hemisphere urls and title
    # Windows users
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome',
                      r'C:\Users\eblak\Class_Folder\Mission-to-Mars',
                      headless=False)

    # 1. Use browser to visit the hemisphere URL
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # b. Cerberus
    browser.click_link_by_partial_text('Cerberus')
    cerberus_html = browser.html
    cerberus_soup = soup(cerberus_html, 'html.parser')

    # find title
    cerberus_title = cerberus_soup.find("h2", class_='title').text

    # Find the relative image url
    cerberus = cerberus_soup.find('img', class_='wide-image')
    cerberus_img = cerberus['src']

    # add base url to rel url
    hemi_url = 'https://astrogeology.usgs.gov'
    cerberus_url = hemi_url + cerberus_img

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # c. Schiaparelli
    browser.back()
    browser.click_link_by_partial_text('Schiaparelli')
    schiaparelli_html = browser.html
    schiaparelli_soup = soup(schiaparelli_html, 'html.parser')

    # find title
    schiaparelli_title = schiaparelli_soup.find("h2", class_='title').text

    # find the relative image url
    schiaparelli = schiaparelli_soup.find('img', class_='wide-image')
    schiaparelli_img = schiaparelli['src']

    # add base url to rel url
    hemi_url = 'https://astrogeology.usgs.gov'
    schiaparelli_url = hemi_url + schiaparelli_img

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # d. Syrtis Major
    browser.back()
    browser.click_link_by_partial_text('Syrtis')
    syrtis_html = browser.html
    syrtis_soup = soup(syrtis_html, 'html.parser')

    # find title
    syrtis_title = syrtis_soup.find("h2", class_='title').text

    # find the relative image url
    syrtis = syrtis_soup.find('img', class_='wide-image')
    syrtis_img = syrtis['src']

    # add base url to rel url
    hemi_url = 'https://astrogeology.usgs.gov'
    syrtis_url = hemi_url + syrtis_img

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # e. Valles Marineris
    browser.back()
    browser.click_link_by_partial_text('Valles')
    valles_html = browser.html
    valles_soup = soup(valles_html, 'html.parser')

    # find title
    valles_title = valles_soup.find("h2", class_='title').text

    # find the relative image url
    valles = valles_soup.find('img', class_='wide-image')
    valles_img = valles['src']

    # add base url to rel url
    hemi_url = 'https://astrogeology.usgs.gov'
    valles_url = hemi_url + valles_img

    return [{
        'img_url': cerberus_url,
        'title': cerberus_title
    }, {
        'img_url': schiaparelli_url,
        'title': schiaparelli_title
    }, {
        'img_url': syrtis_url,
        'title': syrtis_title
    }, {
        'img_url': valles_url,
        'title': valles_title
    }]
def scrape_info():

    browser = Browser('chrome')
    mars = {}
    # URL of page to be scraped
    url = 'https://mars.nasa.gov/news'
    browser.visit(url)
    time.sleep(5)

    # Retrieve page with the requests module
    #response = requests.get(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(browser.html, 'html.parser')

    # Examine the results, then determine element that contains sought info
    # print(soup.prettify())

    # # NASA Mars News

    results = soup.find_all('div', class_="slide")
    title = []
    description = []
    for result in results:

        try:
            title.append(result.find('div', class_="content_title").a.text)

            description.append(
                result.find('div', class_="rollover_description_inner").text)

            print("title and descriptions are :")
            print("-----------------------------")
            if (title and description):

                print(title)
                print(description)

        except AttributeError as e:
            print(e)

    news_title = title[0]

    news_p = description[0]

    mars["news_title"] = news_title
    mars["news_paragraph"] = news_p
    print(mars["news_title"], " ", mars["news_paragraph"])

    # # JPL Mars Space Images - Featured Image

    # jpl_fullsize_url = 'https://photojournal.jpl.nasa.gov/jpeg/'
    # jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    # browser.visit(jpl_url)
    # time.sleep(5)
    # jpl_html = browser.html
    # jpl_soup = BeautifulSoup(jpl_html, 'html.parser')
    # time.sleep(5)
    # featured_image_list=[]
    # for image in jpl_soup.find_all('div',class_="img"):
    #     featured_image_list.append(image.find('img').get('src'))

    # feature_image = featured_image_list[0]
    # temp_list_1 = feature_image.split('-')
    # temp_list_2 = temp_list_1[0].split('/')
    # featured_image_url = jpl_fullsize_url + temp_list_2[-1] + '.jpg'

    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    #response = requests.get(url)
    browser = Browser("chrome")
    browser.visit(url)
    time.sleep(5)
    click_image = browser.find_by_id("full_image")
    click_image.click()
    time.sleep(5)
    print(click_image)
    links_found1 = browser.find_link_by_partial_text('more info')
    print(links_found1)
    links_found1.click()
    time.sleep(5)

    soup = BeautifulSoup(browser.html, 'html.parser')
    result = soup.find('figure', class_="lede")
    featured_image_url = "https://www.jpl.nasa.gov" + result.a.img["src"]
    featured_image_url
    mars["featured_image"] = featured_image_url
    mars["featured_image"]

    # Mars Weather

    twitterurl = "https://twitter.com/marswxreport?lang=en"

    browser.visit(twitterurl)
    response = requests.get(twitterurl)

    soup2 = BeautifulSoup(browser.html, 'html.parser')

    results = soup2.find_all('div', class_="js-tweet-text-container")
    results

    for result in results:
        mars_weather = result.find(
            'p',
            class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
        ).text
    print(mars_weather)
    print(
        "<---------------------------------------------------------------------------------->"
    )

    mars["weather"] = mars_weather

    # Mars Facts

    url = "http://space-facts.com/mars/"

    tables = pd.read_html(url)
    tables[0]

    df = tables[0]
    df

    df.columns = ['Attributes', 'Values']
    df
    html_table = df.to_html()
    html_table = html_table.replace('\n', '')
    mars['facts'] = html_table

    df.to_html('table.html')

    # # Mars Hemispheres
    url_hemi = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_hemi)
    time.sleep(5)
    usgs_soup = BeautifulSoup(browser.html, 'html.parser')
    headers = []
    titles = usgs_soup.find_all('h3')
    time.sleep(5)

    for title in titles:
        headers.append(title.text)

    images = []
    count = 0
    for thumb in headers:
        browser.find_by_css('img.thumb')[count].click()
        images.append(browser.find_by_text('Sample')['href'])
        browser.back()
        count = count + 1

    hemisphere_image_urls = []  #initialize empty list to collect titles
    counter = 0
    for item in images:
        hemisphere_image_urls.append({
            "title": headers[counter],
            "img_url": images[counter]
        })
        counter = counter + 1
    # closeBrowser(browser)
    browser.back()
    time.sleep(1)
    mars["hemisphere"] = hemisphere_image_urls
    print(hemisphere_image_urls)

    return mars
def scrape():

    # Get the driver and set the executable path
    executable_path = {
        "executable_path": "/Users/shiva/downloads/chromedriver"
    }
    browser = Browser("chrome", **executable_path, headless=False)

    # In[7]:

    mars_data = {}
    # visit mars url - mission starts
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)

    # In[4]:

    ### NASA Mars News
    print('### NASA Mars News')

    # In[10]:

    # collect the latest News Title and Paragraph Text
    ## Example:
    # news_title = "NASA's Next Mars Mission to Investigate Interior of Red Planet"
    html = browser.html
    soup = bs(html, 'html.parser')

    latest_news = soup.find("div", class_="list_text")
    news_p = latest_news.find("div", class_="article_teaser_body").text
    news_title = latest_news.find("div", class_="content_title").text
    news_date = latest_news.find("div", class_="list_date").text
    print(news_date)
    print(news_title)
    print(news_p)

    # Add the news date, title and summary to the dictionary
    mars_data["news_date"] = news_date
    mars_data["news_title"] = news_title
    mars_data["summary"] = news_p

    # In[13]:

    print("### JPL Mars Space Images - Featured Image")

    # In[11]:

    # visit the image url
    jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(jpl_url)
    html = browser.html

    # In[12]:

    # Use splinter to navigate the site and find the image url for the current Featured Mars Image
    # Example: featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg'
    # find the image url
    # right click on the image and inspect get the div name
    soup = bs(html, 'html.parser')
    img_div = soup.find("div", class_="img")
    print(img_div)
    img_div = soup.find("img", class_="thumb")
    print(img_div)

    # get the src
    img_src = soup.find("img", class_="thumb")["src"]
    print(img_src)
    featured_image_url = "https://www.jpl.nasa.gov/" + img_src
    print("***************************")
    print("featured_image_url " + featured_image_url)
    # get the url for the image
    #img_link = img_div.find("img", class_="thumb").text
    #print(img_link)
    mars_data["featured_image_url"] = featured_image_url

    # In[49]:

    print('### Mars Weather')

    # In[13]:

    # Visit the Mars Weather twitter account
    # https://twitter.com/marswxreport?lang=en
    twit_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(twit_url)
    html = browser.html

    # In[14]:

    soup = bs(html, 'html.parser')
    # scrape the latest Mars weather tweet from the page. Save the tweet text
    # Example: mars_weather = 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa

    weather_div = soup.find("div", class_="js-tweet-text-container")
    print(weather_div.p.text)

    # assign it to variable
    mars_weather = weather_div.p.text
    mars_data["mars_weather"] = mars_weather

    # In[57]:

    print("### Mars Facts")

    # In[15]:

    # Visit the Mars Facts webpage
    fact_url = "https://space-facts.com/mars/"
    browser.visit(fact_url)
    html = browser.html
    soup = bs(html, 'html.parser')

    # In[20]:

    # get the facts
    fact_header = soup.find("div", class_="widget-header")
    print(fact_header.h3.text)

    fact_data = soup.find("table", class_="tablepress tablepress-id-p-mars")

    # find all rows
    rows = fact_data.find_all('tr')
    fact = []
    for row in rows:
        print(row.text)
        fact.append(row.text)

    mars_data["mars_table"] = fact

    # In[100]:

    print("### Mars Hemispheres")

    # In[21]:

    # Visit the USGS Astrogeology site
    #[here](https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)
    astro_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(astro_url)
    html = browser.html
    soup = bs(html, 'html.parser')

    # In[ ]:

    # You will need to click each of the links to the hemispheres
    # in order to find the image url to the full resolution image

    astro_data = soup.find("div", class_="item")
    astro_link = astro_data.find("div", class_="description")
    print(astro_link.h3)
    #astro_link.h3.click()
    astro_link = browser.find_by_tag('h3')
    len(astro_link)
    mars_hspr = []

    for i in range(len(astro_link)):
        print(astro_link[i])
        astro_link = browser.find_by_tag('h3')

        time.sleep(3)
        astro_link[i].click()
        html = browser.html

        soup = bs(html, 'html.parser')
        partial = soup.find("img", class_="wide-image")["src"]
        img_title = soup.find("h2", class_="title").text

        img_url = 'https://astrogeology.usgs.gov' + partial
        dictionary = {"title": img_title, "img_url": img_url}

        mars_hspr.append(dictionary)
        browser.back()

    mars_data['mars_hemis'] = mars_hspr
    return mars_data
Exemple #31
0
def scrape_info():

    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://redplanetscience.com/'

    browser.visit(url)

    html = browser.html
    firstsoup = BeautifulSoup(html, 'html.parser')

    ## Step 1 - Scraping
    ### NASA Mars News

    results = firstsoup.find_all('div', class_='list_text')[0]
    latest_News_title = (results.find("div", class_='content_title').text)
    results

    results2 = firstsoup.find_all('div', class_='list_text')
    Paragraphtext = (results.find('div', class_='article_teaser_body').text)

    Paragraphtext

    soup = BeautifulSoup(html)
    soup.title.text.strip()

    soup.body.p.text

    browser.visit(url)

    paragrpahs = soup.body.find_all('p')

    paragrpahs[8].text

    paragraphs = soup.find_all('p')

    for paragraph in paragraphs:
        print(paragraph.text)

    title = soup.find_all('title')

    for title in title:
        print(title.text)

    ### JPL Mars Space Images - Featured Image

#              executable_path={
#                  'executable_path': ChromeDriverManager().install()}
#              browser=Browser('chrome', **executable_path, headless=False)

    url = 'https://spaceimages-mars.com'
    browser.visit(url)

    browser.links.find_by_partial_text('FULL IMAGE').click()
    featured_image_url = 'https://spaceimages-mars.com/image/featured/mars2.jpg'

    ### Mars Facts

    #import pandas as pd

    url = 'https://galaxyfacts-mars.com'
    table = pd.read_html(url)[0]
    print(table)
    # You need to covert this table to html
    tables = table.to_html()
    ### Mars Hemispheres

    url = 'https://marshemispheres.com/'

    browser.visit(url)
    links = browser.find_by_css('a.product-item img')

    hemisphere_img_url = []

    for i in range(len(links)):
        browser.find_by_css('a.product-item img')[i].click()
        # we are on the page finding the picture
        sample_elem = browser.links.find_by_text('Sample').first
        title = browser.find_by_css('h2.title').text

        # we found the picture, now we save it into our list (append)
        img_url = sample_elem['href']
        print(f' Page {i} image url: {img_url}')
        hemisphere_img_url.append({"title": title, "img_url": img_url})
        #we are done with this page, lets go back for the next page.
        browser.back()
    browser.quit()
    hemisphere_img_url

    scrape_data = {
        "news_title": latest_News_title,
        "news_paragraph": Paragraphtext,
        "featured_image_url": featured_image_url,
        "html_table": tables,
        "hemisphere_img_urls": hemisphere_img_url
    }
    return scrape_data
def scrape():
    executable_path = {'executable_path': 'C:chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    # Create a dictionary for all of the scraped data
    mars_data = {}
    # Retrieve page with the requests module

    # Create BeautifulSoup object; parse with 'html.parser'
    #Getting all article titles and article description from home page

    url2 = 'https://mars.nasa.gov/news/'
    response = requests.get(url2)
    soup = BeautifulSoup(response.text, features="lxml")

    titles = []
    title_results = soup.find_all('div', class_="content_title")
    for i in title_results:
        titles.append(i.text)

    paragraphs = []
    p_results = soup.find_all('div', class_="rollover_description_inner")
    for i in p_results:
        paragraphs.append(i.text)

    mars_data["news_titles"] = titles[0]
    mars_data["summarys"] = paragraphs[0]

    ##Mars Weather

    url3 = 'https://twitter.com/marswxreport?lang=en'
    response = requests.get(url3)
    soup = BeautifulSoup(response.text, "html.parser")

    #create empty list for weather tweets
    weather_tweets = []
    #scrape html for tweets
    tweet_results = soup.find_all('div', class_="js-tweet-text-container")
    #find weather tweets only
    for i in tweet_results:
        if "sol" in i.text:
            weather_tweets.append(i.text)

    mars_data["Weather"] = weather_tweets[0]

    #Mars Facts

    url4 = 'https://space-facts.com/mars/'
    #use pandas to scrape url
    tables = pd.read_html(url4)
    mars_facts = pd.DataFrame(tables[0])
    mars_facts.columns = ['Mars - Earth Comparison', 'Mars', 'Data']
    mars_facts = mars_facts.set_index("Mars")
    mars_facts = mars_facts.to_html()
    mars_facts = mars_facts.replace('\n', ' ')
    mars_data["mars_facts"] = mars_facts

    #Scrape for featured Image
    url3 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url3)
    # Scrape the browser into soup and use soup to find the full resolution image of mars
    # Save the image url to a variable called `featured_image_url`
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    image = soup.find('img', class_="thumb")["src"]
    img_url = "https://jpl.nasa.gov" + image
    mars_data["featured_img"] = img_url

    #Mars Hemisphere
    #Create dictionaries with the image url string and the hemisphere title to a list.
    # Visit the USGS Astogeology site and scrape pictures of the hemispheres
    url5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url5)

    # Use splinter to loop through the 4 images and load them into a dictionary
    import time
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    hemisphere_image_url = []

    # loop through the four tags and load the data to the dictionary

    for i in range(4):
        time.sleep(5)
        images = browser.find_by_tag('h3')
        images[i].click()
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        partial = soup.find("img", class_="wide-image")["src"]
        img_title = soup.find("h2", class_="title").text
        img_url = 'https://astrogeology.usgs.gov' + partial
        dictionary = {"title": img_title, "img_url": img_url}
        hemisphere_image_url.append(dictionary)
        browser.back()

    mars_data['hemisphere_image'] = hemisphere_image_url
    return mars_data
def scrape():

    #Splinter Setup
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    #NASA Mars News
    #Retrieve webpage and create an object
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html

    response = requests.get(url)
    soup = bs(response.text, 'lxml')

    #Scrape site for news title and paragraph text
    news_heading = soup.find_all('div', class_="content_title")[1].text
    news_snip = soup.find("div", class_="rollover_description_inner").text

    #Mars Facts
    url = 'https://space-facts.com/mars/'

    #Retrieve webpage and create an object
    response = requests.get(url)
    soup = bs(response.text, 'lxml')

    #Convert the HTML into a df
    info_df = pd.read_html(url)
    mars_df = info_df[0]
    mars_df

    #Convert df to HTML table string
    htmltbl = mars_df.to_html()
    htmltbl.replace('\n', '')

    #Mars Hemispheres
    image_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    main_url = 'https://astrogeology.usgs.gov'

    #Splinter Setup
    browser.visit(image_url)

    #Create object and parse
    html = browser.html
    soup = bs(html, 'lxml')

    #Scrape the site for all mars info
    hemisphere = soup.find_all('div', class_="item")

    #Empty list full link
    all_info = []

    for i in hemisphere:
        #find title
        title = i.find('h3').text
        browser.click_link_by_partial_text(title)
        title = title.strip("Enhanced")

        html = browser.html
        soup = bs(html, 'lxml')

        img_url = soup.find("div",
                            class_="downloads").find("ul").find('a')['href']

        marsdict = {'title': title, 'img_url': img_url}
        all_info.append(marsdict)

        browser.back()

    browser.quit()

    #Create dict to scraped info
    output = {
        "newstitle": news_heading,
        "newspara": news_snip,
        "mfacts": htmltbl,
        "hemi": all_info
    }
    return output
Exemple #34
0
def scrape():

        # # Mission to Mars

        from splinter import Browser
        from bs4 import BeautifulSoup


        executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
        browser = Browser('chrome', **executable_path, headless=False)

        url = 'https://mars.nasa.gov/news/'
        browser.visit(url)

        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        #print(soup.prettify())

        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        articles = soup.find_all('li', class_='slide')
        mars_text = {}

        for article in articles:
                link = article.find('a')
                href = link['href']
                
                nasa_title = article.find('div', class_='content_title').text
                print(nasa_title)
                
                nasa_text = article.find('div', class_='article_teaser_body').text
                print(nasa_text)    
                mars_text[nasa_title] = nasa_text





        ### JPL Mars Space Images - Featured Image

        url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
        browser.visit(url)

        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        #print(soup.prettify())
        mars_image = {}

        for x in range(2):
                html = browser.html
                soup = BeautifulSoup(html, 'html.parser')
                articles = soup.find_all('section', class_='centered_text clearfix main_feature primary_media_feature single')
        
                for article in articles:
                        featured_image_title = article.find('h1', class_='media_feature_title').text
                        print(featured_image_title)
                        
                        featured_image_url = article.find('a')['data-fancybox-href']
                        featured_image_url = 'https://www.jpl.nasa.gov' + featured_image_url
                        print(featured_image_url)

                        mars_image[featured_image_title] = featured_image_url



        ### Mars Weather
        import json
        import tweepy 
        from pprint import pprint
        import sys
        sys.path.append('..')
        from config import consumer_key, consumer_secret, access_token, access_token_secret

        mars_temp = {}

        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

        mars_weather = []
        tweets = api.user_timeline(id='MarsWxReport', count=1)
        #pprint(tweets)

        for tweet in tweets:
                mars_weather = tweet['text']
                print(mars_weather)
        
        mars_temp["weather"] = mars_weather
        


        ### Mars Facts
        import pandas as pd
        mars_facts = {}

        url = 'https://space-facts.com/mars/'

        tables = pd.read_html(url)
        fact_table = tables[0]

        fact_table.columns = ["Fact", "Fact"]
        html_table = fact_table.to_html()
        html_table

        mars_facts["table"] = html_table



        ### Mars Hemispheres
        executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
        browser = Browser('chrome', **executable_path, headless=False)

        url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
        browser.visit(url)

        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        #print(soup.prettify())

        mars_urls = {}
        hemisphere_image_urls = []
        hemisphere_url_base = 'https://astrogeology.usgs.gov'

        images = soup.find_all('div', class_='item')

        for image in images:
                # temp_dict = {}
                # hemisphere_url = image.find('a')['href']
                # browser.visit(hemisphere_url_base + hemisphere_url)
                # title = browser.title
                # #title = browser.find_by_css('h2')['title']

                # temp_dict.update({"title": title})    
                # img_url = browser.find_by_text('Sample')['href']
                # temp_dict.update({"img_url": img_url})
                # browser.back()
                # hemisphere_image_urls.append(temp_dict.copy())

                hemisphere_url = image.find('a')['href']
                browser.visit(hemisphere_url_base + hemisphere_url)
                title = browser.title

                img_url = browser.find_by_text('Sample')['href']
                browser.back()
                mars_urls[title] = img_url
        
        return mars_text, mars_image, mars_temp, mars_facts, mars_urls
Exemple #35
0
class DownPatent(object):
    def __init__(self, db, down_url):
        self.db = db
        self.down_url = down_url
        self.browser = Browser("phantomjs", wait_time=10)
        #self.browser = Browser()

    #下载专利
    def download(self, patentno):
        #访问网页
        #网页加载超时
        #down_flag, 0:未下载,1:不存在,2:下载失败
        download_link = ""
        down_flag = 0
        if True:
            print "打开网页"
            self.browser.visit(self.down_url)
            if not self.browser.is_element_not_present_by_value("查询", wait_time=10):
                #填写专利号
                self.browser.fill("cnpatentno", patentno)
                self.browser.find_by_value("查询").first.click()
                print "填写专利号"
                #连接超时,404
                if self.browser:
                    print "打开验证码网页"
                    #一个最多循环20次
                    code_handler = CodeHandler()
                    #填写验证码
                    list_fill_text = []
                    #验证码路径
                    list_code_path = []
                    #验证码分割标志
                    list_split_flag = []
                    #验证码识别标志
                    list_reg_flag = []
                    for code_num in xrange(20):
                        print code_num
                        #查找验证码
                        if not self.browser.is_element_not_present_by_id("getcode", wait_time=5):
                            print "查找验证码"
                            #截图
                            #self.browser.driver.maximize_window()
                            self.browser.driver.save_screenshot("screenshot.png")
                            #获取验证码图片
                            image = Image.open("screenshot.png")
                            image_location = self.find_location(image)
                            image_code = image.crop((image_location[0], image_location[1], image_location[0]+52, image_location[1]+21))
                            save_path = "static/images/onlinecode/" + time.ctime() + ".png"
                            save_path_temp = "../%s" % save_path
                            image_code.save(save_path_temp)
                            list_code_path.append(save_path)

                            #分割图片
                            list_split_image = self.deal_split(code_handler, image_code)
                            
                            #识别,如果能正确识别,则识别,不能,则重新获取验证码      
                            if len(list_split_image) == 4:
                                print "正确分割"
                                list_split_flag.append(1)
                                reg_plain_text = self.reg_code(list_split_image)
                                fill_text = "".join(reg_plain_text)
                                list_fill_text.append(fill_text)
                                #填写验证码
                                #hand_fill_text = raw_input("Enter fill text:")
                                self.browser.fill("ValidCode", fill_text)
                                self.browser.find_by_value("确定").first.click()

                                print self.browser.html.encode("utf-8").find("验证码输入错误") 
                                if self.browser.html.encode("utf-8").find("验证码输入错误") == -1:
                                    list_reg_flag.append(1)
                                    if self.browser.html.encode("utf-8").find("没有找到该专利") == -1:
                                        down_link_one = self.browser.find_link_by_text("申请公开说明书图形下载(标准版)")
                                        down_link_two = self.browser.find_link_by_text("申请公开说明书图形下载(极速版)")
                                        if down_link_one or down_link_two:
                                            print "查找说明书图形下载链接"
                                            list_reg_flag.append(1)
                                            if down_link_one:
                                                self.browser.click_link_by_text("申请公开说明书图形下载(标准版)")
                                            else:
                                                self.browser.click_link_by_text("申请公开说明书图形下载(极速版)")
                                            
                                            print "查找下载链接"
                                            #查找下载链接
                                            download_a = self.browser.find_link_by_text("下载专利")
                                            if download_a:
                                                download_link = download_a["href"]
                                            
                                                #找到下载链接
                                                down_flag = 3
                                                break
                                            else:
                                                print "下载失败"
                                                #下载失败
                                                down_flag = 2
                                                break
                                        '''
                                        else:
                                            print "识别正确,未找到链接"
                                            list_reg_flag.append(0)
                                            self.browser.back()
                                            self.browser.reload()
                                        '''
                                    else:
                                        print "不存在专利"
                                        #没有专利
                                        down_flag = 1
                                        break
                                else:
                                    print "识别错误,重新加载"
                                    list_reg_flag.append(0)
                                    self.browser.back()
                                    self.browser.reload()
                            else:
                                print "不能分割"
                                list_fill_text.append("")
                                list_split_flag.append(0)
                                list_reg_flag.append(0)
                                self.browser.reload()
                    
                    #存入数据集onlinecode,专利号,验证码路径,识别码,识别标志,不可分标志,时间
                    
                    for code_path, fill_text, split_flag, reg_flag in zip(list_code_path,list_fill_text, list_split_flag, list_reg_flag):
                        try:
                            self.db.onlinecode.insert({"indexflag": patentno, "codepath": code_path, "filltext": fill_text, \
                                                      "splitflag": split_flag, "regflag": reg_flag, "time": time.ctime()})
                        except: pass
        return download_link

    #处理验证码                       
    def deal_split(self, code_handler, image):
        list_split_image = code_handler.main_deal_split(image)
        return list_split_image

    #识别
    def reg_code(self, list_split_image):
        all_plain_text = "0123456789abcdef"
        reg_plain_text = []
        neural = NeuralWork()
        list_input_data = []
        for each_split_image in list_split_image:
            each_input_data = []
            for x in xrange(each_split_image.size[1]):
                for y in xrange(each_split_image.size[0]):
                    if each_split_image.getpixel((y, x)):
                        each_input_data.append(0)
                    else:
                        each_input_data.append(1)
            list_input_data.append(each_input_data)
        out = neural.reg_net(list_input_data)
        for each in out:
            plain_text = int(round(each[0] * 100))
            if plain_text < 16:
                reg_plain_text.append(all_plain_text[plain_text])
        return reg_plain_text

    #查找验证码图片位置
    def find_location(self, image):
        image = image.convert("L")
        image_width = image.size[0]
        image_height = image.size[1]
        
        flag = image_width
        location = [0, 0]
        for y in xrange(image_width):
            for x in xrange(image_height):
                if image.getpixel((y, x)) != 0:
                    flag = y
                    break
            if flag != image_width:
                location[0] = y
                location[1] = x
                break

        return location
Exemple #36
0
def scrape():
    #set up connection
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=True)

    #visit nasa news site
    nasa_url = 'https://mars.nasa.gov/news/'
    browser.visit(nasa_url)
    html = browser.html
    nasasoup = BeautifulSoup(html,'html.parser')

    #find most recent news title and description
    result = nasasoup.find_all(class_="slide")
    news_title = result[0].find('h3').text
    news_p = result[0].find(class_='rollover_description_inner').text

    #visit jpl.nasa site
    nasa_url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(nasa_url2)
    html = browser.html
    nasasoup2 = BeautifulSoup(html, 'html.parser')

    #get imageurl for featured image
    featuredimageurl = 'https://www.jpl.nasa.gov' + nasasoup2.select('#full_image')[0]['data-fancybox-href']

    #visit twitter
    twitterfeed_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(twitterfeed_url)
    html = browser.html
    twittersoup = BeautifulSoup(html,'html.parser')

    #get most recent weather tweet
    mars_weather = twittersoup.find('p',class_="TweetTextSize").text

    #visit space-facts.com
    spacefacts_url = 'https://space-facts.com/mars/'
    browser.visit(spacefacts_url)
    html = browser.html
    spacefactsoup = BeautifulSoup(html,'html.parser')

    #read in table via pandas
    spacefacttabledf = pd.read_html(html)[0]

    #convert table back to html
    spacefacttable = spacefacttabledf.to_html(index=False)

    #visit usgs.gov
    usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(usgs_url)

    #grab hemisphere name and img_url for each of the four hemispheres
    imagelinks = []
    for x in range(4):
        links = browser.find_link_by_partial_text('Enhanced')
        browser.click_link_by_partial_text(links[x].text)
        html = browser.html
        imagesoup = BeautifulSoup(html,'html.parser')
        result = imagesoup.find('a',text='Sample')
        hemistring = imagesoup.find('h2').text
        imagelinks.append({'title':hemistring[:len(hemistring)-9],'img_url':result.attrs['href']})
        browser.back()

    output = {'news_title':news_title, 'news_p':news_p, 'featuredimageurl':featuredimageurl,
              'mars_weather':mars_weather,'spacefacttable':spacefacttable, 'imagelinks':imagelinks}

    return output
browser.visit('https://egov.uscis.gov/cris/Dashboard/CaseStatus.do')
receipt_search = '000'
total_num = 0
while True:
    input = browser.find_by_id('receipt')
    button = browser.find_by_id('dashboardForm').find_by_name('submit')
    receipt_pre = 'EAC1490146'
    input.first.fill(receipt_pre + receipt_search)
    button.first.click()
    status = browser.find_by_id('caseStatus').find_by_xpath('//div/div/h4')
    details = browser.find_by_id('caseStatus').find_by_xpath('//div/div/p')
    target = False
    index_end = 3
    date = ""
    for detail in details:
        if 'we received this I765 APPLICATION FOR EMPLOYMENT AUTHORIZATION' in detail.value:
            target = True
            index_end = detail.value.index('we received this I765 APPLICATION FOR EMPLOYMENT AUTHORIZATION')
            date = detail.value[3:index_end-2]
            break
    #time.sleep(60)
    if target and 'Initial Review' in status[0].value:
        print receipt_pre+str(receipt_search)+"    "+date
	total_num = total_num + 1
    receipt_search = str(int(receipt_search) + 1).zfill(3)
    if int(receipt_search) >= 999:
        break
    browser.back()
print 'done'
print str(total_num)
def scrape():

    # ===========================================
    # declare dictionary for all results
    all_dict = {
        "mars_news_title": "",
        "mars_news_text": "",
        "featured_image_url": "",
        "mars_weather": "",
        "mars_facts": "",
        "hemisphere_list": ""
    }

    # ===========================================
    # Mars news url to be scraped
    mars_news_url = "https://mars.nasa.gov/news/"

    # module to call API
    response = requests.get(mars_news_url)

    # scrape raw text from page
    soup = bs(response.text, "html.parser")

    # print soup
    #print(soup.prettify())

    # get all the responses as an iterable list
    results = soup.find_all('div', class_="slide")

    # print the latest news
    #print(results[0].prettify)

    # get news title
    mars_news_title = results[0].find(
        "div", class_="content_title").find("a").text.strip()
    print(mars_news_title)

    # get news text
    mars_news_text = results[0].find(
        "div", class_="rollover_description_inner").text.strip()
    print(mars_news_text)

    all_dict["mars_news_title"] = mars_news_title
    all_dict["mars_news_text"] = mars_news_text

    # ===========================================
    # open browser
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # visit the page for image
    mars_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(mars_image_url)

    # find the button to clicked the feature image
    button = browser.click_link_by_partial_text("FULL IMAGE")

    # Otherwise, this code cannot run in one flow; please blame Splinter
    time.sleep(1)

    # get image url
    soup = bs(browser.html, "html.parser")
    whatever = soup.find("img", {"class": "fancybox-image"})
    print(type(whatever))
    featured_image_url = "https://www.jpl.nasa.gov" + whatever["src"]
    print(featured_image_url)

    browser.quit()

    all_dict["featured_image_url"] = featured_image_url

    # ===========================================
    # Mars weather url to be scraped
    mars_weather_url = "https://twitter.com/marswxreport?lang=en"

    # module to call API
    response = requests.get(mars_weather_url)

    # scrape raw text from page
    soup = bs(response.text, "html.parser")

    # print soup
    #print(soup.prettify())

    # get all the responses as an iterable list
    results = soup.find_all('div', class_="js-tweet-text-container")

    # print the latest weather tweet
    # print(results[0].prettify)

    # get tweet text
    for result in results:
        # get rid of the unwanted tail
        trash = result.find("a", class_="twitter-timeline-link")
        _ = trash.extract()
        # now get the "pure" output
        mars_weather = result.find("p", class_="js-tweet-text").text.strip()
        # if it's a valid weather tweet
        if "InSight" in mars_weather:
            print(mars_weather)
            break

    all_dict["mars_weather"] = mars_weather

    # ===========================================
    # Mars facts url to be scraped
    mars_facts_url = "https://space-facts.com/mars/"

    # read table into pandas
    tables = pd.read_html(mars_facts_url)
    table = tables[0]

    # change name of columns
    table.columns = ['Parameter', 'Value']
    #display(table)

    # convert table to html
    mars_facts = table.to_html()
    mars_facts

    all_dict["mars_facts"] = mars_facts

    # ===========================================
    # open browser (if closed already)
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # visit the page for image
    mars_hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(mars_hemis_url)

    # find the button to clicked the feature image
    buttons = browser.find_by_css('img[class="thumb"]')
    buttons_length = len(buttons)
    button = buttons[0]

    dict_list = []

    # loop over all the buttons
    for i in range(buttons_length):
        button.click()

        #extract elements with beautifulsoup
        soup = bs(browser.html, "html.parser")
        img_title = soup.find('h2', class_="title").text.strip()
        img_url = soup.find('a', target="_blank")['href']

        # append list of dictionaries
        this_dict = {"title": "", "img_url": ""}
        this_dict["title"] = img_title
        this_dict["img_url"] = img_url
        dict_list.append(this_dict)

        # go back one level
        browser.back()
        buttons = browser.find_by_css('img[class="thumb"]')
        if i + 1 in range(buttons_length):
            button = buttons[i + 1]
        else:
            pass

    browser.quit()

    all_dict["hemisphere_list"] = dict_list
    print(all_dict)

    return all_dict