def scrape():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    marsdata = {}

    url = 'https://mars.nasa.gov/news/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Print Title
    news_title = soup.title.text

    # Print all paragraph texts
    paragraphs = soup.find_all('p')
    for paragraph in paragraphs:
        print(paragraph.text)
    news_p = paragraph.text

    # add our last news and last paraghraph to to Marse_data
    marsdata["news_title"] = news_title
    marsdata["news_p"] = news_p

    #Mars Image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    full_image = browser.find_by_id('full_image')
    full_image.click()

    browser.is_element_present_by_text('more info', wait_time=3)
    info = browser.find_link_by_partial_text('more info')
    info.click()

    html = browser.html

    img = BeautifulSoup(html, 'html.parser')

    marsdata["featured_image_url"] = img.select_one('figure.lede a img').get(
        'src')

    # Mars Weather
    mars_weather = soup.find(string=re.compile("Sol"))
    marsdata["mars_weather"] = mars_weather

    # Space Facts
    url = 'https://space-facts.com/mars/'
    browser.visit(url)
    for i in browser.find_by_tag('td'):
        i.text
    head = []
    data = []
    for r in range(len(browser.find_by_tag('td'))):
        if r % 2 == 0:
            head.append(browser.find_by_tag('td')[r].text)
        else:
            data.append(browser.find_by_tag('td')[r].text)
    mars_facts = list(zip(head, data))
    marsdata["Mars_facts"] = mars_facts

    # Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    links = ['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles']
    hemisphere_image_urls = []
    links = ['Cerberus', 'Schiaparelli', 'Syrtis', 'Valles']
    hemisphere_image_urls = []
    for link in links:
        hemisphere_image_urls_dic = {}
        link_click = browser.find_link_by_partial_text(link)
        link_click.click()
        time.sleep(15)
        # `   browser.is_element_present_by_css("img.wide-image", wait_time=10)
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        image_url = soup.find("img", class_="wide-image")["src"]
        title = soup.find("h2", class_="title").text
        if "https://astrogeology.usgs.gov:" not in image_url:
            image_url = "https://astrogeology.usgs.gov" + image_url
        hemisphere_image_urls_dic['title'] = title
        hemisphere_image_urls_dic['image_url'] = image_url
        hemisphere_image_urls.append(hemisphere_image_urls_dic)
        browser.back()
        marsdata["hemisphere_title_urls"] = hemisphere_image_urls
        browser.quit()
        return marsdata
Exemple #2
0
# Import Splinter, BeautifulSoup, and Pandas
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
 
# Path to chromedriver
!which chromedriver
 
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path)
 
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
 
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)
 
# Convert the browser html to a soup object and then quit the browser
html = browser.html
news_soup = soup(html, 'html.parser')
 
slide_elem = news_soup.select_one('ul.item_list li.slide')
 
slide_elem.find("div", class_='content_title')
 
# Use the parent element to find the first a tag and save it as `news_title`
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title
 
def init_browser():
    executable_path = {'executable_path': 'chromedriver.exe'}
    return Browser("chrome", **executable_path, headless=False)
Exemple #4
0
def scrape(driver, driverpath):

    # set up spliter browser
    executable_path = {"executable_path": driverpath}

    ## Latest news
    # set up splinter browser
    with Browser(driver, **executable_path, headless=False) as browser:

        # visit url
        url = "https://mars.nasa.gov/news/"
        browser.visit(url)
        time.sleep(T)
        # pull html text
        html = browser.html
        # parse html
        soup = BeautifulSoup(html, "html.parser")
        # grab news title
        news_title = soup.find("div", {"class": "bottom_gradient"}).text
        # grab news content
        news_content = soup.find("div", {
            "class": "rollover_description_inner"
        }).text

        # with Browser("chrome", **executable_path, headless=False) as browser:

        # Latest featured images
        # featured image url
        url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
        browser.visit(url)
        time.sleep(T)
        # navigate to link
        browser.click_link_by_id("full_image")
        # browser.click_link_by_partial_text("FULL IMAGE")
        time.sleep(T)
        browser.click_link_by_partial_text("more info")
        time.sleep(T)
        # pull/off-load html text
        html = browser.html
        # parse html
        soup = BeautifulSoup(html, "html.parser")

        # grab the image path
        image_path = soup.find('figure', class_='lede').a['href']
        # make the full path
        featured_image_url = "https://www.jpl.nasa.gov/" + image_path
        # # grab the image path
        # image_path = soup.find("div", {"class": "download_tiff"}).p.a["href"]
        # # make the full path
        # featured_image_url = "https://www.jpl.nasa.gov/" + image_path
        # with Browser("chrome", **executable_path, headless=False) as browser:

        ## Latest weather
        url = "https://twitter.com/marswxreport?lang=en"
        browser.visit(url)
        # pull/off-load html text
        html = browser.html
        # parse html
        soup = BeautifulSoup(html, "html.parser")
        # grab latest tweet
        weather = soup.find(
            "p", {
                "class":
                "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
            }).text

        # with Browser("chrome", **executable_path, headless=False) as browser:

        ## Mars facts
        url = "https://space-facts.com/mars/"
        browser.visit(url)
        # pull/off-load html text
        html = browser.html
        # parse html
        soup = BeautifulSoup(html, "html.parser")
        #get the entire table
        facts_table = soup.find('table', {
            "class": "tablepress tablepress-id-mars"
        }).find_all("tr")

        facts_dict = dict(label=[], value=[])
        for tr in facts_table:
            elements = tr.find_all("td")
            facts_dict["label"].append(elements[0].text)
            facts_dict["value"].append(elements[1].text)

        facts_df = pd.DataFrame(facts_dict)
        facts_html = facts_df.to_html(header=False, index=False)
        # with Browser("chrome", **executable_path, headless=False) as browser:

        ## Mars hemispheres
        url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
        browser.visit(url)
        time.sleep(T)

        # pull/off-load html text
        html = browser.html
        # parse html
        soup = BeautifulSoup(html, "html.parser")

        # get class holding hemisphere picture
        collapsible_results = soup.find("div",
                                        {"class": "collapsible results"})
        hemispheres = collapsible_results.find_all("div",
                                                   {"class": "description"})

        hemisphere_image_urls = []

        for item in hemispheres:
            # get title
            title = item.a.h3.text
            # get link to1 hemisphere page
            url_item = "https://astrogeology.usgs.gov" + item.a['href']

            #  pull/off-load heml text
            browser.visit(url_item)
            time.sleep(T)

            # off-load html text
            html_item = browser.html
            # parse html
            soup_item = BeautifulSoup(html_item, 'html.parser')
            image_url = soup_item.find('div', {
                "class": "downloads"
            }).find('li').a['href']

            hemisphere_image_urls.append(dict(title=title, url=image_url))
            # check on the retrieved link
            browser.visit(image_url)
            time.sleep(T)

    scarpe_dict = dict(news_title=news_title,
                       news_content=news_content,
                       featured_image_url=featured_image_url,
                       weather=weather,
                       facts_html=facts_html,
                       hemisphere_image_urls=hemisphere_image_urls,
                       time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    return scarpe_dict
def scrape():
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

# URL of page to be scraped
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)

# HTML object
    html = browser.html
# Parse HTML with Beautiful Soup
    soup = bs(html,"html.parser")

# Retrieve elements
    news_title = soup.find("div",class_="content_title").text
    news_paragraph = soup.find("div", class_="article_teaser_body").text

    print(news_title)

    print(news_paragraph)

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

# URL of page to be scraped
    jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(jpl_url)

# HTML object
    html = browser.html
# Parse HTML with Beautiful Soup
    soup = bs(html, "html.parser")

# Retrieve elements
    featured_img= soup.find("div", class_="carousel_items").find("article")["style"]
#return with base url
    featured_image_url = f'https://www.jpl.nasa.gov{featured_img}'

    print(featured_image_url)
    
# Retrieve elements
    featured_image= soup.find("div", class_="carousel_items").find("article")["style"]
# use split function
    featured_image_split = featured_image.split("'")[1]
#return with base url
    featured_image_url = f'https://www.jpl.nasa.gov{featured_image_split}'

    print(featured_image_url)
    
# URL of page to be scraped
    twitter_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(twitter_url)

    requests.get(twitter_url)
    response = requests.get(twitter_url)

# Parse HTML with Beautiful Soup
    soup = bs(response.text,"html.parser")

# Retrieve elements in text
    mars_weather = soup.find('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text

    print(mars_weather)

# URL of page to be scraped
    facts_url = "https://space-facts.com/mars/"

#create dataframe
    facts_df = pd.read_html(facts_url)
    facts_df = pd.DataFrame(facts_df[0])
    facts_df.head(9)



#Use Pandas to convert the data to a HTML table string.
    facts_df_html = facts_df.to_html()
    facts_df_html


# URL of page to be scraped
    #hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    #browser.visit(hemisphere_url)

# HTML object
    #html = browser.html
# Parse HTML with Beautiful Soup
    #soup = bs(html, "html.parser")

    #hemisphere = []
# Retrieve elements
    #results = soup.find_all("div", class_="item")
# Loop through results 
    #for result in results:
        #hemisphere_dict = {}
        # Use Beautiful Soup's find() method to navigate and retrieve attributes
        #h3 = result.find("h3").text
        #href = result.find("div", class_="description").a["href"]
        #title = 'https://astrogeology.usgs.gov' + href
    
        #browser.visit(title)
    
    # HTML object
        #html = browser.html
    # Parse HTML with Beautiful Soup
        #soup = bs(html, "html.parser")
    # Retrieve elements
        #url = soup.find("img", class_="wide-image")["src"]

        #hemisphere_dict["title"] = h3
        #hemisphere_dict["img_url"] = 'https://astrogeology.usgs.gov' + url
        #print(hemisphere_dict["img_url"])
    
        #hemisphere.append(hemisphere_dict)

        #hemisphere

    mars = {
     "news_title": news_title,
     "news_paragraph": news_paragraph,
     "featured_image_url": featured_image_url,
     "facts_df_html": facts_df_html,
     "mars_weather": mars_weather,
     }

    return mars
]

# load web pages without loading images in selenium
from selenium import webdriver

chromeOptions = webdriver.ChromeOptions()
prefs = {
    'profile.managed_default_content_settings.images': 2,
    'disk-cache-size': 4096
}
chromeOptions.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=chromeOptions)

executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome',
                  **executable_path,
                  headless=False,
                  options=chromeOptions)

# visit main page first
main_url = 'https://www.basketball-reference.com/'
browser.visit(main_url)
# wait before going through loop of each team's stats
time.sleep(60)

# loop to start the scraping of the stats
team_list = []
x = 0
while x == 0:
    try:
        for team in teams:
            # URL
Exemple #7
0
def scrape():

    from splinter import Browser
    from bs4 import BeautifulSoup as bs
    import time
    import pandas as pd
    import requests

    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

    mars_info = {}

    ##### NASA Mars News #####

    url_news = "https://mars.nasa.gov/news/"
    browser.visit(url_news)

    html_news = browser.html
    soup_news = bs(html_news, "html.parser")

    news_title = soup_news.find("div", class_="content_title").text
    news_p = soup_news.find("div", class_="article_teaser_body").text

    mars_info["news_title"] = news_title
    mars_info["news_p"] = news_p

    ##### JPL Mars Space Images - Featured Image #####

    url_jpl = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url_jpl)

    browser.click_link_by_partial_text("FULL IMAGE")
    time.sleep(3)
    browser.click_link_by_partial_text("more info")

    html_img = browser.html
    soup_img = bs(html_img, "html.parser")

    featured_image = soup_img.find("figure").find("a")["href"]

    featured_image_url = f"https://www.jpl.nasa.gov{featured_image}"

    #print(featured_image_url)

    mars_info["featured_image_url"] = featured_image_url

    ##### Mars Weather #####

    url_twt = "https://twitter.com/marswxreport?lang=en"
    response = requests.get(url_twt)

    soup_twt = bs(response.text, "html.parser")

    mars_weather = soup_twt.find(
        "div", class_="js-tweet-text-container").text.strip()

    mars_info["mars_weather"] = mars_weather

    ##### Mars Facts #####

    url_facts = "https://space-facts.com/mars/"
    facts = pd.read_html(url_facts)
    facts_df = facts[0]
    facts_df.columns = ["description", "value"]
    facts_df = facts_df.set_index("description")
    facts_html = facts_df.to_html().strip()

    mars_info["facts_html"] = facts_html

    ##### Mars Hemispheres #####

    url_astro = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_astro)

    html_astro = browser.html
    soup_astro = bs(html_astro, "html.parser")

    hemisphere_image_urls = []

    hemis = soup_astro.find_all("div", class_="description")

    for hemi in hemis:
        title = hemi.find("h3").text
        next_page = hemi.find("a")["href"]
        browser.visit(f"https://astrogeology.usgs.gov{next_page}")
        html_hemi = browser.html
        soup_hemi = bs(html_hemi, "html.parser")
        img_url = soup_hemi.find("div", class_="downloads").find("a")["href"]
        hemisphere_image_urls.append({"title": title, "img_url": img_url})

    mars_info["hemisphere_image_urls"] = hemisphere_image_urls

    browser.quit()

    return mars_info
Exemple #8
0
def main():
    browser = Browser(
        'chrome',
        headless=True,
        user_agent=
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
    )
    url = 'https://www.toutiao.com/c/user/5551493118/#mid=5551493118'
    get_full_page(browser, url)
    name = browser.find_by_css('span[class="name"]')[0].text  #get author anme
    contents = browser.find_by_css(
        'div[class="ugc-content"]')  #get contents of each blog
    time_stamps = browser.find_by_css(
        'span[class="lbtn"]')  #get time stamps of each
    results = browser.find_by_css(
        'div[class="y-left"] a')  #get views, likes, comments
    num_of_toutiao = len(contents)

    total_num_of_likes = 0
    total_num_of_comments = 0
    total_num_of_views = 0.0

    # extract the data from the strings
    print("parsing the data")
    for result in results:
        if "阅读" in result.text:
            if "万" in result.text:
                total_num_of_views += float(extract_num(
                    result.text)[0]) + float(extract_num(result.text)[1]) / 10
            else:
                total_num_of_views += float(extract_num(
                    result.text)[0]) / 10000
        elif "赞" in result.text:
            total_num_of_likes += int(extract_num(result.text)[0])
        elif "评论" in result.text:
            total_num_of_comments += int(extract_num(result.text)[0])

    total_num_of_views = round(total_num_of_views, 4)

    avg_num_of_likes = round(total_num_of_likes / num_of_toutiao, 2)
    avg_num_of_comments = round(total_num_of_comments / num_of_toutiao, 2)
    avg_num_of_views = round(total_num_of_views / num_of_toutiao, 2)

    # write data to csv file
    print("writing the data to the csv file")
    with open('tou_tiao2.csv', mode='w', encoding="utf-8-sig") as file:
        file = csv.writer(file,
                          delimiter=',',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
        x = 0
        for i in range(num_of_toutiao):
            content = contents[i].text
            view = results[x].text
            like = results[x + 1].text[3:]
            comment = results[x + 2].text[3:]
            time_stamp = time_stamps[i].text[3:]
            file.writerow([name, content, view, like, comment, time_stamp])
            x += 3
        file.writerow([
            "总阅读数: " + str(total_num_of_views) + "万",
            "总赞数: " + str(total_num_of_likes),
            "总评论数: " + str(total_num_of_comments)
        ])
        file.writerow([
            "平均阅读数: " + str(avg_num_of_views) + "万",
            "平均赞数: " + str(avg_num_of_likes),
            "平均评论数: " + str(avg_num_of_comments)
        ])

    print("平均阅读数:", avg_num_of_views, "万 ", "平均赞数:", avg_num_of_likes,
          "平均评论数:", avg_num_of_comments)
    print("总阅读数:", total_num_of_views, "万 ", "总赞数:", total_num_of_likes,
          "总评论数:", total_num_of_comments)
Exemple #9
0
def scrape():
    mars_news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser("chrome", **executable_path, headless=False)

    first = None
    #for some reason it doesn't work everytime, so it will just keep trying!
    #while first is None:

    browser.visit(mars_news_url)
    html = browser.html

    more_soup = BeautifulSoup(html, 'html.parser')

    first = more_soup.find('li', class_='slide')

    if first is None:
        return_this = {
            'news_title':
            'Something went wrong talking to Nasa!',
            'news_summary':
            "For some reason when using scrape from the python file, it doesn't find any html for the page."
        }
    else:
        news_title = first.h3.text

        news_summary = first.find('div',
                                  class_='rollover_description_inner').text

        return_this = {"news_title": news_title, 'news_summary': news_summary}

    perseverance_image_url = 'https://www.nasa.gov/perseverance/images'

    try:
        browser.visit(perseverance_image_url)
        image_html = browser.html
        image_soup = BeautifulSoup(image_html, 'html.parser')

        images = image_soup.find('div', class_='is-gallery')
        first_img = images.find('div', class_='image')
        first_img_href = first_img.find('img')['src']

        return_this.update(
            {'perseverance_image': 'https://www.nasa.gov' + first_img_href})

    except:
        pass

    browser.quit()

    facts_url = 'https://space-facts.com/mars/'

    tables = pd.read_html(facts_url)
    df = tables[0]
    df = df.rename(columns={0: '', 1: 'Mars'})
    facts_table = df.to_html(index=False,
                             classes='table table-striped',
                             justify='left')

    return_this.update({"data_table": facts_table})

    hemisphere_image_urls = [{
        'title':
        'Cerberus Hemisphere',
        'img_url':
        'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'
    }, {
        'title':
        'Schiaparelli Hemisphere',
        'img_url':
        'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'
    }, {
        'title':
        'Syrtis Major Hemisphere',
        'img_url':
        'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'
    }, {
        'title':
        'Valles Marineris Hemisphere',
        'img_url':
        'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'
    }]

    return_this.update({'hemisphere_image_urls': hemisphere_image_urls})

    return return_this
def init_browser():
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
Exemple #11
0
def scrape():

    executable_path = {'executable_path': 'chromedriver.exe'}

    browser = Browser('chrome', **executable_path, headless=False)

    # create mars_data dict that we can insert into mongo

    mars_data = {}

    # visit NASA Mars News site and scrape headlines
    news_url = "https://mars.nasa.gov/news/"
    browser.visit(news_url)
    time.sleep(1)
    news_html = browser.html
    news_soup = bs(news_html, 'html.parser')

    # Scrape the Latest News Title
    slide_element = news_soup.select_one("ul.item_list li.slide")
    #slide_element.find("div", class_="content_title")

    news_title = slide_element.find("div", class_="content_title").get_text()
    print(f"The latest news title is: {news_title}")

    # Scrape the Latest Paragraph Text
    news_paragraph = slide_element.find(
        "div", class_="article_teaser_body").get_text()
    print(f"The lanews_paragraphtest news paragraph is: {news_paragraph}")

    # Visit the JPL website and scrape the featured image
    jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl_url)
    time.sleep(1)

    # Ask Splinter to Go to Site and Click Button with Class Name full_image
    full_image_button = browser.find_by_id("full_image")
    full_image_button.click()

    # Find "More Info" Button and Click It
    browser.is_element_present_by_text("more info", wait_time=1)
    more_info_element = browser.find_link_by_partial_text("more info")
    more_info_element.click()

    # Parse Results HTML with BeautifulSoup
    html = browser.html
    image_soup = bs(html, "html.parser")

    img_url = image_soup.select_one("figure.lede a img").get("src")
    img_url

    # Use Base URL to Create Absolute URL
    img_url = f"https://www.jpl.nasa.gov{img_url}"
    print(img_url)

    mars_df = pd.read_html("https://space-facts.com/mars/")[0]
    mars_df.columns = ["Description", "Value"]
    mars_df.set_index("Description", inplace=True)

    mars_df_html = mars_df.to_html(header=False, index=False)

    hemisphere_image_urls = [{
        "title":
        "Cerberus Hemisphere",
        "img_url":
        "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg"
    }, {
        "title":
        "Valles Marineris Hemisphere",
        "img_url":
        "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg"
    }, {
        "title":
        "Schiaparelli Hemisphere",
        "img_url":
        "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg"
    }, {
        "title":
        "Syrtis Major Hemisphere",
        "img_url":
        "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg"
    }]

    scrape_data = {
        "news_title": news_title,
        "news_paragraph": news_paragraph,
        "image_URL": img_url,
        "mars_data": mars_df_html,
        "hemisphere_image": hemisphere_image_urls
    }

    return scrape_data
Exemple #12
0
import selenium
import splinter
from splinter import Browser
import csv
import sys
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'F:\python\Scripts\geckodriver')
browser = Browser('firefox')
'''
###########STEP1 GET THE URLS OF RATING PAGES##########
f = open('c:\\Users\\Administrator\\url.txt', 'r')# url of the search results
with open("rating_url.csv",'w',encoding='utf-8') as fileout:
	writer = csv.writer(fileout)
	for url in f:
		driver.get(url)
		div = driver.find_elements_by_xpath('//*[@class="charity-name-desktop"]')
		for j in div:
			a = j.find_element_by_tag_name('a')
			href = a.get_attribute('href')
			writer.writerow(href)
######After this, I use excel to replace 'summary' with 'history' in the links to create history_page_url and save them into 'uu.txt'.(Save the 'click'.)
######If you don't have names of the list of charities,then can use this to extract names from search result page:
	names = browser.find_by_xpath('//*[@class="charity-name-desktop"]')
	for j in names:
		obj = [j.value]
'''


##########STEP2 SAVE THE HISTORICAL RATINGS########
def rat():
    name = browser.find_by_xpath('//*[@class="charityname"]')
Exemple #13
0
def init_browser():
    if platform == "darwin":
        executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    else:
        executable_path = {'executable_path': 'chromedriver.exe'}
    return Browser("chrome", **executable_path, headless=True)
def init_browser():
    executable_path = {'executable_path': 'chromedriver.exe'}
    return Browser('chrome', **executable_path)
def scrape():
    from splinter import Browser
    from bs4 import BeautifulSoup
    import pandas as pd
    import time

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    time.sleep(3)

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(1)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='article_teaser_body').text

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(1)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(1)
    featured_image_url = browser.find_by_css('.fancybox-image')['src']

    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    time.sleep(1)

    mars_weather = ''
    i = 0
    while 'InSight' not in mars_weather:
        mars_weather = browser.find_by_css(
            '.js-tweet-text-container')[i].find_by_tag('p').text
        i += 1

    url = 'https://space-facts.com/mars/'
    mars_facts = pd.read_html(url)[0]
    mars_facts.columns = ['a', 'b']
    mars_facts_df = mars_facts.set_index('a')
    mars_facts_df.index.names = ['']
    mars_dict = {}
    for row in mars_facts_df.iterrows():
        mars_dict[row[0][:-1]] = row[1][0]

    hemisphere_image_urls = [
        {
            "title":
            "Valles Marineris Hemisphere",
            "img_url":
            "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg"
        },
        {
            "title":
            "Cerberus Hemisphere",
            "img_url":
            "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg"
        },
        {
            "title":
            "Schiaparelli Hemisphere",
            "img_url":
            "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg"
        },
        {
            "title":
            "Syrtis Major Hemisphere",
            "img_url":
            "https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg"
        },
    ]

    browser.quit()

    dict_out = {
        'news_title': news_title,
        'news_p': news_p,
        'featured_image': featured_image_url,
        'mars_weather': mars_weather,
        'mars_facts': mars_dict,
        'hemisphere_imgs': hemisphere_image_urls
    }

    return dict_out
def scrape():
    #scrape the NASA Mars News SIte, collect news title, paragraph text, assign
    #to variables for later reference
    url = "https://mars.nasa.gov/news/"
    response = req.get(url)
    soup = bs(response.text, 'html5lib')

    #scrape the title and accompanying paragraph
    news_title = soup.find("div", class_="content_title").text
    paragraph_text = soup.find("div", class_="rollover_description_inner").text

    #set up splinter
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    #stir soup for scraping
    html = browser.html
    soup = bs(html, "html.parser")

    #have webdriver click links to get to the full image I want
    browser.click_link_by_partial_text('FULL IMAGE')

    #had to add this, wasn't working and docs recommended waiting between clicks
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    #stir new soup for scraping the image url
    new_html = browser.html
    new_soup = bs(new_html, 'html.parser')
    temp_img_url = new_soup.find('img', class_='main_image')
    back_half_img_url = temp_img_url.get('src')

    recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url

    #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA
    #stir soup
    twitter_response = req.get("https://twitter.com/marswxreport?lang=en")
    twitter_soup = bs(twitter_response.text, 'html.parser')

    #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol"
    tweet_containers = twitter_soup.find_all('div',
                                             class_="js-tweet-text-container")
    for i in range(10):
        tweets = tweet_containers[i].text
        if "Sol " in tweets:
            mars_weather = tweets
            break

#Mars Facts....visit webpage, use pandas to scrape the page for facts,
#convert pandas table to html table string.
    request_mars_space_facts = req.get("https://space-facts.com/mars/")

    #use pandas to scrape html table data
    mars_space_table_read = pd.read_html(request_mars_space_facts.text)
    df = mars_space_table_read[0]

    #set the index to the titles of each statistic/value
    df.set_index(0, inplace=True)
    mars_data_df = df

    #convert new pandas df to html, replace "\n" to get html code
    mars_data_html = mars_data_df.to_html()
    mars_data_html.replace('\n', '')
    mars_data_df.to_html('mars_table.html')

    #..Visit the USGS Astrogeology site to obtain hgih resolution images for
    #....each of Mar's hemispheres
    usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    usgs_req = req.get(usgs_url)

    #..You will need to click each of the links to the hemispheres in order
    #....to find full res image

    #had trouble doing this with splinter, decided to just do a bunch of loops for img urls
    soup = bs(usgs_req.text, "html.parser")
    hemi_attributes_list = soup.find_all('a', class_="item product-item")
    #list to keep the dictionaries that have title and image url
    hemisphere_image_urls = []
    for hemi_img in hemi_attributes_list:
        #get the img title
        img_title = hemi_img.find('h3').text
        #print(img_title)
        #get the link to stir another soup, this is the page with the actual image url
        link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href']
        #print(link_to_img)
        img_request = req.get(link_to_img)
        soup = bs(img_request.text, 'lxml')
        img_tag = soup.find('div', class_='downloads')
        img_url = img_tag.find('a')['href']
        hemisphere_image_urls.append({
            "Title": img_title,
            "Image_Url": img_url
        })

    mars_data = {
        "News_Title": news_title,
        "Paragraph_Text": paragraph_text,
        "Most_Recent_Mars_Image": recent_mars_image_url,
        "Mars_Weather": mars_weather,
        "mars_h": hemisphere_image_urls
    }

    return mars_data
def scrape():
    # Chromedriver execution
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=True)

    # URL path
    url1 = "https://mars.nasa.gov/news/"
    browser.visit(url1)

    # Save html and parser
    html = browser.html
    soup = bs(html, "html.parser")

    # get first news date from the url
    news_date = soup.find('li', class_='slide').find('div',
                                                     class_="list_date").text
    # get first news title from the url
    news_title = soup.find('div', class_="list_text").find(
        'div', class_="content_title").text
    # get first news text from the url
    news_text = soup.find('div', class_="list_text").find(
        'div', class_="article_teaser_body").text

    # URL path
    url2 = "https://www.jpl.nasa.gov/spaceimages/"

    # Visiting url2 to click and response
    browser.visit(url2)
    browser.find_by_id('full_image').click()
    time.sleep(3)

    # Clicking on more info button
    browser.links.find_by_partial_text('more info').click()

    # Getting image URL
    featured_image_url = browser.find_by_xpath(
        "//img[@class='main_image']")._element.get_attribute("src")

    # URL path
    url3 = "https://space-facts.com/mars/"

    # Finding all tables on a web page
    table = pd.read_html(url3)

    # Pick first table (Mars facts)
    table[0].columns = ['Parameter', 'Value']
    fact_table = table[0]

    # Converting DataFrame to HTML table
    table_html = fact_table.to_html()

    # Getting mars facts table data from the web page
    browser.visit(url3)

    html = browser.html
    soup = bs(html, "html.parser")

    tables = soup.findChildren('table')
    table_data = []
    table1 = tables[0]
    rows = table1.findChildren(['th', 'tr'])

    for row in rows:
        title = row.find('td', class_="column-1").text.strip()
        value = row.find('td', class_="column-2").text.strip()
        table_data.append({'title': title, 'value': value})

    table_data

    # URL path
    url4 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url4)
    browser.url
    html = browser.html

    # Parsering and scrapping list of images
    soup = bs(html, "html.parser")
    images = soup.find_all('div', class_="description")
    link = f"https://astrogeology.usgs.gov"
    time.sleep(3)

    # Looping thorugh images list, pick href and add it to link, visit new link, scrap for image url and title, append to a list
    hem_img_urls = []
    for image in images:
        img_link = f"{link}{image.find('a')['href']}"
        browser.visit(img_link)
        img_url = browser.find_by_xpath(
            "//img[@class='wide-image']")._element.get_attribute("src")
        title = browser.find_by_xpath("//h2[@class='title']").text
        title = title.rstrip('Enhanced')
        hem_img_urls.append({"title": title, "img_url": img_url})
    hem_img_urls
    time.sleep(3)

    # DataBase dictionary
    mars_web_dict = {
        'news_date': news_date,
        'news_title': news_title,
        'news_text': news_text,
        'featured_image_url': featured_image_url,
        'row1_title': table_data[0]['title'],
        'row1_value': table_data[0]['value'],
        'row2_title': table_data[1]['title'],
        'row2_value': table_data[1]['value'],
        'row3_title': table_data[2]['title'],
        'row3_value': table_data[2]['value'],
        'row4_title': table_data[3]['title'],
        'row4_value': table_data[3]['value'],
        'row5_title': table_data[4]['title'],
        'row5_value': table_data[4]['value'],
        'row6_title': table_data[5]['title'],
        'row6_value': table_data[5]['value'],
        'row7_title': table_data[6]['title'],
        'row7_value': table_data[6]['value'],
        'row8_title': table_data[7]['title'],
        'row8_value': table_data[7]['value'],
        'row9_title': table_data[8]['title'],
        'row9_value': table_data[8]['value'],
        'url1_title': hem_img_urls[0]['title'],
        'url1_img': hem_img_urls[0]['img_url'],
        'url2_title': hem_img_urls[1]['title'],
        'url2_img': hem_img_urls[1]['img_url'],
        'url3_title': hem_img_urls[2]['title'],
        'url3_img': hem_img_urls[2]['img_url'],
        'url4_title': hem_img_urls[3]['title'],
        'url4_img': hem_img_urls[3]['img_url']
    }
    browser.quit()

    return mars_web_dict
def firefox_installed():
    try:
        Browser("firefox")
    except OSError:
        return False
    return True
Exemple #19
0
def init_browser():
    executable_path = {"executable_path": "./chromedriver"}
    return Browser("chrome", **executable_path, headless=False)
 def setUpClass(cls):
     cls.browser = Browser("firefox")
def scrape():

    mars_data= {}
    # browser = init_browser()
    # mars_dict = {}
    #import pdb;pdb.set_trace()
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    # # NASA Mars News
    # URL of page to be scraped
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #News Title
    news_title = soup.find('div', class_="bottom_gradient").text
    print(news_title)
    #Paragraph text
    news_p = soup.find('div', class_='article_teaser_body').text
    # print('--------------------------------------------------')
    print(news_p)

    # Add the news title and summary to the dictionary
    mars_data["news_title"] = news_title
    mars_data["new_p"] = news_p


    # # Featured Image
    #import pdb; pdb.set_trace()
    Image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(Image_url)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(5)
    browser.click_link_by_partial_text('more info')
    time.sleep(5)
    html=browser.html
    soup=BeautifulSoup(html,'html.parser')
    # Extracting image
    Image_path= soup.find('figure',class_='lede').a['href']
    featured_image_url = 'https://www.jpl.nasa.gov/'+ Image_path
    print(featured_image_url)

    # Add the featured image url to the dictionary
    mars_data["featured_image_url"] = featured_image_url



    # # Mars Weather
    mars_tweet = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_tweet)

    html=browser.html
    soup=BeautifulSoup(html,'html.parser')
    # Extracting tweet
    mars_weather = soup.find('div',class_='js-tweet-text-container').text.replace('\n','')
    print(mars_weather)

    # Add the weather to the dictionary
    mars_data["mars_weather"] = mars_weather


    # #  Mars Facts
    mars_fact='https://space-facts.com/mars/'
    browser.visit(mars_fact)

    html=browser.html
    soup=BeautifulSoup(html,'html.parser')
    # Extracting mars table
    #set up lists to hold td elements which alternate between label and value
    trs=soup.find_all('tr')
    #set up lists to hold td elements which alternate between label and value
    labels = []
    values = []

    #for each tr element append the first td element to labels and the second to values
    for tr in trs:
        td_elements = tr.find_all('td')
        labels.append(td_elements[0].text)
        values.append(td_elements[1].text)
    print(labels,values)

    mars_fact_tabel = pd.DataFrame({
        "Label": labels,
        "Values": values
    })
    #mars_fact_tabel

    # convert the data to a HTML table string
    fact_table = mars_fact_tabel.to_html(header = False, index = False)
    print(fact_table)

    # Add the Mars facts table to the dictionary
    mars_data["mars_table"] = fact_table



    # # Mars Hemispheres
    USGS_site= 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    browser.visit(USGS_site)
    html=browser.html
    soup=BeautifulSoup(html,'html.parser')

    # Get the div element that holds the images. 
    images = soup.find('div', class_='collapsible results')
    #Loop through the class="item" by clicking the h3 tag and getting the title and url. 

    hemispheres_image_urls = []

    # print(len(images.find_all("div", class_="item")))
    for i in range(len(images.find_all("div", class_="item"))):
        # print(i)
        time.sleep(5)
        image = browser.find_by_tag('h3')
        image[i].click()
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.find("h2", class_="title").text
        # print(title)
        div = soup.find("div", class_="downloads")
        # for li in div:
        link = div.find('a')
        # print(link)
        url = link.attrs['href']
        
        # print(url)
        hemispheres = {
                'title' : title,
                'img_url' : url
            }
        hemispheres_image_urls.append(hemispheres)
        browser.back()
        
        print(hemispheres_image_urls)

        # Add the hemispheres data to the  dictionary
        mars_data["hemispheres_image_urls"] = hemispheres_image_urls

    # Return the dictionary
    return mars_data
 def test_should_support_with_statement(self):
     with Browser('firefox'):
         pass
def scrape():
#Replace the path with actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    #mars.nasa.gov/news/
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    #Scrape page into Soup
    html = browser.html
    nasa_soup = bs(html, 'html.parser')

    #Returned results
    summary = nasa_soup.find('div', class_="rollover_description_inner").text
    title = nasa_soup.find('div', class_="content_title").text
        
    print(f"Title: {title}")
    print(f"Summary: {summary}")

    #Visit URL for JPL Featured Space Image https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_jpl)

    html = browser.html
    space_soup = bs(html, 'html.parser')

    #using splinter to find mars featured image
    image = space_soup.find('a', class_='fancybox') ['data-fancybox-href']
    image_url = 'https://www.jpl.nasa.gov' + image
    print(image_url)

    #Mars weather tweets
    tweet_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(tweet_url)

    try:
        html = browser.html
        weather_soup = bs(html, 'html.parser')

        #save tweet
        mars_weather = weather_soup.find('p', class_ = "TweetTextSize").text
        print(mars_weather)
    except Exception as e:
        print(e)
        mars_weather = "Latest Mars Weather Tweet not Available. Try again later."

    #Mars facts scrape
    facts_url = 'https://space-facts.com/mars/'
    mars_info = pd.read_html(facts_url)[0].to_html(index=False, header=False)
    mars_info

    #hemispheres photo scraping
    base_hemisphere_url = "https://astrogeology.usgs.gov"
    hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemisphere_url)

    html = browser.html
    soup = bs(html, 'html.parser')

    hemisphere_image_urls = []

    links = soup.find_all("div", class_="item")

    for link in links:
        img_dict = {}
        title = link.find("h3").text
        next_link = link.find("div", class_="description").a["href"]
        full_next_link = base_hemisphere_url + next_link
        
        browser.visit(full_next_link)
        
        pic_html = browser.html
        pic_soup = bs(pic_html, 'html.parser')
        
        url = pic_soup.find("img", class_= "wide-image")["src"]
        
        img_dict["title"] = title
        img_dict["img_url"] = base_hemisphere_url + url
        print(img_dict["img_url"])
        
        hemisphere_image_urls.append(img_dict)
        
    mars_data = {
        "title": title,
        "summary": summary,
        "image_url": image_url,
        "mars_weather": mars_weather,
        "mars_info": mars_info,
        "hemisphere_image_urls": hemisphere_image_urls
    }

    return mars_data
 def setUpClass(cls):
     extension_path = os.path.join(
         os.path.abspath(os.path.dirname(__file__)), 'firebug.xpi')
     cls.browser = Browser('firefox', extensions=[extension_path])
Exemple #25
0
        #print("added to db")
    print(mars_news)   
    #Stick it all into Mongo
    #Collect it all up
    #collection.insert_one(mars_news)


    #Ok, let's use splinter.
    #Get current featured image url from here:
    #https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars


    #import splinter, etc
    from splinter import Browser
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #go to the URL
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    #Navigate to the full image
    browser.click_link_by_partial_text('FULL IMAGE')

    #soup it up
    image_html = browser.html
    image_soup = BeautifulSoup(image_html, 'html.parser')

    #grab that image
    image_ext = image_soup.find('img', {'class': 'fancybox-image'})['src']
 def setUpClass(cls):
     preferences = {
         'dom.max_script_run_time': 360,
         'devtools.inspector.enabled': True,
     }
     cls.browser = Browser("firefox", profile_preferences=preferences)
def init_browser():
    '''Initialize a splinter Chrome browser'''
    return Browser("chrome", headless=True)
 def setUpClass(cls):
     cls.browser = Browser("firefox", fullscreen=True)
def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)
def init_browser():
    executable_path = {
        'executable_path': '/Users/abhsharm/Softwares/Drivers/chromedriver'
    }
    return Browser('chrome', **executable_path, headless=False)