Ejemplos de visit en Python, ejemplos de splinter.browser.visit en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: scraping.py Proyecto: jlemus1367/Mission-to-Mars

def featured_image(browser):
    # Visit URL
    url = 'https://spaceimages-mars.com'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_tag('button')[1]
    full_image_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        # Find the relative image url
        img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')

    except AttributeError:
        return None

    # Use the base URL to create an absolute URL
    img_url = f'https://spaceimages-mars.com/{img_url_rel}'

    return img_url

Ejemplo n.º 2

0

Mostrar archivo

def mars_news(browser):

    # Scrape Mars news
    # Visit the mars nasa news site
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    # Set up the HTML parser. Convert the browser html to a soup object and then quit the browser.
    html = browser.html
    news_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        slide_elem = news_soup.select_one('ul.item_list li.slide')
        # assign the title and summary text to variables to reference later. Begin scraping:
        #slide_elem.find("div", class_='content_title')

        # Use the parent element to find the first 'a' tag and save it as `news_title`
        news_title = slide_elem.find("div", class_="content_title").get_text()
        # Use the parent element to find the paragraph text
        news_p = slide_elem.find("div",
                                 class_="article_teaser_body").get_text()

    except AttributeError:
        return None, None
    return news_title, news_p

Ejemplo n.º 3

0

Mostrar archivo

def featured_image(browser):
    # Visit URL
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_id('full_image')
    full_image_elem.click()

    # Find the more info button and click that
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_elem = browser.links.find_by_partial_text('more info')
    more_info_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        # Find the relative image url
        img_url_rel = img_soup.select_one('figure.lede a img').get("src")
    except AttributeError:
        return None
    #img_url_rel

    # Use the base URL to create an absolute URL
    img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
    return img_url

Ejemplo n.º 4

0

Mostrar archivo

def LoadDataHtml(writedata: list):

    browser.visit('https://apteka-ot-sklada.ru/catalog?q=' +
                  item['pricepos_code'])

    soup = BeautifulSoup(browser.html, 'html.parser')

    data = soup.find_all(
        "div",
        class_="ui-card goods-card goods-grid__cell goods-grid__cell_size_3")
    if len(data) == 0:
        VerifiedData.append(item['pricepos_code'])
        jsdata = json.dumps(VerifiedData)
        temp = open('Verified.json', 'w')
        temp.write(jsdata)
        temp.close()
        return

    soup = BeautifulSoup(str(data), 'html.parser')
    html = soup.find_all("a", href=True)
    browser.visit('https://apteka-ot-sklada.ru' + str(html[0]['href']))
    soup = BeautifulSoup(browser.html, 'html.parser')
    image = soup.find_all(
        'img', class_='goods-photo goods-gallery__picture')[0]['src']
    image = 'https://apteka-ot-sklada.ru' + image
    html = soup.find_all('div', class_='custom-html content-html')[0]
    Description = html.get_text()
    writedata = dict(pricepos_name=item['pricepos_name'],
                     pricepos_count=item['pricepos_count'],
                     pricepos_value=item['pricepos_value'],
                     pricepos_country=item['pricepos_country'],
                     pricepos_code=item['pricepos_code'],
                     description=Description,
                     image=image)
    return writedata

Ejemplo n.º 5

0

Mostrar archivo

def scrape_mars_weather():
    try:
        browser = initialize_browser()

        weather_url = "https://twitter.com/marswxreport?lang=en"
        browser.visit(weather_url)

        html_weather = browser.html
        soup = bs(html_weather, "html.parser")

        recent_tweets = soup.find_all("div", class_="js-tweet-text-container")

        for tweet in recent_tweets:
            weather_tweet = tweet.find("p").text
            if "Sol" and "pressure" in weather_tweet:
                print(weather_tweet)
                break
            else:
                pass

        mars_information["weather_tweet"] = weather_tweet

        return mars_information

    finally:
        browser.quit()

Ejemplo n.º 6

0

Mostrar archivo

Archivo: scraping.py Proyecto: jlemus1367/Mission-to-Mars

def mars_news(browser):

    # Scrape Mars News
    # Visit the mars nasa news site
    url = 'https://redplanetscience.com'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css('div.list_text', wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_soup = soup(html, 'html.parser')

    # Add try/except for error handling
    try:
        slide_elem = news_soup.select_one('div.list_text')
        # Use the parent element to find the first `a` tag and save it as `news_title`
        news_title = slide_elem.find('div', class_='content_title').get_text()
        # Use the parent element to find the paragraph text
        news_p = slide_elem.find('div',
                                 class_='article_teaser_body').get_text()

    except AttributeError:
        return None, None

    return news_title, news_p

Ejemplo n.º 7

0

Mostrar archivo

def mars_weather(browser):
    insight_url = 'https://mars.nasa.gov/insight/weather/'
    browser.visit(insight_url)
    # Parse the data
    html = browser.html
    weather_soup = soup(html, 'html.parser')
    # Scrape the Daily Weather Report table
    weather_table = weather_soup.find('table', class_='mb_table')
    print(weather_table.prettify())

    #<a class="inline_image_enlarge fancybox" href="https://mars.nasa.gov/rss/api/images/insight_marsweather_white.png">
    #<img alt="Three Day Weather Report" src="https://mars.nasa.gov/rss/api/images/insight_marsweather_white.png">
    #
    # Find the relative image url
    #img_url_rel = weather_soup.select_one('a', class_='inline_image_enlarge fancybox').get('src')

    #weather_img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
    return weather_table.prettify()

Ejemplo n.º 8

0

Mostrar archivo

def scrape_mars_news():
    try:
        browser = initialize_browser()

        url = "https://mars.nasa.gov/news/"
        browser.visit(url)

        html = browser.html
        soup = bs(html, "html.parser")

        news_headline = soup.find("div", class_="content_title").find("a").text
        news_story = soup.find("div", class_="article_teaser_body").text

        mars_information["news_headline"] = news_headline
        mars_information["news_story"] = news_story

        return mars_information

    finally:
        browser.quit()

Ejemplo n.º 9

0

Mostrar archivo

def scrape_mars_image():
    try:
        browser = initialize_browser()

        url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
        browser.visit(url_image)

        html_image = browser.html
        soup = bs(html_image, "html.parser")

        featured_image_url = soup.find("article")["style"].replace(
            "background-image: url(", "").replace(");", "")[1:-1]
        main_url = "https://www.jpl.nasa.gov"
        featured_image_url = main_url + featured_image_url
        featured_image_url
        mars_information["featured_image_url"] = featured_image_url

        return mars_information

    finally:
        browser.quit()

Ejemplo n.º 10

0

Mostrar archivo

Archivo: scraping.py Proyecto: jlemus1367/Mission-to-Mars

def hemispheres(browser):
    # 1. Use browser to visit the URL
    url = 'https://marshemispheres.com/'
    browser.visit(url)

    # 2. Create a list to hold the images and titles.
    hemisphere_image_urls = []

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    # Initialize for loop to loop through each hemisphere link
    for link in range(4):

        # Empty dictionary to hold image URLs and titles
        hemispheres = {}

        # Find and click on each hemisphere link
        hemisphere_link = browser.find_by_tag('a.itemLink h3')[link]
        hemisphere_link.click()

        # Navigate to full resolution image and retrieve the full resolution image URL
        image = browser.links.find_by_text('Sample')
        img_url = image['href']

        # Retrieve title for hemisphere image
        title = browser.find_by_tag('h2').text

        # Add hemisphere image URL and title to dictionary
        hemispheres["img_url"] = img_url
        hemispheres["title"] = title

        # Add dictionary to list
        hemisphere_image_urls.append(hemispheres)

        # Navigate back to the beginning to get the next hemisphere image
        browser.back()

    return hemisphere_image_urls

Ejemplo n.º 11

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: eduenas88/Marswebscrape

def scrape():

    browser = init_browser()

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(2)

    #%% [markdown]
    # # NASA Mars News

    #%%

    #%%

    #%%

    #%% [markdown]
    # # JPL Mars Space Images - Featured Image

    #%%
    featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(featured_image_url)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    #%%
    featured_image = soup.find('article')['style'].replace(
        'background-image: url(', '').replace(');', '')

    #%%
    #Slice the url to only include the text inside the hyphens
    featured_image = featured_image[1:-1]

    #%%
    parent_url = 'https://www.jpl.nasa.gov'

    #%%
    image_url = parent_url + featured_image

    #%% [markdown]
    # # Mars Weather

    #%%
    mars_weather_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_weather_url)
    time.sleep(2)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    #%%
    current_tweet = soup.find_all('div', class_='js-tweet-text-container')

Ejemplo n.º 12

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: Emi-Babu/web-scraping-challenge

def scrape():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    mars_mission = {}

    # URL of page to be scraped
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Scrape page into soup
    html = browser.html
    soup = bs(html, 'html.parser')

    news_title = soup.find("div", class_="content_title").text
    news_p = soup.find("div", class_="rollover_description_inner").text

    # Visit the url for JPL Featured Space Image
    url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url2)

    # Scrape page into soup
    html = browser.html
    soup = bs(html, 'html.parser')

    image = soup.find_all('img')[5]["src"]
    featured_img_url = "https://jpl.nasa.gov" + image
    print(featured_img_url)

    #  scrape the latest Mars weather tweet from the page
    url3 = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url3)

    # Scrape page into soup
    html = browser.html
    soup = bs(html, 'html.parser')
    # getting mars weather
    mars_weathers = []

    for weather_info in soup.find_all(
            'p',
            class_=
            'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'):
        mars_weathers.append(weather_info.text.strip())
    mars_weather = mars_weathers[0]
    mars_weather

    # Visit the Mars Facts webpage
    url4 = 'https://space-facts.com/mars/'
    browser.visit(url4)

    # Scrape page into soup
    html = browser.html
    soup = bs(html, 'html.parser')

    # getting mars facts table
    mars_facts = pd.read_html(url4)[0]
    mars_facts.columns = ["Description", "Value"]
    mars_facts_html = mars_facts.to_html()
    mars_facts_html.replace('\n', '')

    mars_facts.to_html('mars_table.html')

    # Mars hemisphere name and images
    url5 = 'https://astrogeology.usgs.gov'
    hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(hemispheres_url)
    hemispheres_html = browser.html
    hemispheres_soup = bs(hemispheres_html, 'html.parser')

    # Mars hemispheres products data
    all_mars_hemispheres = hemispheres_soup.find('div',
                                                 class_='collapsible results')
    mars_hemispheres = all_mars_hemispheres.find_all('div', class_='item')

    hemisphere_img_urls = []
    # Iterate through each hemisphere data
    for i in mars_hemispheres:
        # Collect Title
        hemisphere = i.find('div', class_="description")
        title = hemisphere.h3.text

        # Collect image link by browsing to hemisphere page
        hemisphere_link = hemisphere.a["href"]
        browser.visit(url5 + hemisphere_link)
        img_html = browser.html
        img_soup = bs(img_html, 'html.parser')
        img_link = img_soup.find('div', class_='downloads')
        img_url = img_link.find('li').a['href']

        # Create Dictionary to store title and url info
        img_dict = {}
        img_dict['title'] = title
        img_dict['img_url'] = img_url
        hemisphere_img_urls.append(img_dict)

    mars_mission = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_img_url,
        "mars_weather": mars_weather,
        "mars_facts_table": str(mars_facts_html),
        "hemisphere_img": hemisphere_img_urls
    }
    return mars_mission

Ejemplo n.º 13

0

Mostrar archivo

def Scrape():


    # Empty dictionary
    mars_dict = {}

    # ## NASA Mars News

    # Mars News URL
    url = "https://mars.nasa.gov/news/"

    # Retrieve page with the requests module
    html = requests.get(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html.text, 'html.parser')

    # Get title & description
    news_title = soup.find('div', 'content_title').text	
    news_p = soup.find('div', 'rollover_description_inner').text

    # Adding to dict
    mars_dict["news_title"] = news_title
    mars_dict["news_p"] = news_p

    print("NEWS TITLE & DESCRIPTION")


     # ## JPL Mars Space Images

    # JPL Mars URL
    
    # Setting up splinter
	executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
	browser = Browser('chrome', **executable_path, headless=False)
	url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
	browser.visit(url_image)
	
	#Getting the base url
	from urllib.parse import urlsplit
	base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(url_image))
	print(base_url)
	#Design an xpath selector to grab the image
	xpath = "//*[@id=\"page\"]/section[3]/div/ul/li[1]/a/div/div[2]/img"
	#Use splinter to click on the mars featured image
	#to bring the full resolution image
	results = browser.find_by_xpath(xpath)
	img = results[0]
	img.click()
	#get image url using BeautifulSoup
	html_image = browser.html
	soup = bs(html_image, "html.parser")
	img_url = soup.find("img", class_="fancybox-image")["src"]
	featured_image_url = base_url + img_url
	print(featured_image_url)
	mars_dict["featured_image_url"] = featured_image_url
	print("FEATURED IMAGE")

	#3. Mars weather 
	#get mars weather's latest tweet from the website
	url_weather = "https://twitter.com/marswxreport?lang=en"
	browser.visit(url_weather)
	html_weather = browser.html
	soup = bs(html_weather, "html.parser")
	#temp = soup.find('div', attrs={"class": "tweet", "data-name": "Mars Weather"})
	mars_weather = soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text
	print(mars_weather)
	mars_dict["mars_weather"] = mars_weather

Ejemplo n.º 14

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: mmegancross/web-scraping-challenge

def scrape():
    
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    Nasa_News_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'


    browser.visit(Nasa_News_url)


    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')


    #scrape latest news title
    news_title = soup.find_all('div', class_='content_title')
    latest_title = news_title[1].text
    #print(latest_title)


    #scrape latest news article teaser
    news_teaser = soup.find_all('div', class_="article_teaser_body")
    latest_teaser = news_teaser[0].text
    #print(latest_teaser)


    #scrape JPL Mars featured image 
    JPL_Mars_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(JPL_Mars_url)

    #click buttons to load image webpage
    browser.click_link_by_id("full_image")
    time.sleep(3)
    browser.click_link_by_partial_text("more info")


    # make a new soup
    html=browser.html
    soup=BeautifulSoup(html, "html.parser")
    sub_img = soup.find("figure", class_="lede")
    name=sub_img.a["href"]
    featured_image="https://www.jpl.nasa.gov" + name
    #featured_image


    USGS_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(USGS_url)

    html=browser.html
    soup=BeautifulSoup(html, "html.parser")

    hemi_list = []

    hemispheres = soup.find_all("div", class_="item")

    for hemi in hemispheres:
        #for titles
        hemi_div = hemi.find("div", class_="description")
        hemi_title = hemi_div.a.h3.text
        #print(hemi_title)

        #click link for images
        browser.click_link_by_partial_text("Hemisphere Enhanced")
        time.sleep(3)

        # scrape image 
        html=browser.html
        soup_4=BeautifulSoup(html, "html.parser")
        usgs_open = soup_4.find("img", class_="wide-image")
        usgs_src=usgs_open["src"]
        hemi_image_url="https://www.astrogeology.usgs.gov" + usgs_src
        #print(hemi_image)
        hemi_list.append({"title": hemi_title, "img_url": hemi_image_url})
        
    mars_scrape_data = {
        'Latest Headline': latest_title, 
        latest_teaser, 
        'Featured Image': featured_image, 
        hemi_list}

    return mars_scrape_data

Ejemplo n.º 15

0

Mostrar archivo

# 3.个人信息
# 4.可用教室
# 5.选课
# 6.评教
targetpage = r"http://121.194.57.131/loginAction.do"							#0
my_class_url = r'http://121.194.57.131/xskbAction.do?actionType=1'				#1
my_grade_url = r'http://121.194.57.131/gradeLnAllAction.do?type=ln&oper=qb'		#2
my_info_url = r'http://121.194.57.131/xjInfoAction.do?oper=xjxx'				#3
ava_classroom_url = r'http://121.194.57.131/oneDayJasAction.do?oper=tjcx'		#4
cho_class_url = r'http://121.194.57.131/xkMainAction.do?actionType=6'			#5
list_wj_url = r'http://121.194.57.131/jxpgXsAction.do?oper=listWj'				#6
cho_bxk_url = r'http://121.194.57.131/zytzAction.do?oper=bxqkc'					#7

start = time.clock()
browser = splinter.Browser()
browser.visit(jw_log_url)

if len(sys.argv) == 1:
	print u'''
	# 0.登陆\n
	# 1.课程表\n
	# 2.成绩\n
	# 3.个人信息\n
	# 4.可用教室\n
	# 5.选课\n
	# 6.评教\n
	'''
	student_id = raw_input("ID: ")
	passWord = getpass.getpass(r"password(jwc.bjtu): ")
	choice = int(raw_input("choice: "))
elif len(sys.argv) == 2:

Ejemplo n.º 16

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: dalberghini/Mongo_Web_Scraping

def scrape():

    dict_data = {}
    browser = init_browser()

    url = "https://mars.nasa.gov/news/"
    jpl_url = "https://www.jpl.nasa.gov/images?search=&category=Mars"
    mars_url = "https://space-facts.com/mars/"
    hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    browser.visit(url)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    titles_body_soup = soup.find_all("div",
                                     class_=("content_title",
                                             "article_teaser_body"))

    #display(len(titles_body_soup))
    title_list = []
    news_list = []
    for i, x in enumerate(titles_body_soup):
        if i % 2 == 0:
            news_list.append(x.text)
        else:
            title_list.append(x.text)
    #display(len(title_list))
    #display(len(news_list))
    #news list has extra line due to div content_title from nav bar element
    news_list = news_list[1:49]
    #display(len(news_list))

    browser.visit(jpl_url)
    time.sleep(.5)
    browser.find_by_css("img.BaseImage").click()
    browser.find_by_css("svg.IconExpand").click()
    jpl_html = browser.html
    soup0 = BeautifulSoup(jpl_html, "html.parser")
    featured_image_jpg = soup0.find_all(
        "div", class_="BaseLightbox__slide__img")[0]("img")[0]["src"]

    browser.visit(mars_url)
    mars_table = pd.read_html(mars_url)
    planet_comparison_df = mars_table[1].set_index("Mars - Earth Comparison")
    #display(planet_comparison_df)
    mars_facts = mars_table[0].rename(columns=({
        0: "Description",
        1: "Mars"
    })).set_index("Description")
    #display(mars_facts)
    mars_html = mars_facts.to_html()

    hemisphere_dict_list = []
    hemisphere_images_urls = {}
    browser.visit(hemi_url)
    time.sleep(.5)
    for x in range(4):
        browser.find_by_css("img.thumb")[x].click()
        browser.find_by_css("a.open-toggle").click()
        large_hemi_html = browser.html
        hemi_soup = BeautifulSoup(large_hemi_html, "html.parser")
        title = hemi_soup("h2", class_="title")[0].text
        hemisphere_images_urls["title"] = title.replace(" Enhanced", "")
        hemisphere_images_urls["img_url"] = hemi_soup(
            "img", class_="wide-image")[0]["src"]
        hemisphere_dict_list.append(hemisphere_images_urls)
        browser.visit(hemi_url)
        hemisphere_images_urls = {}
    hemisphere_dict_list
    browser.quit()
    dict_data["article_title"] = title_list[0]
    dict_data["news_list"] = news_list[0]
    dict_data["featured_image"] = featured_image_jpg
    dict_data["mars_table"] = mars_html
    dict_data["hemisphere_dict_list"] = hemisphere_dict_list

    from pymongo import MongoClient
    mongo_conn = MongoClient('mongodb://localhost:27017')
    mars_db = mongo_conn["mars_db"]
    mars_coll = mars_db["mars"]
    mars_db.mars_coll.insert_one(dict_data)

    return dict_data

Ejemplo n.º 17

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: KnKrish/MongoDB

def scrape():
    #scrape the NASA Mars News SIte, collect news title, paragraph text, assign
    #to variables for later reference
    url = "https://mars.nasa.gov/news/"
    response = req.get(url)
    soup = bs(response.text, 'html5lib')

    #scrape the title and accompanying paragraph
    news_title = soup.find("div", class_="content_title").text
    paragraph_text = soup.find("div", class_="rollover_description_inner").text

    #Visit the URL for JPL's Space images
    #splinter to navigate the site and find the image url for the current featured
    #image and assign it to featured_image_url (use .jpg)

    #set up splinter
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    #stir soup for scraping
    html = browser.html
    soup = bs(html, "html.parser")

    #have webdriver click links to get to the full image I want
    browser.click_link_by_partial_text('FULL IMAGE')

    #had to add this, wasn't working and docs recommended waiting between clicks
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    #stir new soup for scraping the image url
    new_html = browser.html
    new_soup = bs(new_html, 'html.parser')
    temp_img_url = new_soup.find('img', class_='main_image')
    back_half_img_url = temp_img_url.get('src')

    recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url

    #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA
    #stir soup
    twitter_response = req.get("https://twitter.com/marswxreport?lang=en")
    twitter_soup = bs(twitter_response.text, 'html.parser')

    #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol"
    tweet_containers = twitter_soup.find_all('div',
                                             class_="js-tweet-text-container")
    for i in range(10):
        tweets = tweet_containers[i].text
        if "Sol " in tweets:
            mars_weather = tweets
            break

#Mars Facts....visit webpage, use pandas to scrape the page for facts,
#convert pandas table to html table string.
    request_mars_space_facts = req.get("https://space-facts.com/mars/")

    #use pandas to scrape html table data
    mars_space_table_read = pd.read_html(request_mars_space_facts.text)
    df = mars_space_table_read[0]

    #set the index to the titles of each statistic/value
    df.set_index(0, inplace=True)
    mars_data_df = df

    #convert new pandas df to html, replace "\n" to get html code
    mars_data_html = mars_data_df.to_html()
    mars_data_html.replace('\n', '')
    mars_data_df.to_html('mars_table.html')

Ejemplo n.º 18

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: EMENGH/web-scraping-challenge

def scrape_all():

    browser = init_browser()

    browser.visit('https://mars.nasa.gov/news/')

    html = browser.html
    news_soup = BeautifulSoup(html, 'lxml')

    title = news_soup.find_all('div', class_='content_title')
    #place results in designated variables to be used later
    news_title = title[1].text.strip()
    print(news_title)

    parag = news_soup.find_all('div', class_='article_teaser_body')
    news_p = parag
    print(news_p)

    # JPL Mars Space Images - Featured Image

    browser.visit(
        "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")

    time.sleep(3)

    browser.click_link_by_partial_text('FULL IMAGE')

    browser.click_link_by_partial_text('more info')

    feat_html = browser.html
    feat_soup = BeautifulSoup(feat_html, 'html.parser')

    mars_img_url = feat_soup.find('figure', class_='lede').a['href']

    orig_url = "https://www.jpl.nasa.gov"
    featured_image_url = orig_url + mars_img_url
    print(f"{featured_image_url}")
    time.sleep(2)

    # Mars Facts

    mars_facts_url = 'https://space-facts.com/mars/'

    time.sleep(3)

    tables_found = pd.read_html(mars_facts_url)

    mars_facts_df = tables_found[0]
    mars_facts_df.head()

    #mars_html_table = mars_facts_df.to_html(classes='data table', index=False, header=False, border=0)
    mars_html_table = mars_facts_df.to_html()
    print(mars_html_table)

    # Mars Hemispheres

    #browser = Browser('chrome', **executable_path, headless=False)

    #hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(
        "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    )

    hemis_html = browser.html
    hemis_soup = BeautifulSoup(hemis_html, 'html.parser')

    hemis_orig_url = 'https://astrogeology.usgs.gov'

    hemisphere_urls = []

    hemis_items = hemis_soup.find_all('div', class_='item')

    # FOR loop to process titles and urls in a dictionary
    for item in hemis_items:

        title = item.find('h3').text
        partial_img_url = item.find('a',
                                    class_='itemLink product-item')['href']

        browser.visit(hemis_orig_url + partial_img_url)

        prev_html = browser.html
        hemis_soup = BeautifulSoup(prev_html, 'html.parser')

        img_url = hemis_orig_url + hemis_soup.find('img',
                                                   class_='wide-image')['src']

        hemisphere_urls.append({"title": title, "img_url": img_url})

        #print(f"{hemisphere_urls[item]}")

    # save all the compiled data about mars in a dictionary
    mars_dictionary = {
        "latest_news_title": news_title,
        "latest_news_parag": news_p,
        "JPL_featured_image": featured_image_url,
        "mars_facts_table": mars_html_table,
        "hemisphere_images": hemisphere_urls
    }
    #for debugging only
    # print("this is my mars dictionary")
    # print(f"[latest_news_title]")
    # print(f"[latest_news_parag]")
    # print(f"[JPL_featured_image]")
    # print(f"[mars_facts_table]")
    # print(f"[hemisphere_images]")

    # close browser
    browser.quit()

    return mars_dictionary

Ejemplo n.º 19

0

Mostrar archivo

def scrape():
    #scrape the NASA Mars News SIte, collect news title, paragraph text, assign
    #to variables for later reference
    url = "https://mars.nasa.gov/news/"
    response = req.get(url)
    soup = bs(response.text, 'html5lib')

    #scrape the title and accompanying paragraph
    news_title = soup.find("div", class_="content_title").text
    paragraph_text = soup.find("div", class_="rollover_description_inner").text

    #Visit the URL for JPL's Space images
    #splinter to navigate the site and find the image url for the current featured
    #image and assign it to featured_image_url (use .jpg)

    #set up splinter
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    #stir soup for scraping
    html = browser.html
    soup = bs(html, "html.parser")

    #have webdriver click links to get to the full image I want
    browser.click_link_by_partial_text('FULL IMAGE')

    #had to add this, wasn't working and docs recommended waiting between clicks
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    #stir new soup for scraping the image url
    new_html = browser.html
    new_soup = bs(new_html, 'html.parser')
    temp_img_url = new_soup.find('img', class_='main_image')
    back_half_img_url = temp_img_url.get('src')

    recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url

    #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA
    #stir soup
    twitter_response = req.get("https://twitter.com/marswxreport?lang=en")
    twitter_soup = bs(twitter_response.text, 'html.parser')

    #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol"
    tweet_containers = twitter_soup.find_all('div',
                                             class_="js-tweet-text-container")
    for i in range(10):
        tweets = tweet_containers[i].text
        if "Sol " in tweets:
            mars_weather = tweets
            break

#Mars Facts....visit webpage, use pandas to scrape the page for facts,
#convert pandas table to html table string.
    request_mars_space_facts = req.get("https://space-facts.com/mars/")

    #use pandas to scrape html table data
    mars_space_table_read = pd.read_html(request_mars_space_facts.text)
    df = mars_space_table_read[0]

    #set the index to the titles of each statistic/value
    df.set_index(0, inplace=True)
    mars_data_df = df

    #convert new pandas df to html, replace "\n" to get html code
    mars_data_html = mars_data_df.to_html()
    mars_data_html.replace('\n', '')
    mars_data_df.to_html('mars_table.html')

    #..Visit the USGS Astrogeology site to obtain hgih resolution images for
    #....each of Mar's hemispheres
    usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    usgs_req = req.get(usgs_url)

    #..You will need to click each of the links to the hemispheres in order
    #....to find full res image

    #had trouble doing this with splinter, decided to just do a bunch of loops for img urls
    soup = bs(usgs_req.text, "html.parser")
    hemi_attributes_list = soup.find_all('a', class_="item product-item")
    #list to keep the dictionaries that have title and image url
    hemisphere_image_urls = []
    for hemi_img in hemi_attributes_list:
        #get the img title
        img_title = hemi_img.find('h3').text
        #print(img_title)
        #get the link to stir another soup, this is the page with the actual image url
        link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href']
        #print(link_to_img)
        img_request = req.get(link_to_img)
        soup = bs(img_request.text, 'lxml')
        img_tag = soup.find('div', class_='downloads')
        img_url = img_tag.find('a')['href']
        hemisphere_image_urls.append({
            "Title": img_title,
            "Image_Url": img_url
        })

    mars_data = {
        "News_Title": news_title,
        "Paragraph_Text": paragraph_text,
        "Most_Recent_Mars_Image": recent_mars_image_url,
        "Mars_Weather": mars_weather,
        "mars_h": hemisphere_image_urls
    }

    return mars_data

Ejemplo n.º 20

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: eduenas88/Marswebscrape

    current_tweet = soup.find_all('div', class_='js-tweet-text-container')


#%%
for tweet in current_tweet:
    mars_weather_tweet = tweet.find('p').text
    if 'sol' and 'pressure' in mars_weather_tweet:
        print(mars_weather_tweet)
        print('-------------')

#%% [markdown]
# # Mars Facts

#%%
    mars_facts_url = 'https://space-facts.com/mars/'
    browser.visit(mars_facts_url)
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    #%%
    mars_facts_second_table = pd.read_html(mars_facts_url)[1]
    mars_facts_second_table = mars_facts_second_table.rename(index=str,
                                                             columns={
                                                                 0:
                                                                 "Description",
                                                                 1: "Value"
                                                             })
    mars_facts_second_tablehtml = mars_facts_second_table.to_html(
        index='False')

Ejemplo n.º 21

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: Quant-Boy/scrapemars

title = soup.find("div", class_="content_title").text
paragraph_text = soup.find("div", class_="rollover_description_inner").text

# In[ ]:

print(paragraph_text)

# In[128]:

#Visit the URL for JPL's Space images
#splinter to navigate the site and find the image url for the current featured
#image and assign it to featured_image_url (use .jpg)
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
featured_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)

# In[129]:

html = browser.html
soup = bs(html, "html.parser")

# In[130]:

browser.click_link_by_partial_text('FULL IMAGE')
#time.sleep(5)

# In[131]:

browser.click_link_by_partial_text('more info')

Ejemplo n.º 22

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: sherryjystad/Projects

def scrape():
    executable_path = {"executable_path": "chromedriver"}
    browser = Browser("chrome", **executable_path)
    mars_scrape_data = {}

    # NASA URL of page to be scraped
    url = 'https://mars.nasa.gov/news/'
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object
    soup = bs(response.text, 'lxml')

    #Featured article
    # Identify and return title and paragraph
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='rollover_description_inner').text

    # Insert into data table
    mars_scrape_data['news_title'] = news_title
    mars_scrape_data['news_p'] = news_p

    # JPL URL of page to be scraped
    jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl_url)
    jpl_html = browser.html

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = bs(jpl_html, 'html.parser')

    # scraping JPL feature image
    image = soup.find("article", class_="carousel_item")
    div = image.find("div", class_="default floating_text_area ms-layer")
    footer = div.find('footer')
    image = footer.a['data-fancybox-href']
    featured_image_url = "https://www.jpl.nasa.gov" + image
    mars_scrape_data['featured_image_url'] = featured_image_url

    # TWITTER Mars Weather to be scraped
    twitter_url = 'https://twitter.com/marswxreport?lang=en'
    twitter_response = requests.get(twitter_url)
    twitter_soup = bs(twitter_response.text, 'lxml')
    twitter_result = twitter_soup.find('div', class_='js-tweet-text-container')
    mars_weather = twitter_result.find('p', class_='js-tweet-text').text
    mars_scrape_data['mars_weather'] = mars_weather

    # MARS Space Facts
    mars_facts_url = 'https://space-facts.com/mars/'
    tables = pd.read_html(mars_facts_url)
    df = tables[0]
    df.columns = ['Description', 'Value']
    df.set_index('Description', inplace=True)

    # Export scraped table into an html script
    mars_facts_table = df.to_html()
    mars_facts_table.replace("\n", "")
    df.to_html('mars_facts_table.html')

    # Store html file to dictionary
    mars_scrape_data['mars_facts_table'] = mars_facts_table

    # USGS URL of page to be scraped
    usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(usgs_url)
    usgs_html = browser.html

    # Create BeautifulSoup object; parse with 'html.parser'
    usgs_soup = bs(usgs_html, 'html.parser')
    #print(usgs_soup.prettify())

    usgs_images = usgs_soup.find_all('div', class_='description')
    #usgs_images

    # Loop through images to extract title and image link
    for usgs_image in usgs_images:
        title = usgs_image.find('h3').text
        image_url = "https://astrogeology.usgs.gov" + usgs_image.a['href']
        mars_dict = {"title": title, "image_url": image_url}
        mars_data.append(mars_dict)
    mars_scrape_data["marsdata"] = mars_data
    return mars_scrape_data

Ejemplo n.º 23

0

Mostrar archivo

        temp = open('Authorization.json', 'w')
        temp.write(jsdata)
        temp.close()


CheckDataAuthorization()
temp = open("Authorization.json", 'r')
data = json.loads(temp.read())

#executable_path = {'executable_path': os.getcwd()+"/chromedriver"}
login = data['login']
password = data['password']

browser = browser.Browser('chrome', incognito=True)

browser.visit('https://zakaz.godovalov.ru/')

browser.reload()

browser.fill('login', login)

browser.fill('password', password)

browser.click_link_by_id('ext-gen1022')

browser.visit('https://zakaz.godovalov.ru/ordersale')

time.sleep(20)

browser.visit('https://zakaz.godovalov.ru/priceposlist_json?ordersale_id')

Ejemplo n.º 24

0

Mostrar archivo

	student_id = raw_input("ID: ")
	passWord = getpass.getpass("password: "******"choice: "))
elif len(sys.argv) == 2:
	student_id = '14281023'
	passWord = '******'
	choice = int(sys.argv[1])
elif len(sys.argv) == 4:
	student_id = sys.argv[1]
	passWord = sys.argv[2]
	choice = int(sys.argv[3])



browser = splinter.Browser()
browser.visit(targetpage)
# time.sleep(2)
browser.fill('zjh',student_id)
browser.fill('mm',passWord)
#验证码
v_yzm = ''

vrifycodeUrl = "http://121.194.57.131/validateCodeAction.do?"
#提取验证码text
while True:
	file = urllib2.urlopen(vrifycodeUrl)
	pic = file.read()
	picName = u'urf_login_temp.jpg'
	localpic = open(picName,"wb")
	localpic.write(pic)
	localpic.close()

Ejemplo n.º 25

0

Mostrar archivo

    student_id = '14281023'
    passWord = '******'
    choice = int(sys.argv[1])
elif len(sys.argv) == 4:
    student_id = sys.argv[1]
    passWord = sys.argv[2]
    choice = int(sys.argv[3])

#创建临时目录
try:
    os.mkdir('temp')
except:
    pass

browser = splinter.Browser()
browser.visit(targetpage)
# time.sleep(2)
browser.fill('zjh', student_id)
browser.fill('mm', passWord)

#识别错误的验证码图片
error_image = Image.new("RGB", [100, 100], (255, 255, 255))
error_time = 0


def getCode(rand_code):
    global error_image
    global error_time
    #验证码
    v_yzm = ''
    #列出目录下的文件,获取截图文件

Ejemplo n.º 26

0

Mostrar archivo

Archivo: urpauto.py Proyecto: Tachyu/PythonCodes

	student_id = '14281023'
	passWord = '******'
	choice = int(sys.argv[1])
elif len(sys.argv) == 4:
	student_id = sys.argv[1]
	passWord = sys.argv[2]
	choice = int(sys.argv[3])

#创建临时目录
try:
	os.mkdir('temp')
except:
	pass
	
browser = splinter.Browser()
browser.visit(targetpage)
# time.sleep(2)
browser.fill('zjh',student_id)
browser.fill('mm',passWord)

#识别错误的验证码图片
error_image = Image.new("RGB", [100,100], (255,255,255))
error_time = 0
def getCode(rand_code):
	global error_image
	global error_time
	#验证码
	v_yzm = ''
	#列出目录下的文件,获取截图文件
	files = os.listdir('temp')
	picName = u'temp/'

Ejemplo n.º 27

0

Mostrar archivo

def scrape():
    browser = init_browser()

    # Mars News
    # assign url; use browser to 'get' url
    url_news = "https://mars.nasa.gov/news/"
    browser.visit(url_news)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Use .find to locate the "div" and "class" and return only text
    news_title = soup.find('div', class_='content_title')

    news_p = soup.find('div', class_='article_teaser_body')

    # print text to confirm
    print(news_title.text)
    print(news_p.text)

    # Mars image
    # assign url; use browser to 'get' url
    url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url_image)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Use splinter to navigate the site and find the image url for the current Featured Mars Image
    image = soup.find('img', class_='fancybox-image')
    footer = soup.find("footer")
    link = footer.find('a')

    # assign the url string to a variable called `featured_image_url`.
    featured_image_url = link['data-fancybox-href']

    # save a complete url string for this image
    print('https://www.jpl.nasa.gov/' + featured_image_url)

    # Mars weather
    # assign url; use browser to 'get' url
    url_mars_weather = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_mars_weather)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # using regular expressions .compile function
    mars_weather = re.compile(r'sol')
    mars_weather = soup.find('span', text=mars_weather).text
    print(mars_weather)

    # Mars facts
    # assign url; use browser to 'get' url
    url_facts = "https://space-facts.com/mars/"
    browser.visit(url_facts)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # making a get request
    response = requests.get(url_facts)

    # scrape the table data from 'table' and 'id' elements
    mars_facts = soup.find('table', id="tablepress-p-mars-no-2").text
    print(mars_facts)

    # Mars table data
    table = pd.read_html(url_facts)
    table

    df = table[0]
    html_table = df.to_html()
    html_table

    df.to_html('table.html')

    # Mars Hemisphere
    # assign url; use browser to 'get' url
    url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_hemisphere)

    time.sleep(1)

    # create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # variable to locate the 'div' with 'item' from url
    items = soup.find_all('div', class_='item')

    # create an empty list to store results
    url_hemisphere_img = []

    # set variable to visit main url
    main_url = 'https://astrogeology.usgs.gov'

    # create a for loop
    for i in items:

        # locate titles
        title = i.find('h3').text

        # locate first partial img_url
        partial_img_url = i.find('a', class_='itemLink product-item')['href']

        # return to main_url; then partial img url
        browser.visit(main_url + partial_img_url)

        # initiate new html browser
        partial_img_html = browser.html

        # use beautiful soup and splinter to scrape each page
        soup = BeautifulSoup(partial_img_html, 'html.parser')

        # set variable to find full 'img' & 'src' urls
        img_url = main_url + soup.find('img', class_='wide-image')['src']

        # append titles and imgs; return as a list of dictionaries
        url_hemisphere_img.append({"title": title, "img_url": img_url})

    url_hemisphere_img

    # Close the browser after scraping
    browser.quit()

    # Store data in a dictionary
    mars_info = {
        "News Title": news_title,
        "News Paragraph": news_p,
        "Featured Image": featured_image_url,
        "Mars Weather": mars_weather,
        "Mars Facts": mars_facts,
        "Mars Table": table,
        "Mars Hemisphere": url_hemisphere_img
    }

    # Return results
    return mars_info

Ejemplo n.º 28

0

Mostrar archivo

# In[25]:


def init_browser():
    executable_path = {"executable_path": "chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=False)


# In[26]:

# NASA Mars News
browser = init_browser()

# NASA Mars News Site
url = "https://mars.nasa.gov/news/"
browser.visit(url)

time.sleep(3)

# Scrape Page
html = browser.html
soup = bs(html, "html.parser")

# News
news = soup.find_all('div', class_="list_text")[0]

# Title
news_title = news.find(class_="content_title").text

# News Article
news_p = news.find(class_="article_teaser_body").text

Ejemplo n.º 29

0

Mostrar archivo

    #
    # Find the relative image url
    #img_url_rel = weather_soup.select_one('a', class_='inline_image_enlarge fancybox').get('src')

    #weather_img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
    return weather_table.prettify()


# Deliverable-1, Scrape High-Resolution Mars Hemisphere Images and Titles

# Initiate headless driver for deployment (initialize the browser)
browser = Browser("chrome", executable_path="chromedriver", headless=False)

# 1. Use browser to visit the URL
long_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(long_url)
# Optional delay for loading the page
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=2)
# main_url
short_url = 'https://astrogeology.usgs.gov'

# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []

# 3. Write code to retrieve the image urls and titles for each hemisphere.
browser.visit(long_url)
html = browser.html
hemi_soup = soup(html, 'html.parser')
main_url = hemi_soup.find_all('div', class_='item')
titles = []

Ejemplo n.º 30

0

Mostrar archivo

Archivo: scrape_mars.py Proyecto: sallisonhome/hw13

def scrape():
    #scrape the NASA Mars News SIte, collect news title, paragraph text, assign
    #to variables for later reference
    url = "https://mars.nasa.gov/news/"
    response = req.get(url)
    soup = bs(response.text, 'html5lib')

    #Scrape for news item
    news_title = soup.find("div", class_="content_title").text
    paragraph_text = soup.find("div", class_="rollover_description_inner").text

    # JPL's Space images

    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    #call soup
    html = browser.html
    soup = bs(html, "html.parser")

    #auto click through to full image
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(3)
    browser.click_link_by_partial_text('more info')

    #soup gets image url
    new_html = browser.html
    new_soup = bs(new_html, 'html.parser')
    temp_img_url = new_soup.find('img', class_='main_image')
    recent_mars_image_url = "https://www.imagecache.jpl.nasa.gov/images/640x350/PIA18605-16-640x350.jpg"

    #getdata from Twitter for Mars Weather
    twitter_req = req.get("https://twitter.com/marswxreport?lang=en")
    twitter_bs = bs(twitter_req.text, 'html.parser')

    tweet_output = twitter_bs.find_all('div', class_="js-tweet-text-container")

    for i in range(10):
        tweets = tweet_output[i].text
        if "Sol " in tweets:
            mars_weather = tweets
            break

#MARS FACTS.
    request_mars_facts = req.get("https://space-facts.com/mars/")

    mars_table = pd.read_html(request_mars_facts.text)
    mars_df = mars_table[0]

    mars_df.set_index(0, inplace=True)
    mars_df2 = mars_df

    mars_data_html = mars_df2.to_html()
    mars_data_html.replace('\n', '')
    mars_df2.to_html('mars_table.html')

    #Get pics of Mars' hemispheres
    usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    usgs_req = req.get(usgs_url)

    soup = bs(usgs_req.text, "html.parser")
    hemis_list = soup.find_all('a', class_="itemLink product-item")

    hemisphere_image_urls = []
    for hemi_img in hemis_list:
        img_title = hemi_img.find('h3').text
        link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href']
        img_request = req.get(link_to_img)
        soup = bs(img_request.text, 'lxml')
        img_tag = soup.find('div', class_='downloads')
        img_url = img_tag.find('a')['href']
        hemisphere_image_urls.append({
            "Title": img_title,
            "Image_Url": img_url
        })

    mars_data = {
        "News_Title": news_title,
        "Paragraph_Text": paragraph_text,
        "Most_Recent_Mars_Image": recent_mars_image_url,
        "Mars_Weather": mars_weather,
        "mars_h": hemisphere_image_urls
    }

    return mars_data

Ejemplo n.º 31

0

Mostrar archivo

def scrape():
    #Create Dictionary for Mongo
    mars_info = {}
    
    #Mars News
    browser = init_browser()
    nasa_url = 'https://mars.nasa.gov/news/'
    browser.visit(nasa_url)
    html = browser.html
    soup = bs(html, 'html.parser')
                
    #Scrape for Most Recent Article then store variable
    latest_article = soup.find("div", "list_text")
    news_title = latest_article.find("div", class_="content_title").text
    news_p = latest_article.find("div", class_="article_teaser_body").text

    #Add Dictionary
    mars_info["news_title"] = news_title
    mars_info["teaser"] = news_p

    # Space Image Site
    jpl_url = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(jpl_url)

    # JPL Mars Scrape for requested image
    html = browser.html
    soup = bs(html, 'html.parser')
    carousel = soup.find('div', class_= 'carousel_items')
    div_style = carousel.find('article')['style']
    style = cssutils.parseStyle(div_style)
    partial_url = style['background-image']
    

    # Cleaning up image url - Per Recommendation of Learning Assistance
    partial_url = partial_url.replace('url(', '').replace(')', '')
    featured_image_url = "https://jpl.nasa.gov" + partial_url
    #print(featured_image_url)

    # Adding to dictionary - Per Learning Assistant need to remember (needs to be done for images also)
    mars_info["featured_image_url"] = featured_image_url

    # Twitter Navigation for Information
    tweet_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(tweet_url)
    
    # Most Recent Tweet for Weather
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    mars_weather = soup.find("p", class_="tweet-text").text
    print(mars_weather)

    # Adding to dictionary again
    mars_info["mars_weather"] = mars_weather

    # Mars Fact Site Navigation
    facts_url = "https://space-facts.com/mars/"
    browser.visit(facts_url)

    # Using Panda for Scrape
    facts = pd.read_html(facts_url)
    
    # DataFrame List for Specific Information
    facts_df = pd.DataFrame(facts[0])
    facts_df.columns=['Fact','Result']
    
    # DataFrame utilized for HTML
    mars_table = facts_df.to_html(index=False, justify='left', classes='mars-table')
    mars_table = mars_table.replace('\n', ' ')
    
    # Adding to dictionary - again
    mars_info["mars_table"] = mars_table

    # Going to Site for Image
    hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemi_url)

    # Loop to scrape image info with time delay to account for browser navigation
    hemisphere_image_urls = []

    for i in range (4):
        time.sleep(5)
        images = browser.find_by_tag('h3')
        images[i].click()
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        partial_url = soup.find("img", class_="wide-image")["src"]
        image_title = soup.find("h2",class_="title").text
        image_url = 'https://astrogeology.usgs.gov'+ partial_url
        image_dict = {"title":image_title,"image_url":image_url}
        hemisphere_image_urls.append(image_dict)
        browser.back()    
   
    # Adding to dictionary again
    mars_info["hemispheres"] = hemisphere_image_urls

    # Quit browser - Per Learning Assistant recommended
    browser.quit