Exemple #1
0
def get_answers_html(exam_title):
    logger.info(f"beging: get_answers_html,q = {exam_title}")
    html = ''
    browser = Browser(driver_name='chrome',
                      executable_path='chromedriver.exe',
                      headless=True)
    examtitle = exam_title
    # examtitle = 'POS机、银行卡概述'
    url = 'https://www.tiku88.com/'
    browser.visit(url)
    browser.fill('q', examtitle)
    time.sleep(2)
    browser.find_by_id('search_submit').click()
    time.sleep(2)
    html = browser.html

    while True:
        try:
            browser.find_link_by_partial_text('下一页').first.click()
            time.sleep(3)
            print("追加下一页的内容")
            html += browser.html
        except Exception as e:
            print("没有下一页")
            break
    browser.quit()
    return html
Exemple #2
0
def scrape():
    browser = Browser('chrome', **executable_path, headless=False)

    data = {}

    browser.visit('https://redplanetscience.com/')
    data['title'] = browser.find_by_css('div.content_title').text
    data['paragraph'] = browser.find_by_css('div.article_teaser_body').text

    browser.visit('https://spaceimages-mars.com')
    browser.find_link_by_partial_text('FULL IMAGE').click()
    data['image'] = browser.find_by_css('img.fancybox-image')['src']

    data['table'] = pd.read_html('https://galaxyfacts-mars.com')[1].to_html()

    browser.visit('https://marshemispheres.com/')

    hemispheres = []
    for i in range(4):
        hemisphere = {}
        hemisphere['title'] = browser.find_by_css('a.itemLink h3')[i].text
        browser.find_by_css('a.itemLink h3')[i].click()
        hemisphere['url'] = browser.find_by_text('Sample')['href']
        browser.back()
        hemispheres.append(hemisphere)
    browser.quit()
    data['hemispheres'] = hemispheres  

    return data
Exemple #3
0
def scrape_5():
    page = requests.get(
        'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    )
    soup = BeautifulSoup(page.content, 'html.parser')

    results = soup.find_all('div', class_='item')
    hemisphere_list = []

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    for result in results:

        url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
        browser.visit(url)

        try:
            h3 = result.h3.text
            browser.find_link_by_partial_text(h3).click()

            new_html = browser.html
            img_soup = BeautifulSoup(new_html, 'html.parser')
            img_link = img_soup.find("img", class_='wide-image')['src']
            full_url = 'https://astrogeology.usgs.gov/' + img_link

            entry = {"text": h3, "url": full_url}

            hemisphere_list.append(dict(entry))

        except:
            pass

    return hemisphere_list
def image_link():

    path = {}
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    browser.find_link_by_partial_text('FULL').first.click()
    html = browser.html
    
    soup = bs(html, 'html.parser')


    image =  soup.find('a',class_='button')


    link = image['data-link']

    image_url = 'https://www.jpl.nasa.gov' + str(link)

    browser.visit(image_url)

    html2 = browser.html
    soup2 = bs(html2, 'html.parser')

    jpeg_image = soup2.find('figure',class_='lede')
    
    final_jpeg = jpeg_image.a['href']

    featured_image_url = 'https://www.jpl.nasa.gov' + str(final_jpeg)

    path["src"] = featured_image_url

    return (path)
Exemple #5
0
def featured_image(browser):

    executable_path = {"executable_path": "./chromedriver.exe"}
    browser = Browser("chrome", **executable_path)

    # Visit the NASA JPL (Jet Propulsion Laboratory) site
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    # Ask Splinter to go to site and click button with class name full_image
    # <button class="full_image">Full Image</button>
    full_image_button = browser.find_by_id("full_image")
    full_image_button.click()

    # Find "More Info" button and Click It
    browser.is_element_present_by_text("more info", wait_time=1)
    more_info_element = browser.find_link_by_partial_text("more info")
    more_info_element.click()

    # Parse results HTML with BeautifulSoup
    html = browser.html
    image_soup = BeautifulSoup(html, "html.parser")

    img = image_soup.select_one("figure.lede a img")
    try:
        img_url = img.get("src")
    except AttributeError:
        return None 
   # Use base URL to create absolute URL
    img_url = f"https://www.jpl.nasa.gov{img_url}"
    return img_url
Exemple #6
0
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = User(name="Alice",
                         email="*****@*****.**",
                         password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run,
                                               kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()

    def test_2_add_post(self):
        self.browser.visit("http://127.0.0.1:8080")
        print("current url = ", self.browser.url)

        self.browser.driver.set_window_size(1920, 1080)
        self.browser.click_link_by_text('login')
        print("current url = ", self.browser.url)

        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        print(self.browser.url)

        add_link = self.browser.find_link_by_partial_text('add')
        add_link.click()
        print(self.browser.url)

        title = "test_acceptance_add_post"
        self.browser.fill("title", title)
        now = datetime.datetime.now()
        now = str(now)
        self.browser.fill("content", now)
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        print(self.browser.url)

        new_post_appears = self.browser.is_text_present(
            title) and self.browser.is_text_present(now)
        print("new_post_appears = ", new_post_appears)
        self.assertEqual(new_post_appears, True)
Exemple #7
0
def Hemi_image(name):

    # Visit URL
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser = Browser("chrome", executable_path="chromedriver", headless=True)
    browser.visit(url)

    # Find and click the title link
    Hemi_image = browser.find_link_by_partial_text(name)
    Hemi_image.click()

    # Find the open button and click that
    Hemi_image1 = browser.find_by_id('wide-image-toggle')
    Hemi_image1.click()

    # Parse the resulting html with soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Find the relative image url
    try:
        # find the relative image url
        img_url_1 = soup.find('img', {"class": "wide-image"})['src']

    except AttributeError:
        return None

    # Use the base URL to create an absolute URL
    img_url1 = f'https://astrogeology.usgs.gov{img_url_1}'
    return (img_url1)
Exemple #8
0
def scrape_2():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    browser.find_by_id('full_image').click()
    browser.find_link_by_partial_text('more info').click()

    new_html = browser.html
    img_soup = BeautifulSoup(new_html, 'html.parser')
    img_link = img_soup.find("figure", class_='lede').a['href']
    recent_image = 'https://www.jpl.nasa.gov' + str(img_link)

    return recent_image
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = User(name="Alice", email="*****@*****.**",
                         password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)


    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
    
    def test_2_add_post (self):
        self.browser.visit("http://127.0.0.1:8080")
        print ("current url = ", self.browser.url)
        
        self.browser.driver.set_window_size(1920, 1080)
        self.browser.click_link_by_text('login')
        print ("current url = ", self.browser.url)
        
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        print (self.browser.url)
        
        add_link=self.browser.find_link_by_partial_text('add')
        add_link.click()
        print (self.browser.url)
        
        title="test_acceptance_add_post"
        self.browser.fill("title", title)
        now=datetime.datetime.now()
        now=str(now)
        self.browser.fill("content", now)
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        print(self.browser.url)
        
        new_post_appears=self.browser.is_text_present(title) and self.browser.is_text_present(now)
        print ("new_post_appears = ", new_post_appears)
        self.assertEqual(new_post_appears, True)
Exemple #10
0
def scrape():
    results={}

    executable_path = {
        'executable_path': 'C:\p\HomeWork\Web-Scraping-Challenge\Mission_to_Mars\chromedriver.exe'
    }

    browser = Browser('chrome',**executable_path)

    url='https://mars.nasa.gov/news/'
    browser.visit(url)
    browser.is_element_present_by_css('ul.itme_list',wait_time=2)
    soup= BeautifulSoup(browser.html)
    title=soup.find('div','content_title').get_text()
    news_p=soup.find('div','article_teaser_body').get_text()
    results['news_title']=title
    results['news_paragraph']=news_p
    
    # 2. jpl.nasa.gov/spaceimages
    url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    full_image_btn=browser.find_by_id('full_image')
    full_image_btn.click()
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_btn = browser.find_link_by_partial_text('more info')
    more_info_btn.click()
    soup=BeautifulSoup(browser.html)
    img_url_rel=soup.select_one('figure.lede a img').get('src')
    img_url=f'http://www.jpl.nasa.gov{img_url_rel}'
    results['featured_images']= img_url


    #3 table from space-facts.com/mars/
    df = pd.read_html('https://space-facts.com/mars/') [0]
    df.columns=['description','value']
    df.set_index('description',inplace=True)
    results['facts']= df.to_html(classes='table table_striped')

    #4 hemisheres images from astrology.usgs.gov
    url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    hemisheres=[]
    links=browser.find_by_css('a.product-item h3')

    for i in range(len(links)):
        hemi={}
        browser.find_by_css('a.product-item h3')[i].click()
        sample_elm=browser.find_link_by_text('Sample').first
        img_url=sample_elm['href']
        title=browser.find_by_css('h2.title').text
        hemi['title']=title
        hemi['img_url']=img_url
        hemisheres.append(hemi)
        browser.back()
    results['hemispheres']=hemisheres

    return(results)
Exemple #11
0
def retrieve_hemispheres():
    browser = Browser("chrome",headless=True)
    browser.visit(source_urls['hemispheres'])
    browser.click_link_by_partial_text('Enhanced')

    browser.click_link_by_partial_text('Back')

    hemisphere_links = browser.find_link_by_partial_text('Hemisphere')
    link_text = []
    for link in hemisphere_links:
        link_text.append(link.text)
    hemisphere_image_urls = []
    for link in link_text:
        browser.click_link_by_partial_text(link)
        hemisphere_image_urls.append({
            'title' : link[:-9],
            'tif_url' : browser.find_link_by_partial_text('Original')['href'],
            'jpg_url' : browser.find_link_by_text('Sample')['href'],
        })
        browser.click_link_by_partial_text('Back')
    return hemisphere_image_urls
Exemple #12
0
def scrape():
    browser = init_browser()
    mars = {}

    #marsimage
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    html = browser.html
    soup = bs(html, 'html.parser')

    image = browser.find_by_id('full_image')
    image.click()

    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_elem = browser.find_link_by_partial_text('more info')
    more_info_elem.click()

    html = browser.html
    soup = bs(html, 'html.parser')
    image_find = soup.find('figure', class_='lede').find('img')['src']

    featured_image_url = f"https://www.jpl.nasa.gov/{image_find}"
    mars["featured_image_url"] = featured_image_url

    # Mars Weather
    url2 = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url2)

    html = browser.html
    soup = bs(html, 'html.parser')

    mydivs = soup.findAll("p", {
        "class":
        "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    })
    mars_weather = mydivs[0].text

    mars['mars_weather'] = mars_weather

    # Mars Facts
    url3 = 'https://space-facts.com/mars/'
    tables = pd.read_html(url3)

    df = tables[0]
    df.columns = ['Details', 'Values']
    df.set_index('Details', inplace=True)
    # html_table = df.to_html()
    # html_table.replace('\n', '')

    mars['mars_facts'] = df

    return mars
Exemple #13
0
class Submitter:
    def __init__(self, url, username, password, course_id, homework_id,
                 submit_list):
        self._callback = None
        self._browser = Browser()
        self._url = url
        self._username = username
        self._password = password
        self._course_id = course_id
        self._homework_id = homework_id
        self._submit_list = submit_list

    def _login(self):
        self._browser.visit(self._url)
        self._browser.fill("i_user", self._username)
        self._browser.fill("i_pass", self._password)
        self._browser.find_by_id("loginButtonId").click()

    def _nvi2course(self):
        self._browser.find_link_by_partial_text(self._course_id).first.click()
        self._browser.windows.current.close()

    def _nvi2homework(self):
        self._browser.find_link_by_partial_text("课程作业").first.click()
        self._browser.find_link_by_partial_text(
            self._homework_id).first.click()

    def _submit(self, stu_id, grade, comment, ex_file):
        xpath_str = '//tbody/tr[td[3]=' + stu_id + ']/td[last()]/a'
        self._browser.find_by_xpath(xpath_str).last.click()
        self._browser.fill('cj', grade)
        self._browser.fill('pynr', comment)
        if os.path.splitext(ex_file)[1] == '.pdf':
            self._browser.driver.find_element_by_name('fileupload').send_keys(
                ex_file)
        submit_btn_css = 'div[class="sub-back sub-back-3 absolute"] > input[class="btn"]'
        self._browser.find_by_css(submit_btn_css).first.click()
        while not self._browser.is_text_present('关闭', wait_time=1):
            pass
        self._browser.find_by_text('关闭').click()
        self._browser.back()
        self._browser.back()

    def add_single_task_callback(self, callback):
        self._callback = callback

    def start(self):
        self._login()
        self._nvi2course()
        self._nvi2homework()
        for stu_id, grade, comment, ex_file in self._submit_list:
            self._submit(stu_id, grade, comment, ex_file)
            self._callback([stu_id, grade, comment, ex_file])
        self._browser.quit()

    @staticmethod
    def clean():
        work_dir = os.getcwd()
        os.remove(work_dir + "/geckodriver.log")
Exemple #14
0
def mars_hemi():
    browser = Browser("chrome", executable_path="chromedriver", headless=True)
    # Visit the mars hemisphere site
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    # Set up HTML parser
    html = browser.html
    hem_soup = BeautifulSoup(html, 'html.parser')

    # Find all h3 headings
    hem_title = hem_soup.find_all('h3')
    hem_title

    # Put titles in a list
    hem_list = []
    for title in hem_title:
        word = title.text
        hem_list.append(word)

    url_list = []
    for index in [0,1,2,3]:
        
        # Visit the mars hemisphere site
        url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
        browser.visit(url)

        # Optional delay for loading the page
        browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)
    
        
        # Find the title button and click that
        more_info_elem = browser.find_link_by_partial_text(hem_list[index])
        more_info_elem.click()

        # Parse the resulting html with soup
        html = browser.html
        img_soup = BeautifulSoup(html, 'html.parser')

        # Find the relative image url
        hem_url_rel = img_soup.find('img', class_ = "wide-image").get("src")

        # Use the base URL to create an absolute URL
        hem_url = f'https://astrogeology.usgs.gov/{hem_url_rel}'

        url_list.append(hem_url)

    return hem_list, url_list
def JPL_image():

    browser = init_browser()

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    browser.visit(url_jpl)
    #for button in buttons:
    browser.find_link_by_partial_text('FULL IMAGE').click()
    browser.is_element_not_present_by_id('images', wait_time=2)
    browser.find_link_by_partial_text('more info').click()

    link = browser.find_link_by_partial_href('largesize')

    image_url = link.html.split("=")
    image_url = link.html.split("=")[-1].lstrip('"')
    image_url = image_url.rstrip('">')

    featured_image_url = 'https://www.jpl.nasa.gov' + image_url

    return featured_image_url
def get_featured_img_func(url):
    # Path to chromedriver
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Go to website
    browser.visit(url)

    # find "Full Image" button to click on it to get to next webpage
    full_img = browser.find_by_id("full_image")
    full_img.click()

    # find "More Info" button to click on it to get to next webpage
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_elem = browser.find_link_by_partial_text('more info')
    more_info_elem.click()

    # read website's html
    html = browser.html
    soup = bs(html, 'html.parser')

    # find "a" tag to find href containing the URL
    result = browser.find_by_tag("a")
    relative_image_path = result[58]["href"]

    # get image title
    relative_image_title = soup.find('h1', class_='article_title')
    relative_image_title = relative_image_title.get_text()
    relative_image_title = relative_image_title.split('\t')
    relative_image_title
    relative_image_title[4]
    final_title_feature_img = []
    final_title_feature_img.append({
        'Title': relative_image_title[4],
        'URL': relative_image_path
    })

    # Close the browser after scraping
    browser.quit()

    #return scraped object
    return final_title_feature_img
Exemple #17
0
def featured_img():

    executable_path = {"executable_path": (r"C:\Users\Mickey\anaconda3\Scripts\chromedriver.exe")}
    browser = Browser("chrome", **executable_path, headless=False)

    #Browse URL
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)

    time.sleep(1)

    #Ask Splinter to Go to Site and Click Button with Class Name full_image
    # <button class="full_image">Full Image</button>
    full_image_button = browser.find_by_id("full_image")
    full_image_button.click()

    #Find "More Info" Button and Click It
    browser.is_element_present_by_text("more info", wait_time=1)
    more_info_element = browser.find_link_by_partial_text("more info")
    more_info_element.click()

    #Parse Results with BeautifulSoup (Allow for Try and Except)
    html = browser.html
    image_soup = bs(html, "html.parser")

    img = image_soup.select_one("figure.lede a img")

    try:
        img_url = img.get("src")
    except AttributeError:
        return None
    
    #Combine with Base URL
    img_url = f"https://www.jpl.nasa.gov{img_url}"

    #Close Broswer when done
    browser.quit()
    
    #Return Results
    return img_url
Exemple #18
0
def scrape():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # URL of NASA Mars News Site to be scraped
    url = 'https://mars.nasa.gov/news/'

    # In[14]:

    # Retrieve page with the requests module
    browser.visit(url)

    # In[16]:

    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    # In[17]:

    html = browser.html

    # In[19]:

    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html, 'html.parser')

    # In[20]:

    # Examine the results, then determine element that contains sought info
    print(soup.prettify())

    # In[23]:

    element = soup.select_one("ul.item_list li.slide")
    title = element.find("div", class_="content_title").get_text()
    title

    # In[24]:

    paragraph = element.find("div", class_='article_teaser_body').get_text()
    paragraph

    # In[21]:

    # # Extract title text
    # news_title = soup.find('div', class_='content_title').text
    # news_p = soup.find('div', class_='article_teaser_body')
    # print(news_title)
    # print(news_p)

    # ## JPL Mars Space Images - Featured Image

    # In[26]:

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # In[27]:

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    # ### featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg'

    # In[36]:

    element = browser.find_by_id("full_image")
    element.click()

    # In[37]:

    browser.is_element_present_by_css("more info", wait_time=1)
    findElement = browser.find_link_by_partial_text("more info")
    findElement.click()

    # In[38]:

    html = browser.html
    imagesoup = BeautifulSoup(html, 'html.parser')

    # In[39]:

    image = imagesoup.select_one("figure.lede a img")
    imagesource = image.get("src")

    # In[41]:

    featured_image_url = 'https://www.jpl.nasa.gov' + imagesource
    featured_image_url

    # ## Mars Facts

    # In[64]:

    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)
    tables
    fact_df = tables[0]
    fact_df.columns = ["Description", "value"]
    fact_df.set_index("Description", inplace=True)
    # In[67]:
    html_table = fact_df.to_html(classes="table table-striped")

    hemurl = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    response = requests.get(hemurl)
    hemsoup = BeautifulSoup(response.text, "html.parser")
    item = hemsoup.find_all(class_="itemLink product-item")
    item

    # In[92]:

    hem = []
    for image in item:

        image_url = "https://astrogeology.usgs.gov" + image.get("href")
        hem.append(image_url)
    hem

    # In[ ]:

    hemurl = []
    for url in hem:
        response = requests.get(url)
        imgsoup = BeautifulSoup(response.text, "html.parser")
        #time.sleep(2)
        imageurl = imgsoup.find("a", href=True, text="Sample")
        href = imageurl["href"]
        title = imgsoup.find(class_="title").text.strip().replace(
            ' Enhanced', '')

        hemurl.append({"title": title, "img_url": href})
    hemurl

    # In[ ]:
    data = {
        "news_title": title,
        "news_p": paragraph,
        "image": featured_image_url,
        "mars_df": html_table,
        "mars_hem": hemurl
    }

    return data
def scrape():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    nasaURL = 'https://mars.nasa.gov/news/'
    browser.visit(nasaURL)
    time.sleep(2)
    soupNasa = bs(browser.html, 'html.parser')

    news_title = soupNasa.find('ul', class_='item_list').find(
        'div', class_="content_title").find('a').text.strip()
    news_p = soupNasa.find('div', class_="article_teaser_body").text.strip()
    news_date = soupNasa.find('div', class_="list_date").text.strip()

    jplURL = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jplURL)
    time.sleep(2)
    soupJPL = bs(browser.html, 'html.parser')
    browser.find_by_id("full_image").click()
    browser.find_link_by_partial_text("more info").click()
    soupJPL = bs(browser.html, 'html.parser')

    featured_image_url = 'https://www.jpl.nasa.gov' + soupJPL.find(
        'figure').find('img')['src']

    twitterURL = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(twitterURL)
    time.sleep(2)
    soupTwitter = bs(browser.html, 'html.parser')

    mars_weather = soupTwitter.find("div", {
        "data-testid": "tweet"
    }).find('div', {
        "lang": "en"
    }).find('span').text

    factsURL = 'https://space-facts.com/mars/'
    tables = pd.read_html(factsURL)[0]
    tables.set_index(0, inplace=True)
    marsFacts = tables.to_html(header=False)

    hemisphereURL = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    hemisphereBaseURL = 'https://astrogeology.usgs.gov'
    browser.visit(hemisphereURL)
    time.sleep(2)
    soupHemi = bs(browser.html, 'html.parser')

    hemisphere_image_urls = []
    hemis = soupHemi.find_all('div', class_="item")

    for h in hemis:
        tempURL = hemisphereBaseURL + h.find('a')['href']
        browser.visit(tempURL)
        time.sleep(2)
        soupSearch = bs(browser.html, 'html.parser')
        downloads = soupSearch.find("div", class_="downloads").find_all("li")
        for dl in downloads:
            if (dl.find('a').text == "Sample"):
                temp_dict = {
                    "title": soupSearch.find("h2", class_="title").text,
                    "img_url": dl.find('a')['href']
                }
                hemisphere_image_urls.append(temp_dict)

    returnDictionary = {
        "news_title": news_title,
        "news_p": news_p,
        "news_date": news_date,
        "featured_image_url": featured_image_url,
        "mars_weather": mars_weather,
        "mars_facts_table": marsFacts,
        "hemisphere_image_urls": hemisphere_image_urls
    }
    browser.quit()
    return returnDictionary
Exemple #20
0
url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url_image)

# In[73]:

html_i = browser.html
soup_i = BeautifulSoup(html_i, 'html.parser')

# In[74]:

image_button = browser.find_by_id("full_image").click()
next_link = browser.find_link_by_partial_text("more info").click()

# In[89]:

results_i = soup.find_all('img')

# In[90]:

results_i

# In[91]:

image_i_big = []
for result_i in results_i:
    big_image = result_i['src']
    image_i_big.append(big_image)
def scrape_info():

    browser = Browser("chrome")
    mars = {}




    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)


    # Parse HTML with Beautiful Soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')


    # Find all the content title and paragraph
    news_title =  soup.find_all('div', class_='content_title')
    news_p = soup.find_all('div', class_='article_teaser_body')

    print(news_title)
    print(news_p)


    # A blank list to hold the headlines " we are trying to display all titles "
    news_titles = []
    # Loop over div elements
    for result in news_title:
        # Identify the anchor...
        if (result.a):
            # And the anchor has non-blank text...
            if (result.a.text):
                # Append thext to the list
                news_titles.append(result)
    news_titles



    # A blank list to hold the paragraphs " we are trying to display all paragraph "
    news_para = []
    # Loop over div elements
    for result in news_p:
        # Identify the anchor...
        if (result.text):
            # Append thext to the list
            news_para.append(result)
    news_para


    #Top 5 Titles
    top_titles = []
    # Print only the headlines
    for x in range(5):
        temp=news_titles[x].text
        newvar = temp.strip('\n\n')
        top_titles.append(newvar)
    mars["news_title"] =top_titles[0]

    #Top 5 Paragraph
    top_paragraph = []
    # Print only the headlines
    for x in range(5):
        temp=news_para[x].text
        newvar = temp.strip('\n\n')
        top_paragraph.append(newvar)
    top_paragraph
    mars["news_paragraph"] =top_paragraph[0]


    # URL of page to be scraped
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(2)
    browser.find_by_id('full_image').click()
    time.sleep(2)
    browser.find_link_by_partial_text('more info').click()
    # Parse HTML with Beautiful Soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    featured_image_url = soup.find('figure', class_="lede")
    featured_image_url = featured_image_url.a.img["src"]
    featured_image_url

    main_url = 'https://www.jpl.nasa.gov'

    featured_image_url = main_url + featured_image_url

    featured_image_url
    mars["featured_image"] =featured_image_url


    ### Mars Facts



    # define url
    mars_facts_url = "https://space-facts.com/mars/"

    # read html into pandas
    tables = pd.read_html(mars_facts_url)

    # It returns 3 tables. The first has the data needed, so will convert to a dataframe and clean up nameing

    facts_mars = tables[0]
    facts_mars.columns = ["Description", "Value"]

    facts_mars

    #setting index
    facts_mars.set_index('Description', inplace=True)
    facts_mars.head()

    #Use Pandas to convert the data to a HTML table string.
    html_table = facts_mars.to_html()
    html_table
    mars["facts"] =html_table



    # define url and open in browser

    mars_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    browser.visit(mars_url)



    # Parse HTML with Beautiful Soup
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    #finding titles of hemispheres
    hemisphere_titles = soup.find_all('h3')
    hemisphere_titles



    #Use a Python dictionary to store the data using the keys `img_url` and `title`.

    hemisphere_image_urls = []

    #going through each title, clicking it opening the wide image coping url printing as we go along and putting in dictionary
    for i in range(len(hemisphere_titles)):
        hemisphere_title = hemisphere_titles[i].text
        print(hemisphere_title)
        
        hemisphere_images = browser.find_by_tag('h3')
        hemisphere_images[i].click()
        
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        
        img_url = soup.find('img', class_='wide-image')['src']
        img_url = "https://astrogeology.usgs.gov" + img_url
        print(img_url)
        
        hemisphere_dict = {"title": hemisphere_title, "img_url":img_url}
        hemisphere_image_urls.append(hemisphere_dict)
        
        browser.back()

    #printing dictionary

    hemisphere_image_urls
    mars["hemispheres"] =hemisphere_image_urls



    return mars
Exemple #22
0
def scrape_all():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url ='https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    dic={}

    #Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title
    #and Paragraph Text. Assign the text to variables that you can reference later.
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    title= soup.find_all('div', class_='content_title')
    body= soup.find('div', class_='article_teaser_body')
    print(title[1].text)
    print(body.text)

    news_title=title[1].text
    news_p= body.text

    dic[news_title]=news_title
    dic[news_p]=news_p

    dic

    #browser.quit()
    # JPL Mars Space Images - Featured Image
    image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(image_url)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    browser.find_by_id('full_image').click()
    browser.find_link_by_partial_text('more info').click()

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    imgs=soup.find('figure', class_='lede')
    print(imgs)
    print(imgs.a)
    print(imgs.a.img)
    print(imgs.a.img['src'])

    featured_image_url='https://www.jpl.nasa.gov'+imgs.a.img['src']
    print(featured_image_url)
    dic[featured_image_url]=featured_image_url
    dic

    # Mars Facts
    url ='https://space-facts.com/mars/'
    facts=pd.read_html(url)
    facts
    type(facts)

    df=facts[0]
    df.columns=['Profile','Values']
    df.set_index('Profile', inplace=True)
    df.head()

    html_facts = df.to_html()
    html_facts
    

    #strip unwanted newlines to clean up the table.
    html_facts.replace('\n', '')

    df.to_html('facts.html')

    #Mars Hemispheres
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    image_urls= []
    imgs = browser.find_by_css("a.product-item h3")
    imgs

    # For loop

    for i in range(len(imgs)):
        hemisphere = {}   
        browser.find_by_css("a.product-item h3")[i].click()
        
        # Find Sample Image
        sample_element = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = sample_element["href"]
        
        # Get the Title
        hemisphere["title"] = browser.find_by_css("h2.title").text
        
        # Append
        image_urls.append(hemisphere)
        
        # find imgs back
        browser.back()
    image_urls

    dic['hemisphere']=image_urls
    dic

    return dic
Exemple #23
0
def scrape_all():
    # Create the exe path for chrome to open chrome page
    # Will open a chrome window
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=True)
    print(
        f'\n-------------------------------------------------------------------------------------\nScraping Started'
    )
    print(
        f'\n-------------------------------------------------------------------------------------\n'
    )

    # Visit the site to scrape
    # Will go to the website and extract the browser url
    news_url = "https://mars.nasa.gov/news/"
    browser.visit(news_url)
    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
    news_html = browser.html
    soup = bs(news_html, 'lxml')
    news_title = soup.find('div', class_='content_title').text
    news_p = soup.find('div', class_='article_teaser_body').text

    print(
        f'\n-------------------------------------------------------------------------------------\n'
    )
    print(f'\nNews Title: {news_title}')
    print(f'\nNews Para: {news_p}')
    print(
        f'\n-------------------------------------------------------------------------------------\n'
    )

    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # Visit the site to scrape
    # Will go to the website and extract the browser url
    jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(jpl_url)
    time.sleep(1)

    # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
    jpl_html = browser.html
    soup = bs(jpl_html, 'lxml')
    #print(soup.prettify())

    image_link = soup.find(
        'div',
        class_='carousel_container').article.footer.a['data-fancybox-href']
    featured_image_url_medium = f'https://www.jpl.nasa.gov{image_link}'

    time.sleep(1)
    full_image_elem = browser.find_by_id("full_image")
    full_image_elem.click()

    time.sleep(1)
    more_info_elem = browser.find_link_by_partial_text('more info')
    more_info_elem.click()

    html = browser.html
    img_soup = bs(html, 'lxml')

    img_url_rel = img_soup.select_one('figure.lede a img').get("src")
    featured_image_url_large = f'https://www.jpl.nasa.gov{img_url_rel}'

    print(f'Featured Image: {featured_image_url_large}')
    print(
        f'\n-------------------------------------------------------------------------------------\n'
    )
    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # Visit the site to scrape
    # Will go to the website and extract the browser url
    weather_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(weather_url)
    time.sleep(1)

    # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
    weather_html = browser.html
    soup = bs(weather_html, 'lxml')

    weather_all = soup.find_all('div', class_='js-tweet-text-container')

    weather_list = []
    for x in weather_all:
        y = x.find('p', class_='js-tweet-text').text
        if "InSight" in y:
            weather_list.append(y)

    mars_weather = weather_list[0]

    print(f'Mars Weather: {mars_weather}')
    print(
        f'\n-------------------------------------------------------------------------------------\n'
    )
    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # Visit the site to scrape
    # Will go to the website and extract the browser url
    facts_url = "https://space-facts.com/mars/"
    browser.visit(facts_url)

    # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
    facts_html = browser.html
    soup = bs(facts_html, 'lxml')

    facts_str = pd.read_html(facts_url)

    # https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.to_html.html
    facts_html = facts_str[1].to_html(index=False, header=False)
    #facts_str[1].to_html("facts.html", index = False, header = False)

    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    # Visit the site to scrape
    # Will go to the website and extract the browser url
    hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemisphere_url)

    # Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
    hemisphere_html = browser.html
    soup = bs(hemisphere_html, 'lxml')

    # Find the links
    image_urls = [(a.text, a['href'])
                  for a in browser.find_by_css('div[class="description"] a')]

    hemisphere_image_urls = []

    for title, url in image_urls:
        temp = {}
        temp['title'] = title
        browser.visit(url)
        img_url = browser.find_by_css('img[class="wide-image"]')['src']
        temp['img_url'] = img_url
        hemisphere_image_urls.append(temp)

    print(f'Dict: {hemisphere_image_urls}')
    print(
        f'\n-------------------------------------------------------------------------------------\n'
    )

    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    data = {
        "news_title": news_title,
        'news_paragraph': news_p,
        "featured_image": featured_image_url_large,
        "weather": mars_weather,
        "facts_html": facts_html,
        "hemisphere_image_urls": hemisphere_image_urls
    }

    return data


#test = scrape_all()
Exemple #24
0
# In[69]:

#visiting the page
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)

# In[70]:

time.sleep(2)
image_button = browser.find_by_id("full_image").click()

# In[71]:

time.sleep(2)
more_info_button = browser.find_link_by_partial_text("more info").click()

# In[72]:

image = bs(browser.html, "html.parser")

# In[75]:

image_url = image.find("figure", class_="lede").find("img")["src"]
image_url

# In[77]:

final_url = "https://www.jpl.nasa.gov" + image_url
final_url
if JANUS:
    url = 'http://apps.webofknowledge.com.dianus.libr.tue.nl/DIIDW_AdvancedSearch_input.do?' \
          'SID=V2i7L6wGDEBBsnkAWFI&product=DIIDW&search_mode=AdvancedSearch'
    browser.visit(url)
    #this redirects to janus, fill in login info
    browser.fill('user',USERNAME)
    browser.fill('pass', PASSWORD)
    #find and click the login button
    browser.find_by_value('Login').first.click()
else:
    url = 'http://apps.webofknowledge.com/DIIDW_AdvancedSearch_input.do?SID=N1cpglrQOdCmC16gM44&product=DIIDW&search_mode=AdvancedSearch'
    browser.visit(url)

#if new session needs to be started click link
try:
    browser.find_link_by_partial_text('new session').first.click()
except:
    pass

def Build_Query_Citations(codes):
    #iterate through the list
    #build the query
    query = "CD=("
    for code in codes:
        if query == "CD=(":
            query += code
        else:
            query += " OR " + code
    query += ")"

    return query
Exemple #26
0
def scrape_info():

    #splinter exercise
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    mars= {}





    #scrape website
    url= 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(browser.html, 'html.parser')

    #pulling title
    news_title= soup.find_all('div', class_='content_title')
    news_title= news_title[1].a.text
    #print(news_title)

    mars["news_title"]=news_title

    news_p= soup.find_all('div', class_='article_teaser_body')
    news_p= news_p[0].text
    mars["news_p"]= news_p











    soup


    # *Splinter*




    url= 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(2)
    browser.find_by_id("full_image").click()
    time.sleep(2)
    browser.find_link_by_partial_text("more info").click()
    time.sleep(2)
    soup= BeautifulSoup(browser.html, 'html.parser')
    result= soup.find("figure", class_= "lede")
    result= result.a.img["src"]
    result
    featured_image_link= "https://www.jpl.nasa.gov" + result
    mars["featured_image_link"]= featured_image_link


    # FEATURED IMAGE

    # MARS FACTS




    url= 'https://space-facts.com/mars/'
    table= pd.read_html(url)
    table[0]
    df= table[0]
    df.columns= ["Description", "Values"]
    df.set_index("Description", inplace= True)
    df
    html_table= df.to_html()
    html_table= html_table.replace('\n', '')
    mars["Facts"]= html_table





    mars


    # Mars Hemispheres




    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    hemisphere_image_url= []

    for i in range (4):
        hemispheres= {}
        time.sleep(2)
        browser.find_by_css("a.product-item h3")[i].click()
        soup= BeautifulSoup(browser.html, 'html.parser')
        title= soup.find("h2", class_= "title").get_text()
        image= soup.find("a", text= "Sample").get("href")
        hemispheres["title"]= title
        hemispheres["img_url"]= image
        hemisphere_image_url.append(hemispheres)
        browser.back()
        
    mars["hemispheres"]= hemisphere_image_url

    mars= {
    "news_title": news_title,
    "news_p": news_p,
    "featured_image_link": featured_image_link,
    "Facts": html_table,
    "hemisphere_image_title_1": hemisphere_image_url[0]["title"],
    "hemisphere_image_url_1": hemisphere_image_url[0]["img_url"],
    "hemisphere_image_title_2": hemisphere_image_url[1]["title"],
    "hemisphere_image_url_2": hemisphere_image_url[1]["img_url"],
    "hemisphere_image_title_3": hemisphere_image_url[2]["title"],
    "hemisphere_image_url_3": hemisphere_image_url[2]["img_url"],
    "hemisphere_image_title_4": hemisphere_image_url[3]["title"],
    "hemisphere_image_url_4": hemisphere_image_url[3]["img_url"]
    }





    return mars
Exemple #27
0
class MegBotMentions:
        """MegBotMentions is an interface to Facebook Messages. You can use it by logging into your account, and giving it a group message you are a member of. You can navigate to different pages of the message, read the page, and write to the group chat. Please make sure the other members are aware MegBotMentions is joining y'all in your conversation."""
        def __init__(self, un, pw):
                self.username = un;
                self.password = pw;
                self.currentPage = 0;
                self.messageID = 0;
                self.browser = Browser("phantomjs");
                pass;

        def login(self):
        	self.browser.visit(_base_url)
                self.browser.fill('email', self.username);
                self.browser.fill('pass', self.password);
                self.browser.find_by_css('input[type="submit"]').first.click();

	        print "Logged in!"

                if self.messageID:
                        self.moveToMessage();

        def move_to_message(self, mID):
                self.currentPage = 0;
                self.messageID = mID.strip();
                self.browser.visit(_base_msg_url + self.messageID);
                print _base_msg_url + self.messageID;
                time.sleep(3);
                #self.send_message(_intro_message);

        def refresh_messages(self):
                if self.messageID == 0:
                        return;

                self.currentPage = 0;
                self.browser.visit(_base_msg_url + self.messageID);
                time.sleep(3);

        def next_page(self):
                if self.messageID == 0:
                        return False;

                self.currentPage += 1;
                print self.currentPage;
                self.browser.visit(_base_msg_url + self.messageID + _msg_url_mid + str(5 * self.currentPage));
                time.sleep(3);

        def send_message(self, inWords, receiver):

                #navigate back to page
                self.browser.visit("https://mbasic.facebook.com/messages/");
                #print self.browser.current_url;
                newconvo = self.browser.find_link_by_partial_text(receiver.title()).first;
                newconvo.click();

                #go through all names
                #names = self.browser.find_by_css('.bp');
                #for name in names:
                    #nametext = name.find_by_css('a');
                    #if nametext == receiver:
                    #click on name
                        #nametext.click_link_by_href();
                        #print self.browser.current_url;
                self.browser.find_by_id('composerInput')[0].fill("Hey, " + receiver + ", you've been mentioned in a chat");
                self.browser.find_by_css('input[name="send"]').first.click();
                time.sleep(3);

                #execute send message


        def read_messages(self):
                messages = [];
                mHTML = self.browser.find_by_id("messageGroup").find_by_css("div");
                #unnecessary right now
                link = mHTML.pop(0).find_by_css("a")["href"];
                lines = mHTML.find_by_css("span");
                #lines = mHTML.find_by_tag('span');
                for line in lines:
                        l = filter(lambda x: x in string.printable, line.text.strip());
                        if l and l != "." and not ("Sent from" in l) and not ("Seen by" in l):
                                messages.insert(0, l);
                #else:
                        #print "skipped " + l;

                return messages;
Exemple #28
0
from conn_info import *

parser = OptionParser()
parser.add_option("-d", "--dept", dest="dept_index", default=0, help="indice du departement")
options, args = parser.parse_args()

dept_index = int(options.dept_index)

t0 = time.time()
browser = Browser('zope.testbrowser')
browser.visit(SCODOC)
print 'Start: title:', browser.title
print 'URL: ', browser.url
# print browser.html

links = browser.find_link_by_partial_text('Scolarit')
print '%d departements' % len(links)

links[dept_index].click() # va sur le premier departement

# ---- Formulaire authentification
print 'Authentification: ', browser.url

browser.fill('__ac_name', USER)
browser.fill('__ac_password', PASSWD)
button = browser.find_by_id('submit')
button[0].click()

# ---- Page accueil Dept
print browser.url
Exemple #29
0
class KuchIterator:
    def __init__(self):
        self.browser = Browser('phantomjs')
        self.browser.visit(PAGE_URL)
        self.get_next_row()

    def __iter__(self):
        return self

    def get_next_row(self):
        soup = bs4.BeautifulSoup(self.browser.html)
        self.schedule = soup.find_all('table')[0]
        self.cur_row = self.schedule.tbody.tr

    def get_a_children(self, parent):
        out = ""
        for c1 in parent.children:
            if getattr(c1, 'name', None):
                if c1.name == 'a':
                    if c1.string:
                        out += c1.string
                elif c1.name in ['i', 'br']:
                    out += ", "
                    out += self.get_a_children(c1)
            else:
                if c1.string:
                    out += c1.string
        return out

    def next(self):
        try:
            c1 = self.cur_row.td
        except AttributeError:
            self.browser.find_link_by_partial_text('next').click()
            time.sleep(5)
            self.get_next_row()
            c1 = self.cur_row.td

        when1 = c1.span['content']
        c1 = c1.next_sibling
        who1 = c1.next_sibling.div.ul.li.contents[0]
        if not isinstance(who1, bs4.element.NavigableString):
            who1 = who1.contents[0]
        c1 = c1.next_sibling
        c1 = c1.next_sibling
        c1 = c1.next_sibling
        c1 = c1.next_sibling
        where1 = c1.next_sibling.a.contents[0]
        self.cur_row = self.cur_row.next_sibling
        self.cur_row = self.cur_row.next_sibling
        pos1 = when1.find('+')
        if pos1 != -1:
            when1 = when1[:pos1]
        date1 = datetime.datetime.strptime(when1, '%Y-%m-%dT%H:%M:%S')
        dict1 = {}
        dict1['year'] = date1.year
        dict1['month'] = date1.month
        dict1['day'] = date1.day
        dict1['hour'] = date1.hour
        dict1['min'] = date1.minute
        dict1['what'] = who1
        dict1['where'] = where1
        dict1['content'] = 'musicrux'
        return dict1
def scrape_info():

    #run ChromeDriverManager
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://mars.nasa.gov/news'
    browser.visit(url)

    mars = {}

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = bs(html, 'html.parser')
    type(soup)
    # Retrieve all elements that contain mars article information
    result1 = soup.find_all('div', class_='content_title')
    result2 = soup.find('div', class_='article_teaser_body')

    mars_article = result1[1].text.strip()
    mars_body_text = result2.text.strip()

    print(mars_article)
    print('-----------')
    print(mars_body_text)
    mars["news_title"] = mars_article
    mars["news_p"] = mars_body_text

    #visit the url for JPL Featured Space Image
    url_2 = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url_2)
    time.sleep(.5)
    browser.find_link_by_partial_text('FULL IMAGE').click()

    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = bs(html, 'html.parser')
    type(soup)

    # Retrieve all elements that contain image information
    mars_image = soup.find('img', class_='headerimage fade-in')

    print('https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/')
    print(mars_image)
    print('-----------')
    mars_image["src"]

    featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + mars_image[
        "src"]

    mars["featured_image_url"] = featured_image_url

    url_3 = 'https://space-facts.com/mars/'
    mars_tables = pd.read_html(url_3)
    mars_tables

    type(mars_tables)

    mars_df = mars_tables[0]
    mars_df

    mars2_df = mars_df.set_index(0, inplace=True)
    mars_df

    mars3_df = mars_df.rename(columns={0: ' ', 1: ' '})
    mars3_df

    mars_html_table = mars3_df.to_html()
    mars_html_table

    clean_mars_html_table = mars_html_table.replace('\n', '')
    clean_mars_html_table

    mars["facts"] = clean_mars_html_table

    url_4 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_4)
    link_list = browser.find_by_css("a.product-item h3")
    mars_hemisphere_image_urls = []
    for x in range(len(link_list)):
        hemisphere = {}
        browser.find_by_css("a.product-item h3")[x].click()
        sample = browser.links.find_by_text("Sample").first
        hemisphere["img_url"] = sample["href"]
        hemisphere["title"] = browser.find_by_css("h2.title").text
        mars_hemisphere_image_urls.append(hemisphere)
        browser.back()
    mars_hemisphere_image_urls
    mars["hemisphere"] = mars_hemisphere_image_urls

    browser.quit()

    # Return results
    return mars
Exemple #31
0
def mars_scrape():
    mars = {}

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=True)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    sleep(1)
    ourwebpage = browser.html
    soup = bs(ourwebpage, 'html.parser')
    x = soup.body.find_all(class_="content_title")
    alltitle = []

    for i in x[1:]:
        alltitle.append(i.find('a').text.strip())

    alltitle = alltitle[0]

    mars['title'] = alltitle

    paragraph = soup.body.find_all(class_="article_teaser_body")

    news_p = []

    for i in paragraph:
        #print(i.text)
        news_p.append(i.text)
    news_p = news_p[0]

    mars['news_paragraph'] = news_p

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    sleep(1)

    browser.click_link_by_id('full_image')

    z = browser.find_link_by_partial_text("more info")
    z.click()
    sleep(1)

    imgwebpage = browser.html
    soup2 = bs(imgwebpage, 'html.parser')

    image_path = soup2.find(class_="main_image")['src']
    image_full_path = "https://www.jpl.nasa.gov" + image_path

    mars["feature_img"] = image_full_path

    mars_table = pd.read_html("https://space-facts.com/mars/")[0]
    mars_table.rename(columns={0: "Category", 1: "Value"}, inplace=True)

    mars["mars_table"] = mars_table

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    sleep(1)

    image = []

    for i in range(4):
        browser.find_by_css("a.product-item h3")[i].click()
        sleep(1)
        html_image = browser.html

        soupitem = bs(html_image, 'html.parser')

        zz = soupitem.find('a', text="Sample")
        image.append(zz['href'])

        browser.back()
    hemisphere_image_urls = [
        {
            "title": "Valles Marineris Hemisphere",
            "img_url": image[0]
        },
        {
            "title": "Cerberus Hemisphere",
            "img_url": image[1]
        },
        {
            "title": "Schiaparelli Hemisphere",
            "img_url": image[2]
        },
        {
            "title": "Syrtis Major Hemisphere",
            "img_url": image[3]
        },
    ]

    mars["mars_image"] = hemisphere_image_urls

    return mars
# ""### Featured Images"
# In[9]:
# Visit URL
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

# In[10]:
# Find and click the full image button
full_image_elem = browser.find_by_id('full_image')
full_image_elem.click()

# In[11]:
# Find the more info button and click that
browser.is_element_present_by_text('more info', wait_time=1)
more_info_elem = browser.find_link_by_partial_text('more info')
more_info_elem.click()

# In[12]:
# Parse the resulting html with soup
html = browser.html
img_soup = BeautifulSoup(html, 'html.parser')

# In[13]:
browser.is_element_present_by_text('main_image', wait_time=1)
# Find the relative image url
img_url_rel = img_soup.select_one('figure.lede a img').get("src")
img_url_rel

# In[14]:
# Use the base URL to create an absolute URL
def scrape():
    # Dependencies
    import time
    import requests
    import pandas as pd
    from bs4 import BeautifulSoup
    from splinter import Browser
    # from selenium.webdriver.common import action_chains, keys
    # from selenium import webdriver
    import pymongo
    conn = "mongodb://localhost:27017"
    client = pymongo.MongoClient(conn)
    db = client.mars_db
    mars_data = db.mars_data
    db.mars_data.drop()

    # having issues with browser, use webdriver instead

    #driver = webdriver.Chrome()
    #url = 'https://mars.nasa.gov/news/'
    #driver.get(url)

    #html = driver.page_source
    #soup = BeautifulSoup(html, 'lxml')
    # In[35]:

    browser = Browser('chrome', headless=False)
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    time.sleep(5)

    # In[36]:

    html = browser.html
    soup = BeautifulSoup(html, 'lxml')
    print(soup.prettify())

    # In[37]:

    # soup.body.prettify()

    # In[38]:

    # Extract news title text
    title = soup.find('div', class_='bottom_gradient').text
    print(title)

    # In[39]:

    # Extract paragraph text
    paragraph = soup.find('div', class_='rollover_description_inner').text
    print(paragraph)

    # ### JPL Mars Space Images - Featured Image

    # In[7]:

    # Visit the url for JPL's Featured Space Image here.
    # Use splinter to navigate the site and find the image url for
    # the current Featured Mars Image and assign the url string to a variable called featured_image_url.
    # Make sure to find the image url to the full size .jpg image.
    # Make sure to save a complete url string for this image.

    # # Example:
    # featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg'

    # In[11]:

    from splinter import Browser
    #img_url = 'https://www.jpl.nasa.gov/spaceimages/'

    #executable_path = {'executable_path': './chromedriver'}
    #browser = Browser('chrome', **executable_path)
    #browser.visit(img_url)

    browser = Browser('chrome', headless=False)
    img_url = 'https://www.jpl.nasa.gov/spaceimages/'
    browser.visit(img_url)
    time.sleep(5)

    # In[12]:

    browser.click_link_by_id('full_image')

    # In[13]:

    time.sleep(5)
    browser.find_link_by_partial_text('more info').click()

    # In[14]:

    #time.sleep(5)
    #browser.find_link_by_partial_text('.jpg').click()

    # In[15]:

    time.sleep(5)
    featured_image_url = browser.find_by_tag('img')[6]['src']
    featured_image_url

    # ### Mars Weather

    # In[16]:

    # Visit the Mars Weather twitter account here
    # and scrape the latest Mars weather tweet from the page.
    # Save the tweet text for the weather report
    # as a variable called mars_weather.

    # Example:
    # mars_weather = \
    # 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa, daylight 06:09-17:55'

    # In[17]:

    from splinter import Browser
    browser = Browser('chrome', headless=False)
    tw_acct_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(tw_acct_url)
    time.sleep(5)

    # In[18]:

    html = browser.html
    soup = BeautifulSoup(html, 'lxml')
    # print(soup.prettify())

    # In[19]:

    container = soup.find('div', class_='js-tweet-text-container')
    container

    # In[20]:

    mars_weather = container.find(
        'p',
        class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"
    ).text
    mars_weather

    # ### Mars Facts

    # In[21]:

    # Visit the Mars Facts webpage here and use Pandas
    # to scrape the table containing facts about the planet
    # including Diameter, Mass, etc.
    # Use Pandas to convert the data to a HTML table string.

    # In[22]:

    marsfacts_url = 'https://space-facts.com/mars/'
    tables = pd.read_html(marsfacts_url)
    tables

    # In[23]:

    df = tables[0]
    df

    # In[24]:

    df = df.rename(columns={0: 'Measurement', 1: 'Value'})
    df = df.set_index('Measurement')
    df

    # In[25]:

    # convert table to html string
    html_table = df.to_html()
    html_table

    # In[26]:

    # strip unwanted newlines to clean up the table.
    html_table = html_table.replace('\n', '')
    html_table

    # ### Mars Hemisperes

    # In[27]:

    # Visit the USGS Astrogeology site here to obtain
    # high resolution images for each of Mar's hemispheres.
    # You will need to click each of the links to the hemispheres
    # in order to find the image url to the full resolution image.
    # Save both the image url string for the full resolution hemipshere image,
    # and the Hemisphere title containing the hemisphere name.

    # Use a Python dictionary to store the data using the keys img_url and title.
    # Append the dictionary with the image url string and the hemisphere title to a list.
    # This list will contain one dictionary for each hemisphere.

    # # Example:
    # hemisphere_image_urls = [
    #     {"title": "Valles Marineris Hemisphere", "img_url": "..."},
    #     {"title": "Cerberus Hemisphere", "img_url": "..."},
    #     {"title": "Schiaparelli Hemisphere", "img_url": "..."},
    #     {"title": "Syrtis Major Hemisphere", "img_url": "..."},
    # ]

    # In[28]:

    from splinter import Browser
    browser = Browser('chrome', headless=False)
    usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(usgs_url)
    time.sleep(5)

    # In[29]:

    browser.find_by_css('h3')[0].click()
    img1_url = browser.find_by_tag('a')[41]['href']
    print(img1_url)

    img1_title = browser.find_by_css('h2')[0].text
    img1_title = img1_title.replace(' Enhanced', '')
    print(img1_title)

    # In[30]:

    browser.back()
    browser.find_by_css('h3')[1].click()
    img2_url = browser.find_by_tag('a')[41]['href']
    print(img2_url)

    img2_title = browser.find_by_css('h2')[0].text
    img2_title = img2_title.replace(' Enhanced', '')
    print(img2_title)

    # In[31]:

    browser.back()
    browser.find_by_css('h3')[2].click()
    img3_url = browser.find_by_tag('a')[41]['href']
    print(img3_url)

    img3_title = browser.find_by_css('h2')[0].text
    img3_title = img3_title.replace(' Enhanced', '')
    print(img3_title)

    # In[32]:

    browser.back()
    browser.find_by_css('h3')[3].click()
    img4_url = browser.find_by_tag('a')[41]['href']
    print(img4_url)

    img4_title = browser.find_by_css('h2')[0].text
    img4_title = img4_title.replace(' Enhanced', '')
    print(img4_title)

    # In[33]:

    # Use a Python dictionary to store the data using the keys img_url and title.
    hemisphere_img_dict = [
        {
            "title": img1_title,
            "img_url": img1_url
        },
        {
            "title": img2_title,
            "img_url": img2_url
        },
        {
            "title": img3_title,
            "img_url": img3_url
        },
        {
            "title": img4_title,
            "img_url": img4_url
        },
    ]

    data_outputs = {
        'title': title,
        'paragraph': paragraph,
        'featured_image_url': featured_image_url,
        'mars_weather': mars_weather,
        'html_table': html_table,
        'hemisphere_img_dict': hemisphere_img_dict
    }

    mars_data.insert(data_outputs)
    return data_outputs
def scrape_info():
    browser = Browser('chrome')
    mars = {}

    # # Scraping

    # # NASA Mars News

    # pull titles from website
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    titles = soup.find_all('div', class_="content_title")
    news_title = titles[1].text
    body = soup.find_all('div', class_="article_teaser_body")
    news_p = body[0].text
    mars["news_title"] = news_title
    mars["news_p"] = news_p

    # pull body from website
    mars

    # pull titles and body from website
    results = soup.find_all('div', class_="slide")
    for result in results:
        titles = result.find('div', class_="content_title")
        title = titles.find('a').text
        bodies = result.find('div', class_="rollover_description")
        body = bodies.find('div', class_="rollover_description_inner").text
        print('----------------')
        print(title)
        print(body)

    # # JPL Mars Space Images - Featured Image

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    response = requests.get(url)
    browser.visit(url)
    browser.find_by_id("full_image").click()
    time.sleep(2)
    browser.find_link_by_partial_text('more info').click()
    soup = BeautifulSoup(browser.html, 'html.parser')
    result = soup.find('figure', class_='lede')
    featured_image_url = 'https://www.jpl.nasa.gov' + result.a.img["src"]
    featured_image_url
    mars["featured_image"] = featured_image_url

    # # Mars Facts

    mars_facts_url = "https://space-facts.com/mars/"
    table = pd.read_html(mars_facts_url)
    table[0]

    df = table[0]
    df.columns = ["Facts", "Value"]
    df.set_index(["Facts"])
    df

    facts_html = df.to_html()
    facts_html = facts_html.replace("\n", "")
    facts_html
    mars["facts"] = facts_html

    # # Mars Hemispheres

    hemisphere_image_urls = []

    # Cerberus Hemispheres

    url = (
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced'
    )

    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    # print(soup.prettify())

    cerberus_img = soup.find_all('div', class_="wide-image-wrapper")
    # print(cerberus_img)

    Hemisphere = {}
    for img in cerberus_img:
        pic = img.find('li')
        full_img = pic.find('a')['href']
        print(full_img)
    cerberus_title = soup.find('h2', class_='title').text
    print(cerberus_title)
    cerberus_hem = {"Title": cerberus_title, "url": full_img}
    print(cerberus_hem)
    Hemisphere["title"] = cerberus_title
    Hemisphere["img_url"] = cerberus_hem
    hemisphere_image_urls.append(Hemisphere)

    # Schiaparelli Hemisphere

    url = (
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced'
    )
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    #print(soup.prettify())

    shiaparelli_img = soup.find_all('div', class_="wide-image-wrapper")
    # print(shiaparelli_img)

    for img in shiaparelli_img:
        pic = img.find('li')
        full_img = pic.find('a')['href']
        print(full_img)
    shiaparelli_title = soup.find('h2', class_='title').text
    print(shiaparelli_title)
    shiaparelli_hem = {"Title": shiaparelli_title, "url": full_img}
    print(shiaparelli_hem)
    Hemisphere["title"] = shiaparelli_title
    Hemisphere["img_url"] = shiaparelli_title
    hemisphere_image_urls.append(Hemisphere)

    # Syrtis Hemisphere

    url = (
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced'
    )

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    #print(soup.prettify())

    syrtris_img = soup.find_all('div', class_="wide-image-wrapper")
    # print(syrtris_img)

    for img in syrtris_img:
        pic = img.find('li')
        full_img = pic.find('a')['href']
        print(full_img)
    syrtris_title = soup.find('h2', class_='title').text
    print(syrtris_title)
    syrtris_hem = {"Title": syrtris_title, "url": full_img}
    print(syrtris_hem)
    Hemisphere["title"] = syrtris_title
    Hemisphere["img_url"] = syrtris_hem
    hemisphere_image_urls.append(Hemisphere)

    # Valles Marineris Hemisphere

    url = (
        'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced'
    )

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    valles_marineris_img = soup.find_all('div', class_="wide-image-wrapper")
    # print(valles_marineris_img)

    for img in valles_marineris_img:
        pic = img.find('li')
        full_img = pic.find('a')['href']
        print(full_img)
    valles_marineris_title = soup.find('h2', class_='title').text
    print(valles_marineris_title)
    valles_marineris_hem = {"Title": valles_marineris_title, "url": full_img}
    print(valles_marineris_hem)
    Hemisphere["title"] = valles_marineris_title
    Hemisphere["img_url"] = valles_marineris_hem
    hemisphere_image_urls.append(Hemisphere)

    mars["hemisphere"] = hemisphere_image_urls
    return mars
Exemple #35
0
def scrape():
    mars = mongo.db.mars

    # Put everything from Jupyter Notebook Here
    # Set the executable path and initialize the chrome browser in splinter

    executable_path = {
        'executable_path': 'C:\\Users\\enere\\Desktop\chromedriver'
    }
    browser = Browser('chrome', **executable_path)

    ##### MARS NEWS Scrape #####
    # Visit the mars nasa news site
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_scraper = BeautifulSoup(html, 'html.parser')

    # Use the parent element to find the first a tag and save it as `news_title`
    title_element = news_scraper.find('div', {'class': 'content_title'})
    news_title = title_element.get_text()

    # Use the parent element to find the paragraph text
    teaser_element = news_scraper.find('div', {'class': 'article_teaser_body'})
    teaser_text = teaser_element.get_text()

    ##### JPL Space Images Featured Image #####
    # Visit URL
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    full_image_elem = browser.find_by_id('full_image')
    full_image_elem.click()

    # Find the more info button and click that
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_elem = browser.find_link_by_partial_text('more info')
    more_info_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_scraper = BeautifulSoup(html, 'html.parser')

    # find the relative image url
    img_element = img_scraper.find('img', {'class': 'main_image'})

    # find the relative image url
    img_src = img_element.get('src')

    # Use the base url to create an absolute url
    img_url = f'https://www.jpl.nasa.gov{img_src}'

    ##### Mars Weather Scrape
    # Visit URL
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)

    html = browser.html
    weather_soup = BeautifulSoup(html, 'html.parser')

    # First, find a tweet with the data-name `Mars Weather`
    mars_weather_tweet = weather_soup.find('div',
                                           attrs={
                                               "class": "tweet",
                                               "data-name": "Mars Weather"
                                           })

    # Next, search within the tweet for the p tag containing the tweet text
    mars_weather = mars_weather_tweet.find('p', 'tweet-text').get_text()
    mars_weather

    ##### Mars Facts Scrape

    # Visit URL
    url = 'https://space-facts.com/mars/'
    browser.visit(url)

    tables = pd.read_html(url)

    html_table = df.to_html()

    df.to_html('table.html')

    type(tables)

    df = tables[0]
    df.columns = ['Mars - Earth Comparison', 'Mars', 'Earth']

    # Set the index to Mars - Earth Comparison column
    df.set_index('Mars - Earth Comparison', inplace=True)

    #convert DataFrames back to HTML tables using the to_html function
    html_table = df.to_html()

    ##### Mars Hemisphere Scrape
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    hemisphere_image_urls = []

    # First, get a list of all of the hemispheres
    links = browser.find_by_css("a.product-item h3")

    # Next, loop through those links, click the link, find the sample anchor, return the href
    for i in range(len(links)):
        hemisphere = {}

        # We have to find the elements on each loop to avoid a stale element exception
        browser.find_by_css("a.product-item h3")[i].click()

        # Next, we find the Sample image anchor tag and extract the href
        sample_elem = browser.find_link_by_text('Sample').first
        hemisphere['img_url'] = sample_elem['href']

        # Get Hemisphere title
        hemisphere['title'] = browser.find_by_css("h2.title").text

        # Append hemisphere object to list
        hemisphere_image_urls.append(hemisphere)

        hemisphere_image_urls
        # Finally, we navigate backwards
        browser.back()

    browser.quit()
    ##### Create a dictionary to store our scraped data
    scraped_data = {
        'News Title': news_title,
        'Teaser Text': teaser_text,
        'Image URL': img_url,
        'Mars Weather': mars_weather,
        'Mars Hemisphere': hemisphere_image_urls,
        'Mars Facts': html_table
    }

    ##### Put into MongoDB
    mars.update({}, scraped_data, upsert=True)

    return jsonify(scraped_data)