class LoginTestCase(unittest.TestCase):

  def setUp(self):
    self.testbed = testbed.Testbed()
    self.testbed.activate()
    self.testbed.init_datastore_v3_stub()
    self.testbed.init_memcache_stub()
    self.browser = Browser('chrome')

  def tearDown(self):
    self.testbed.deactivate()

  def test_login(self):
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")

    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

  def test_logout(self):
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")

    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

    self.browser.find_link_by_text("Log out").first.click()
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
Exemple #2
0
def doScrape():
    executable_path = {'executable_path': './chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    #Lookin at the USGS webpage
    titles = list()
    img_links = list()

    browser.visit(usgs_url)
    html = browser.html
    soup_usgs = BeautifulSoup(html, 'html.parser')

    results = soup_usgs.find_all('a', class_='itemLink product-item')
    for result in results:
        title = result.find('h3')
        if (title != None):
            titles.append(title.text)

    for title in titles:
        browser.visit(usgs_url)
        browser.click_link_by_partial_text(title)
        browser.find_link_by_text('Sample').click()
        img_links.append(browser.windows[1].url)
        browser.windows[1].close()

    hemisphere_image_urls = []
    for x in range(len(titles)):
        hemisphere_image_urls.append({"title": titles[x], "img_url": img_links[x]})

    #browser.quit()

    print(hemisphere_image_urls)

    return hemisphere_image_urls
Exemple #3
0
def enable():
    import time
    import requests
    import settings
    from splinter import Browser
    from xvfbwrapper import Xvfb

    print "Trying to enable myself."
    vdisplay = Xvfb()
    vdisplay.start()


    email = settings.getEmail()
    password = settings.getPassword()
    team_name = settings.getTeamName()
    bot_user = settings.getBotUser()

    browser = Browser('chrome')
    url = 'https://{}.slack.com/services/{}'.format(team_name, bot_user)
    browser.visit(url)
    browser.fill('email', email)
    browser.fill('password', password)
    browser.find_by_id('signin_btn').first.click()
    browser.find_link_by_text('Enable').first.click()
    time.sleep(2) # Sometimes I saw a crash where there was no alert, so we'll wait a bit first.
    alert = browser.get_alert()
    alert.accept()
    time.sleep(2) # If you close the display too quickly, the request doesn't get processed.

    vdisplay.stop()
Exemple #4
0
def hemisphere():

    # Visit the USGS Astrogeology Science Center Site
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    hemi_img_urls = []

    # Get a List of All the Hemispheres
    links = browser.find_by_css("a.product-item h3")
    for item in range(len(links)):
        hemisphere = {}

        browser.find_by_css("a.product-item h3")[item].click()

        # Find Sample Image Anchor Tag & Extract <href>
        sample = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = sample["href"]

        # Get Hemisphere Title
        hemisphere["title"] = browser.find_by_css("h2.title").text

        # Append List
        hemi_img_urls.append(hemisphere)

        # Navigate Backwards
        browser.back()

    hemi_img_urls

    return {'Hemisphere Image URLS': hemi_img_urls}
Exemple #5
0
def hemisphere(browser):

    executable_path = {"executable_path": "./chromedriver.exe"}
    browser = Browser("chrome", **executable_path)

    # Visit the USGS Astrogeology Science center site
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    hemisphere_image_urls = []

    # Get a list of all the hemisphere
    links = browser.find_by_css("a.product-item h3")
    for item in range(len(links)):
        hemisphere = {}
        
        # Find element on each loop to avoid stale element exception
        browser.find_by_css("a.product-item h3")[item].click()
        
        # Find sample image anchor tag & extract href
        sample_element = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = sample_element["href"]
        
        # Get hemisphere title
        hemisphere["title"] = browser.find_by_css("h2.title").text
        
        # Append hemisphere object to list
        hemisphere_image_urls.append(hemisphere)
        
        # Navigate back
        browser.back()
    return hemisphere_image_urls
Exemple #6
0
def submitQueue(NETID, PASSWORD, SECURITY_QUESTIONS):

    browser = Browser()

    # netid page
    browser.visit("https://puaccess.princeton.edu/psp/hsprod/EMPLOYEE/HRMS/h/?tab=DEFAULT")
    browser.fill('userid', NETID)
    browser.find_by_value("Continue").first.click()

    # password page
    browser.fill('Bharosa_Password_PadDataField', PASSWORD)
    browser.evaluate_script("Bharosa_Password_Pad.keyPress('ENTERKEY');")

    # security question page
    html = browser.html

    for key in SECURITY_QUESTIONS.keys():
        
        if key in html:
            
            answer = SECURITY_QUESTIONS[key]

    browser.fill('Bharosa_Challenge_PadDataField', answer)
    browser.evaluate_script("Bharosa_Challenge_Pad.keyPress('ENTERKEY');")

    time.sleep(2)

    # welcome to SCORE
    browser.find_link_by_text("Student Center").first.click()


    # student center, start by busting out of the iframe
    browser.visit("https://puaccess.princeton.edu/psc/hsprod/EMPLOYEE/HRMS/c/SA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL?PORTALPARAM_PTCNAV=HC_SSS_STUDENT_CENTER&EOPP.SCNode=HRMS&EOPP.SCPortal=EMPLOYEE&EOPP.SCName=ADMN_SCORE&EOPP.SCLabel=&EOPP.SCPTcname=ADMN_SC_SP_SCORE&FolderPath=PORTAL_ROOT_OBJECT.PORTAL_BASE_DATA.CO_NAVIGATION_COLLECTIONS.ADMN_SCORE.ADMN_S200801281459482840968047&IsFolder=false&PortalActualURL=https%3a%2f%2fpuaccess.princeton.edu%2fpsc%2fhsprod%2fEMPLOYEE%2fHRMS%2fc%2fSA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL&PortalContentURL=https%3a%2f%2fpuaccess.princeton.edu%2fpsc%2fhsprod%2fEMPLOYEE%2fHRMS%2fc%2fSA_LEARNER_SERVICES.SSS_STUDENT_CENTER.GBL&PortalContentProvider=HRMS&PortalCRefLabel=Student%20Center&PortalRegistryName=EMPLOYEE&PortalServletURI=https%3a%2f%2fpuaccess.princeton.edu%2fpsp%2fhsprod%2f&PortalURI=https%3a%2f%2fpuaccess.princeton.edu%2fpsc%2fhsprod%2f&PortalHostNode=HRMS&NoCrumbs=yes&PortalKeyStruct=yes")
    browser.select('DERIVED_SSS_SCL_SSS_MORE_ACADEMICS', "1005")
    browser.find_by_id("DERIVED_SSS_SCL_SSS_GO_1").first.click()

    # pick semester
    browser.choose("SSR_DUMMY_RECV1$sels$0", "1")
    browser.find_by_id("DERIVED_SSS_SCT_SSR_PB_GO").first.click()

    # select classes to add... class should already be in queue
    browser.find_by_id("DERIVED_REGFRM1_LINK_ADD_ENRL$115$").first.click()

    # confirm classes
    browser.find_by_id("DERIVED_REGFRM1_SSR_PB_SUBMIT").first.click()
Exemple #7
0
def make(b,c):
	browser=Browser('chrome') 
	url='http://admin2.okzaijia.com.cn/Account/login'
	browser.visit(url)
	browser.find_by_id('UserName').fill('Tina')
	browser.find_by_id('Password').fill('13916099416')
	browser.find_by_id('LoginOn').click()
	browser.find_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div/div/ul/li/a').click()
	if b==1:
		browser.find_link_by_text(u'新增订单').click()
		browser.windows.current=browser.windows[1]
	#print 	browser.windows.current
		textnew=browser.find_by_name('RepairContent')
		textnew.fill(random.randint(10000,19999))
		a=''.join([chr(random.randint(97,122)) for _ in range(4)])
		browser.find_by_id('UserName').fill(a)
		browser.find_by_id('UserMobile').fill(random.randint(15138460867,19000000000))
		browser.select('Source',random.randint(1,10))
		browser.select('AreaId',random.randint(801,819))
		browser.find_by_id('UserAddress').fill(random.randint(3000,9999))
		browser.find_by_xpath('//*[@id="submit"]').click()
		time.sleep(2)
		
	else:
		browser.find_by_name('orderno').fill(c)
		browser.find_by_xpath('//*[@id="searchForm"]/div[7]/button').click()
		browser.find_by_text(u'维修记录').click()
		browser.find_by_xpath("/html/body/div[1]/div[1]/div/div[2]/div[1]/a").click()
		browser.windows.current=browser.windows[1]
		b=''.join([chr(random.randint(97,122)) for _ in range(5)])
		browser.find_by_name('RepairContent').fill(b)
		browser.find_by_name('Remark').fill(random.randint(20000,29999))
		browser.find_by_id('submit').click()
		time.sleep(3)
	browser.visit('http://admin2.okzaijia.com.cn/Task/MyTask?TaskType=4&Status=1')
	browser.windows.current=browser.windows[1]
#print 	browser.windows.current	
	browser.find_by_xpath('//*[@id="searchForm"]/div[3]/button').click()
	browser.find_by_xpath('//*[@id="pages"]/div/a[7]').click()
	browser.find_by_text(u'执行任务').last.click()
	time.sleep(2)
	browser.windows.current=browser.windows[2]
	browser.find_by_value('37').click()#选择接单的施工组
	#print browser.find_by_value('17').text
	browser.find_by_id('submit').click()
Exemple #8
0
def scrape():
    results={}

    executable_path = {
        'executable_path': 'C:\p\HomeWork\Web-Scraping-Challenge\Mission_to_Mars\chromedriver.exe'
    }

    browser = Browser('chrome',**executable_path)

    url='https://mars.nasa.gov/news/'
    browser.visit(url)
    browser.is_element_present_by_css('ul.itme_list',wait_time=2)
    soup= BeautifulSoup(browser.html)
    title=soup.find('div','content_title').get_text()
    news_p=soup.find('div','article_teaser_body').get_text()
    results['news_title']=title
    results['news_paragraph']=news_p
    
    # 2. jpl.nasa.gov/spaceimages
    url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    full_image_btn=browser.find_by_id('full_image')
    full_image_btn.click()
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_btn = browser.find_link_by_partial_text('more info')
    more_info_btn.click()
    soup=BeautifulSoup(browser.html)
    img_url_rel=soup.select_one('figure.lede a img').get('src')
    img_url=f'http://www.jpl.nasa.gov{img_url_rel}'
    results['featured_images']= img_url


    #3 table from space-facts.com/mars/
    df = pd.read_html('https://space-facts.com/mars/') [0]
    df.columns=['description','value']
    df.set_index('description',inplace=True)
    results['facts']= df.to_html(classes='table table_striped')

    #4 hemisheres images from astrology.usgs.gov
    url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    hemisheres=[]
    links=browser.find_by_css('a.product-item h3')

    for i in range(len(links)):
        hemi={}
        browser.find_by_css('a.product-item h3')[i].click()
        sample_elm=browser.find_link_by_text('Sample').first
        img_url=sample_elm['href']
        title=browser.find_by_css('h2.title').text
        hemi['title']=title
        hemi['img_url']=img_url
        hemisheres.append(hemi)
        browser.back()
    results['hemispheres']=hemisheres

    return(results)
class Retreiver():
	def __init__(self, folder):
		self.aux = Auxiliary()
		self.folder = folder
		self.tickers = None
		
	def click(self, destination):
		try:
			self.browser.find_by_text(destination).first.click()
		except splinter.exceptions.ElementDoesNotExist:
			self.browser.click_link_by_text(destination)
		
	def retreive(self):
		print ('Please enter the period for retrieval.')
		raw_dates = input ('Dates in European format: dd/mm/yyyy\n>')
		eurodates = self.aux.date_parse(raw_dates)[0]
		dates = self.aux.european_dates_to_american(eurodates)
		raw_tickers = input ('Tickers:\n>')
		self.tickers = self.aux.parse_tickers(raw_tickers)

		self.browser = Browser('chrome')
		for ticker in self.tickers:
			self.browser.visit('https://beta.finance.yahoo.com/quote/%s/history' % ticker)
			time.sleep(5)
			input_boxes = self.browser.find_by_tag('input')
			for i in range(0,6):
				input_boxes[i+2].fill(dates[i]) #we need 3-8 inputs
			self.click('Apply')
			download_link = self.browser.find_link_by_text('Download data').first
			response = requests.get(download_link['href'])
			with open('%s//%s.csv' % (self.folder, ticker), 'wb') as f:
				f.write(response.content)		
		self.browser.quit()
		
	def put_together(self):
		if not self.tickers:
			self.tickers = []
			for f in os.listdir(self.folder):
				self.tickers.append(f[:-4])
		target = openpyxl.Workbook()
		sheet = target.active
		sheet.append(self.tickers)
		for filename in os.listdir(self.folder):
			source = open('%s//%s' %(self.folder, filename), 'r', encoding='utf-8')
			sheet = target.create_sheet()
			sheet.title = filename[:-4] #strip out the extension
			for line in source:
				sheet.append(self.aux.parse_comma_separated_line(line))
			source.close()
		target.save('Historical_data_together.xlsx')
def hemispheres():
    astropedia_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser = Browser('chrome')
    browser.visit(astropedia_url)
    links = browser.find_by_css("a.product-item h3")
    hemisphere_urls = []
    for i in range(len(links)):
        hemisphere = {}
        browser.find_by_css("a.product-item h3")[i].click()
        sample = browser.find_link_by_text('Sample').first
        hemisphere['title'] = browser.find_by_css('h2.title').text
        hemisphere['image_url'] = sample['href']
        #     title = browser.find_by_css('h2.title').text
        #     hemisphere[title] = original['href']
        hemisphere_urls.append(hemisphere)
        browser.back()
    return hemisphere_urls
Exemple #11
0
def mars_hemis(browser):

    # Set the executable path and initialize the chrome browser in splinter
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path)

    #html = browser.html
    #hemi_soup = BeautifulSoup(html, 'html.parser')

    #create list for Hemisphere content
    hemi_list = []

    for i in range (0,4): #(len(hemi_titles_list)):
        # visit splash page
        url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
        browser.visit(url)

        # Get the hemisphere title with splinter
        hemi_titles_list = browser.find_by_css('.description .itemLink')
        hemi_title = hemi_titles_list[i].value
        #print(hemi_title)

        # Parse the resulting html with soup
        hemi_splinter_link = browser.find_by_css('div.description a.itemLink')

        hemi_splinter_links = hemi_splinter_link[i]["href"]

        # generate new url and visit page
        browser.visit(hemi_splinter_links)

        # scrape site for high-res image
        #img_soup = BeautifulSoup(html, 'html.parser')
        sample_example = browser.find_link_by_text('Sample').first
        sample_img = sample_example["href"]
        #print(sample_img)

        # create dictionary pair
        hemi_pair = {'Title': hemi_title, 'Image': sample_img}
        #print(hemi_pair)

        # append hemi_list with new dictionary content
        hemi_list.append(dict(hemi_pair))

        if i == 3:
            return (hemi_list)
Exemple #12
0
def retrieve_hemispheres():
    browser = Browser("chrome",headless=True)
    browser.visit(source_urls['hemispheres'])
    browser.click_link_by_partial_text('Enhanced')

    browser.click_link_by_partial_text('Back')

    hemisphere_links = browser.find_link_by_partial_text('Hemisphere')
    link_text = []
    for link in hemisphere_links:
        link_text.append(link.text)
    hemisphere_image_urls = []
    for link in link_text:
        browser.click_link_by_partial_text(link)
        hemisphere_image_urls.append({
            'title' : link[:-9],
            'tif_url' : browser.find_link_by_partial_text('Original')['href'],
            'jpg_url' : browser.find_link_by_text('Sample')['href'],
        })
        browser.click_link_by_partial_text('Back')
    return hemisphere_image_urls
def scrap_hemisphereInfo():
    
    from splinter import Browser
    from bs4 import BeautifulSoup
    # get branch links and name:
    browser = Browser('chrome', headless=False)
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    time.sleep(3)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    # loop to spiders the name and links info
    title_names = soup.find_all("div", class_ = "description")
    titles = []
    branch_links = []
    for title_name in title_names:
        
        # loop get the links ready 
        link = title_name.a['href']
        href = f"https://astrogeology.usgs.gov/{link}"
        branch_links.append(href)
        
        # get the names ready 
        name = title_name.h3.text
        titles.append(name.replace(" Enhanced", "").strip())
        
    hemisphere_image_urls = []
    dictt = {}

    for i in range(len(branch_links)):
        browser = Browser('chrome', headless=False)
        url = branch_links[i]
        browser.visit(url)
        full_image_link = browser.find_link_by_text("Sample")
        image_link = full_image_link['href']
        dictt['title'] = titles[i]
        dictt['image_url'] = image_link
        hemisphere_image_urls.append(dictt)

    return hemisphere_image_urls
Exemple #14
0
def hemisphere():

    executable_path = {"executable_path": (r"C:\Users\Mickey\anaconda3\Scripts\chromedriver.exe")}
    browser = Browser("chrome", **executable_path, headless=False)

    #Browse URL
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    #Create an Empty List to store Result
    hemi_img_urls = []

    #Get a List of Hemispheres
    products = browser.find_by_css("a.product-item h3")

    #Begin For Loop 
    for item in range(len(products)):
        hemisphere = {}

        browser.find_by_css("a.product-item h3")[item].click()
        time.sleep(1)
        
        #Find Sample Image
        sample_element = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = sample_element["href"]

        #Get Hemisphere Title
        hemisphere["title"] = browser.find_by_css("h2.title").text

        #Add to List
        hemi_img_urls.append(hemisphere)

        #Navigate to Previous Page
        browser.back()

    #Close Broswer when done
    browser.quit()

    #Return Results
    return hemi_img_urls
def hemisphere_image(browser):

    # Visit web page
    browser = Browser('chrome', headless=False)
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    hemisphere_image_urls = []

    links = browser.find_by_css("a.product-item h3")
    for item in range(len(links)):
        hemisphere = {}
        browser.find_by_css("a.product-item h3")[item].click()
        sample_element = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = sample_element["href"]

        hemisphere["title"] = browser.find_by_css("h2.title").text
        hemisphere_image_urls.append(hemisphere)
        browser.back()

    # browser.quit()

    return hemisphere_image
Exemple #16
0
    


# In[38]:


length_loop=len(title_array)
hemisphere_image_urls=[]
for marker in range(length_loop):
    temp_dict={'title': title_array[marker], 'img_url':list_image_urls[marker]}
    hemisphere_image_urls.append(temp_dict)


# In[39]:


hemisphere_image_urls


# In[40]:


links_found = browser.find_link_by_text('Link for Example.com')






    return Mars_data
Exemple #17
0
hemi_dicts = []

for i in range(1,9,2):
    hemi_dict = {}
    
    browser.visit(mars_hemisphere_url)
    time.sleep(1)
    hemispheres_html = browser.html
    hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser')
    hemi_name_links = hemispheres_soup.find_all('a', class_='product-item')
    hemi_name = hemi_name_links[i].text.strip('Enhanced')
    
    detail_links = browser.find_by_css('a.product-item')
    detail_links[i].click()
    time.sleep(1)
    browser.find_link_by_text('Sample').first.click()
    time.sleep(1)
    browser.windows.current = browser.windows[-1]
    hemi_img_html = browser.html
    browser.windows.current = browser.windows[0]
    browser.windows[-1].close()
    
    hemi_img_soup = BeautifulSoup(hemi_img_html, 'html.parser')
    hemi_img_path = hemi_img_soup.find('img')['src']

    print(hemi_name)
    hemi_dict['title'] = hemi_name.strip()
    
    print(hemi_img_path)
    hemi_dict['img_url'] = hemi_img_path
Exemple #18
0
def scrape_all():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url ='https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    dic={}

    #Scrape the [NASA Mars News Site](https://mars.nasa.gov/news/) and collect the latest News Title
    #and Paragraph Text. Assign the text to variables that you can reference later.
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    title= soup.find_all('div', class_='content_title')
    body= soup.find('div', class_='article_teaser_body')
    print(title[1].text)
    print(body.text)

    news_title=title[1].text
    news_p= body.text

    dic[news_title]=news_title
    dic[news_p]=news_p

    dic

    #browser.quit()
    # JPL Mars Space Images - Featured Image
    image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(image_url)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    browser.find_by_id('full_image').click()
    browser.find_link_by_partial_text('more info').click()

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    imgs=soup.find('figure', class_='lede')
    print(imgs)
    print(imgs.a)
    print(imgs.a.img)
    print(imgs.a.img['src'])

    featured_image_url='https://www.jpl.nasa.gov'+imgs.a.img['src']
    print(featured_image_url)
    dic[featured_image_url]=featured_image_url
    dic

    # Mars Facts
    url ='https://space-facts.com/mars/'
    facts=pd.read_html(url)
    facts
    type(facts)

    df=facts[0]
    df.columns=['Profile','Values']
    df.set_index('Profile', inplace=True)
    df.head()

    html_facts = df.to_html()
    html_facts
    

    #strip unwanted newlines to clean up the table.
    html_facts.replace('\n', '')

    df.to_html('facts.html')

    #Mars Hemispheres
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    image_urls= []
    imgs = browser.find_by_css("a.product-item h3")
    imgs

    # For loop

    for i in range(len(imgs)):
        hemisphere = {}   
        browser.find_by_css("a.product-item h3")[i].click()
        
        # Find Sample Image
        sample_element = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = sample_element["href"]
        
        # Get the Title
        hemisphere["title"] = browser.find_by_css("h2.title").text
        
        # Append
        image_urls.append(hemisphere)
        
        # find imgs back
        browser.back()
    image_urls

    dic['hemisphere']=image_urls
    dic

    return dic
Exemple #19
0
def scrape():
    browser = init_browser()

    ##### __NASA Mars News__ #####
    # URL of page to be scraped
    url = 'https://mars.nasa.gov/news/'
    # Retrieve page with the requests module
    response = requests.get(url)
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.text, 'html.parser')
    # Collect the latest News Title assign the text to a variable that can be referenced later.
    news_title = soup.find_all('div', class_='content_title')[0].text
    # Collect the latest paragragph and assign the text to a variable that can be referenced later.
    news_p = soup.find_all('div', class_='rollover_description_inner')[0].text
    # Close the browser after scraping
    browser.quit()

    #### __JPL Mars Space Images - Featured Image__ ####
    browser = init_browser()
    # Setup Splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    # Set up browser to connect to url and scrape
    url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url)
    # Click on FULL IMAGE button
    browser.links.find_by_partial_text('FULL IMAGE').click()
    # Create Browser and BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # Delay code to allow link to open before trying to scrape
    time.sleep(1)
    # Scrape page to find the featured Mars image
    mars_image = soup.find('img', class_='fancybox-image')
    url = mars_image['src']
    featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + url
    # Close the browser after scraping
    browser.quit()

    ##### __Mars Facts__ #####
    browser = init_browser()
    # Use Pandas to scrape the table and convert the data to a HTML table string
    url = 'https://space-facts.com/mars/'
    mars_table = pd.read_html(url)
    mars_data_df = mars_table[0]
    mars_html_table = mars_data_df.to_html(classes='table table-striped'
                                           'table-bordered',
                                           index=False,
                                           header=False,
                                           border=1)
    # #Close the browser after scraping
    browser.quit()

    ##### __Mars Hemispheres__ #####
    browser = init_browser()
    # Setup splinter
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    # Set up browser to connect to url to scrape
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)
    # Setup empty list
    hemisphere_image_urls = []
    # Get list of hemispheres
    for i in range(4):
        hemisphere = {}

        time.sleep(1)

        # Click on each hemispher enhanced link
        browser.find_by_css("a.product-item h3")[i].click()

        # Scrape page to find Hemisphere title
        hemisphere["title"] = browser.find_by_css("h2.title").text

        # Locate sample jpg image & scrape url
        sample_element = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = sample_element["href"]

        # download = soup.find('div', class_ = 'downloads')
        # image_url = download.ul.li.a["href"]
        # hemisphere["image_url"] = image_url

        # Add data to hemisphere dictionary
        hemisphere_image_urls.append(hemisphere)

        # Navigate back to Products page to continue through range
        browser.back()

    # Close the browser after scraping
    browser.quit()

    # Python dictionary containing all of the scraped data.
    mars_data = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "mars_html_table": mars_html_table,
        "hemisphere_image_urls": hemisphere_image_urls
    }

    # Close remaing browser
    browser.quit()
    # Return results
    return mars_data
class UploadTestCase(unittest.TestCase):

  def setUp(self):
    self.testbed = testbed.Testbed()
    self.testbed.activate()
    self.testbed.init_datastore_v3_stub()
    self.testbed.init_memcache_stub()
    self.browser = Browser('chrome')

  def tearDown(self):
    self.testbed.deactivate()

  def test_when_create_task_upload_file(self):
    #login
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

    self.browser.visit("http://127.0.0.1:8080/tasks")

    self.browser.click_link_by_text('Create new task')

    self.browser.fill('title', 'title')
    self.browser.fill('text', 'text')

    self.browser.is_element_present_by_name('files[]', wait_time=10)

    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    #self.browser.attach_file('files[]', 'test/1.png')
    self.browser.find_by_css('.btn.btn-primary.start').first.click()


    self.assertEqual(1, len(self.browser.find_by_css('.template-download.fade.in')))
    self.assertEqual(4, len(self.browser.find_by_css('.template-download.fade.in td')))

  def test_when_create_task_upload_many_files(self):
    #login
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

    self.browser.visit("http://127.0.0.1:8080/tasks")

    self.browser.click_link_by_text('Create new task')

    self.browser.fill('title', 'title')
    self.browser.fill('text', 'text')

    self.browser.is_element_present_by_name('files[]')

    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))

    #self.browser.attach_file('files[]', 'test/1.png')
    self.browser.find_by_css('.btn.btn-primary.start').first.click()
    sleep(3)

    self.assertEqual(3, len(self.browser.find_by_css('.files tr.template-download')))
					button.click() 		# Click OK
					break

	# --Download page reached--
	format = browser.find_by_id('downloadFormatMenu0').first # Open download format chooser
	format.click()

	# Switch to your desired download format
	for a in browser.find_by_tag('a'):
		if dl_format+' -' in a.value:
			a.click()
			print('Switching to '+dl_format+' format.')
			break

	# Print format being used
	format = browser.find_by_id('downloadFormatMenu0').first
	print('Format: '+format.value)

	# Wait while the download is being prepared...
	print('Preparing download.')
	while browser.is_text_present('preparing'):
		time.sleep(5)

	# Grab final download link
	downloadLink = browser.find_link_by_text('Download').first
	print('Got download link! Starting download...')
	url = downloadLink['href']
	file_name = wget.download(url)	# Download the link using wget

# Repeat for other albums in the list
def scrape():
    # browser = init_browser()
    browser = Browser('chrome')
    #Visit the URL
    Nasa_news_url = 'https://mars.nasa.gov/news/'
    browser.visit(Nasa_news_url)
    html = browser.html

    #Parse HTML with Beautiful Soup
    soup_nasa = BeautifulSoup(html, 'html.parser')
    type(soup_nasa)

    ### NASA Mars News
    #<div class="content_title"><a href="/news/8782/sensors-on-mars-2020-spacecraft-answer-long-distance-call-from-earth/" target="_self">
    #Sensors on Mars 2020 Spacecraft Answer Long-Distance Call From Earth</a></div>
    #<div class="article_teaser_body">Instruments tailored to collect data during the descent of NASA's next rover through the Red Planet's atmosphere have been checked in flight.</div>
    #news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text
    news_titles = soup_nasa.find_all('div', class_="content_title")[0].text
    news_paragraphs = soup_nasa.find_all('div',
                                         class_="article_teaser_body")[0].text
    print(news_titles)
    print('------------------')
    print(news_paragraphs)

    ### JPL Mars Space Images - Featured Image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    time.sleep(5)
    #print(soup.prettify())
    #go to the full image
    #data-fancybox-href
    image = browser.find_by_id('full_image')
    image.click()
    time.sleep(5)
    browser.click_link_by_partial_text('more info')

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    url_image_find = soup.find('img', class_='main_image').get("src")

    featured_image_url = 'https://www.jpl.nasa.gov' + url_image_find
    featured_image_url
    ### Mars Facts
    url = 'https://space-facts.com/mars/'
    mars_facts_df = pd.read_html('https://space-facts.com/mars/')[2]
    mars_facts_df
    mars_facts_df.columns = ["Details", "Measures"]
    mars_facts_df
    mars_facts_df = mars_facts_df.to_html()
    mars_facts_df
    ### Mars Hemispheres
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    browser.visit(url)
    web_links = browser.find_by_css("a.product-item h3")
    len(web_links)
    web_list = []
    for i in range(len(web_links)):
        web_hemispheres = {}
        browser.find_by_css("a.product-item h3")[i].click()
        web_hemispheres["link"] = browser.find_link_by_text(
            'Sample').first["href"]
        web_hemispheres["Title"] = browser.find_by_css('h2.title').text
        web_list.append(web_hemispheres)
        browser.back()
        web_list

    browser.quit()
class TestViews(unittest.TestCase):
    def setUp(self):
        """Test setup """
        #define browser instance
        self.browser = Browser("phantomjs")
        
        #Set up the tables in the database
        Base.metadata.create_all(engine)
        
        #Create an example user
        self.user = User(name="Alice", email="*****@*****.**", password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()
        
        self.process = multiprocessing.Process(target=app.run, kwargs={"port": 8080})
        self.process.start()
        time.sleep(1)
        
        
    def test_login_correct(self):
        #navigate to demo website
        
        self.browser.visit("http://127.0.0.1:8080/login")
        #enter user name and password in their fields
        
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        #define Log in button
       
        button = self.browser.find_by_css("button[type=submit]")
        #click on the Log in button
        
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")     
            
        
    def test_login_incorrect(self):
        
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")    
        
    def test_logout(self):
        #navigate to demo log in website
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        
        #confirm return to home page
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")     
        
        #navigate to demo log out website
        self.browser.visit("http://127.0.0.1:8080/logout")
        
        #confirm log out link exists
        logout_link= self.browser.find_link_by_text("Log out")
        
        #confirm return to log in page
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/login")
        
    
    
    
    def test_add_entry_edit(self):
        self.browser.visit("http://127.0.0.1:8080/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        
        self.assertEqual(self.browser.url, "http://127.0.0.1:8080/")     
        # check add entry link exists
        
        self.browser.visit("http://127.0.0.1:8080/entry/add")
        first_found = self.browser.find_by_name("title").first
        last_found = self.browser.find_by_name("content").last
        button = self.browser.find_by_css("button[type=submit]")
        
        
        self.browser.visit("http://127.0.0.1:8080/entry/edit")
        
        self.browser.find_by_name("title")
        self.browser.find_by_name("content")
       # self.browser.find_by_value("entry_title").first why is splinter not recognising flask format in html 
        #self.browser.find_by_value("entry_content").last
        button = self.browser.find_by_css("button[type=submit]")
        #self.assertEqual(self.browser.url, "http://127.0.0.1:8080/") this gives error
        
        # all tests are running ok but I noticed that get/entry/edit gave a 404. Why? 
    
        
#test entry author is th person editing and is logged in
    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
# In[87]:


#Python code with loop
for i in range (number):
    hemisphere = {}
    i = i + 1
    
    print(i)
    try:
        browser.find_by_css('a.product-item')[i].click()
        
    except:
        continue
    
    hemi_href = browser.find_link_by_text('Sample').first
    hemisphere['img_url'] = hemi_href['href']
    hemisphere['title'] = browser.find_by_css('h2.title').text
    
    hemisphere_images.append(hemisphere)
    print(i)
    browser.back()


# In[88]:


#flat url
hemisphere_images

browser.visit(url)

# In[17]:

# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []

# 3. Write code to retrieve the image urls and titles for each hemisphere.
for item in range(4):
    hemisphere = {}

    # Find Element on Each Loop to Avoid a Stale Element Exception
    browser.find_by_css("a.product-item h3")[item].click()

    # Find Sample Image Anchor Tag & Extract <href>
    sample_element = browser.find_link_by_text("Sample").first
    hemisphere["img_url"] = sample_element["href"]

    # Get Hemisphere Title
    hemisphere["title"] = browser.find_by_css("h2.title").text

    # Append Hemisphere Object to List
    hemisphere_image_urls.append(hemisphere)

    # Navigate Backwards
    browser.back()

# In[18]:

# 4. Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls
		typeaName = typeaNameList[cType]

		if cType == 0 or cType == 1:
			# pass

			for igType in igTypeListScotia:

				# fdsf
				browser.find_by_name('addScenario').first.click()
				browser.fill('scName', countryTypeList[conIndex]+typeaName+igType)
				browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b')
				browser.type('scEffDate', '2015-10-31')
				browser.find_by_name('update').first.click()

				browser.find_link_by_text('Obligor').first.click()

				# choose the companyType type
				element = browser.find_by_name('companyType').first
				element.select(str(cType))

				browser.fill('obligorName', companyName)
				browser.find_by_name('ObligorSearch').first.click()
			
				browser.find_link_by_partial_href('javascript:refPortResult')[0].click()

				# select "B-III counterpaty type" to be "corporate"
				element = browser.find_by_name('counterPartyType').first
				element.select('1')
				# select "Classification re Asset Value Correlation" to be "Non-Financial Institution (N)"
				element = browser.find_by_name('avc').first
def scrape_all():
    # Set the executable path and initialize the chrome browser in splinter
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path)

    # Visit the mars nasa news site
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_soup = BeautifulSoup(html, 'html.parser')

    slide_elem = news_soup.select_one('ul.item_list li.slide')
    slide_elem.find("div", class_='content_title')

    # Use the parent element to find the first a tag and save it as `news_title`
    news_title = slide_elem.find("div", class_='content_title').get_text()

    # Use the parent element to find the paragraph text
    news_p = slide_elem.find('div', class_="article_teaser_body").get_text()

    # JPL Space Images Featured Image - Visit URL
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_id('full_image')
    full_image_elem.click()

    # Find the more info button and click that
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_elem = browser.find_link_by_partial_text('more info')
    more_info_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = BeautifulSoup(html, 'html.parser')

    # find the relative image url
    img_url_rel = img_soup.select_one('figure.lede a img').get("src")

    # Use the base url to create an absolute url
    img_url = f'https://www.jpl.nasa.gov{img_url_rel}'

    #Mars weather - visit url
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)
    html = browser.html
    weather_soup = BeautifulSoup(html, 'html.parser')

    # First, find a tweet with the data-name `Mars Weather`
    mars_weather_tweet = weather_soup.find('div',
                                           attrs={
                                               "class": "tweet",
                                               "data-name": "Mars Weather"
                                           })

    # Next, search within the tweet for the p tag containing the tweet text
    mars_weather = mars_weather_tweet.find('p', 'tweet-text').get_text()

    #Hemispheres of Mars
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    hemisphere_image_urls = []

    # First, get a list of all of the hemispheres
    links = browser.find_by_css("a.product-item h3")

    # Next, loop through those links, click the link, find the sample anchor, return the href
    for i in range(len(links)):
        hemisphere = {}

        # We have to find the elements on each loop to avoid a stale element exception
        browser.find_by_css("a.product-item h3")[i].click()

        # Next, we find the Sample image anchor tag and extract the href
        sample_elem = browser.find_link_by_text('Sample').first
        hemisphere['img_url'] = sample_elem['href']

        # Get Hemisphere title
        hemisphere['title'] = browser.find_by_css("h2.title").text

        # Append hemisphere object to list
        hemisphere_image_urls.append(hemisphere)

        # Finally, we navigate backwards
        browser.back()

    #mars facts
    df = pd.read_html('https://space-facts.com/mars/')[0]
    df.columns = ['description', 'value']
    df.set_index('description', inplace=True)
    df = df.to_html()

    #final data dictionary
    data = {
        "news_title": news_title,
        "news_paragraph": news_p,
        "featured_image": img_url,
        "hemispheres": hemisphere_image_urls,
        "weather": mars_weather,
        "facts": df,
        "last_modified": dt.datetime.now()
    }
    browser.quit()
    return data
Exemple #28
0
def scrape():
    # Import dependencies ----------------------------------------------------------------
    from splinter import Browser
    from bs4 import BeautifulSoup as bs
    import requests
    import time
    import pandas as pd

    # set up Splinter ----------------------------------------------------------------------
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # 1. NASA Mars News---------------------------------------------------------------------
    ## Scrape the NASA Mars News Site (https://mars.nasa.gov/news) and collect the latest News Title and Paragraph Text
    ## Assign the text to variables to reference later

    #! can't use requests library here, because the news are rendered by js after page is load; if use requests.get, it will only return the contents before rendering

    # 1.1 Retrieve page with splinter
    url_news = "https://mars.nasa.gov/news"
    browser.visit(url_news)
    html = browser.html

    # 1.2 Get the first news from html retrieved
    # Create BeautifulSoup object; parse with 'html.parser'
    bsoup = bs(html, 'html.parser')

    # reach the container of the first news
    li = bsoup.find("li", class_="slide")

    news_t = li.find("div", class_="content_title").text  # title
    news_p = li.find("div", class_="article_teaser_body").text  # paragraph
    news_link = url_news.replace("/news", "") + li.find(
        "div", class_="content_title").a[
            "href"]  # link to the news (added to base url)
    news_date = li.find("div", class_="list_date").text  # date

    # 2. JPL Mars Space Images - Featured Image----------------------------------------------
    ## Get the current Featured Image from JPL (https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars)

    url_img = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"

    # navigate to to full-size image url with splinter
    browser.visit(url_img)
    browser.click_link_by_partial_text('FULL IMAGE')

    # ---if try to click on the more info button, directly, sometimes it returns an error "element not visible"
    # ---the only way to avoid that it to wait until the element becomes visible, which takes time
    # --- the workaround is to get the href link and visit it insteading of trying to click the link directly
    # time.sleep(30)
    # browser.click_link_by_partial_text('more info')

    href = browser.find_link_by_partial_text("more info")[0]["href"]
    browser.visit(href)

    browser.find_by_css(".main_image").click()

    # store the image url
    featured_image_url = browser.url

    # 3. Mars Weather ------------------------------------------------------------------------
    ## Visit the Mars Weather twitter account page (https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page

    # 3.1 Retrieve page using requests
    url_twitter = "https://twitter.com/marswxreport?lang=en"
    html = requests.get(url_twitter).text

    # 3.2 Get the weather post from html retrieved
    bsoup = bs(html, "html.parser")

    # all tweets are under ol
    ol = bsoup.find(id="stream-items-id")

    # put tweets in lis list
    lis = ol.findAll("li")
    # use a for loop to find the first tweet with weather info (criterion: has hPa in the post)
    mars_weather = ""
    for li in lis:
        tweet = li.find("div", class_="js-tweet-text-container").p.text
        if tweet.find("hPa"):
            mars_weather = tweet
            break

    # 4. Mars Facts----------------------------------------------------------------------------
    ## Visit the Mars Facts webpage (https://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    # Use Pandas to convert the data to a HTML table string.
    url_fact = "https://space-facts.com/mars/"

    # use pandas to scrape tabular data from the page
    tables = pd.read_html(url_fact)
    facts = tables[0]

    # store data in a list of lists
    facts = facts.values.tolist()

    # 5. Mars Hemispheres-------------------------------------------------------------------------
    ## Visit the USGS Astrogeology site (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres.

    # 5.1 Retrieve the html with splinter
    url_hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_hemi)
    html = browser.html

    # 5.2 Get the urls needed from the html retrieved
    bsoup = bs(html, "html.parser")

    items = bsoup.findAll("div", class_="item")

    hemisphere_image_urls = []  # initialize list
    for item in items:
        title = item.find("h3").text  # title
        url = "https://astrogeology.usgs.gov/" + item.find(
            "div",
            class_="description").a["href"]  # get the url for picture details
        browser.visit(url)
        img_url = browser.find_link_by_text("Sample")[0][
            "href"]  # get the url to the full-size picture
        hemisphere_image_urls.append({
            "title": title,
            "img_url": img_url
        })  # append a dictionary to the hemisphere_image_urls list

    # store data scraped into a dictionary--------------------------------------------------------------------
    data = {
        "news": {
            "title": news_t,
            "body": news_p,
            "link": news_link,
            "date": news_date
        },
        "feature_img": featured_image_url,
        "weather": mars_weather,
        "facts": facts,
        "hemi_img": hemisphere_image_urls
    }

    print(data)  # print to console
    return data
Exemple #29
0
def scrape():
    # Scrape NASA Mars News Site and collect news title and paragraph text
    url = 'https://mars.nasa.gov/news/'
    response = requests.get(url)

    soup = bs(response.text, 'lxml')

    # Create variables for title and paragraph text
    news_title = soup.find('div', class_='content_title').text
    paragraph_text = soup.find('div', class_='rollover_description_inner').text

    #Visit the URL for JPL's Space Images-Use splinter to navigate site
    executable_path = {'executable_path': 'chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    html = browser.html
    soup = bs(html, 'html.parser')

    # Click the 'FULL IMAGE' button
    browser.click_link_by_partial_text('FULL IMAGE')

    # Click the 'more info' button
    browser.click_link_by_partial_text('more info')

    # Get feature image url from 'more info' page
    html_2 = browser.html
    soup_2 = bs(html_2, 'html.parser')
    img_url = soup_2.find('img', class_='main_image')
    end_img_url = img_url.get('src')

    feature_image_url = 'https://www.jpl.nasa.gov' + end_img_url

    # Scrape latest Mars weather tweet from 'https://twitter.com/marswxreport?lang=en'
    url = 'https://twitter.com/marswxreport?lang=en'
    twitter_resp = requests.get(url)
    twitter_soup = (bs(twitter_resp.text, 'html.parser').find(
        'div', class_='js-tweet-text-container')).text.strip()

    # Create a pandas dataframe containing facts scraped from 'https://space-facts.com/mars/'
    mars_facts_request = requests.get('https://space-facts.com/mars/')
    mars_facts_table = pd.read_html(mars_facts_request.text)
    mars_facts_table
    mars_facts_df = mars_facts_table[0]
    mars_facts_df

    # Visit USGS Astrology site, 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' to obtain
    #high reloution images of each of Mar's hemispheres
    usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(usgs_url)

    # Find image url for full resolution image
    links = browser.find_by_css("a.product-item h3")

    # Save the image url string and hemisphere title containing hemisphere name in python dictionary
    hemisphere_img_urls = []
    # Itterate through the links
    for link in range(len(links)):
        images = {}
        browser.find_by_css("a.product-item h3")[link].click()
        #print(browser.find_by_css("a.product-item h3")[link])
        image_url = browser.find_link_by_text('Sample')
        images['img_url'] = image_url['href']
        #print(image_url)
        browser.find_by_css('h2.title').text
        image_title = browser.find_by_css('h2.title').text
        images['title'] = image_title
        hemisphere_img_urls.append({
            "title": image_title,
            "image_url": image_url
        })
        browser.back()

    mars_dictionary = {
        "Nasa_Title": news_title,
        "Nasa_Paragraph": paragraph_text,
        "Feature_Image": feature_image_url,
        "Mars_Weather": twitter_soup,
        "Mars_Facts": mars_facts_df,
        "Hemispheres": hemisphere_img_urls
    }

    browser.quit()
    return mars_dictionary
typeaNameList = ['scotia_pub_', 'scotia_priv_', 'nonsco_pub_', 'nonsco_priv_']
typeaNameList2 = ['scotia_pub', 'scotia_priv', 'nonsco_pub', 'nonsco_priv']
devdCompanyNameList = ['TOYOTA TSUSHO CORPORATION','BASCO DESEMBUAGE','A&T CORPORATION','asdf']
devgCompanyNameList = [countryList[1],countryList[1],countryList[1],'asdf']

# open browser, navigate to the right page, configure, 
from splinter import Browser
from splinter import driver
browser = Browser()

browser.visit('https://clientnet-uat.gbm.bns:8090/CNETADMIN/login.jsp?ssosrc=http%3A%2F%2Fclientnet-uat.gbm.bns%2FCNETCOMM%2Findex.do')
browser.fill('uid', 'ychoe')
browser.fill('pwd', 'Winter15')
browser.find_by_name('signin').first.click()

browser.find_link_by_text('International Banking').first.click()

browser.fill('clientSearchString', 'RF vs RFDA test cases')
browser.find_by_name('search').first.click()
browser.find_by_value('GO').first.click()

for conIndex in range(2):

	for cType in range(4):

		if conIndex == 0:
			companyName = devdCompanyNameList[cType]
		else:
			companyName = devgCompanyNameList[cType]

		typeaName = typeaNameList[cType]
items = soup(html, 'html.parser')

# In[22]:

#hemisphere_image_urls = []
#links = browser.find_css("a.product-items h3")
hemisphere_image_urls = []
# First, get a list of all of the hemispheres
links = browser.find_by_css("a.product-item h3")
# Next, loop through those links, click the link, find the sample anchor, return the href
for i in range(len(links)):
    hemisphere = {}
    # We have to find the elements on each loop to avoid a stale element exception
    browser.find_by_css("a.product-item h3")[i].click()
    # Next, we find the Sample image anchor tag and extract the href
    sample_elem = browser.find_link_by_text('Sample').first
    hemisphere['img_url'] = sample_elem['href']
    # Get Hemisphere title
    hemisphere['title'] = browser.find_by_css("h2.title").text
    # Append hemisphere object to list
    hemisphere_image_urls.append(hemisphere)
    # Finally, we navigate backwards
    browser.back()

# In[23]:

# 4. Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

# In[24]:
Exemple #32
0
def scrape():
    browser = init_browser()

    #Mars dict to hold info
    mars_data={}
    
    # Get Mars news
    url = "https://mars.nasa.gov/news/"
    browser.visit(url)
    time.sleep(1)
    
    html = browser.html
    soup = bs(html, "html.parser")
    # find new news article titles
    news_title = soup.find("div",class_="content_title").text
    # find new news articles text
    news_text = soup.find("div", class_="article_teaser_body").text
    
    #Get Mars img from JPL
    jpl_images_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(jpl_images_url)
    time.sleep(1)
    html = browser.html
    soup = bs(html, "html.parser")
    img_source = soup.find(class_ = "carousel_item")['style']
    # Use split to get the text portion just related to the full size image URL.
    string_split = img_source.split("'")[0]
    image_split = img_source.split("'")[1]
    # Combine with base url to make complete url for image
    featured_image_url = jpl_images_url + image_split
    
    #Twitter scrape
    executable_path = {"executable_path": "chromedriver.exe"}
    browser = Browser("chrome", **executable_path)
    twit_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(twit_url)
    
    # Use find by css method with click to access tweet.  Tried with beautiful soup in jupyter notebook
    # Resource website used https://www.seleniumeasy.com/selenium-tutorials/css-selectors-tutorial-for-selenium-with-examples
    browser.find_by_css('div[class="css-1dbjc4n r-1awozwy r-18u37iz r-1wtj0ep"]').first.click()
    #find and save text from tweet.  End up in [6] location
    target_tweet = browser.find_by_css('span[class="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0"]')[6].text
    
    #Mars Facts Scrape
    facts_url = "https://space-facts.com/mars/"
    browser.visit(facts_url)
    mars_facts_df = pd.read_html(facts_url)[0]
    mars_facts_df.columns=["Facts", "Values"]
    mars_facts_df.set_index("Facts", inplace=True)
    mars_facts_html = mars_facts_df.to_html()
    
    # Mars Hemispheres
    mars_hemi_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(mars_hemi_url)
    hemi_img_url = []
    links = browser.find_by_css("a.product-item h3")
    n = len(links)
    
    for row in range(n):
        hemi_dict = {}
        browser.find_by_css("a.product-item h3")[row].click()
    
        sample_element = browser.find_link_by_text("Sample").first
    
        # Update dictionary with image url and title
        hemi_dict["img_url"] = sample_element["href"]
        hemi_dict["title"] = browser.find_by_css("h2.title").text
    
        # Append it to 
        hemi_img_url.append(hemi_dict)
    
        # Need to send browser back each time in order to click each product-item.
        browser.back()
        
        #Update mars_data with information
    mars_data = {
            "mars_news_title": news_title,
            "mars_news_teaser": news_text,
            "mars_tweet": target_tweet,
            "mars_image": featured_image_url,
            "mars_table": mars_facts_html,
            "hemi_image_title_1": hemi_img_url[0]["title"],
            "hemi_image_url_1": hemi_img_url[0]["img_url"],
            "hemi_image_title_2": hemi_img_url[1]["title"],
            "hemi_image_url_2": hemi_img_url[1]["img_url"],
            "hemi_image_title_3": hemi_img_url[2]["title"],
            "hemi_image_url_3": hemi_img_url[2]["img_url"],
            "hemi_image_title_4": hemi_img_url[3]["title"],
            "hemi_image_url_4": hemi_img_url[3]["img_url"]      
        }
    browser.quit()
        
    return mars_data
   
    
    
    
    
Exemple #33
0
def scrape():
    ############ defining the urls needed

    ####---------------------------------------
    ##url for the Nasa news site
    nasa_news_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"

    ####---------------------------------------
    ##urls for the jpl site

    #base url for jpl site
    jpl_base_url = "https://www.jpl.nasa.gov"

    #url for "Mars" search for jpl
    jpl_mars_url = jpl_base_url + "/spaceimages/?search=&category=Mars"

    ####---------------------------------------
    ##url for space facts page
    mars_facts_url = "https://space-facts.com/mars/"

    ####---------------------------------------
    ##urls for hemisphere data

    #base url for hemisphere site
    hem_base_url = "https://astrogeology.usgs.gov"

    #page we'll use to access each hemisphere page
    hem_url = hem_base_url + "/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

    #using splinter since requests is only returning partial results
    executable_path = {
        'executable_path':
        "C:/Program Files (x86)/Chrome Driver/chromedriver.exe"
    }

    browser = Browser('chrome', executable_path, headless=False)

    ########### in this block, we're going to try to run all the browser.visit(x) and html=browser.html statements so we can close the the browser out more quickly

    #opening browser to the nasa news website
    browser.visit(nasa_news_url)
    time.sleep(2)
    #extracting html from the nasa news website
    nasa_news_html = browser.html

    #opening browser to jpl site
    browser.visit(jpl_mars_url)
    time.sleep(2)
    #extracting html
    jpl_html = browser.html
    # browser.quit()

    #visiting mars facts page and grabbing html
    browser.visit(mars_facts_url)
    time.sleep(2)
    ### defining the html
    mars_facts_html = browser.html

    ####------------------------------------------------------------------------------
    ####visiting the mars hemispheres site and grabbing the html

    browser.visit(hem_url)
    time.sleep(2)
    hem_html = browser.html

    #parsing hemisphere site
    hem_soup = bs(hem_html, 'html.parser')

    ### get the image url string, hemisphere title containing the hemispher name

    ##a results set that contain the links to each hemisphere
    hem_items = hem_soup.find_all(class_="item")

    hem_list = []

    ### visit each hemisphere page and...
    for result in hem_items:
        print("--------")

        #opens to the browser to the current hemisphere page
        browser.visit(hem_base_url + result.a["href"])
        time.sleep(1)

        ##opening the image to view it full size
        browser.find_link_by_text("Open").first.click()
        time.sleep(1)

        #defines the current html
        current_html = browser.html
        cur_soup = bs(current_html, 'html.parser')

        # # adds to the dictionary. The key is the hemisphere title and the value is the image link

        current_dict = {}
        current_dict["title"] = cur_soup.find('title').text
        current_dict["img_url"] = hem_base_url + cur_soup.find(
            'img', class_="wide-image")['src']

        hem_list.append(current_dict)

    browser.quit()

    #parsing the html
    nasa_news_soup = bs(nasa_news_html, 'html.parser')

    #finds the list in the html which contains the article titles and paragraphs we seek

    first_art = nasa_news_soup.find(class_="item_list")

    # print(first_art.text)

    ## gets the articles title
    first_art.find(class_="content_title").text
    first_art.find(class_="article_teaser_body").text

    art_dict = {
        "news_title": first_art.find(class_="content_title").text,
        "news_p": first_art.find(class_="article_teaser_body").text
    }

    #parsing the html for the jpl site
    jpl_soup = bs(jpl_html, 'html.parser')

    ## finds the partial url for the first image
    partial_jpl_image_url = jpl_soup.find(
        class_="slide").a["data-fancybox-href"]

    ## joins the partial url to the initial url to get the full image url
    featured_image_url = jpl_base_url + partial_jpl_image_url

    #grabbing the first table from the mars facts html and turning it into a dataframe

    fact_df = pd.read_html(mars_facts_html)[0]
    fact_df

    fact_table = fact_df.to_html(header=False, index=False)

    ##final dictionary
    final_dict = {
        "Hemispheres": hem_list,
        "Articles": art_dict,
        "FeaturedImage": featured_image_url,
        "Facts": fact_table
    }
    print("scraping done")
    return final_dict
Exemple #34
0
Fichier : buy.py Projet : lsoftp/jd
    #         bbh=buyh-1
    #     else:
    #         bbm=buym-1
    #         bbh=buyh
    #
    # for test
    print("start.....")
    #
    t3 = datetime.strptime(buyd + ' ' + buyt, '%Y-%m-%d %H:%M:%S')
    #t3=datetime.strptime('2017-03-02 00:00:00','%Y-%m-%d %H:%M:%S')
    #b.visit('https://passport.jd.com/new/login.aspx?ReturnUrl=https://cart.jd.com/order/orderInfoMain.html')
    #b.visit('https://passport.jd.com/new/login.aspx?ReturnUrl=https://item.jd.com/4325034.html')
    url = 'https://passport.jd.com/new/login.aspx?ReturnUrl=https://item.jd.com/' + item + '.html'
    #    b.visit('https://passport.jd.com/new/login.aspx?ReturnUrl=https://item.jd.com/4390094.html')
    b.visit(url)
    uu = b.find_link_by_text('账户登录')
    uu.click()
    b.fill('loginname', user)
    #    b.fill('loginname','18371542519')
    b.fill('nloginpwd', ppp)
    b.find_by_id('loginsubmit').click()
    #b.visit('https://item.jd.com/3763103.html')
    tt = datetime.now()
    tttt = t3 - tt
    ttt = tttt.days * 24 * 60 * 60 + tttt.seconds
    while (ttt > -60 * 15):

        tt = datetime.now()
        tttt = t3 - tt

        ttt = tttt.days * 24 * 60 * 60 + tttt.seconds
countryList = ['australia','chile']

countryTypeList = ['developed_', 'developing_']

typeaNameList = ['Existing_Scotia_Public', 'Existing_Scotia_Private', 'Non_Scotia_Public', 'Non_Scotia_Private']
devdCompanyNameList = ['TOYOTA TSUSHO CORPORATION','BASCO DESEMBUAGE','A&T CORPORATION','asdf']
devgCompanyNameList = [countryList[1],countryList[1],countryList[1],'asdf']

# open browser, navigate to the right page, configure, 
from splinter import Browser
browser = Browser()

from selenium.webdriver.common.keys import Keys
from selenium import webdriver

browser.visit('https://clientnet-uat.gbm.bns:8090/CNETADMIN/login.jsp?ssosrc=http%3A%2F%2Fclientnet-uat.gbm.bns%2FCNETCORP%2Findex.do')
browser.fill('uid', 'ychoe')
browser.fill('pwd', 'Winter15')
browser.find_by_name('signin').first.click()

browser.fill('clientSearchString', 'jason\'s client')
browser.find_by_name('search').first.click()
browser.find_by_value('GO').first.click()


while len(browser.find_link_by_text('Delete'))>0:
	browser.find_link_by_text('Delete').first.click()
	browser.get_alert().accept()

def scrape_all():
    # Initiate headless driver for deployment
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    # Go to the NASA Mars News Site
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(url)

    # Create a Beautiful Soup object
    soup = bs(browser.html, 'lxml')

    news_title = soup.find_all('div', class_='content_title')
    news_articles = []
    for news in news_title:
        if (news.a):
            if (news.a.text):
                news_articles.append(news.a.text)

    # Print paragraph for the latest news article
    news_story = soup.find_all('div', class_='article_teaser_body')
    news_paragraph = []
    for paragraph in news_story:
        if (paragraph.text):
            news_paragraph.append(paragraph.text)

    # Create variables for our latest news article and paragraph
    first_article = news_articles[0]
    news_p = news_paragraph[0]
    data["news_title"] = first_article
    data["news_paragraph"] = news_p

    # Visit the url for JPL Featured Space Image
    url_2 = 'https://www.jpl.nasa.gov/images?search=&category=Mars'
    browser.visit(url_2)

    # Create a Beautiful Soup object
    soup2 = bs(browser.html, 'lxml')

    # Find and append the links(href) for each image featured on the page
    article_images = soup2.find_all('a', class_="group cursor-pointer block")
    image_links = []
    for image in article_images:
        image_links.append(image['href'])

    # Scrape through the first href and find the full sized image url
    soup2 = bs(browser.html, 'lxml')

    domain_url = 'https://' + browser.url.replace('http://', '').replace(
        'https://', '').split('/', 1)[0]

    browser.visit(domain_url + image_links[0])
    soup3 = bs(browser.html, 'lxml')
    img_url = soup3.find_all('div', class_="lg:w-auto w-full")
    img_href = []
    for i in img_url:
        if (i.a):
            if (i.a['href']):
                img_href.append(i.a['href'])

    featured_image_url = img_href[0]

    data["featured_image"] = featured_image_url

    # Visit the Mars Facts webpage
    url_3 = 'https://space-facts.com/mars/'
    browser.visit(url_3)

    # Create a Beautiful Soup object
    soup3 = bs(browser.html, 'lxml')

    # Scrape the table containing facts about the planet including Diameter, Mass, etc.
    mars_facts = pd.read_html(browser.html)
    table_df = mars_facts[0]

    # Use Pandas to convert the data to a HTML table string.

    table_df.columns = ["description", "value"]
    data["facts"] = table_df.to_html(index=False)

    # Bring in the USGS Astrogeology site for our web scrapping
    url_4 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url_4)

    # Create a Beautiful Soup object
    soup4 = bs(browser.html, 'lxml')

    hemisphere_image_urls = []

    # Run a for loop to click through our hemisphere links in order to
    # append the titles & urls for the full resolution hemisphere images
    links = browser.find_by_css("a.product-item h3")
    for item in range(len(links)):
        hemisphere = {}

        browser.find_by_css("a.product-item h3")[item].click()

        # find urls for the full resolution hemisphere images
        aref_list = browser.find_link_by_text("Sample").first
        hemisphere["img_url"] = aref_list["href"]

        # find the titles for our hemisphere images
        hemisphere["title"] = browser.find_by_css("h2.title").text

        # append titles & urls for our hemisphere images
        hemisphere_image_urls.append(hemisphere)

        browser.back()

        data["hemispheres"] = hemisphere_image_urls

    browser.quit()

    return data
countryTypeList = ['developed_', 'developing_']

typeaNameList = ['Existing_Scotia_Public', 'Existing_Scotia_Private', 'Non_Scotia_Public', 'Non_Scotia_Private']
devdCompanyNameList = ['TOYOTA TSUSHO CORPORATION','BASCO DESEMBUAGE','A&T CORPORATION','asdf']
devgCompanyNameList = [countryList[1],countryList[1],countryList[1],'asdf']

# open browser, navigate to the right page, configure, 
from splinter import Browser
browser = Browser()

from selenium.webdriver.common.keys import Keys
from selenium import webdriver

browser.visit('https://clientnet-uat.gbm.bns:8090/CNETADMIN/login.jsp?ssosrc=http%3A%2F%2Fclientnet-uat.gbm.bns%2FCNETCOMM%2Findex.do')
browser.fill('uid', 'ychoe')
browser.fill('pwd', 'Winter15')
browser.find_by_name('signin').first.click()

browser.find_link_by_text('International Banking').first.click()

browser.fill('clientSearchString', 'RF vs RFDA test cases')
browser.find_by_name('search').first.click()
browser.find_by_value('GO').first.click()


while len(browser.find_link_by_text('Delete'))>0:
	browser.find_link_by_text('Delete').first.click()
	browser.get_alert().accept()

Exemple #38
0
def scrape():
    mars = mongo.db.mars

    # Put everything from Jupyter Notebook Here
    # Set the executable path and initialize the chrome browser in splinter

    executable_path = {
        'executable_path': 'C:\\Users\\enere\\Desktop\chromedriver'
    }
    browser = Browser('chrome', **executable_path)

    ##### MARS NEWS Scrape #####
    # Visit the mars nasa news site
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Optional delay for loading the page
    browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_scraper = BeautifulSoup(html, 'html.parser')

    # Use the parent element to find the first a tag and save it as `news_title`
    title_element = news_scraper.find('div', {'class': 'content_title'})
    news_title = title_element.get_text()

    # Use the parent element to find the paragraph text
    teaser_element = news_scraper.find('div', {'class': 'article_teaser_body'})
    teaser_text = teaser_element.get_text()

    ##### JPL Space Images Featured Image #####
    # Visit URL
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    full_image_elem = browser.find_by_id('full_image')
    full_image_elem.click()

    # Find the more info button and click that
    browser.is_element_present_by_text('more info', wait_time=1)
    more_info_elem = browser.find_link_by_partial_text('more info')
    more_info_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_scraper = BeautifulSoup(html, 'html.parser')

    # find the relative image url
    img_element = img_scraper.find('img', {'class': 'main_image'})

    # find the relative image url
    img_src = img_element.get('src')

    # Use the base url to create an absolute url
    img_url = f'https://www.jpl.nasa.gov{img_src}'

    ##### Mars Weather Scrape
    # Visit URL
    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)

    html = browser.html
    weather_soup = BeautifulSoup(html, 'html.parser')

    # First, find a tweet with the data-name `Mars Weather`
    mars_weather_tweet = weather_soup.find('div',
                                           attrs={
                                               "class": "tweet",
                                               "data-name": "Mars Weather"
                                           })

    # Next, search within the tweet for the p tag containing the tweet text
    mars_weather = mars_weather_tweet.find('p', 'tweet-text').get_text()
    mars_weather

    ##### Mars Facts Scrape

    # Visit URL
    url = 'https://space-facts.com/mars/'
    browser.visit(url)

    tables = pd.read_html(url)

    html_table = df.to_html()

    df.to_html('table.html')

    type(tables)

    df = tables[0]
    df.columns = ['Mars - Earth Comparison', 'Mars', 'Earth']

    # Set the index to Mars - Earth Comparison column
    df.set_index('Mars - Earth Comparison', inplace=True)

    #convert DataFrames back to HTML tables using the to_html function
    html_table = df.to_html()

    ##### Mars Hemisphere Scrape
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    hemisphere_image_urls = []

    # First, get a list of all of the hemispheres
    links = browser.find_by_css("a.product-item h3")

    # Next, loop through those links, click the link, find the sample anchor, return the href
    for i in range(len(links)):
        hemisphere = {}

        # We have to find the elements on each loop to avoid a stale element exception
        browser.find_by_css("a.product-item h3")[i].click()

        # Next, we find the Sample image anchor tag and extract the href
        sample_elem = browser.find_link_by_text('Sample').first
        hemisphere['img_url'] = sample_elem['href']

        # Get Hemisphere title
        hemisphere['title'] = browser.find_by_css("h2.title").text

        # Append hemisphere object to list
        hemisphere_image_urls.append(hemisphere)

        hemisphere_image_urls
        # Finally, we navigate backwards
        browser.back()

    browser.quit()
    ##### Create a dictionary to store our scraped data
    scraped_data = {
        'News Title': news_title,
        'Teaser Text': teaser_text,
        'Image URL': img_url,
        'Mars Weather': mars_weather,
        'Mars Hemisphere': hemisphere_image_urls,
        'Mars Facts': html_table
    }

    ##### Put into MongoDB
    mars.update({}, scraped_data, upsert=True)

    return jsonify(scraped_data)
Exemple #39
0
# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []

# 3. Write code to retrieve the image urls and titles for each hemisphere.

links = browser.find_by_css('a.product-item h3')

# Loop through the links 
for i in range(len(links)):
    hemisphere = {}
    
    # find elements on each loop and click on link
    browser.find_by_css('a.product-item h3')[i].click()
    
    # find Sample image anchor tag and extract the href
    samp_element = browser.find_link_by_text('Sample').first
    img_url = samp_element['href']
    hemisphere["img_url"] = img_url
    
    # Get hemisphere title
    title = browser.find_by_css("h2.title").text
    hemisphere["title"] = title
    
    # Append hemisphere object to list hemisphere_image_urls
    hemisphere_image_urls.append(hemisphere)
    # Navigate back
    browser.back()


# In[138]:
def main(argv):
	email = None
	txtipt = None
	socks = None
	socksPort = None
	try:
		opts, args = getopt.getopt(argv, "hi:m:S:P:",["port=","socks=","input=","mail=","help"])
	except:
		print "Use --help for help"
		sys.exit(2)

	for opt, arg in opts:
		if opt in ("-h", "--help"):
			print 'Usage %s options \n' % (os.path.basename(__file__))
			print '      -h, --help           This help'
			print '      -m, --mail           Your facebook login email'
			print '      -i, --input          Your input file name'
			print '      -S, --socks          Socks Proxy Address for Tor use'
			print '      -P, --port           Port Socks for Tor use'
			sys.exit()
		elif opt in ("-i","--input"):
			txtipt = arg
		elif opt in ("-m","--mail"):
			email = arg
		elif opt in ("-S","--socks"):
			socks = arg
		elif opt in ("-P","--port"):
			socksPort = arg
	if not email or not txtipt:
		print 'Use --help for help'
		sys.exit()

	password = getpass.getpass()

	if socks and socksProt:
		proxy_settings = {
		'network.proxy.type':1,
		'network.proxy.socks': socks,
		'network.proxy.socks_port': socksPort
		}

		browser = Browser('firefox',profile_preferences=proxy_settings)
	else:
		browser = Browser()
	# with Browser() as browser:
		browser.visit('https://m.facebook.com/')
		browser.fill("email",email);
		browser.fill("pass",password);
		browser.find_by_name("login").click()

		if browser.is_element_present_by_css('.login_error_box'):
			print 'The email and password didn\'t work.'
			sys.exit()
		
		try:
			fileipt = open(txtipt, 'r')
		except:
			sys.exit('Unable to open file %s' % txtipt)

		for line in fileipt:
			browser.visit(line)
			addButton = browser.find_link_by_text('Add Friend')
			if len(addButton) > 0:
				addButton[0].click()
Exemple #41
0
import config
from splinter import Browser

browser = Browser()

browser.visit('https://studentemployment.neu.edu/tsx_studentjobs.aspx')

browser.fill('Skin$ctl08$LoginNameText', config.username)
browser.fill('Skin$ctl08$LoginPasswordText', config.password)

browser.find_by_name('Skin$ctl08$ctl14').click()
browser.click_link_by_text(config.jobTitle)

browser.find_link_by_text('Go to time sheet').first.click()

# browser.find_link_by_text('Start time sheet').first.click()

# alert = browser.get_alert()
# alert.accept()


def addShift(shift):
    browser.click_link_by_text('Add New Entry')
    browser.find_by_id('Skin_body_ctl01_WDL').find_by_css('option')[
        shift.day].click()
    browser.find_by_id('Skin_body_ctl01_StartDateTime1').select(shift.start)
    browser.find_by_id('Skin_body_ctl01_EndDateTime1').select(shift.end)
    browser.find_by_value('Add').first.click()


for shift in config.shifts:
Exemple #42
0
class DownPatent(object):
    def __init__(self, db, down_url):
        self.db = db
        self.down_url = down_url
        self.browser = Browser("phantomjs", wait_time=10)
        #self.browser = Browser()

    #下载专利
    def download(self, patentno):
        #访问网页
        #网页加载超时
        #down_flag, 0:未下载,1:不存在,2:下载失败
        download_link = ""
        down_flag = 0
        if True:
            print "打开网页"
            self.browser.visit(self.down_url)
            if not self.browser.is_element_not_present_by_value("查询", wait_time=10):
                #填写专利号
                self.browser.fill("cnpatentno", patentno)
                self.browser.find_by_value("查询").first.click()
                print "填写专利号"
                #连接超时,404
                if self.browser:
                    print "打开验证码网页"
                    #一个最多循环20次
                    code_handler = CodeHandler()
                    #填写验证码
                    list_fill_text = []
                    #验证码路径
                    list_code_path = []
                    #验证码分割标志
                    list_split_flag = []
                    #验证码识别标志
                    list_reg_flag = []
                    for code_num in xrange(20):
                        print code_num
                        #查找验证码
                        if not self.browser.is_element_not_present_by_id("getcode", wait_time=5):
                            print "查找验证码"
                            #截图
                            #self.browser.driver.maximize_window()
                            self.browser.driver.save_screenshot("screenshot.png")
                            #获取验证码图片
                            image = Image.open("screenshot.png")
                            image_location = self.find_location(image)
                            image_code = image.crop((image_location[0], image_location[1], image_location[0]+52, image_location[1]+21))
                            save_path = "static/images/onlinecode/" + time.ctime() + ".png"
                            save_path_temp = "../%s" % save_path
                            image_code.save(save_path_temp)
                            list_code_path.append(save_path)

                            #分割图片
                            list_split_image = self.deal_split(code_handler, image_code)
                            
                            #识别,如果能正确识别,则识别,不能,则重新获取验证码      
                            if len(list_split_image) == 4:
                                print "正确分割"
                                list_split_flag.append(1)
                                reg_plain_text = self.reg_code(list_split_image)
                                fill_text = "".join(reg_plain_text)
                                list_fill_text.append(fill_text)
                                #填写验证码
                                #hand_fill_text = raw_input("Enter fill text:")
                                self.browser.fill("ValidCode", fill_text)
                                self.browser.find_by_value("确定").first.click()

                                print self.browser.html.encode("utf-8").find("验证码输入错误") 
                                if self.browser.html.encode("utf-8").find("验证码输入错误") == -1:
                                    list_reg_flag.append(1)
                                    if self.browser.html.encode("utf-8").find("没有找到该专利") == -1:
                                        down_link_one = self.browser.find_link_by_text("申请公开说明书图形下载(标准版)")
                                        down_link_two = self.browser.find_link_by_text("申请公开说明书图形下载(极速版)")
                                        if down_link_one or down_link_two:
                                            print "查找说明书图形下载链接"
                                            list_reg_flag.append(1)
                                            if down_link_one:
                                                self.browser.click_link_by_text("申请公开说明书图形下载(标准版)")
                                            else:
                                                self.browser.click_link_by_text("申请公开说明书图形下载(极速版)")
                                            
                                            print "查找下载链接"
                                            #查找下载链接
                                            download_a = self.browser.find_link_by_text("下载专利")
                                            if download_a:
                                                download_link = download_a["href"]
                                            
                                                #找到下载链接
                                                down_flag = 3
                                                break
                                            else:
                                                print "下载失败"
                                                #下载失败
                                                down_flag = 2
                                                break
                                        '''
                                        else:
                                            print "识别正确,未找到链接"
                                            list_reg_flag.append(0)
                                            self.browser.back()
                                            self.browser.reload()
                                        '''
                                    else:
                                        print "不存在专利"
                                        #没有专利
                                        down_flag = 1
                                        break
                                else:
                                    print "识别错误,重新加载"
                                    list_reg_flag.append(0)
                                    self.browser.back()
                                    self.browser.reload()
                            else:
                                print "不能分割"
                                list_fill_text.append("")
                                list_split_flag.append(0)
                                list_reg_flag.append(0)
                                self.browser.reload()
                    
                    #存入数据集onlinecode,专利号,验证码路径,识别码,识别标志,不可分标志,时间
                    
                    for code_path, fill_text, split_flag, reg_flag in zip(list_code_path,list_fill_text, list_split_flag, list_reg_flag):
                        try:
                            self.db.onlinecode.insert({"indexflag": patentno, "codepath": code_path, "filltext": fill_text, \
                                                      "splitflag": split_flag, "regflag": reg_flag, "time": time.ctime()})
                        except: pass
        return download_link

    #处理验证码                       
    def deal_split(self, code_handler, image):
        list_split_image = code_handler.main_deal_split(image)
        return list_split_image

    #识别
    def reg_code(self, list_split_image):
        all_plain_text = "0123456789abcdef"
        reg_plain_text = []
        neural = NeuralWork()
        list_input_data = []
        for each_split_image in list_split_image:
            each_input_data = []
            for x in xrange(each_split_image.size[1]):
                for y in xrange(each_split_image.size[0]):
                    if each_split_image.getpixel((y, x)):
                        each_input_data.append(0)
                    else:
                        each_input_data.append(1)
            list_input_data.append(each_input_data)
        out = neural.reg_net(list_input_data)
        for each in out:
            plain_text = int(round(each[0] * 100))
            if plain_text < 16:
                reg_plain_text.append(all_plain_text[plain_text])
        return reg_plain_text

    #查找验证码图片位置
    def find_location(self, image):
        image = image.convert("L")
        image_width = image.size[0]
        image_height = image.size[1]
        
        flag = image_width
        location = [0, 0]
        for y in xrange(image_width):
            for x in xrange(image_height):
                if image.getpixel((y, x)) != 0:
                    flag = y
                    break
            if flag != image_width:
                location[0] = y
                location[1] = x
                break

        return location
def scrape():
    # https://splinter.readthedocs.io/en/latest/drivers/chrome.html
    get_ipython().system('which chromedriver')


    # # NASA Mars News

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    html = requests.get(url)
    soup = bs(html.text, 'lxml')

    title_results = soup.find_all('div', class_="content_title")
    paragraph_results = soup.find_all('div', class_="rollover_description_inner")

    news_title = soup.find('div', class_="content_title").text.strip()
    news_p = soup.find('div', class_="rollover_description_inner").text.strip()


    # # JPL Mars Space Images

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    html = browser.html
    soup = bs(html, 'html.parser')

    result = soup.find('div', class_="default floating_text_area ms-layer")

    featured_image = result.footer.a['data-fancybox-href']
    featured_image_url = f'http://www.jpl.nasa.gov{featured_image}'


    # # Mars Weather

    url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url)

    html = browser.html
    soup = bs(html, 'html.parser')

    current_weather = soup.find('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text


    # # Mars Facts

    url = 'https://space-facts.com/mars/'
    table = pd.read_html(url)
    df = table[0]
    df.columns = ['Profile', 'Value']
    df.to_html('table.html',index=False, justify='center')


    # # Mars Hemisphere

    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    hemisphere_image_urls = []

    links = browser.find_by_css("h3")

    for i in range(len(links)):
        hemisphere = {}
        browser.find_by_css("h3")[i].click()
        sample = browser.find_link_by_text('Sample').first
        hemisphere['img_url'] = sample['href']
        hemisphere['title'] = browser.find_by_css("h2.title").text
        hemisphere_image_urls.append(hemisphere)
        browser.back()

    return render_template('index.html', news_p = news_p, news_title = news_title, featured_image_url = featured_image_url, current_weather = current_weather)
Exemple #44
0
def scrape_info():

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    #NASA NEWS

    url = 'https://mars.nasa.gov/news/'

    browser.visit(url)
    browser.is_element_present_by_css('li.slide', wait_time=10)

    html = browser.html

    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    article = soup.find('div', class_="article_teaser_body").text

    title = soup.find(class_="bottom_gradient")
    title = title.find('h3').text

    #JPL MARS SPACE IMAGE
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)

    html = browser.html
    soup = BeautifulSoup(html, 'lxml')
    footer = soup.find('footer')
    string_footer = str(footer)
    string_footer = string_footer.split('data-fancybox-href="')[1].split(
        '" data-link')[0]
    featured_image_url = 'https://www.jpl.nasa.gov' + string_footer

    #MARS WEATHER TWITTER

    url = 'https://twitter.com/MarsWxReport'
    driver = webdriver.Chrome()
    driver.implicitly_wait(5)  # seconds
    driver.get(url)

    element = driver.find_element_by_class_name("css-901oao")
    tweet = element.text

    # MARS FACTS

    url = "https://space-facts.com/mars/"
    tables = pd.read_html(url, index_col=0)
    mars_facts_df = tables[0]
    html_table = mars_facts_df.to_html()

    # MARS HEMISHERES

    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url)

    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    hemispheres = soup.find_all('h3')
    hemi_list = []
    link_list = []

    for h in hemispheres:
        hemi_text = h.text.strip('Enhanced')
        hemi_list.append(hemi_text)
        #Click on Hemisphere Link
        try:
            browser.click_link_by_partial_text(hemi_text)
        except:
            print("Scraping Complete")
        #Find image link
        link = browser.find_link_by_text('Original').first['href']
        link_list.append(link)
        #Go Back a Page
        browser.visit(url)

    #Hard coding values scraped earlier when the site was working
    hemi_list = [
        'Cerberus Hemisphere ', 'Schiaparelli Hemisphere ',
        'Syrtis Major Hemisphere ', 'Valles Marineris Hemisphere '
    ]
    link_list = [
        'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg',
        'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg',
        'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg',
        'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'
    ]

    #WRITE ALL TO DICT

    mars_dict = {
        "article_title": title,
        "article_excerpt": article,
        "feature_image": featured_image_url,
        "fact_table": html_table,
        "mars_weather": tweet,
        "hemisphere_list": hemi_list,
        "hemisphere_pic": link_list
    }

    return (mars_dict)