class LoginTestCase(unittest.TestCase):

  def setUp(self):
    self.testbed = testbed.Testbed()
    self.testbed.activate()
    self.testbed.init_datastore_v3_stub()
    self.testbed.init_memcache_stub()
    self.browser = Browser('chrome')

  def tearDown(self):
    self.testbed.deactivate()

  def test_login(self):
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")

    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

  def test_logout(self):
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")

    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

    self.browser.find_link_by_text("Log out").first.click()
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
Exemple #2
0
    def run(self):
        """Run the b0t"""
        browser = Browser()
        browser.visit(self.url)

        try:
            while browser.find_by_tag('button').first:
                self.process_elements(browser)
                browser.find_by_tag('button').first.click()
        except ElementDoesNotExist:
            pass
Exemple #3
0
class PlayPagesWebTests(StaticLiveServerTestCase):

    def setUp(self):
        self.user1 = UserFactory.build()
        self.user1.set_password('abc')
        self.user1.save()

        self.user2 = UserFactory.build()
        self.user2.set_password('123')
        self.user2.save()

        self.browser1 = Browser()

    def tearDown(self):
        self.browser1.quit()

    def login_helper(self, browser, username, password):
        browser.visit(
            '%s%s' % (self.live_server_url, '/accounts/login/')
        )

        browser.fill('username', username)
        browser.fill('password', password)
        browser.find_by_value('Log in').first.click()

    # Test 4
    # Check playing single player game
    def test_single_player(self):
        self.login_helper(self.browser1, self.user1.username, 'abc')
        self.browser1.visit(
            '%s%s' % (self.live_server_url, '/play/')
        )
        time.sleep(2)
        snippet = self.browser1.find_by_id('type').value
        try:
            snippet = snippet[:-(
                len(snippet.split()[-1]) + 1
            )]
            for c in snippet[:100]:
                self.browser1.type('typed', c)
                time.sleep(0.001)
            self.browser1.find_by_tag('input')[3].click()
            self.browser1.find_by_tag('input').last.click()
            self.assertEqual(
                self.browser1.url,
                '%s%s' % (self.live_server_url, '/scores/')
            )
        except ElementNotVisibleException:
            self.assertTrue(True)
Exemple #4
0
class UserUtils(object):
        def __init__(self):
                self.config = config.read_config()
		self.account = self.config['account']
                self.idp_server = self.config['nodes']['idp_node']
		
		# Abort test if esgf-web-fe is not reachable
		r = requests.get("https://{0}/esgf-web-fe".format(self.idp_server), verify=False, timeout=1)
                assert r.status_code == 200

		self.browser = Browser('firefox')

		# Mapping user data to fit to web-fe user creation form 
                self.elements = {'firstName' : self.account['firstname'],
                                 'lastName'  : self.account['lastname'],
                                 'email'     : self.account['email'],
                                 'userName'  : self.account['username'],
                                 'password1' : self.account['password'],
                                 'password2' : self.account['password']}


	def check_user_exists(self):
		URL = "https://{0}/esgf-web-fe/login".format(self.idp_server)
		OpenID = "https://{0}/esgf-idp/openid/{1}".format(self.idp_server, self.account['username'])

		# Try to log in
		self.browser.visit(URL)
		self.browser.find_by_id('openid_identifier').fill(OpenID)
		self.browser.find_by_value('Login').click()

		# User does not exist if unable to resolve OpenID
		if(self.browser.is_text_present("Error: unable to resolve OpenID identifier")):
			self.user_exists = False
		else:
			self.user_exists = True
		
        def create_user(self):
		URL = "https://{0}/esgf-web-fe/createAccount".format(self.idp_server)
        	self.browser.visit(URL)
	
		# Filling the form
		for element_name in self.elements:
			self.browser.find_by_name(element_name).fill(self.elements[element_name])

      		self.browser.find_by_value('Submit').click()

		# Parsing response
		self.response = []		
		if (self.browser.is_text_present("SUCCESS") == True):
			self.response.append("SUCCESS")
		else:
			self.response.append("FAILURE")
			selection = self.browser.find_by_tag('span')
			for sel in selection:
				if sel.has_class('myerror'):
					self.response.append(sel.value)


        def exit_browser(self):
		self.browser.quit()
Exemple #5
0
def splinter():
    browser = Browser()
    url = "http://ehire.51job.com/MainLogin.aspx"
    browser.visit(url)
    time.sleep(1)
    browser.find_by_id('txtMemberNameCN').fill(u'安能聚业')
    browser.find_by_id('txtUserNameCN').fill(u'上海安能聚创供应链')
    browser.find_by_id('txtPasswordCN').fill('aneqc888')
    browser.find_by_id('Login_btnLoginCN').click()
    time.sleep(1)
    browser.find_by_tag('a').click()
    browser.find_by_id('hlResumeSearch').click()
    # id 85798642 未公开
    # 309554553 未下载
    #
    browser.find_by_id('txtUserID').fill('6098724')
    time.sleep(1)
    browser.find_by_id('btnSearchID_leftbtnSearchID').click()

    cvTarget = browser.find_by_xpath('//tr/td/p/span/a[@target="_blank"]')
    if len(cvTarget) == 0:
        print "can not find the cv from this id."
        return
    cvTarget.click()
    allwindows = browser.windows
    driver = browser.driver
    driver.switch_to_window(allwindows[-1].name)
    UndownloadLink = browser.find_by_id('UndownloadLink')
    if len(UndownloadLink) == 0:
        print "can not find the cv from this id."
    else:
        UndownloadLink.click()
        time.sleep(1)
        browser.find_by_id('btnCommonOK').click()
    selector = etree.HTML(browser.html)
    lines = selector.xpath('//title')
    if len(lines) != 0:
        print "name:", strip(lines[0].text)

    contents = browser.html.encode("utf-8")
    print re.findall(re.compile('''<td height="20">电 话:</td><td height="20" colspan="3">(.*?)<span'''), contents)[0]
    printre.findall(re.compile('''E-mail:</td><td height="20" colspan="3"><a href="mailto:(.*?)" class="blue">'''),
                    contents)[0]
    winNum = len(allwindows)
    for i in range(winNum):
        allwindows[winNum - 1 - i].close()
Exemple #6
0
def splinter(url):
    #"""""""""""""""""""""""""MySQL DEF**********************************************
    conn = MySQLdb.connect(host='192.168.1.8',user='******',passwd='123123',db='gwycf')
    cursor = conn.cursor()#create cursor operate db
    #"""""""""""""""""""""""""MySQL DEF**********************************************
    data = xlrd.open_workbook('./chafen.xlsx')
    table = data.sheets()[0]
    nrows = table.nrows 
    ncols = table.ncols
    print nrows
    
    browser = Browser('firefox')
#    browser = Browser('chrome')
    dir(browser)
    browser.visit(url)
    time.sleep(5)
    count = 0
    #<================================================>
    for i in range(nrows):
        #HaoMa = str(table.row_values(i)[1]).split(".")[0]
        name = table.row_values(i)[0]
        HaoMa = table.row_values(i)[1]
#        epost = table.row_values(i)[2]

        browser.find_by_name('TxtName').fill(name)
        browser.find_by_name('TxtHaoMa').fill(HaoMa)
        browser.find_by_id('btnSubmit').click()
	#=================获取页面数据=====================
        epost = browser.find_by_tag('td')[10].value
        ecode = browser.find_by_tag('td')[14].value
        xingce = browser.find_by_tag('td')[16].value
        shenlun = browser.find_by_tag('td')[18].value
        jiafen = browser.find_by_tag('td')[20].value
        zongfen = browser.find_by_tag('td')[22].value
	#=================获取页面数据======================
        query = u"insert into info values('%s','%s','%s','%s','%s','%s','%s','%s',0)" % (name,HaoMa,epost,ecode,xingce,shenlun,jiafen,zongfen)
        print count,query
        cursor.execute(query.encode('utf-8')) #原始数据可以根据gbk运行无错,现在改成utf8
        conn.commit()
        browser.back()
        count = count +1
    cursor.close()
    conn.commit()
    conn.close()
Exemple #7
0
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = models.User(name="Alice", email="*****@*****.**",
                                password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run)
        self.process.start()
        time.sleep(1)
        

    def test_add_post(self):
        log= logging.getLogger("unittest.TestCase")
        
        ################################## Login as Alice
        #self.browser.visit("http://0.0.0.0:8080/login") # original line
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        #self.assertEqual(self.browser.url, "http://0.0.0.0:8080/") # original line
        # self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") # ask sam about this line
        
############################################ add a test post #####################
        self.browser.visit("http://127.0.0.1:5000")
        self.browser.click_link_by_partial_href('add')
        self.browser.fill("title", "post test1 title")
        self.browser.fill("content", "post test1 content")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        post_found = self.browser.find_by_tag('h1').value #cheated here - made template title h2. how do we access? index?
        #post_found = self.browser.find_by_text('post test1 title').value - didnt work
        
        log.debug( "FIRSTH1= %r", post_found )
        
        self.assertEqual(post_found, "post test1 title")

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
class Retreiver():
	def __init__(self, folder):
		self.aux = Auxiliary()
		self.folder = folder
		self.tickers = None
		
	def click(self, destination):
		try:
			self.browser.find_by_text(destination).first.click()
		except splinter.exceptions.ElementDoesNotExist:
			self.browser.click_link_by_text(destination)
		
	def retreive(self):
		print ('Please enter the period for retrieval.')
		raw_dates = input ('Dates in European format: dd/mm/yyyy\n>')
		eurodates = self.aux.date_parse(raw_dates)[0]
		dates = self.aux.european_dates_to_american(eurodates)
		raw_tickers = input ('Tickers:\n>')
		self.tickers = self.aux.parse_tickers(raw_tickers)

		self.browser = Browser('chrome')
		for ticker in self.tickers:
			self.browser.visit('https://beta.finance.yahoo.com/quote/%s/history' % ticker)
			time.sleep(5)
			input_boxes = self.browser.find_by_tag('input')
			for i in range(0,6):
				input_boxes[i+2].fill(dates[i]) #we need 3-8 inputs
			self.click('Apply')
			download_link = self.browser.find_link_by_text('Download data').first
			response = requests.get(download_link['href'])
			with open('%s//%s.csv' % (self.folder, ticker), 'wb') as f:
				f.write(response.content)		
		self.browser.quit()
		
	def put_together(self):
		if not self.tickers:
			self.tickers = []
			for f in os.listdir(self.folder):
				self.tickers.append(f[:-4])
		target = openpyxl.Workbook()
		sheet = target.active
		sheet.append(self.tickers)
		for filename in os.listdir(self.folder):
			source = open('%s//%s' %(self.folder, filename), 'r', encoding='utf-8')
			sheet = target.create_sheet()
			sheet.title = filename[:-4] #strip out the extension
			for line in source:
				sheet.append(self.aux.parse_comma_separated_line(line))
			source.close()
		target.save('Historical_data_together.xlsx')
	def __authorize(self):
		b = Browser('chrome')
		b.visit("http://box-token-generator.herokuapp.com/")

		if b.find_link_by_href('set_client_credentials'):
			b.visit('http://box-token-generator.herokuapp.com/set_client_credentials')
			time.sleep(2)
		
			b.find_by_id('login').first.fill('*****@*****.**')
			b.find_by_id('password').first.fill('dharit1250')
			b.find_by_name('login_submit').first.click()

			b.find_by_id('consent_accept_button').first.click()
	
		code = b.find_by_tag('h4')[1].text
		self.client = box.BoxClient(code)
		b.quit()
Exemple #10
0
news_title

# Use the parent element to find the paragraph text
#For example, if we were to use .find_all() instead of .find() when pulling the summary,
#we would retrieve all of the summaries on the page instead of just the first one.
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

# ### JPL Space Images Featured Image

# Visit URL
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

# Use the base URL to create an absolute URL
img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
img_url

# ### Mars Facts
Exemple #11
0
# Fill in the url
browser.visit('somePageURL')

# Find the username cell
browser.find_by_name('user')[1].fill(username)

# Find the password cell
browser.find_by_name('password')[1].fill(password)

# Find the submit button and click
browser.find_by_css('.loginsub').first.click()

# If there is a "full sign", we kick somebody out :) Sorry!!
try:
        browser.find_by_tag('input')[3].click()

except:
        pass

# I setup a progress bar here
bar = pyprind.ProgBar(handle_sheet_0.nrows, stream=1)

# I create the excel header
style1 = xlwt.easyxf('pattern: pattern solid, fore_colour gray40; align: horiz center')
style2 = xlwt.easyxf('align: horiz center')
style3 = xlwt.easyxf('pattern: pattern solid, fore_colour red; align: horiz center')
style4 = xlwt.easyxf('pattern: pattern solid, fore_colour green; align: horiz center')

cellWidths = [75, #0 Rövid cégnév az excelben
              20, #1 Adószám az excelben
def scrape():
    # NASA Mars News page to be scraped
    nasa_mars_url = 'https://mars.nasa.gov/news/'
    # Retrieve Nasa Mars News page with the requests module
    nasa_response = requests.get(nasa_mars_url)
    # nasa_response

    # Create BeautifulSoup object; parse with 'lxml'
    nasa_soup = BeautifulSoup(nasa_response.text, 'lxml')
    # print(nasa_soup.prettify())

    # NASA Mars News
    # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text.
    # Assign the text to variables that you can reference later.

    news_title = nasa_soup.find('div', class_='content_title').find('a').text
    news_p = nasa_soup.find('div', class_='rollover_description_inner').text

    # print(news_title)
    # print(news_p)

    # JPL Mars Space Images - Featured Image
    # Visit the url for JPL Featured Space Image here.
    # Use splinter to navigate the site and find the image url for the current Featured Mars Image

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    jpl_mars_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(jpl_mars_url)

    # click through the page
    browser.click_link_by_partial_text('FULL IMAGE')
    browser.click_link_by_partial_text('more info')
    browser.click_link_by_id('page')

    # assign the url string to a variable called featured_image_url.
    # Make sure to find the image url to the full size .jpg image.
    # Make sure to save a complete url string for this image.

    jpl_html = browser.html
    jpl_soup = BeautifulSoup(jpl_html, 'html.parser')

    image = jpl_soup.find('img')
    featured_image_url = image.get('src')

    # featured_image_url

    # Mars Weather
    # Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page.
    # Save the tweet text for the weather report as a variable called mars_weather.

    mars_weather_url = 'https://twitter.com/marswxreport?lang=en'

    mars_weather_response = requests.get(mars_weather_url)
    # mars_weather_response

    mars_weather_soup = BeautifulSoup(mars_weather_response.text, 'lxml')
    # print(mars_weather_soup.prettify())

    mars_weather_tweets = mars_weather_soup.find(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
    ).text

    # print(mars_weather_tweets)

    # Mars Facts
    # Visit the Mars Facts webpage here and use Pandas to scrape the table
    # containing facts about the planet including Diameter,Mass, etc.
    # Use Pandas to convert the data to a HTML table string

    mars_facts_url = 'https://space-facts.com/mars/'
    mars_facts_tables = pd.read_html(mars_facts_url)
    # mars_facts_tables

    # type(mars_facts_tables)

    # put the table in the data frame
    mars_facts_df = mars_facts_tables[0]
    mars_facts_df.columns = ['description', 'value']
    mars_facts_df.set_index('description', inplace=True)
    # mars_facts_df.head()

    # convert data frame into html
    mars_facts_html_table = mars_facts_df.to_html()
    # mars_facts_html_table

    # drop line break \n
    mars_facts_html_table.replace('\n', '')

    # Mars Hemispheres
    # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
    # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    # Save both the image url string for the full resolution hemisphere image,
    # and the Hemisphere title containing the hemisphere name.
    # Use a Python dictionary to store the data using the keys img_url and title.
    # Append the dictionary with the image url string and the hemisphere title to a list.
    # This list will contain one dictionary for each hemisphere.

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    mars_hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(mars_hemispheres_url)

    hemisphere_image_urls = []

    # find all h3 tag
    link_cnt = browser.find_by_tag('h3')

    # loop through all h3 tag
    for item in range(len(link_cnt)):
        mars_hemispheres = {}

        # click on each h3 item
        browser.find_by_tag('h3')[item].click()

        # Get Mars Hemispheres Title
        mars_hemispheres["title"] = browser.find_by_tag("h2.title").text

        # Find Sample Image Tag & get url
        sample = browser.find_link_by_text("Sample").first
        mars_hemispheres["img_url"] = sample["href"]

        # Append Mars Hemispheres to List
        hemisphere_image_urls.append(mars_hemispheres)

        # Navigate Backwards
        browser.back()

    # hemisphere_image_urls
    mars_data = {
        'New_Title': news_title,
        'News_Paragraph': news_p,
        'Feature_Image': featured_image_url,
        'Mars_Weather': mars_weather_tweets,
        'Mars_Facts': mars_facts_html_table,
        'Mars_Hemisphere': hemisphere_image_urls
    }

    return mars_data
def scrape():
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    mars_dict = {}

    # Step One:
    browser.visit(
        "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    )
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find_all('div', class_="content_title")[1].text
    description = soup.find('div', class_="article_teaser_body").text
    mars_dict["news_title"] = title
    mars_dict["news_description"] = description

    # Step Two:
    browser.visit(
        "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars")
    browser.find_by_id('full_image').click()
    time.sleep(1)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    results = soup.find_all('img', class_="fancybox-image")
    link = str(results)
    link = link.split("src=")[1]
    link = link.split("style")[0]
    link = link.replace('"', "")
    link = link.replace(" ", "")
    featured_image_url = f"https://www.jpl.nasa.gov{link}"
    mars_dict["featured_image"] = featured_image_url

    # Step Three:
    browser.visit("https://twitter.com/marswxreport?lang=en")
    time.sleep(1)
    browser.find_by_xpath(
        "/html/body/div/div/div/div[2]/main/div/div/div/div/div/div/div/div/div[2]/section/div/div/div/div[1]/div/div/div/article/div/div[2]/div[2]/div[2]/div[1]/div/span"
    ).click()
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    weather = soup.find_all(
        'div',
        class_=
        'css-901oao r-hkyrab r-1qd0xha r-1blvdjr r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0'
    )
    weather = str(weather)
    weather = weather.split("r-ad9z0x r-bcqeeo r-qvutc0")[1]
    weather = weather.split("/span")[0]
    weather = weather.replace('>', '')
    weather = weather.replace('<', '')
    current_mars_weather = weather.replace('"', '')
    mars_dict["mars_weather"] = current_mars_weather

    # Step Four:
    url = "https://space-facts.com/mars/"
    mars_table = pd.read_html(url)
    mars_df = mars_table[0]
    mars_html = mars_df.to_html()
    mars_dict["mars_facts"] = mars_html

    # Step Five:
    hemisphere_list = []
    for i in range(0, 4):
        links = []
        browser.visit(
            "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
        )
        browser.find_by_tag('h3')[i].click()
        browser.find_by_id('wide-image-toggle').click()
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find('h2', class_="title").text
        results = soup.find_all('li')
        for result in results:
            link = result.find('a')['href']
            links.append(link)
        hemisphere_dict = {"title": title, "img_url": links[0]}
        hemisphere_list.append(hemisphere_dict)
        mars_dict["hemispheres"] = hemisphere_list

    return mars_dict
# In[485]:

html_table = mars_df.to_html()
html_table

# In[645]:

sf_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(sf_url)
html = browser.html
soup = bs(html)

# In[646]:

images = soup.section.find_all('img', class_='thumb')

img_urls = []
for i in range(len(images)):
    img_dict = {}
    browser.find_by_css('img.thumb')[i].click()
    img_dict['title'] = browser.find_by_tag('h2').text
    img_dict['img_url'] = browser.find_link_by_text("Sample")['href']
    img_urls.append(img_dict)
    browser.back()

# In[648]:

img_urls

# In[ ]:
				browser.find_by_name('addScenario').first.click()
				browser.fill('scName', countryTypeList[conIndex]+typeaName+igType)
				browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b')
				browser.type('scEffDate', '2015-10-31')
				browser.find_by_name('update').first.click()

				browser.find_link_by_text('Obligor').first.click()

				# choose the companyType type
				element = browser.find_by_name('companyType').first
				element.select(str(cType))

				browser.fill('obligorName', companyName)
				browser.find_by_name('ObligorSearch').first.click()
			
				browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[2].find_by_tag('a')[0].click()

				# select "B-III counterpaty type" to be "corporate"
				element = browser.find_by_name('counterPartyType').first
				element.select('1')
				# select "Classification re Asset Value Correlation" to be "Non-Financial Institution (N)"
				element = browser.find_by_name('avc').first
				element.select('4')

				# select proper IG according to the IG type
				element = browser.find_by_name('obligorIgCode').first
				if igType == 'cap':
					element.select('99')
					browser.find_by_name('UpdateButton').first.click()
				elif igType == 'floor':
					element.select('30')
def scrape():
    # Create dictionary to return
    return_dict = {}

    # Create initial browser object
    executable_path = {'executable_path': '/Users/joshchung/Bootcamp/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Scrape NASA Mars news
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'lxml')
    results = soup.find('li', class_="slide")
    article_date = results.find('div', class_="list_date").text
    article_title = results.find('div', class_="content_title").text
    article_teaser = results.find('div', class_="article_teaser_body").text
    return_dict.update({'article_date':article_date,
                        'article_title':article_title,
                        'article_teaser':article_teaser})

    # Scrape JPL image
    url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'lxml')
    results = soup.find_all('article', class_="carousel_item")
    url_string = results[0].get('style')
    url_string = url_string.split("url('")
    url_string = url_string[1].split("');")
    url_string = url_string[0]
    img_url = 'https://www.jpl.nasa.gov' + url_string
    return_dict.update({'img_url':img_url})

    # Scrape Twitter
    url = 'https://twitter.com/marswxreport'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'lxml')
    last_tweet = soup.find('p', class_="tweet-text").text
    last_tweet = last_tweet.replace('\n', ' ')
    return_dict.update({'last_tweet':last_tweet})

    # Scrape Mars facts
    url = 'https://space-facts.com/mars/'
    tables = pd.read_html(url)
    mars_df = tables[0]
    mars_df.columns = ['Statistic','Values']
    mars_df = mars_df.set_index('Statistic')
    mars_table = mars_df.to_html()
    mars_table = mars_table.replace('\n', '')
    return_dict.update({'mars_table':mars_table})

    # Scrape Mars hemisphere images
    url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    mars_urls = {}
    for x in range(0,4):
        browser.visit(url)
        links = browser.find_by_tag('h3')
        links[x].click()
        html = browser.html
        soup = bs(html, 'lxml')
        downloads = soup.find('div', class_="downloads")
        dl_links = downloads.find_all('a')
        img_link = dl_links[0].get('href')
        dld_link = dl_links[1].get('href')
        title = soup.find('h2', class_="title").text
        mars_urls.update({
            f"marsimg_{x}" : img_link,
            f"marstitle_{x}": title,
            f"marsdld_{x}": dld_link
        })
        browser.back()
    return_dict.update(mars_urls)

    # Return dictionary when function is run
    return return_dict
Exemple #17
0
# Go to each of the 4 hemisphere websites and scrape the link for the Sample Image
usgs_url = "https://astrogeology.usgs.gov"
image_url = []
titles = []
for x in range(len(mars4hemis)):
    # Go to the hemisphere website
    browser.visit(usgs_url + mars4hemis[x])
    browser.click_link_by_text("Open")
    time.sleep(2)
    # CLick Sample link to get the image
    sample = browser.find_by_text('Sample')
    image = sample['href']
    image_url.append(image)
    # Search the h2 tags to get the title
    headers = browser.find_by_tag('h2')
    full_title = headers.text
    title = full_title.strip('Enhanced')
    titles.append(title)
    #    print(browser.url)
    print(title, image)

# Show the two newly created lists: titles and image_url
print(titles)
print(image_url)

# Create the list of 4 Dictionaries with the image url string and the hemisphere title to a list
hemisphere_image_urls = []
for x in range((len(titles))):
    one_hemisphere = {"title": titles[x], "img_url": image_url[x]}
    hemisphere_image_urls.append(one_hemisphere)
class InstaLiker():

	# constructor
	def __init__(self):
		self.mUrl = "https://www.instagram.com/"
		self.cycles = 4
		self.browser = Browser()
		self.username = "******"
		self.pw = 'xxxxxxxxxxxxxxxx\r'
		self.totalLikes = 0
		self.blackList = ["make a list of users to exclude", "including your own username" ]

	# scroll the page and
	# do the liking
	def launchPage(self):
		self.browser.visit(self.mUrl)
		self.login()

		self.scrollBy()
		for i in range(0, self.cycles):
			self.likePosts()

		print("just liked " + str(self.totalLikes) + " pix...Yay!")		

	def login(self):
		print("login")
		print("logging in as " + self.username)
		self.browser.click_link_by_text('Log in')
		self.browser.fill('username', self.username)
		self.browser.fill('password', self.pw)
		
		form = self.browser.find_by_tag('form')
		inputs = form.find_by_tag('button')
		inputs[0].click()

		# need to sleep a few seconds here
		time.sleep(5)

	def likePosts(self):
		print("liking posts")
		likeList = self.browser.find_by_text("Like")
		
		if len(likeList) == 0:
			print("nothing left to like. attempt to scroll farther to load more posts.")
			self.scrollBy()
			time.sleep(3)
			likeList = self.browser.find_by_text("Like")
			print("likeList is now: " + str(len(likeList)))

		if (len(likeList) > 0):
			print("found " + str(len(likeList)) + " posts to like")
			
			for foo in likeList:
				tmpParentNode = foo.find_by_xpath("ancestor::article/header")
				print(tmpParentNode["innerText"])
				if self.checkBlackList(tmpParentNode["innerText"]) == 0:
					foo.click()
					self.totalLikes += 1
					time.sleep(1)

	def checkBlackList(self, pString):
		for foo in self.blackList:
			if foo in pString:
				print("found blacklisted item '" + foo + "'")
				return 1		
		return 0

	def scrollBy(self):
		print("scrolling down.")
		self.browser.execute_script( "window.scrollBy(0,30000);" )
		time.sleep(1) 

	def boneyard(self):
		print('boneyard')
Exemple #19
0
import flask
from flask import request, jsonify
import bs4
import urllib.parse
from splinter import Browser

from flask import Flask, jsonify

#################################################
# Web Scraping for Georgia Income Data
#################################################
executable_path={'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit('http://datausa.io/api/data?Geography=04000US13:children&measure=Household Income by Race,Household Income by Race Moe&drilldowns=Race')

GA_Income_Data = json.loads((browser.find_by_tag('body').first.text))
browser.quit()

#################################################
# EV Station Data API - Acquisiton
#################################################
url ='https://developer.nrel.gov/api/alt-fuel-stations/v1.json?fuel_type=ELEC,ELEC&state=GA&limit=all&api_key=FHhxl7HnTsc9tm4X9CwUBVDNmbQFFu4uZXKJeO59&format=JSON'
response = requests.get(url).json()
#print((json.dumps(response, indent = 4, sort_keys =True)))
response_string=(json.dumps(response ['fuel_stations'], indent = 4, sort_keys =True))
#############################
#creationg dataframe from ev response
ev_df = pd.read_json(response_string)
##########################
#selecting relevent data
ev_df.head()
from splinter import Browser
from selenium import webdriver

driver = webdriver.Chrome('C:/Users/phillipparamirez/Downloads/chromedriver')

browser = Browser('chrome')
url = 'http://localhost:5000'
browser.visit(url)
assert 'Todo' in browser.title
header = browser.find_by_tag('h1').first
assert 'Todo list' in header.text

browser.quit()
Exemple #21
0
def scrape_info():
    # Configure settings for splinter
    executable_path = {"executable_path": ChromeDriverManager().install()}
    browser = Browser("chrome", **executable_path, headless=False)

    ### Part 1: NASA Mars News_________________________________________________________________________________________________________________________________
    # Visit mars.nasa.gov/news site
    # Get html and parse it from mars events website using requests
    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    browser.visit(url)
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")
    # Find most recent news article title and description
    a = soup.find("body", id="news")
    b = a.find("ul", class_="item_list")
    c = b.find("li", class_="slide")
    d = c.find("div", class_="image_and_description_container")
    e = d.find("div", class_="list_text")
    f = e.find("div", class_="content_title")

    news_title = f.find("a").text
    news_description = e.find("div", class_="article_teaser_body").text

    ###Part 2: JPL Mars Space Images Featured Image - Get html and parse it________________________________________________________________________________________________________________________________
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url)
    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")
    # Find the featured image url
    # Click on FULL IMAGE button
    link1 = browser.find_by_tag('a').links.find_by_partial_text("FULL IMAGE")
    link2 = link1.click()
    time.sleep(10)
    # Click on More Info button
    link3 = browser.links.find_by_partial_text("more info")
    link3.click()
    time.sleep(10)
    #Save full size of featured image
    link4 = browser.find_by_tag('figure[class="lede"]')
    link5 = link4.find_by_tag('a')
    for image in link5:
        featured_image_url = image["href"]

    ###Part 3: Mars Facts_______________________________________________________________________________________________________________________________________________________________________--
    #Scrape page into soup
    url = "https://space-facts.com/mars/"
    browser.visit(url)
    html = browser.html
    soup = bs(html, "html.parser")

    a = soup.find("table", id="tablepress-p-mars-no-2")
    b = a.find_all("tr")
    metric = list()
    value = list()
    for row in b:
        metric.append(row.find("td", class_="column-1").text)
        value.append(row.find("td", class_="column-2").text)

    mars_facts = pd.DataFrame({
        'Metric': metric,
        'Value': value
    },
                              columns=['Metric', 'Value'])
    mars_facts.set_index('Metric')
    # convert dataframe to html
    mars_facts_html = mars_facts.to_html()

    ###Part 4: Mars Hemispheres___________________________________________________________________________________________________________________________________________________________________________

    #Scrape page into soup
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)
    html = browser.html
    soup = bs(html, "html.parser")

    # Find the body of the page that includes the necessary links for the hemispheres
    link1 = browser.find_by_tag('div[id="product-section"]')
    h_count = 0
    hemisphere_image_urls = list()

    # Within the div: product-section, there are 4 class items. Each one corresponds to a different hemisphere
    while h_count < 4:

        # Initialize dictionary to store each hemisphere's information
        temp = dict()

        # Navigate to the hemisphere's section
        link2 = link1.find_by_tag('div[class="item"]')[h_count]
        link3 = link2.find_by_tag("a")
        link4 = link3.last
        link5 = link4.click()
        time.sleep(1)

        # Get the hemisphere name, store in a list
        hlink1 = browser.find_by_tag('section[class="block metadata"]')
        key = hlink1.find_by_tag('h2[class="title"]').text

        # Get the full size image url for each hemisphere
        hlink2 = browser.find_by_tag('div[id="wide-image"]')
        hlink3 = hlink2.find_by_tag('img[class="wide-image"]')
        value = hlink3['src']

        # Save the hemisphere's data in the temp dictionary, append the dictionary to the list
        temp[key] = value
        hemisphere_image_urls.append(temp)

        # Go back to the main page and start again for the next hemisphere
        link6 = browser.back()
        link1 = browser.find_by_tag('div[id="product-section"]')
        h_count += 1

    browser.quit()

    #Create a single dictionary of all the items
    # Create a single dictionary of the information

    mars_info = {
        "Recent_news_title": news_title,
        "Recent_news_description": news_description,
        "Featured_image_url": featured_image_url,
        "Mars_facts_html": mars_facts_html,
        "Hemispheres_images": hemisphere_image_urls
    }

    return (mars_info)
def scrape():

    #initialize the large dictionary to store all scraped information
    mars_dictionary = {}

    # create path and open browser window
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    # establish url
    url = 'https://mars.nasa.gov/news/'

    # visit site
    browser.visit(url)

    #time delay 1 sec
    time.sleep(1)

    # create soup object
    soup = BeautifulSoup(browser.html,'html.parser')

    # find title for the latest one, which is the one in the first box
    news_title = soup.find_all('div',class_='content_title')
    news_title = news_title[1].text

    # pulling the text from the paragraph
    news_p = soup.find_all('div', class_= 'article_teaser_body')
    news_p = news_p[0].text

    #append the title and paragrapah text to the larger mars_dictionary
    mars_dictionary['current_title'] = news_title
    mars_dictionary['current_p'] = news_p

    #############################################################
    #Visit the site ('https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars')
    #and scrape for the current featured image (the full size version)
    #############################################################

    # establish url
    jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

    # visit site
    browser.visit(jpl_url)

    # grab page html
    jpl_html = browser.html

    # create soup object
    soup = BeautifulSoup(jpl_html,'html.parser')

    #Find the image at the top and click 
    target = 'a[class="group  cursor-pointer block"]'
    browser.find_by_tag(target).click()

    #time delay of 1 second
    time.sleep(1)

    # grab page html
    target_html = browser.html

    # create soup object
    soup = BeautifulSoup(target_html,'html.parser')
    
    #find all anchors and loop through to find href by using get_text for JPG
    anchors = soup.find_all('a')
    for a in anchors:
        if 'JPG' in a.get_text():
            featured_image_url=a['href']

    #append the featured image url to the larger mars_dictionary
    mars_dictionary['featured_image_url'] = featured_image_url

    #############################################################################
    #Visit the site ('https://space-facts.com/mars/') and scrape the table with 
    #the mars data and convert back to html
    ##############################################################################

    # establish url
    facts_url = 'https://space-facts.com/mars/'

    # visit site
    browser.visit(facts_url)

    #pull the table from the site
    tables =pd.read_html('https://space-facts.com/mars/')
    #pull the specific table for just the Mars data
    mars_tables = tables[0]
    #rename columns
    mars_tables.columns = ['Fact','Value']
    #convert dataframe back to html
    mars_tables_html = mars_tables.to_html()

    #append the mars_table to the larger mars_dictionary
    mars_dictionary['mars_tables_html'] =  mars_tables_html

    ################################################################
    # Visit the site ('https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars')
    #scrape  for the title and image url for each of the hemisphere  and create a list with 
    #a mini dictionary for each hemisphere
    ################################################################

    # establish url
    hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    # visit site
    browser.visit(hemi_url)

    # grab page html
    hemi_html = browser.html

    # create soup object
    soup = BeautifulSoup(hemi_html,'html.parser')

    #Find the all the titles to use for the click function 
    title_list = soup.find_all('div', class_='description')

    hemisphere_image_urls = []
    #loop through the title_list and pull title and imagine url
    for t in title_list:
        title = t.h3.text
        browser.find_by_text(title).click()

        # grab page html
        title_html = browser.html

        # create soup object
        soup = BeautifulSoup(title_html,'html.parser')

        image = soup.find_all('div', class_='downloads')
        image_url = image[0].li.a['href']

        #create mini-dictionary
        mini_dictionary = {'title': title, 'img_url':image_url}
        hemisphere_image_urls.append(mini_dictionary)

        # click back button
        browser.back()

    #quit browser
    browser.quit()

    #append the hemisphere list to the larger mars_dictionary
    mars_dictionary['hemisphere_image_urls'] = hemisphere_image_urls

    #returns the mars_dictionary
    return mars_dictionary
def run(filename=True):

    ########################################################################
    # read data

    #filename='MARTA Breeze card numbers.xlsx'
    #filename='MARTA Breeze card numbers_1.xls'
    dataset = pd.read_excel(filename, sheetname=0, header=None)

    ########################################################################
    # input params

    url = 'https://balance.breezecard.com/breezeWeb/jsp/web/cardnumberweb.jsp'

    columns = [
        'cardnumber', 'protected_balance', 'expiration_date', 'product_name',
        'remaining_rides', 'stored_value', 'pending_autoload_transactions'
    ]
    cardinformation = pd.DataFrame(columns=columns)

    n = dataset.shape[0]

    for i in range(n):

        cardnumber = dataset.ix[i].values[0]
        #cardnumer='0164 1487 1502 5743 2323'
        if (len(cardnumber) < 16):
            print "invalid card length: %s\n" % cardnumber
            temp_df = pd.DataFrame(
                [[cardnumber, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']],
                columns=columns)
            cardinformation = cardinformation.append(temp_df)
            continue

        browser = Browser('chrome')
        browser.visit(url)
        browser.fill('cardnumber', cardnumber)
        browser.find_by_name('submitButton').click()
        text = browser.find_by_tag('tr')
        # 2. breezecard bulk of information
        temp_txt = text[2].value.split('\n')

        if len(temp_txt) == 11:

            txt = [cardnumber]

            # 5. is our card balance protected?
            temp_val = temp_txt[2].split(':')[1]
            txt.append(temp_val)

            # 6. card expiration date
            temp_val = temp_txt[3].split(':')[1]
            txt.append(temp_val)

            # 7. product name
            temp_val = temp_txt[5]
            txt.append(temp_val)

            # 9. remaining rides
            temp_val = temp_txt[6]
            txt.append(temp_val)

            # 11. store value
            temp_val = temp_txt[7].split(':')[1]
            txt.append(temp_val)

            # 11. store value
            temp_val = temp_txt[9]
            txt.append(temp_val)

            temp_df = pd.DataFrame([txt], columns=columns)
            cardinformation = cardinformation.append(temp_df)

        elif len(temp_txt) == 10:
            txt = [cardnumber]

            # 5. is our card balance protected?
            temp_val = temp_txt[2].split(':')[1]
            txt.append(temp_val)

            # 6. card expiration date
            temp_val = temp_txt[3].split(':')[1]
            txt.append(temp_val)

            # 7. product name
            temp_val = temp_txt[5]
            txt.append(temp_val)

            # 9. remaining rides
            temp_val = 0
            txt.append(temp_val)

            # 11. store value
            temp_val = temp_txt[6].split(':')[1]
            txt.append(temp_val)

            # 11. store value
            temp_val = temp_txt[8]
            txt.append(temp_val)

            temp_df = pd.DataFrame([txt], columns=columns)
            cardinformation = cardinformation.append(temp_df)

        else:
            temp_df = pd.DataFrame(
                [[cardnumber, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']],
                columns=columns)
            cardinformation = cardinformation.append(temp_df)

        browser.quit()

    output = 'output_' + time.strftime("%H_%M_%S") + '.xlsx'
    cardinformation.to_excel(output, header=True, index=False)
Exemple #24
0
class HomePageWebTests(StaticLiveServerTestCase):

    def setUp(self):
        self.user1 = UserFactory.build()
        self.user1.set_password('abc')
        self.user1.save()
        self.browser = Browser()

    def tearDown(self):
        self.browser.quit()

    def login_helper(self, username, password):
        self.browser.visit('%s%s' % (self.live_server_url, '/accounts/login/'))

        self.browser.fill('username', username)
        self.browser.fill('password', password)
        self.browser.find_by_value('Log in').first.click()

    # Test 2
    # Check for login link from anonymous get of homepage
    def test_anon_login(self):
        self.browser.visit('%s%s' % (self.live_server_url, '/'))
        login_link = self.browser.find_by_tag('a')[2]
        self.assertEqual(
            '%s%s' % (self.live_server_url, '/accounts/login/'),
            login_link['href']
        )

    # Test 3
    # Check for register link from anonymous get of homepage
    def test_anon_register(self):
        self.browser.visit('%s%s' % (self.live_server_url, '/'))
        register_link = self.browser.find_by_tag('a')[3]
        self.assertEqual(
            '%s%s' % (self.live_server_url, '/accounts/register/'),
            register_link['href']
        )

    # Test 4
    # Check for user login success
    def test_login_success(self):
        self.login_helper(self.user1.username, 'abc')
        self.assertEqual(
            self.browser.url,
            '%s%s' % (self.live_server_url, '/profile/')
        )
        logout_link = self.browser.find_by_tag('a')[6]
        self.assertEqual(
            '%s%s' % (self.live_server_url, '/accounts/logout/?next=/'),
            logout_link['href']
        )
        greeting = self.browser.find_by_tag('h1')[0]
        self.assertEqual(
            '%s%s%s' % ('Well howdy there, ', self.user1.username, '!'),
            greeting.text
        )

    # Test 5
    # Check for user logout success
    def test_logout_success(self):
        self.login_helper(self.user1.username, 'abc')

        self.browser.find_by_tag('a')[6].click()

        self.assertEqual(
            self.browser.url,
            '%s%s' % (self.live_server_url, '/')
        )

    # Test 6
    # Register brand new user
    def test_registration(self):
        self.browser.visit(
            '%s%s' % (self.live_server_url, '/accounts/register/')
        )

        self.browser.fill('username', 'joseph')
        self.browser.fill('email', '*****@*****.**')
        self.browser.fill('password1', '123')
        self.browser.fill('password2', '123')
        self.browser.find_by_value('Submit').first.click()

        self.assertEqual(
            self.browser.url,
            '%s%s' % (self.live_server_url, '/accounts/register/complete/')
        )

        link_end = mail.outbox[0].body.split('days:')[1].split()[0][18:]
        link = '%s%s' % (self.live_server_url, link_end)
        self.browser.evaluate_script('document.location="%s"' % link)
        self.assertEqual(
            self.browser.url,
            '%s%s' % (self.live_server_url, '/accounts/activate/complete/')
        )
        self.login_helper('joseph', '123')
        greeting = self.browser.find_by_tag('h1')[0]
        self.assertEqual('Well howdy there, joseph!', greeting.text)
Exemple #25
0
# %%
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

# %%
### Featured Images

# %%
# Visit URL
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

# %%
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

# %%
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

# %%
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

# %%
# Use the base URL to create an absolute URL
img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
Exemple #26
0
    if browser.is_text_present('Congratulations. This browser is configured to use Tor.'):
        print "Yes, using TOR :)"
    else:
        print "No, not using TOR :("
        raise EnvironmentError('Not using TOR')
    browser.visit('https://tasonline.gotilc.com/GTPublicWeb/MainWeb/GageSelect.aspx')
    # assert_that(browser.is_text_present('Select Car Mark', wait_time=5))

    car_mark_select_list = browser.find_by_id('TRNContentPlaceHolder_ddCarMark_FilterSec')
    assert_that(len(car_mark_select_list), equal_to(1))

    car_mark_select = car_mark_select_list[0]
    print car_mark_select
    assert_that(car_mark_select.tag_name, equal_to(u'select'))

    options = browser.find_by_tag('option')
    for option in options:
        car_mark_select.select(option.text)
        car_mark_filter = browser.find_by_id('TRNContentPlaceHolder_ddCarMark_FilterSec')

        print option.text

    car_mark_select.select('ACTX')
    # assert browser.is_text_present('Loading...'), u'Text not found'
    assert_that(browser.is_text_not_present('Loading...', wait_time=6))

    car_mark_select_list = browser.find_by_id('TRNContentPlaceHolder_ddCarMark_FilterSec')
    assert_that(len(car_mark_select_list), equal_to(1))

    car_mark_select = car_mark_select_list[0]
    assert_that(car_mark_select.tag_name, equal_to(u'select'))
Exemple #27
0
class OOP():
    def __init__(self):
        self.win = tk.Tk()
        self.win.geometry('700x700')
        self.win.title('Python GUI')
        self.create_settings()
        self.create_login()
        self.create_query()
        self.create_records()
        self.create_menu()

        self.valid_sites = []
        self.select_row = -1

        self.valid_td = []
        self.select_record_row = -1
        self.browser_state = 'log in'
        self.today = datetime.date.today()
        self.username = ''
        self.password = ''
        self.mobile = ''

    def create_menu(self):
        def msgbox():
            msg.showinfo(
                'Info Box',
                'This is a python GUI designed by Wenliang Zhang\n Use for fun : )'
            )

        menu_bar = Menu(self.win)
        self.win.config(menu=menu_bar)

        file_menu = Menu(menu_bar, tearoff=0)
        file_menu.add_command(label='About', command=msgbox)
        menu_bar.add_cascade(label='Menu', menu=file_menu)

    def create_settings(self):
        settings = ttk.LabelFrame(self.win, text='Settings')
        settings.grid(row=0, column=0, sticky='w' + 'e', padx=50, pady=10)

        driver_label = ttk.Label(settings,
                                 text='Driver name',
                                 width=15,
                                 anchor='center')
        driver_label.grid(row=0, column=0, padx=5, pady=5)

        self.driver_name = tk.StringVar()
        driver_option = ttk.Combobox(settings,
                                     textvariable=self.driver_name,
                                     width=17,
                                     state='readonly')
        driver_option['values'] = ('firefox', 'chrome')
        driver_option.current(0)
        driver_option.grid(row=0, column=1, padx=5)

    def log_in(self):

        try:
            driver = self.driver_name.get()
            self.browser = Browser(driver_name=driver, headless=True)
            self.browser.visit('https://elife.fudan.edu.cn/')
            self.browser.find_by_xpath("//div/input[@class='xndl']").click()
            self.browser.fill("username", self.username)
            self.browser.fill("password", self.password)
            self.browser.find_by_value(u'登录').click()
            self.note.configure(text=('Hello, ' + self.browser.find_by_xpath(
                "//div[@class='person_a']").first.text))
            self.search_button.configure(state='normal')
            self.search_button2.configure(state='normal')
            self.info_button1.configure(state='normal')
            self.info_button2.configure(state='normal')
            self.browser.cookies.all()
        except:
            self.note.configure(
                text='Failed, please check your input or Internet access')

    def search(self):

        self.browser_state = 'search'

        def select(event, row):
            self.select_row = int(row) - 1
            self.avail_scr.tag_raise('tag_all')
            self.avail_scr.tag_configure('tag_all',
                                         background='white',
                                         foreground='black')
            self.avail_scr.tag_raise('tag' + row)
            self.avail_scr.tag_configure('tag' + row,
                                         background='blue',
                                         foreground='white')

        self.avail_scr.configure(state='normal')
        self.avail_scr.delete('1.0', 'end')
        self.valid_sites = []
        self.select_row = -1

        urlcode = self.court_var.get()
        user_start_time = int(self.start_time.get()[0:2])
        user_end_time = int(self.end_time.get()[0:2])

        dtime = Timedict[self.date.get()]
        reserve_date = (self.today + timedelta(dtime, 0)).strftime('%Y-%m-%d')
        self.browser.visit(Urldict[urlcode] + '&currentDate=' + reserve_date)

        found_sites = self.browser.find_by_xpath(
            "//td[@class='site_td1']/font")
        sites = []
        for site in found_sites:
            if site.text != '':
                sites.append(site.text)
        has_reversed = self.browser.find_by_xpath(
            "//td[@class='site_td4']/font")
        all_for_reservation = self.browser.find_by_xpath(
            "//td[@class='site_td4']/span")

        if len(has_reversed) == 0:
            self.avail_scr.insert('insert', '您好,当天没有场地可以预约')
        else:

            for i in range(len(has_reversed)):
                site_time = int(sites[i][0:2])
                remain = int(all_for_reservation[i].text) - int(
                    has_reversed[i].text)
                if (site_time >= user_start_time) and (
                        site_time <= (user_end_time - 1)) and remain > 0:
                    self.valid_sites.append(i)

            if len(self.valid_sites) == 0:
                self.avail_scr.insert('insert', '该时段场地未开放或已预定完,请适当放宽筛选条件。')
            else:
                for valid_site_num in self.valid_sites:
                    self.avail_scr.insert(
                        'insert', Weekdict[int(
                            (self.today + timedelta(dtime, 0)).strftime('%w'))]
                        + ' ' + sites[valid_site_num] + ' ' +
                        namedict[urlcode] + ' \n')

                self.avail_scr.tag_add('tag_all', '1.0', 'end')

                self.avail_scr.tag_raise('tag_all')
                self.avail_scr.tag_configure('tag_all',
                                             background='white',
                                             foreground='black')  #刷新的时候把蓝色漂白

                for j in range(len(self.valid_sites)):
                    row = str(j + 1)
                    self.avail_scr.tag_add('tag' + row, row + '.0',
                                           row + '.end')
                    self.avail_scr.tag_bind(
                        'tag' + str(j + 1),
                        '<Button-1>',
                        lambda event, row=row: select(event, row))

        self.avail_scr.configure(state='disable')

    def make_appointment(self):
        dtime = Timedict[self.date.get()]
        reserve_date = (self.today + timedelta(dtime, 0)).strftime('%Y-%m-%d')
        url = Urldict[self.court_var.get()] + '&currentDate=' + reserve_date

        def wait_for_the_midnight():
            while ((self.today.strftime('%d')
                    == datetime.date.today().strftime('%d'))
                   or (datetime.datetime.now().hour < 6)):
                print('current time: ' +
                      str(datetime.datetime.now().hour).zfill(2) + ':' +
                      str(datetime.datetime.now().minute).zfill(2))
                sleep(1800)
            while ((datetime.datetime.now().hour +
                    datetime.datetime.now().minute / 60) < 6.8):
                sleep(300)
                print('current time: ' +
                      str(datetime.datetime.now().hour).zfill(2) + ':' +
                      str(datetime.datetime.now().minute).zfill(2))
            while (datetime.datetime.now().hour < 7):
                sleep(15)
                print('current time: ' +
                      str(datetime.datetime.now().hour).zfill(2) + ':' +
                      str(datetime.datetime.now().minute).zfill(2))

            try:
                self.browser.visit(url)
                self.browser.find_by_tag('img')[self.valid_sites[
                    self.select_row]].click()
                self.browser.fill('mobile', self.mobile)
                self.browser.find_by_value(u' 预 约 ').click()
                self.note2.configure(text='Job done')
            except:
                print("重新登陆")
                self.browser.visit('https://elife.fudan.edu.cn/')
                self.browser.find_by_xpath(
                    "//div/input[@class='xndl']").click()
                self.browser.fill("username", self.username)
                self.browser.fill("password", self.password)
                self.browser.find_by_value(u'登录').click()
                self.browser.cookies.all()
                print("登陆成功")
                self.browser.visit(url)
                self.browser.find_by_tag('img')[self.valid_sites[
                    self.select_row]].click()
                self.browser.fill('mobile', self.mobile)
                self.browser.find_by_value(u' 预 约 ').click()
                self.note2.configure(text='Job done')
                print("抢票成功")

        if self.select_row == -1 or self.browser_state != 'search':
            if self.browser_state != 'search':
                self.note2.configure(text='Please update the search result')

            if self.select_row == -1:
                self.note2.configure(text='Please choose a court first')

        else:
            if Timedict[self.date.get()] <= 2:
                try:
                    self.browser.visit(url)
                    self.browser.find_by_tag('img')[self.valid_sites[
                        self.select_row]].click()
                    self.browser.fill('mobile', self.mobile)
                    self.browser.find_by_value(u' 预 约 ').click()
                except:
                    self.note2.configure(text='You cannot book the court')
                else:
                    self.note2.configure(text='Job done')

            else:
                confirm_msg = msg.askokcancel(
                    '提示', '确定执行抢场功能吗,这可能需要一点时间。(场地晚上12点刷新,请保持程序运行)')
                if confirm_msg == True:
                    self.note2.configure(
                        text='Job has been queued, hold on please.')
                    _thread.start_new_thread(wait_for_the_midnight, ())

    def update(self):
        self.browser_state = 'record'
        self.record_scr.configure(state='normal')
        self.record_scr.delete('1.0', 'end')
        self.valid_td = []
        self.select_record_row = -1

        def select(event, row):
            self.select_record_row = int(row) - 1
            self.record_scr.tag_raise('tag_all')
            self.record_scr.tag_configure('tag_all',
                                          background='white',
                                          foreground='black')
            self.record_scr.tag_raise('tag' + row)
            self.record_scr.tag_configure('tag' + row,
                                          background='blue',
                                          foreground='white')

        self.valid_td = []
        self.browser.visit(
            'https://elife.fudan.edu.cn/public/userbox/index.htm?userConfirm=&orderstateselect='
        )
        record_tr_num = len(
            self.browser.find_by_xpath("//table[@class='table3']/tbody/tr"))
        record_td = self.browser.find_by_xpath(
            "//table[@class='table3']/tbody/tr/td")
        for i in range(record_tr_num):
            if record_td[5 + 7 * i].text == '待签到':
                self.valid_td.append(i)

        if len(self.valid_td) != 0:
            for j in self.valid_td:
                valid_record_name = record_td[
                    3 + 7 * j].text + '  ' + record_td[
                        4 + 7 * j].text + '  ' + record_td[2 +
                                                           7 * j].text + '\n'
                self.record_scr.insert('insert', valid_record_name)

            self.record_scr.tag_add('tag_all', '1.0', 'end')

            self.record_scr.tag_raise('tag_all')
            self.record_scr.tag_configure('tag_all',
                                          background='white',
                                          foreground='black')  #刷新的时候把蓝色漂白

            for p in range(len(self.valid_td)):
                row = str(p + 1)
                self.record_scr.tag_add('tag' + row, row + '.0', row + '.end')
                self.record_scr.tag_bind(
                    'tag' + row,
                    '<Button-1>',
                    lambda event, row=row: select(event, row))
        self.record_scr.configure(state='disable')

    def cancel(self):
        self.browser.visit(
            'https://elife.fudan.edu.cn/public/userbox/index.htm?userConfirm=&orderstateselect='
        )
        if self.select_record_row != -1:
            self.browser.find_by_xpath("//table[@class='table3']/tbody/tr/td")[
                6 + (self.valid_td[self.select_record_row]) * 7].click()
            self.browser.get_alert().accept()

            self.record_scr.configure(state='normal')
            self.record_scr.tag_raise('tag_all')
            self.record_scr.tag_configure('tag_all',
                                          background='white',
                                          foreground='black')  #刷新的时候把蓝色漂白
            self.record_scr.delete('1.0', 'end')
            self.record_scr.insert('insert', '取消预约成功,请刷新')
            self.record_scr.configure(state='disable')
        else:
            self.record_scr.configure(state='normal')
            self.record_scr.tag_raise('tag_all')
            self.record_scr.tag_configure('tag_all',
                                          background='white',
                                          foreground='black')  #刷新的时候把蓝色漂白
            self.record_scr.delete('1.0', 'end')
            self.record_scr.insert('insert', '您未选择需要取消的预约,请刷新后重试')
            self.record_scr.configure(state='disable')

    def create_login(self):
        def confirm():
            login_button_1.configure(text='modify', command=modify)
            student_ID_enter.configure(state='readonly')
            mobile_enter.configure(state='readonly')
            password_enter.configure(state='readonly')
            self.username = student_ID_var.get()
            self.password = password_var.get()
            self.mobile = mobile_var.get()

        def modify():
            login_button_1.configure(text='OK', command=confirm)
            student_ID_enter.configure(state='normal')
            mobile_enter.configure(state='normal')
            password_enter.configure(state='normal')

        login = ttk.LabelFrame(self.win, text=' Log in')
        login.grid(row=1, column=0, padx=50, pady=10, sticky='w' + 'e')

        student_ID_label = ttk.Label(login,
                                     text='Student ID',
                                     width=15,
                                     anchor='center')
        student_ID_label.grid(row=0, column=0, padx=5, pady=5)
        student_ID_var = tk.StringVar()
        student_ID_enter = ttk.Entry(login,
                                     textvariable=student_ID_var,
                                     width=20)
        student_ID_enter.grid(row=0, column=1, padx=5)

        password_label = ttk.Label(login,
                                   text='Password',
                                   width=15,
                                   anchor='center')
        password_label.grid(row=1, column=0, padx=5, pady=5)
        password_var = tk.StringVar()
        password_enter = ttk.Entry(login, textvariable=password_var)
        password_enter.grid(row=1, column=1, padx=5)

        mobile_label = ttk.Label(login,
                                 text='Mobile',
                                 width=15,
                                 anchor='center')
        mobile_label.grid(row=2, column=0, pady=5, padx=5)
        mobile_var = tk.StringVar()
        mobile_enter = ttk.Entry(login, textvariable=mobile_var)
        mobile_enter.grid(row=2, column=1, padx=5)

        img = Image.open(r"./logo.jpg")
        global tk_img
        tk_img = ImageTk.PhotoImage(img)
        logo_frame = tk.Label(login, image=tk_img)
        logo_frame.grid(row=0,
                        column=3,
                        rowspan=3,
                        columnspan=5,
                        padx=30,
                        pady=5)

        login_button_1 = ttk.Button(login,
                                    text='OK',
                                    command=confirm,
                                    width=10)
        login_button_1.grid(row=3, column=6, pady=10, padx=20, sticky='e')

        login_button_2 = ttk.Button(login,
                                    text='Log in',
                                    command=self.log_in,
                                    width=10)
        login_button_2.grid(row=3, column=7, pady=10, padx=5)

        self.note = ttk.Label(login, text='Please verify your identity')
        self.note.grid(row=3, column=0, padx=100, columnspan=4, sticky='w')

    def create_query(self):
        query = ttk.LabelFrame(self.win, text=' Query ')
        query.grid(row=2, column=0, padx=50, pady=10, sticky='w' + 'e')

        court_label = ttk.Label(query, text='Court', width=14, anchor='center')
        court_label.grid(row=0, column=0, padx=5, pady=5)
        self.court_var = tk.IntVar()
        self.court_var.set(0)
        courtRad1 = ttk.Radiobutton(query,
                                    text='正大',
                                    variable=self.court_var,
                                    value=0,
                                    width=8)
        courtRad1.grid(column=1, row=0, sticky='w', padx=5)
        courtRad2 = ttk.Radiobutton(query,
                                    text='北区',
                                    variable=self.court_var,
                                    value=1,
                                    width=8)
        courtRad2.grid(column=2, row=0, sticky='w')
        courtRad3 = ttk.Radiobutton(query,
                                    text='江湾',
                                    variable=self.court_var,
                                    value=2,
                                    width=8)
        courtRad3.grid(column=1, row=1, sticky='w', padx=5)

        date_label = ttk.Label(query, text='Day of Week', anchor='center')
        date_label.grid(row=2, column=0, padx=5, pady=5)
        self.date = tk.StringVar()
        date_option = ttk.Combobox(query,
                                   textvariable=self.date,
                                   width=17,
                                   state='readonly')
        date_option['values'] = ('Today', 'Tomorrow', '+2', '+3(rush mode)')
        #        date_option.current(int((datetime.date.today()).strftime('%w')))
        date_option.current = ('Tomorrow')
        date_option.grid(row=2, column=1, padx=5, columnspan=2)

        start_time_label = ttk.Label(query, text='Start time', anchor='center')
        start_time_label.grid(row=3, column=0, padx=5, pady=5)
        self.start_time = tk.StringVar()
        start_time_option = ttk.Combobox(query,
                                         textvariable=self.start_time,
                                         width=17,
                                         state='readonly')
        start_time_option['values'] = ('08:00', '09:00', '10:00', '11:00',
                                       '12:00', '13:00', '14:00', '15:00',
                                       '16:00', '17:00', '18:00', '19:00',
                                       '20:00', '21:00')
        start_time_option.current(0)
        start_time_option.grid(row=3, column=1, padx=5, columnspan=2)

        end_time_label = ttk.Label(query, text='End time', anchor='center')
        end_time_label.grid(row=4, column=0, padx=5, pady=5)
        self.end_time = tk.StringVar()
        end_time_option = ttk.Combobox(query,
                                       textvariable=self.end_time,
                                       width=17,
                                       state='readonly')
        end_time_option['values'] = ('09:00', '10:00', '11:00', '12:00',
                                     '13:00', '14:00', '15:00', '16:00',
                                     '17:00', '18:00', '19:00', '20:00',
                                     '22:00')
        end_time_option.current(12)
        end_time_option.grid(row=4, column=1, padx=5, columnspan=2)

        avail_label = ttk.Label(query, text='Available', width=10)
        avail_label.grid(row=0, column=3, padx=30, pady=5, sticky='w')

        self.avail_scr = scrolledtext.ScrolledText(query, width=33, height=7)
        self.avail_scr.grid(row=1,
                            column=3,
                            padx=30,
                            pady=5,
                            rowspan=4,
                            columnspan=4)
        self.avail_scr.bind('<Enter>',
                            self.avail_scr.configure(cursor='arrow'))
        self.avail_scr.configure(state='disable')

        self.search_button = ttk.Button(query,
                                        text='Search',
                                        width=10,
                                        command=self.search,
                                        state='disable')
        self.search_button.grid(row=5, column=4, padx=15, pady=10, sticky='w')

        self.search_button2 = ttk.Button(query,
                                         text='Reserve',
                                         command=self.make_appointment,
                                         width=10,
                                         state='disable')
        self.search_button2.grid(row=5, column=6, padx=30, pady=10, sticky='w')

        self.note2 = ttk.Label(query, text='Please set your preference')
        self.note2.grid(row=5, column=0, padx=100, columnspan=4, sticky='w')

    def create_records(self):
        info = ttk.LabelFrame(self.win, text=' Info ')
        info.grid(row=3, column=0, padx=50, pady=10, sticky='w' + 'e')
        record_label = ttk.Label(info,
                                 text='Records',
                                 width=14,
                                 anchor='center')
        record_label.grid(row=0, column=0, padx=5, pady=5, sticky='w')

        self.record_scr = tk.Text(info, width=50, height=3)
        self.record_scr.grid(row=1,
                             column=0,
                             padx=50,
                             pady=10,
                             rowspan=2,
                             columnspan=2,
                             sticky='e')
        self.record_scr.bind('<Enter>',
                             self.avail_scr.configure(cursor='arrow'))
        self.avail_scr.configure(state='disable')

        self.info_button1 = ttk.Button(info,
                                       text='Update',
                                       width=10,
                                       command=self.update,
                                       state='disable')
        self.info_button1.grid(row=1, pady=5, padx=10, column=2)

        self.info_button2 = ttk.Button(info,
                                       text='Cancel',
                                       width=10,
                                       command=self.cancel,
                                       state='disable')
        self.info_button2.grid(row=2, pady=5, column=2)
Exemple #28
0
class ScoresWebTests(StaticLiveServerTestCase):

    def setUp(self):
        self.user1 = UserFactory.build()
        self.user1.set_password('abc')
        self.user1.save()

        self.user2 = UserFactory.build()
        self.user2.set_password('123')
        self.user2.save()

        self.userscore1 = UserScores(
            user=self.user1,
            wpm_gross=110,
            wpm_net=100,
            mistakes=8
        )
        self.userscore1.save()

        self.userscore2 = UserScores(
            user=self.user2,
            wpm_gross=100,
            wpm_net=90,
            mistakes=10
        )
        self.userscore2.save()

        self.match = Matches(winner=self.userscore1, loser=self.userscore2)
        self.match.save()

        self.browser = Browser()

    def tearDown(self):
        self.browser.quit()

    def login_helper(self, username, password):
        self.browser.visit(
            '%s%s' % (self.live_server_url, '/accounts/login/')
        )
        self.browser.fill('username', username)
        self.browser.fill('password', password)
        self.browser.find_by_value('Log in').first.click()

    # Test 11
    # Check anon get of /scores/
    def test_anon_get_scores(self):
        self.browser.visit('%s%s' % (self.live_server_url, '/scores/'))
        self.assertEqual(
            self.browser.url,
            '%s%s' % (self.live_server_url, '/accounts/login/?next=/scores/')
        )

    # Test 12
    # Check anon get of /scores/match_score
    def test_anon_get_match_score(self):
        self.browser.visit('%s%s' % (
            self.live_server_url,
            '/scores/match_score')
        )
        self.assertEqual(
            self.browser.url,
            '%s%s' % (
                self.live_server_url,
                '/accounts/login/?next=/scores/match_score'
            )
        )

    # Test 13
    # Check scores for user
    def test_user_for_scores(self):
        self.login_helper(self.user1.username, 'abc')
        self.browser.visit('%s%s' % (self.live_server_url, '/scores/'))
        self.assertEqual(
            self.browser.find_by_tag('strong')[2].text, self.user1.username
        )
        self.assertEqual(
            self.browser.find_by_tag('strong')[3].text, str(
                self.userscore1.wpm_net
            )
        )
        self.assertEqual(
            self.browser.find_by_tag('strong')[4].text, self.user2.username
        )
        self.assertEqual(
            self.browser.find_by_tag('strong')[5].text, str(
                self.userscore2.wpm_net
            )
        )
def scrape():

    mars_data = {}
    # browser = init_browser()
    # mars_dict = {}
    #import pdb;pdb.set_trace()
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    # # NASA Mars News
    # URL of page to be scraped
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Create BeautifulSoup object; parse with 'html.parser'
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    #News Title
    news_title = soup.find('div', class_="bottom_gradient").text
    print(news_title)
    #Paragraph text
    news_p = soup.find('div', class_='article_teaser_body').text
    # print('--------------------------------------------------')
    print(news_p)

    # Add the news title and summary to the dictionary
    mars_data["news_title"] = news_title
    mars_data["new_p"] = news_p

    # # Featured Image
    #import pdb; pdb.set_trace()
    Image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(Image_url)
    browser.click_link_by_partial_text('FULL IMAGE')
    time.sleep(5)
    browser.click_link_by_partial_text('more info')
    time.sleep(5)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # Extracting image
    Image_path = soup.find('figure', class_='lede').a['href']
    featured_image_url = 'https://www.jpl.nasa.gov/' + Image_path
    print(featured_image_url)

    # Add the featured image url to the dictionary
    mars_data["featured_image_url"] = featured_image_url

    # # Mars Weather
    mars_tweet = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(mars_tweet)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # Extracting tweet
    mars_weather = soup.find('div',
                             class_='js-tweet-text-container').text.replace(
                                 '\n', '')
    print(mars_weather)

    # Add the weather to the dictionary
    mars_data["mars_weather"] = mars_weather

    # #  Mars Facts
    mars_fact = 'https://space-facts.com/mars/'
    browser.visit(mars_fact)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    # Extracting mars table
    #set up lists to hold td elements which alternate between label and value
    trs = soup.find_all('tr')
    #set up lists to hold td elements which alternate between label and value
    labels = []
    values = []

    #for each tr element append the first td element to labels and the second to values
    for tr in trs:
        td_elements = tr.find_all('td')
        labels.append(td_elements[0].text)
        values.append(td_elements[1].text)
    print(labels, values)

    mars_fact_tabel = pd.DataFrame({"Label": labels, "Values": values})
    #mars_fact_tabel

    # convert the data to a HTML table string
    fact_table = mars_fact_tabel.to_html(header=False, index=False)
    print(fact_table)

    # Add the Mars facts table to the dictionary
    mars_data["mars_table"] = fact_table

    # # Mars Hemispheres
    USGS_site = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    browser.visit(USGS_site)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # Get the div element that holds the images.
    images = soup.find('div', class_='collapsible results')
    #Loop through the class="item" by clicking the h3 tag and getting the title and url.

    hemispheres_image_urls = []

    # print(len(images.find_all("div", class_="item")))
    for i in range(len(images.find_all("div", class_="item"))):
        # print(i)
        time.sleep(5)
        image = browser.find_by_tag('h3')
        image[i].click()
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.find("h2", class_="title").text
        # print(title)
        div = soup.find("div", class_="downloads")
        # for li in div:
        link = div.find('a')
        # print(link)
        url = link.attrs['href']

        # print(url)
        hemispheres = {'title': title, 'img_url': url}
        hemispheres_image_urls.append(hemispheres)
        browser.back()

        print(hemispheres_image_urls)

        # Add the hemispheres data to the  dictionary
        mars_data["hemispheres_image_urls"] = hemispheres_image_urls

    # Return the dictionary
    return mars_data
conn = sqlite3.connect('petfinder.db')
c = conn.cursor()
# c.execute('''CREATE TABLE dogs
#         (id text, name text,age text, breed text, animal text, shelterId text, sex text, website text)''')
# c.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
zippers = [90001, 90211, 90608]
# url = "https://zogzmiller.github.io/"
for z in zippers:
    browser = Browser('chrome')
    conn = sqlite3.connect('petfinder.db')
    c = conn.cursor()
    browser.visit('https://zogzmiller.github.io/')
    browser.fill('zip', z)
    browser.find_by_id('submitZip').click()
    time.sleep(10)
    listings = browser.find_by_tag('li')

    for i in listings:
        dogs = []
        text = i.text.split('^ ')
        for x in range(8):
            value = text[x].split(': ')[1]
            if '"' in value:
                standin = 'Doug Funny'
                dogs.append(standin)
            else:
                dogs.append(value)
        # c.executemany('INSERT INTO dogs VALUES (?,?,?,?,?,?,?)', dogs)
        c.execute(
            f'INSERT INTO dogs VALUES ("{dogs[0]}","{dogs[1]}","{dogs[2]}","{dogs[3]}","{dogs[4]}","{dogs[5]}","{dogs[6]}","{dogs[7]}")'
        )
def scrape():
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    mars = {}

    #-----------NASA MARS NEWS---------------
    url = "https://redplanetscience.com/"
    browser.visit(url)
    # Parse Results HTML with BeautifulSoup>

    html = browser.html
    news_soup = BeautifulSoup(html, "html.parser")

    #News
    news_title = news_soup.find("div", class_="content_title").text
    news_p = news_soup.find("div", class_="article_teaser_body").text
    mars["news_titles"] = news_title
    mars["news_p"] = news_p

    #-------------JPL MARS SPACE IMAGES - FEATURED IMAGE-------------
    url = "https://spaceimages-mars.com/"
    browser.visit(url)
    browser.find_by_tag("button")[1].click()
    html = browser.html
    jplimage = BeautifulSoup(html, "html.parser")
    image = jplimage.find('img', class_="fancybox-image").get('src')
    image
    featimgurl = "https://spaceimages-mars.com/" + image
    featimgurl
    mars["featured_image_url"] = featimgurl

    #-------------- MARS FACTS---------------
    url = "https://galaxyfacts-mars.com/"
    df = pd.read_html(url)
    df = df[0]
    df
    df.columns = ["Description", "Mars", "Earth"]
    df.set_index("Description", inplace=True)
    df
    df_html = df.to_html()
    df_html
    mars["facts"] = df_html

    #----------- MARS HEMISPHERES---------------
    url = "https://marshemispheres.com/"
    browser.visit(url)
    result = browser.find_by_css("a.product-item img")

    hemisphere_image_url = []

    for i in range(len(result)):
        hemisphere = {}
        browser.find_by_css("a.product-item img")[i].click()
        element = browser.links.find_by_text('Sample').first
        img_url = element["href"]
        hemisphere["img_url"] = img_url
        hemisphere["title"] = browser.find_by_css("h2.title").text
        hemisphere_image_url.append(hemisphere)
        browser.back()
    hemisphere_image_url
    mars["hemispheres"] = hemisphere_image_url

    # close the browser
    browser.quit()

    # Return one python dictionary containing all of the scraped data
    mars = {'news_title': news_title}
    mars['news_p'] = news_p
    mars['featured_image_url'] = featimgurl
    mars['facts'] = df_html
    mars['hemispheres'] = hemisphere_image_url

    return mars
Exemple #32
0
class ChopeBrowser:
    def __init__(self, headless=False):
        self.chrome = Browser('chrome', headless=headless)

    def time_delay(self, time):
        self.chrome.is_element_present_by_name('!@#$%^&*())(*&^%$#@!',
                                               wait_time=time)

    def login(self, usr, pwd, domain='STUDENT'):
        url = 'https://ntupcb.ntu.edu.sg'
        url += '/fbscbs/Account/SignIn?ReturnUrl=%2ffbscbs'
        self.chrome.visit(url)
        dropdown = self.chrome.find_by_tag('option')

        for option in dropdown:
            if option.text == domain:
                option.click()

        self.chrome.fill('Username', usr)
        self.chrome.fill('Password', pwd + '\n')

    def first_setup(self):
        button = self.chrome.find_by_id('tdFacilityBook')
        button.click()
        self.chrome.click_link_by_href('#8')
        self.chrome.click_link_by_href('#-1')
        self.chrome.click_link_by_href('/fbscbs/Booking/Create?resourceId=69')
        self.chrome.click_link_by_id('book')
        self.chrome.click_link_by_id('changeResource')
        self.chrome.click_link_by_href('#-1')
        self.chrome.click_link_by_id('book')

    def is_registered(event):
        if event.has_class('noShowWhite'):
            return False
        if event.has_class('currentEvent'):
            return False
        return True

    def check_facility(self, evFacilities):
        columnWeek = self.chrome.find_by_css('.wc-event-column')
        evWeek = []
        for columnDay in columnWeek:
            evToday = []
            evList = columnDay.find_by_css('.ui-corner-all')
            for event in evList:
                if not event.has_class('noShowWhite'):
                    if not event.has_class('currentEvent'):
                        event = event.text
                        if not event.find('—') == -1:
                            if event == '':
                                continue
                            evToday.append(event.split('—'))
            evWeek.append(evToday)
        evFacilities.append(evWeek)

    def click_next(self, counter, evFacilities):
        # Kerja rekursif dengan check_facility.
        # Milih option facility berdasarkan counter.
        dropdown = self.chrome.find_by_id('ResourceId')
        options = dropdown.find_by_tag('option')
        if counter < len(options):
            nextOption = options[counter]
            nextOption.click()
            self.check_facility(counter, evFacilities)
        else:
            return evFacilities

    def scrape_seats(self, usr, pwd):
        self.login(usr, pwd)
        self.first_setup()
        evFacilities = []
        dropdown = self.chrome.find_by_id('ResourceId')
        options = dropdown.find_by_tag('option')
        for opt in options:
            nextOption = opt
            nextOption.click()
            self.time_delay(0.1)
            # while loadingTitle.visible:
            #     pass
            evFacilities.append(opt.text)
            self.check_facility(evFacilities)
        return evFacilities

    def quit(self):
        self.chrome.quit()
Exemple #33
0
def scrape():
    # Create a library that holds all the Mars' Data
    mars_library = {}

    #### PART 1: NASA Mars News ####
    # Initiate ChromeDriver
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)

    # Target URL
    url = 'https://mars.nasa.gov/news/'
    browser.visit(url)

    # Create a Beautiful Soup object
    html = browser.html
    soup = bs(html, "html.parser")

    # Latest News Title and Paragraph
    news_title = soup.find("div", class_="content_title").text
    news_paragraph = soup.find("div", class_="article_teaser_body").text
    print(f"Title: {news_title}")
    print(f"Para: {news_p}")

    # put infos into Library
    mars_library['news_title'] = news_title
    mars_library['news_paragraph'] = news_paragraph

    #### PART 2: JPL Mars Space Images ####

    # Target URL
    image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(image_url)

    # Create a Beautiful Soup object
    html = browser.html
    soup2 = bs(html, "html.parser")

    # Featured Image
    image_url_route = soup2.find_all(
        'a', class_='fancybox')[0].get('data-fancybox-href').strip()

    # Full Address
    featured_image_url = "https://www.jpl.nasa.gov/" + image_url_route
    print(featured_image_url)

    # put infos into Library
    mars_library['featured_image_url'] = featured_image_url

    #### PART 3: Mars Weather ####

    # Target Twitter URL
    twitter_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(twitter_url)

    # Create a Beautiful Soup object
    html = browser.html
    soup3 = bs(html, "html.parser")

    # Latest Mars Weather Tweet
    mars_tweet = soup3.find_all(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
    )[0].text
    print(mars_tweet)

    # put infos into Library
    mars_library['mars_weather'] = mars_weather

    #### PART 4: Mars Fact ####

    # Target URL
    url = "https://space-facts.com/mars/"

    # Import URL to Panda
    table = pd.read_html(url)

    mars_fact_df = table[0]
    mars_fact_df.columns = ["Category", "Stats"]
    mars_fact_df.set_index(["Category"])

    # Exporting as HTML Table
    mars_fact_html = mars_fact_df.to_html()
    mars_fact_html = mars_fact_html.replace("\n", "")

    mars_fact_df.to_html('mars_fact_df.html')

    # Put infos into Library
    mars_library['mars_facts'] = mars_facts_df

    #### PART 5: Mars Hemispheres ####

    # Target URL
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    # Use splinter to loop through the 4 images and load them into a dictionary
    html = browser.html
    soup = bs(html, 'html.parser')
    mars_hemis = []

    # loop through the four tags and load the data to the dictionary

    for i in range(4):
        time.sleep(5)
        images = browser.find_by_tag('h3')
        images[i].click()
        html = browser.html
        soup = bs(html, 'html.parser')
        partial = soup.find("img", class_="wide-image")["src"]
        img_title = soup.find("h2", class_="title").text
        img_url = 'https://astrogeology.usgs.gov' + partial
        dictionary = {"title": img_title, "img_url": img_url}
        mars_hemis.append(dictionary)
        browser.back()

    # Put infos into Library
    mars_library['hemisphere_image_urls'] = mars_hemis

    # Return Library
    return mars_library
#https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars

# In[20]:

usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(usgs_url)

# In[21]:

usgs_html = browser.html
usgs_soup = BeautifulSoup(usgs_html, 'html.parser')
mars_hemis = []

# In[22]:

for i in range(4):
    images = browser.find_by_tag('h3')
    images[i].click()
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    partial = soup.find("img", class_="wide-image")["src"]
    img_title = soup.find("h2", class_="title").text
    img_url = 'https://astrogeology.usgs.gov' + partial
    dictionary = {"title": img_title, "img_url": img_url}
    mars_hemis.append(dictionary)
    browser.back()

# In[23]:

print(mars_hemis)
Exemple #35
0
def scrape():
    #Scrape the NASA Mars News Site and assign to variables for later reference
    url = "https://mars.nasa.gov/news/"
    page = requests.get("https://mars.nasa.gov/news/")
    soup = BeautifulSoup(page.text, "html.parser")

    #formated HTML content nicely using the prettify method on the BeautifulSoup object
    print(soup.prettify())
    html = list(soup.children)[2]
    #Scrape title
    news_title = html.find("title").get_text()
    paragraphs = soup.find_all("p")
    for paragraph in paragraphs:
        new_p = paragraph.text
#set up splinter and navigate to the site
    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)
    url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url_jpl)

    #etracting the current featured image
    browser.find_by_id("full_image").click()
    featured_image_url = browser.find_by_css(".fancybox-image").first["src"]
    #Mars Weather
    url_mars = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_mars)
    tweet_text = browser.find_by_css(".tweet-text")
    for tweet in tweet_text:
        if tweet.text.partition(" ")[0] == "Sol":
            mars_weather = tweet.text
            break


#Mars Facts
    url_mars_fact = "https://space-facts.com/mars/"
    browser.visit(url_mars_fact)
    tables = pd.read_html(url_mars_fact)
    df = tables[0]

    mars_df = df.set_index(0).rename(columns={1: "Value"})
    mars_df.index.names = ["Planet Profile"]

    # converting to html data
    mars_facts = mars_df.to_html()
    # strip unwanted newlines to clean up the table.
    mars_facts.replace('\n', '')
    # with print function to make readable easily
    print(mars_facts)
    #Mars Hemispheres
    url_mars_hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url_mars_hemi)
    Cerberus = browser.find_by_tag("h3")[0].text
    Schiaparelli = browser.find_by_tag("h3")[1].text
    Syrtis = browser.find_by_tag("h3")[2].text
    Valles = browser.find_by_tag("h3")[3].text
    browser.find_by_css(".thumb")[0].click()
    Cerberus_img = browser.find_by_text("Sample")["href"]
    browser.back()
    browser.find_by_css(".thumb")[1].click()
    Schiaparelli_img = browser.find_by_text("Sample")["href"]
    browser.back()
    browser.find_by_css(".thumb")[2].click()
    Syrtis_img = browser.find_by_text("Sample")["href"]
    browser.back()
    browser.find_by_css(".thumb")[3].click()
    Valles_img = browser.find_by_text("Sample")["href"]
    browser.back()

    hemisphere_image_urls = [{
        'title': Cerberus,
        'img_url': Cerberus_img
    }, {
        'title': Schiaparelli,
        'img_url': Schiaparelli_img
    }, {
        'title': Syrtis,
        'img_url': Syrtis_img
    }, {
        'title': Valles,
        'img_url': Valles_img
    }]
    hemisphere_image_urls

    mars_data = {
        "news_title": news_title,
        "news_p": new_p,
        "featured_image_url": featured_image_url,
        "mars_weather": mars_weather,
        "mars_facts": mars_facts,
        "hemisphere_image_urls": hemisphere_image_urls
    }

    return mars_data
Exemple #36
0
def scrape():

    # Pointing to the directory where chromedriver exists
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)


    ### NASA Mars News
    # There is delay in run time wait upto few seconds
    # URL of page to be scraped
    url1 = "https://mars.nasa.gov/news/"
    browser.visit(url1)
    # Create a Beautiful Soup object
    html1= browser.html
    soup1 = bs(html1, 'html.parser')
    # type(soup1)

    news_title = soup1.find("div",class_="content_title").text
    news_paragraph = soup1.find("div", class_="article_teaser_body").text
    print(f"* TITLE: {news_title}\n")
    print(f"* PARAGRAPH: {news_paragraph}\n")



    ### JPL Mars Space Images - Featured Image
    # There is delay in run time wait upto few seconds
    # URL of page to be scraped
    url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    browser.visit(url2)
    # Finding id full_image 
    browser.find_by_id("full_image").click()
    time.sleep(5)  
    # Create a Beautiful Soup object
    html2= browser.html
    soup2 = bs(html2, 'html.parser')
    #type(soup2)

    # Setting featured_image_url
    img_url = soup2.find('img', class_='fancybox-image')['src']
    # print(img_url)
    featured_image_url = "https://www.jpl.nasa.gov" + img_url
    print(f"* FEATURED IMAGE URL: {featured_image_url}\n")



    ### Mars Weather
    # There is delay in run time wait upto few seconds
    url3 = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url3)
    html3= browser.html
    soup3 = bs(html3, 'html.parser')
    #type(soup3)

    # Store the latest match for class_='TweetTextSize  TweetTextSize--normal js-tweet-text tweet-text
    mars_weather = soup3.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text                    
    print(f"* MARS WEATHER: {mars_weather}\n")



    ### Mars Facts
    # There is delay in run time wait upto few seconds
    url4 = "http://space-facts.com/mars/"
    browser.visit(url4)
    html4= browser.html
    soup4 = bs(html4, 'html.parser')
    #type(soup4)

    mars_facts = pd.read_html(url4)
    # mars_facts

    df_mars_facts = mars_facts[0]
    df_mars_facts.columns = ['Mars_Profile', 'Mars_ProfileValue']
    df_mars_facts.set_index('Mars_Profile', inplace=True)
    # df_mars_facts

    # mars_facts_html =df_mars_facts.to_html("mars_facts.html",justify='left')
    mars_facts_html =df_mars_facts.to_html(justify='left')
    print(f"* MARS FACTS HTML: {mars_facts_html}\n")    
    # !open mars_facts.html



    ### Mars Hemispheres
    # There is delay in run time wait upto few seconds
    url5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url5)
    html5= browser.html
    soup5 = bs(html5, 'html.parser')
    #type(soup5)

    # Creating empty dictionary for storing images url and title
    mars_hemisphere_dict = []
    for i in range (4):   # we have 4 images
        time.sleep(5)
        imgs = browser.find_by_tag('h3')   # looking for all h3 tags where we have to click
        imgs[i].click()
        html5 = browser.html
        soup5 = bs(html5, 'html.parser')
        url_part = soup5.find("img", class_="wide-image")["src"]
        title = soup5.find("h2",class_="title").text
        iurl = 'https://astrogeology.usgs.gov'+ url_part
        mars_dict={"title": title,"img_url":iurl}
        mars_hemisphere_dict.append(mars_dict)
        browser.back()

    print(f"* MARS HEMISPHERE: {mars_hemisphere_dict}\n")

    # Consolidating all scraped data into one dictionary.
     # mars_mission_data = {
    #     'LATEST_MARS_NEWS_TITLE': news_title,
    #     'LATEST_MARS_NEWS_TEXT' : news_paragraph,
    #     'MARS_FEATURED_IMAGE'   : featured_image_url,
    #     'MARS_WEATHER'          : mars_weather,
    #     'MARS_FACTS'            : mars_facts_html,
    #     'MARS_HEMISPHERE'       : mars_hemisphere_dict
    # }
    mars_mission_data = {
        'news_title'            : news_title,
        'news_paragraph'        : news_paragraph,
        'featured_image_url'    : featured_image_url,
        'mars_weather'          : mars_weather,
        'mars_facts_html'       : mars_facts_html,
        'mars_hemisphere_dict'  : mars_hemisphere_dict
    }
    print(f"** MARS MISSION DATA : {mars_mission_data}\n")
    return mars_mission_data
Exemple #37
0
def scrape():
    from bs4 import BeautifulSoup
    from splinter import Browser
    import requests
    import os
    import pandas as pd
    import lxml.html as LH
    import time

    # In[2]:

    executable_path = {'executable_path': 'chromedriver.exe'}
    browser = Browser('chrome', **executable_path, headless=False)

    # In[3]:

    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    browser.visit(url)

    # In[4]:

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    news_title = soup.find('div', class_='content_title').text

    news_p = soup.find('div', class_="article_teaser_body").text

    # In[5]:

    print(news_title)

    # In[6]:

    # news_p=news_p.text
    print(news_p)

    # In[7]:

    url_jpl = 'https://www.jpl.nasa.gov'
    url_pic = url_jpl + '/spaceimages/?search=&category=Mars'
    browser.visit(url_pic)

    # In[8]:

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # In[9]:

    test = soup.find_all('a', class_="fancybox")

    # In[10]:

    image = test[1].get('data-fancybox-href')

    # In[11]:

    featured_image_url = url_jpl + image

    print(featured_image_url)

    # In[12]:

    # for link in soup.find_all('a', class_="fancybox"):
    #     print(link.get('data-fancybox-href'))

    # In[13]:

    url_tweet = "https://twitter.com/marswxreport?lang=en"
    browser.visit(url_tweet)

    # In[14]:

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # In[15]:

    #tweets = [p.text for p in soup.find_al('p', class="tweet-text")]

    # In[16]:

    tweets = soup.find_all('p', class_="tweet-text")

    # In[17]:

    mars_weather = tweets[0].text

    # In[18]:

    facts_url = "https://space-facts.com/mars/"

    # In[19]:

    facts_df = pd.read_html(facts_url, header=None, index_col=None)

    # In[20]:

    facts_df = facts_df[0]

    # In[21]:

    facts_df.columns = ['Fact', 'Data']

    # In[22]:

    table = facts_df.to_html()

    # In[23]:

    table

    # In[24]:

    hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(hemisphere_url)

    # In[25]:

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    mars_hemis = []

    # In[ ]:

    # In[26]:

    for x in range(4):
        time.sleep(5)
        title = browser.find_by_tag('h3')
        title[x].click()
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')
        link = soup.find("img", class_="wide-image")["src"]
        hem_title = soup.find('h2', class_="title").text
        mars_hemis.append({
            "img_url": "https://astrogeology.usgs.gov" + link,
            "title": hem_title
        })
        browser.back()

    # In[27]:

    mars_hemis

    # In[28]:

    mars_dict = {
        'hemis_pics': mars_hemis,
        'table': table,
        'weather': mars_weather,
        'feature_pic': featured_image_url,
        'title': news_title,
        'paragraph': news_p
    }
    return mars_dict
from splinter import Browser
import time

myseed = []
try:
	with open('seed.txt') as seed:
		myseed = seed.readlines()
except IOError as err:
	print("File error: ", str(err))

browser1 = Browser('chrome')
browser1.visit('http://app.scientificseller.com/keywordtool')
#browser1.reload()
browser1.find_by_tag('textarea').fill(myseed)
browser1.find_by_tag('button').click()
browser1.find_by_tag('input').fill('fafafa')
browser1.find_by_tag('button').click()

volume = 0
try:
	with open('volume.txt') as v:
		vol = v.readlines()
		volume = vol.pop()
#		print(volume)
except IOError as err:
	print('File error: ',str(err))

loop = True
while loop:
#	print(browser1.find_by_tag('strong').value + '\n')
	time.sleep(5)    
def scrape():

    mars_dict = {}

    # Mars News URL of page to be scraped
    url = 'https://mars.nasa.gov/news/'
    html = requests.get(url).text
    title_soup = BeautifulSoup(html, 'html.parser')
    # Retrieve the latest news title and paragraph
    news_title = title_soup.find('div', class_='content_title').text
    news_par = title_soup.find('div', class_='rollover_description_inner').text

    # Mars Image to be scraped
    browser = Browser('chrome', headless=False)
    mars_image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(mars_image_url)
    time.sleep(1)
    # Move to a second page
    target1 = "a[class='button fancybox']"
    browser.find_by_tag(target1).click()
    # Move to next page
    browser.find_by_text('more info     ').click()
    # Move to next page with image url
    target2 = "figure[class='lede']"
    browser.find_by_tag(target2).click()
    time.sleep(1)
    image_soup = BeautifulSoup(browser.html, 'html.parser')
    image_link = image_soup.find('img', src=True)
    # Retrieve featured image link
    featured_image_url = image_link['src']
    browser.quit()

    time.sleep(1)
    # Mars weather to be scraped
    mars_weather_url = 'https://twitter.com/MarsWxReport?lang=en'
    browser = Browser('chrome', headless=False)
    browser.visit(mars_weather_url)
    time.sleep(3)
    weather_soup = BeautifulSoup(browser.html, 'html.parser')
    # Retrieve latest tweet with Mars weather info
    mars_weather = weather_soup.find(
        "div",
        class_=
        "css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0"
    ).text
    browser.quit()

    time.sleep(1)
    # Mars facts to be scraped, converted into html table
    mars_facts_url = 'https://space-facts.com/mars/'
    mars_facts_table = pd.read_html(mars_facts_url)
    mars_facts = mars_facts_table[2]
    mars_facts.columns = ["Description", "Value"]
    mars_html_table = mars_facts.to_html()
    mars_html_table.replace('\n', '')

    time.sleep(1)
    # Mars hemisphere name and image to be scraped
    usgs_url = 'https://astrogeology.usgs.gov'
    hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser = Browser('chrome', headless=False)
    browser.visit(hemispheres_url)
    time.sleep(3)
    hemispheres_html = browser.html
    pics_soup = BeautifulSoup(hemispheres_html, 'html.parser')
    # Mars hemispheres products data
    all_mars_hemispheres = pics_soup.find('div', class_='collapsible results')
    mars_hemispheres = all_mars_hemispheres.find_all('div', class_='item')
    hemisphere_image_urls = []
    # Iterate through each hemisphere data
    time.sleep(3)
    for i in mars_hemispheres:
        # Collect Title
        hemisphere = i.find('div', class_="description")
        title = hemisphere.h3.text
        # Collect image link by browsing to hemisphere page
        hemisphere_link = hemisphere.a["href"]
        browser.visit(usgs_url + hemisphere_link)
        image_html = browser.html
        image_soup = BeautifulSoup(image_html, 'html.parser')
        image_link = image_soup.find('div', class_='downloads')
        image_url = image_link.find('li').a['href']
        # Create Dictionary to store title and url info
        image_dict = {}
        image_dict['title'] = title
        image_dict['img_url'] = image_url
        hemisphere_image_urls.append(image_dict)
        time.sleep(1)

        # Mars
        mars_dict = {
            "news_title": news_title,
            "news_par": news_par,
            "featured_image_url": featured_image_url,
            "mars_weather": mars_weather,
            "fact_table": str(mars_html_table),
            "hemisphere_images": hemisphere_image_urls
        }

    return mars_dict
Exemple #40
0
def scrape():
    # Open borowser
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)

    # URL of page to be scraped
    url = 'https://mars.nasa.gov/news'
    url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    url3 = 'https://twitter.com/marswxreport?lang=en'
    url4 = 'https://space-facts.com/mars/'
    url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

    # Requests of the URLs
    news_request = requests.get(url)
    #spaceimages_request = requests.get(url2)
    #twitter_request = requests.get(url3)
    #facts_request = requests.get(url4)
    #hemisphere_request = requests.get(url5)

    # First scrapping
    news_request.text
    news = bs(news_request.text, "html.parser")

    ##Get the Title of the page
    TITLE = news.find("title")
    TITLE = TITLE.get_text().replace('\n', '').strip()

    ##Get the title of the first news
    content_title = news.find(class_="content_title")
    news_title = content_title.get_text().replace('\n', '').strip()

    ## Get the description of the first news.
    rollover_description_inner = news.find(class_="rollover_description_inner")
    news_p = rollover_description_inner.get_text().replace('\n', '').strip()

    # Second Scrapping
    browser.visit(url2)
    browser.find_by_id('full_image').click()
    html = browser.html
    soup = bs(html, "html.parser")

    featured_image_url = soup.find("article",
                                   class_="carousel_item").get('style')
    featured_image_url = featured_image_url.split("'")[1]
    featured_image_url = f"https://www.jpl.nasa.gov{featured_image_url}"

    # Third Scraping
    browser.visit(url3)
    time.sleep(6)
    html = browser.html
    soup = bs(html, "html.parser")

    mars_weather = soup.find(text=re.compile('InSight'))

    #mars_weather = soup.find_all(class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0")
    #mars_weather = mars_weather[23].text

    # Fourth Scraping
    tables = pd.read_html(url4)
    tables[0]
    HTML_TABLE = tables[0].to_html()
    HTML_TABLE = bs(HTML_TABLE, "html.parser")

    # Fifth Scraping
    browser.visit(url5)
    browser.find_by_tag('h3')[0].click()
    html = browser.html
    soup = bs(html, "html.parser")

    ##First image data
    url5_1 = soup.find_all('li')
    url5_11 = url5_1[0]
    url5_11 = url5_11.a["href"]
    url5_12 = soup.find('h2', class_="title").text

    ##Second image data
    browser.visit(url5)
    browser.find_by_tag('h3')[1].click()
    html = browser.html
    soup = bs(html, "html.parser")
    url5_2 = soup.find_all('li')
    url5_21 = url5_2[0]
    url5_21 = url5_21.a["href"]
    url5_22 = soup.find('h2', class_="title").text

    ##Third image data
    browser.visit(url5)
    browser.find_by_tag('h3')[2].click()
    html = browser.html
    soup = bs(html, "html.parser")
    url5_3 = soup.find_all('li')
    url5_31 = url5_3[0]
    url5_31 = url5_31.a["href"]
    url5_32 = soup.find('h2', class_="title").text

    ##Fourth image data
    browser.visit(url5)
    browser.find_by_tag('h3')[3].click()
    html = browser.html
    soup = bs(html, "html.parser")
    url5_4 = soup.find_all('li')
    url5_41 = url5_4[0]
    url5_41 = url5_41.a["href"]
    url5_42 = soup.find('h2', class_="title").text

    hemisphere_image_urls = [
        {
            "title": url5_11,
            "img_url": url5_12
        },
        {
            "title": url5_21,
            "img_url": url5_22
        },
        {
            "title": url5_31,
            "img_url": url5_32
        },
        {
            "title": url5_41,
            "img_url": url5_42
        },
    ]

    listings = {}
    listings["title"] = TITLE
    listings["newsH"] = news_title
    listings["newsP"] = news_p
    listings["featuredImg"] = featured_image_url
    listings["weather"] = mars_weather
    #listings["HTMLtable"] = HTML_TABLE
    #listings["HemisphereDic"] = hemisphere_image_urls
    listings["url5_11"] = url5_11
    listings["url5_12"] = url5_12
    listings["url5_21"] = url5_21
    listings["url5_22"] = url5_22
    listings["url5_31"] = url5_31
    listings["url5_32"] = url5_32
    listings["url5_41"] = url5_41
    listings["url5_42"] = url5_42

    browser.quit()
    return listings
Exemple #41
0
class SplinterBrowserDriver(BaseBrowserDriver):
    """
        This is a BrowserDriver for splinter
        (http://splinter.cobrateam.info)
        that implements the BaseBrowserDriver API.

        To use it, you must have splinter installed on your env.

        For itself it's a browser driver that supports multiple browsing
        technologies such as selenium, phantomjs, zope, etc.
    """

    driver_name = 'splinter'

    def __init__(self):
        super(SplinterBrowserDriver, self).__init__()
        if not splinter_available:
            raise ImportError(
                "In order to use splinter Base Driver you have to install it. "
                "Check the instructions at http://splinter.cobrateam.info")
        self._browser = Browser(config.default_browser)

    def _handle_empty_element_action(self, element):
        if not element:
            raise ActionNotPerformableException(
                "The action couldn't be perfomed because the element couldn't "
                "be found; Try checking if your element"
                "selector is correct and if the page is loaded properly.")

    @property
    def page_url(self):
        return self._browser.url

    @property
    def page_source(self):
        return self._browser.html

    @property
    def page_title(self):
        return self._browser.title

    def open_url(self, url):
        self._browser.driver.get(url)

    def quit(self):
        return self._browser.quit()

    def is_element_visible(self, element):
        return element.visible

    def get_element_text(self, element):
        return element.text

    def get_element_by_xpath(self, selector):
        return self._browser.find_by_xpath(selector)

    def get_element_by_css(self, selector):
        return self._browser.find_by_css(selector)

    def get_element_by_id(self, selector):
        return self._browser.find_by_id(selector)

    def get_element_by_tag(self, selector):
        return self._browser.find_by_tag(selector)

    @element_action
    def type(self, element, text, slowly=False):
        return element.type(text, slowly)

    @element_action
    def fill(self, element, text):
      return element.fill(text)

    @element_action
    def clear(self, element):
      self.fill(element, '')

    @element_action
    def click(self, element):
        return element.click()

    @element_action
    def check(self, element):
        return element.check()

    @element_action
    def uncheck(self, element):
        return element.uncheck()

    @element_action
    def mouse_over(self, element):
        return element.mouse_over()

    @element_action
    def mouse_out(self, element):
        return element.mouse_out()

    def reload(self):
        return self._browser.reload()

    def go_back(self):
        return self._browser.back()

    def go_forward(self):
        return self._browser.forward()

    def execute_script(self, script):
        return self._browser.evaluate_script(script)

    def get_iframe(self, iframe_id):
        return self._browser.get_iframe(iframe_id)

    def get_alert(self):
        return self._browser.get_alert()

    def attach_file(self, input_name, file_path):
        return self._browser.attach_file(input_name, file_path)

    def wait_pageload(self, timeout=30):
        wait_interval = 0.05
        elapsed = 0

        while self.execute_script('document.readyState') != 'complete':
            self.wait(wait_interval)
            elapsed += wait_interval

            if elapsed > timeout:
                raise PageNotLoadedException

    def click_and_wait(self, element, timeout=30):
        self.click(element)
        self.wait_pageload(timeout)
			browser.type('scEffDate', '2014-10-31')
			browser.find_by_name('update').first.click()

			browser.find_link_by_text('Obligor').first.click()

			# choose the companyType type
			element = browser.find_by_name('companyType').first
			element.select(str(cType))

			browser.fill('obligorName', countryList[conIndex])

			browser.find_by_name('ObligorSearch').first.click()

			if cType == 0 or cType == 1:
				# browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[1].find_by_xpath('//tr[td[@text="'+kmvCountryList[conIndex]+'"]]').first.find_by_tag('a')[0].click()
				browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[1].find_by_xpath('//tbody/tr[td[text()[contains(.,"' + kmvCountryList[conIndex] + '")]]]')[0].find_by_tag('a')[0].click()
			else:
				# element = browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[2].find_by_xpath('//tbody/tr[td[text()[contains(.,"' + kmvCountryList[conIndex] + '")]]]')[0].find_by_tag('a')[0].click()
				elementList = browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[2].find_by_tag('tr')
				for element in elementList:
					if kmvCountryList[conIndex] in element.text:
						element.find_by_tag('a')[0].click()
						break


			element = browser.find_by_name('counterPartyType').first
			element.select('1')

			element = browser.find_by_name('avc').first
			element.select('4')
Exemple #43
0
returnValue = messageBox(None,"Do you want to copy scenario, obligor or facility?","Copy scenario/obligor/facility",0x40 | 0x1)

while returnValue == 1:

	# browser.fill('clientSearchString', 'jason\'s client')
	# browser.find_by_name('search').first.click()
	# browser.find_by_value('GO').first.click()

	waitNavigation = ctypes.windll.user32.MessageBoxW(0, "Now navigate the scenario/obligor/facility page and then click OK.", "", 0)

	if waitNavigation == 2:
		break
	# noddd = browser.find_by_tag('form').first.find_by_tag('td').first.find_by_tag('table').first

	noddd = browser.find_by_tag('form').first

	noddStr = noddd.html.replace('\n','\t').replace('\'','\\\'')

	scenarioName = browser.find_by_xpath('//body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[@class="rowHeader"]')[2].text

	if noddStr.find('Scenario Status') != -1:

		file = open('scenario_' + scenarioName + '.xml', 'w+')
		file.write(noddStr)
		file.close()


	elif noddStr.find('Correlation Information') != -1:

		file = open('obligor_' + scenarioName + '.xml', 'w+')
class UploadTestCase(unittest.TestCase):

  def setUp(self):
    self.testbed = testbed.Testbed()
    self.testbed.activate()
    self.testbed.init_datastore_v3_stub()
    self.testbed.init_memcache_stub()
    self.browser = Browser('chrome')

  def tearDown(self):
    self.testbed.deactivate()

  def test_when_create_task_upload_file(self):
    #login
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

    self.browser.visit("http://127.0.0.1:8080/tasks")

    self.browser.click_link_by_text('Create new task')

    self.browser.fill('title', 'title')
    self.browser.fill('text', 'text')

    self.browser.is_element_present_by_name('files[]', wait_time=10)

    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    #self.browser.attach_file('files[]', 'test/1.png')
    self.browser.find_by_css('.btn.btn-primary.start').first.click()


    self.assertEqual(1, len(self.browser.find_by_css('.template-download.fade.in')))
    self.assertEqual(4, len(self.browser.find_by_css('.template-download.fade.in td')))

  def test_when_create_task_upload_many_files(self):
    #login
    self.browser.visit("http://127.0.0.1:8080/")
    self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
    self.browser.find_by_id("submit-login").first.click()
    self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance")

    self.browser.visit("http://127.0.0.1:8080/tasks")

    self.browser.click_link_by_text('Create new task')

    self.browser.fill('title', 'title')
    self.browser.fill('text', 'text')

    self.browser.is_element_present_by_name('files[]')

    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))
    self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png'))

    #self.browser.attach_file('files[]', 'test/1.png')
    self.browser.find_by_css('.btn.btn-primary.start').first.click()
    sleep(3)

    self.assertEqual(3, len(self.browser.find_by_css('.files tr.template-download')))
browser.visit(url)
print u'你现在有20s的时间输入jaccount'
sleep(15)
print u'请稍等,本程序稍微有点慢.在这段等待的时间,你可以先打开你的微信^_^'
browser.visit('http://electsys.sjtu.edu.cn/edu/student/elect/warning.aspx?xklc=1&lb=3')
button=browser.find_by_id('CheckBox1')
if (browser.is_element_not_present_by_id('CheckBox1')):
	pass
else:
	button.click()
	browser.find_by_id('btnContinue').click()

#获取所有课程编码
pattern=re.compile(r'[A-Z]{2}[0-9]{3}')
classlist=[]
for ele in browser.find_by_tag('td'):
	if (re.match(pattern,ele.text)):
		classlist.append(ele.text)

#打开微信
wechaturl='http://wechat.shwilling.com/auth/qrcode/login?redirect=http%3A%2F%2Fwechat.shwilling.com%2Fsjtu%2Fcourse'
browser.visit(wechaturl)
print u'你现在有20s的时间扫描二维码确认登陆'
sleep(10)
print u'请稍等,本程序稍微有点慢...但是等待还是值得的.'
myfile=open(u'scorelist.txt','w')
for classid in classlist:
	time=['/2014-2015-1','/2014-2015-2','/2015-2016-1']
	for i in range(3):
		class_str='http://wechat.shwilling.com/sjtu/course/detail/'+classid+time[i]
		browser.visit(class_str)
class TestViews(unittest.TestCase):
    def setUp(self):
        """ Test setup """
        self.browser = Browser("phantomjs")

        # Set up the tables in the database
        Base.metadata.create_all(engine)

        # Create an example user
        self.user = models.User(name="Alice", email="*****@*****.**",
                                password=generate_password_hash("test"))
        session.add(self.user)
        session.commit()

        self.process = multiprocessing.Process(target=app.run)
        self.process.start()
        time.sleep(1)

    def testLoginCorrect(self):
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")

    def testLoginIncorrect(self):
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/login")
    
    
    def testAddEditPost(self):
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.browser.visit('http://127.0.0.1:5000/post/add')
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/add")
        self.browser.fill("title", "First Post")
        self.browser.fill("content", "Hello World!")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.browser.click_link_by_text('Edit Post')
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/1/edit")
        self.browser.fill("title", "Edited First Post")
        self.browser.fill("content", "Hello Universe!")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.assertEqual(self.browser.find_by_tag('h1').first.value, "Edited First Post")
        #divs = self.browser.find_by_tag("div")
        #myList = []
        #if "Hello Universe!" in divs:
            #myList.append("Hello Universe!")
        #self.assertEqual(myList[0], "Hello Universe!")
    
    def testAddDeletePost(self):
        self.browser.visit("http://127.0.0.1:5000/login")
        self.browser.fill("email", "*****@*****.**")
        self.browser.fill("password", "test")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.browser.visit('http://127.0.0.1:5000/post/add')
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/add")
        self.browser.fill("title", "First Post")
        self.browser.fill("content", "Hello World!")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.browser.click_link_by_text('Delete Post')
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/1/delete")
        button = self.browser.find_by_css("button[type=submit]")
        button.click()
        self.assertEqual(self.browser.url, "http://127.0.0.1:5000/")
        self.assertEqual(len(self.browser.find_by_tag('h1')),0)
        #divs = self.browser.find_by_tag("div")
        #myList = []
        #if "Hello Universe!" in divs:
            #myList.append("Hello Universe!")
        #self.assertEqual(myList[0], "Hello Universe!")

    def tearDown(self):
        """ Test teardown """
        # Remove the tables and their data from the database
        self.process.terminate()
        session.close()
        engine.dispose()
        Base.metadata.drop_all(engine)
        self.browser.quit()
Exemple #47
0
def splinter(url):
    browser = Browser('chrome')
    #login 126 email websize
    browser.visit(url)
    #wait web element loading
    time.sleep(1)

    ##input username
    browser.find_by_tag('input')[0].fill('username')

    ##input password
    browser.find_by_tag('input')[1].fill('password')

    time.sleep(10)
    print('click now!')

    browser.find_by_id('loginSub').click()

    time.sleep(20)
    print('sleep')

    browser.find_by_xpath('//*[@id="selectYuding"]/a').first.click()

    time.sleep(0.5)

    print(browser.cookies.all())

    browser.cookies.add({'_jc_save_toStation': '%u90D1%u5DDE%2CZZF'})
    browser.cookies.add({'_jc_save_fromStation': '%u4E0A%u6D77%2CSHH'})

    browser.cookies.add({'_jc_save_fromDate': '2018-02-13'})
    browser.reload()
    print(browser.cookies.all())

    browser.find_by_xpath('//*[@id="query_ticket"]').first.click()

    time.sleep(0.5)

    target_line = [
        'G1952', 'G3600', 'G1866', 'G1806', 'G1920', 'G1924', 'G1956', 'G1810',
        'G1928', 'G1814', 'G1932', 'G1818'
    ]

    flag = False
    while True:

        a_list = browser.find_by_tag('a')

        for a in a_list:

            if not a['onclick']:
                continue

            if 'checkG1234' in a['onclick']:
                L = a['onclick'].split(',')
                s = L[2]
                line = re.findall(r'G[0-9]{4}', s)

                if (len(line) != 0):
                    line_nb = line[0]

                    print(s, '-----', line_nb)
                    if line_nb in target_line:

                        s = s.strip("'")
                        id_target = 'ZE_' + s
                        print(id_target)
                        print(id_target.strip("'"))

                        available = browser.find_by_xpath(
                            f'//*[@id="{id_target}"]/div')
                        print(available.text)

                        if (available.text == '无'):
                            continue
                        print(type(available.text))

                        print(line_nb)
                        print(a.text)
                        print('***')
                        a.click()
                        flag = True
                        break

        if flag == True:
            print('found')
            break
        else:
            print('not found')

        browser.reload()
        browser.find_by_xpath('//*[@id="query_ticket"]').first.click()

    browser.find_by_xpath('//*[@id="normalPassenger_0"]').first.click()
    browser.find_by_xpath('//*[@id="normalPassenger_4"]').first.click()
    browser.find_by_xpath('//*[@id="submitOrder_id"]').first.click()

    browser.click_link_by_id('query_ticket')

    time.sleep(8)
Exemple #48
0
    def getCVContacts(self, cvID):
        cvID = "85798642"  # 不公开
        cvID = "307661274"
        #         cvID = "6098724"
        #         cvID = "318657201"
        from splinter import Browser
        browser = Browser()
        url = "http://ehire.51job.com/MainLogin.aspx"
        browser.visit(url)
        time.sleep(1)
        browser.find_by_id('txtMemberNameCN').fill(u'安能聚业')
        browser.find_by_id('txtUserNameCN').fill(u'上海安能聚创供应链')
        browser.find_by_id('txtPasswordCN').fill('aneqc888')

        browser.find_by_id('Login_btnLoginCN').click()
        time.sleep(1)
        browser.find_by_tag('a').click()
        selector = etree.HTML(browser.html)
        cvDownableNum = selector.xpath('//span[@id ="Navigate_AvalidResumes"]/a/b')[0].text
        if cvDownableNum == "0":
            self.log.fatal("id:%s can not be down, as to cvDownableNum == 0." % (cvID))
            browser.quit()
            return "0\n\ncvDownableNum is 0."

        browser.find_by_id('hlResumeSearch').click()
        browser.find_by_id('txtUserID').fill(cvID)
        time.sleep(1)
        browser.find_by_id('btnSearchID_leftbtnSearchID').click()

        cvTarget = browser.find_by_xpath('//tr/td/p/span/a[@target="_blank"]')
        if len(cvTarget) == 0:
            self.log.fatal("can not find the cv from this id:%s." % (cvID))
            browser.quit()
            return "0\n\ncan not find the cv from this id."
        cvTarget.click()
        allwindows = browser.windows
        browser.driver.switch_to_window(allwindows[-1].name)
        UndownloadLink = browser.find_by_id('UndownloadLink')
        if len(UndownloadLink) != 0:
            UndownloadLink.click()
            time.sleep(1)
            browser.find_by_id('btnCommonOK').click()
        selector = etree.HTML(browser.html)
        contents = browser.html.encode("utf-8")
        winNum = len(allwindows)
        for i in range(winNum):
            allwindows[winNum - 1 - i].close()
        browser.quit()
        lines = selector.xpath('//title')
        name = ""
        if len(lines) != 0:
            name = strip(lines[0].text)
        try:
            phone = \
            re.findall(re.compile('''<td height="20">电 话:</td><td height="20" colspan="3">(.*?)<span'''), contents)[0]
        except:
            phone = "not supplied"

        try:
            eMail = \
            re.findall(re.compile('''E-mail:</td><td height="20" colspan="3"><a href="mailto:(.*?)" class="blue">'''),
                       contents)[0]
        except:
            eMail = "not supplied"

        if not isinstance(name, unicode):
            name = name.decode("utf-8")
        if not isinstance(phone, unicode):
            phone = phone.decode("utf-8")

        result = "1\n\nname:%s\tphone:%s\teMail:%s" % (name, phone, eMail)
        self.log.fatal(result)
        return result
Exemple #49
0
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 15 19:14:26 2021
Downloads BIOS driver for ASRock Z490 Phantom Gaming-ITX/TB3
@author: Eric
"""
# Import Splinter and Chromedriver
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import time

# Wait for 5 seconds
time.sleep(5)

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
# =============================================================================
url = 'https://www.asrock.com/MB/Intel/Z490%20Phantom%20Gaming-ITXTB3/index.asp#BIOS'
browser.visit(url)
bios_box = browser.find_by_tag('tbody')
bios_box.find_by_tag("a")[1].click()

# Wait for 20 seconds so download completes before window closes
time.sleep(20)
Exemple #50
0
parser.read('config.ini')

browser = Browser(parser.get('Config', 'Browser'))
browser.driver.maximize_window()

browser.visit('https://fsweb.no/studentweb/login.jsf?inst=' +  parser.get('Config', 'Institution'))
browser.find_by_text('Norwegian ID number and PIN').first.click()

browser.find_by_id('login-box')
browser.fill('j_idt129:j_idt131:fodselsnummer', parser.get('Config', 'Fodselsnummer'))
browser.fill('j_idt129:j_idt131:pincode',  parser.get('Config', 'Pin'))
browser.find_by_text('Log in').first.click()

browser.click_link_by_href('/studentweb/resultater.jsf')

tags = browser.find_by_tag('tr')

chars = []

for tag in tags:
	if tag.has_class('resultatTop') or tag.has_class('none'):
		inner_tags = tag.find_by_tag('td')
		course_id = inner_tags[1].text.split("\n")[0]
		course_name = inner_tags[1].text.split("\n")[1]
		grade = inner_tags[5].text
		if grade != 'passed':
			chars.append(grade) 
			print "%s\t%-30s\t%s" % (course_id, course_name, grade)

total = 0.0
for char in chars:
Exemple #51
0
def scrape():
    # Open a blank window of Google Chrome.
    chrome_exec_shim = os.environ.get("GOOGLE_CHROME_BIN", "chromedriver")
    chromedriver_path = os.environ.get("CHROMEDRIVER_PATH", "")
    print("google chrome bin = %s" % chrome_exec_shim)
    print("chromedriver_path = %s" % chromedriver_path)
    if (chrome_exec_shim):
        # chrome_options = Options()
        # chrome_options.binary_location = chrome_exec_shim
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--no-sandbox')
        # driver = webdriver.Chrome(executable_path=chromedriver_path, chrome_options=chrome_options)
        # self.selenium = webdriver.Chrome(executable_path=chrome_exec_shim)
        exec_path = {'executable_path': chromedriver_path}

    browser = Browser("chrome", headless=True, **exec_path)

    mars_facts_data = {}

    # Visit the NASA newspage using the blank Chrome window.
    nasa_news_url = "https://mars.nasa.gov/news/"
    browser.visit(nasa_news_url)

    # Get html code from the site and convert it into json.
    html = browser.html
    soup = bs(html, "html.parser")

    news_title = soup.find("div", class_="content_title").text
    paragraph_text = soup.find("div", class_="article_teaser_body").text
    mars_facts_data['news_title'] = news_title
    mars_facts_data['news_paragraph'] = paragraph_text
    # JPL Mars Space Images - Featured Image

    # Visit the JPL site which includes the featured image and extract the html code.
    #     jpl_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    #     browser.visit(jpl_image_url)

    #     html = browser.html
    #     soup = bs(html,"html.parser")

    #     featured_image_url = soup.find('a', {'id': 'full_image', 'data-fancybox-href': True}).get('data-fancybox-href')

    #     split_url = featured_image_url.split('/')

    #     pia_url = split_url[-1]

    #     base_image_url = "https://photojournal.jpl.nasa.gov/jpeg/"

    #     pia_final = pia_url.split('_')[0]+'.jpg'

    #     full_image_url = base_image_url + pia_final
    #     mars_facts_data["featured_image_url"] = full_image_url
    browser.visit(
        'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars')
    browser.find_by_id('full_image').click()
    featured_image_url = browser.find_by_css('.fancybox-image').first['src']

    mars_facts_data['featured_image_url'] = featured_image_url

    # Mars Weather
    mars_weather_twitter_url = "https://twitter.com/marswxreport?lang=en"
    browser.visit(mars_weather_twitter_url)

    html = browser.html
    soup = bs(html, "html.parser")

    mars_weather = soup.find(
        'p',
        class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'
    ).text
    mars_facts_data["mars_weather"] = mars_weather
    # Mars Facts
    mars_facts_url = "https://space-facts.com/mars/"

    mars_facts_tb1 = pd.read_html(mars_facts_url)[0]
    mars_facts_tb1.columns = ['Physical Properties', 'Values']
    mars_html_table = mars_facts_tb1.to_html(justify='left',
                                             index=False).replace('\n', '')
    mars_facts_data["mars_facts_table"] = mars_html_table

    # Mars Hemispheres
    mars_hemi_urls = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(mars_hemi_urls)

    html = browser.html
    soup = bs(html, "html.parser")

    #Loop through the class="item" by clicking the h3 tag and getting the title and url.
    images = soup.find('div', class_='collapsible results')
    mars_hemi_urls = []

    for i in range(len(images.find_all("div", class_="item"))):
        time.sleep(5)
        image = browser.find_by_tag('h3')
        image[i].click()
        html = browser.html
        soup = bs(html, 'html.parser')
        title = soup.find("h2", class_="title").text
        div = soup.find("div", class_="downloads")
        for li in div:
            link = div.find('a')
        url = link.attrs['href']
        hemispheres = {'title': title, 'img_url': url}
        mars_hemi_urls.append(hemispheres)
        browser.back()
    mars_facts_data["mars_hemispheres"] = mars_hemi_urls
    browser.quit()
    return mars_facts_data
def scrape():
    driverPath = get_ipython().getoutput('which chromedriver')
    executable_path = {'executable_path': driverPath[0]}
    browser = Browser('chrome', **executable_path, headless=False)

    # In[3]:

    #Create BeautifulSoup Object
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    # In[164]:

    url = 'https://mars.nasa.gov/news'
    browser.visit(url)

    # In[166]:

    #Find first article name and text. Store
    article_list = soup.find(class_='slide')
    news_title = article_list.find('h3').text
    news_p = article_list.find(class_='article_teaser_body').text
    # Check that it works
    print(news_title)
    print(news_p)

    # In[131]:

    space_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(space_image)
    image = browser.find_by_tag('article')

    # In[132]:

    featured_image_url = browser.find_by_id('full_image')

    # In[133]:

    #Splinter to pull tweet
    tweet_url = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(tweet_url)

    # In[134]:

    timeline = browser.find_by_id("timeline")
    timeline.click()

    # In[135]:

    # Python to scrape facts
    facts_url = requests.get('https://space-facts.com/mars/')
    bsfacts = BeautifulSoup(facts_url.text)
    print(bsfacts)

    # In[136]:

    facts_table = bsfacts.find(id='text-2').text
    print(facts_table)

    # In[148]:

    # In[141]:

    img_find = browser.find_by_css('img')
    img_click = img_find.find_by_css('thumb')

    # In[7]:

    import requests
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    driver = webdriver.Chrome()
    driver.get(
        'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    )

    # In[27]:

    imgElement = driver.find_element_by_class_name('thumb')
    imgElement.click()
    new_driver = driver.current_url
    downloads = new_driver.find_element_by_class_name('downloads')
    image = downloads.find_element_by_tag_name('href')
Exemple #53
0
# convert to soup
html = browser.html
hemisphere_soup = soup(html, 'html.parser')

hemisphere_elem = hemisphere_soup.select_one('div', id_='collapsible results')

# In[274]:

# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []
hemisphere = {}

# 3. Write code to retrieve the image urls and titles for each hemisphere.
for hemispheres in hemisphere_elem.find_all('div', class_='item'):
    full_image_elem = browser.find_by_tag('img.thumb')
    full_image_elem.click()
    # Parse the resulting html with soup
    html = browser.html
    download_soup = soup(html, 'html.parser')

    download_url = download_soup.select_one('div', class_='downloads')
    hemisphere['img_url'] = download_url.select_one(
        'li', string='Sample').a.get('href')
    hemisphere['title'] = download_soup.select_one('h2',
                                                   class_='title').get_text()
    hemisphere_image_urls.append(hemisphere)

# In[275]:

hemisphere_image_urls
	
content = []	# Open file with list of BandCamp albums to download
with open(list_file) as f:
    content = f.readlines()

browser = Browser('phantomjs')
for album in content:
	url = album.replace("\n","")
	print('URL: '+album)
	browser.visit(url)
	#time.sleep(3)
	artistName = browser.find_by_id('band-name-location')[0].value.split('\n')[0] 	# Get artist name from top-right
	print('Getting artist name.')													# Will be used in the email portion

	# Check download's type (either says 'Buy Now (name your price)' or 'Free Download')
	for b in browser.find_by_tag('button'):					# Go through page's buttons
		if b.value == 'Buy Now':
			enterPrice = True
			print('Button is of type \'Buy Now\'')
			b.click()
			break
		elif b.value == 'Free Download':
			enterPrice = False
			print('Button is of type \'Free Download\'')
			b.click()
			break

	# If Buy Now (name your price)
	if (enterPrice == True):
		browser.find_by_id('userPrice').first.fill('0') # Fill $0 as price to pay
		
from splinter import Browser
import time
seed=open('seed.txt')
myseed=seed.readlines()
seed.close()

b=open('brandsfile.txt')
brands=list(set(b.read().split()))
b.close()

b=Browser('chrome')
b.visit('http://fanzle.com/amazon-longtail-keyword-scraper')
b.find_by_value('3').click()
b.find_by_tag('textarea').fill(myseed)
b.find_by_value('submit').click()
search=True
while search:
    time.sleep(2)
    if int(b.find_by_id('current').value)==int(b.find_by_id('total').value):
        search=False
keywords=b.find_by_tag('tbody').value.split()
diff_keywords=list(set(keywords))
diff_keywords.sort(key=keywords.index)
diff_keywords.pop(0)
final_keywords=[word for word in diff_keywords if word not in brands]

fw=open('fazleword.txt', 'w')
for i in final_keywords:
	fw.write(i+' ')
fw.close()