class LoginTestCase(unittest.TestCase): def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_datastore_v3_stub() self.testbed.init_memcache_stub() self.browser = Browser('chrome') def tearDown(self): self.testbed.deactivate() def test_login(self): self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") def test_logout(self): self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") self.browser.find_link_by_text("Log out").first.click() self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in")
def run(self): """Run the b0t""" browser = Browser() browser.visit(self.url) try: while browser.find_by_tag('button').first: self.process_elements(browser) browser.find_by_tag('button').first.click() except ElementDoesNotExist: pass
class PlayPagesWebTests(StaticLiveServerTestCase): def setUp(self): self.user1 = UserFactory.build() self.user1.set_password('abc') self.user1.save() self.user2 = UserFactory.build() self.user2.set_password('123') self.user2.save() self.browser1 = Browser() def tearDown(self): self.browser1.quit() def login_helper(self, browser, username, password): browser.visit( '%s%s' % (self.live_server_url, '/accounts/login/') ) browser.fill('username', username) browser.fill('password', password) browser.find_by_value('Log in').first.click() # Test 4 # Check playing single player game def test_single_player(self): self.login_helper(self.browser1, self.user1.username, 'abc') self.browser1.visit( '%s%s' % (self.live_server_url, '/play/') ) time.sleep(2) snippet = self.browser1.find_by_id('type').value try: snippet = snippet[:-( len(snippet.split()[-1]) + 1 )] for c in snippet[:100]: self.browser1.type('typed', c) time.sleep(0.001) self.browser1.find_by_tag('input')[3].click() self.browser1.find_by_tag('input').last.click() self.assertEqual( self.browser1.url, '%s%s' % (self.live_server_url, '/scores/') ) except ElementNotVisibleException: self.assertTrue(True)
class UserUtils(object): def __init__(self): self.config = config.read_config() self.account = self.config['account'] self.idp_server = self.config['nodes']['idp_node'] # Abort test if esgf-web-fe is not reachable r = requests.get("https://{0}/esgf-web-fe".format(self.idp_server), verify=False, timeout=1) assert r.status_code == 200 self.browser = Browser('firefox') # Mapping user data to fit to web-fe user creation form self.elements = {'firstName' : self.account['firstname'], 'lastName' : self.account['lastname'], 'email' : self.account['email'], 'userName' : self.account['username'], 'password1' : self.account['password'], 'password2' : self.account['password']} def check_user_exists(self): URL = "https://{0}/esgf-web-fe/login".format(self.idp_server) OpenID = "https://{0}/esgf-idp/openid/{1}".format(self.idp_server, self.account['username']) # Try to log in self.browser.visit(URL) self.browser.find_by_id('openid_identifier').fill(OpenID) self.browser.find_by_value('Login').click() # User does not exist if unable to resolve OpenID if(self.browser.is_text_present("Error: unable to resolve OpenID identifier")): self.user_exists = False else: self.user_exists = True def create_user(self): URL = "https://{0}/esgf-web-fe/createAccount".format(self.idp_server) self.browser.visit(URL) # Filling the form for element_name in self.elements: self.browser.find_by_name(element_name).fill(self.elements[element_name]) self.browser.find_by_value('Submit').click() # Parsing response self.response = [] if (self.browser.is_text_present("SUCCESS") == True): self.response.append("SUCCESS") else: self.response.append("FAILURE") selection = self.browser.find_by_tag('span') for sel in selection: if sel.has_class('myerror'): self.response.append(sel.value) def exit_browser(self): self.browser.quit()
def splinter(): browser = Browser() url = "http://ehire.51job.com/MainLogin.aspx" browser.visit(url) time.sleep(1) browser.find_by_id('txtMemberNameCN').fill(u'安能聚业') browser.find_by_id('txtUserNameCN').fill(u'上海安能聚创供应链') browser.find_by_id('txtPasswordCN').fill('aneqc888') browser.find_by_id('Login_btnLoginCN').click() time.sleep(1) browser.find_by_tag('a').click() browser.find_by_id('hlResumeSearch').click() # id 85798642 未公开 # 309554553 未下载 # browser.find_by_id('txtUserID').fill('6098724') time.sleep(1) browser.find_by_id('btnSearchID_leftbtnSearchID').click() cvTarget = browser.find_by_xpath('//tr/td/p/span/a[@target="_blank"]') if len(cvTarget) == 0: print "can not find the cv from this id." return cvTarget.click() allwindows = browser.windows driver = browser.driver driver.switch_to_window(allwindows[-1].name) UndownloadLink = browser.find_by_id('UndownloadLink') if len(UndownloadLink) == 0: print "can not find the cv from this id." else: UndownloadLink.click() time.sleep(1) browser.find_by_id('btnCommonOK').click() selector = etree.HTML(browser.html) lines = selector.xpath('//title') if len(lines) != 0: print "name:", strip(lines[0].text) contents = browser.html.encode("utf-8") print re.findall(re.compile('''<td height="20">电 话:</td><td height="20" colspan="3">(.*?)<span'''), contents)[0] printre.findall(re.compile('''E-mail:</td><td height="20" colspan="3"><a href="mailto:(.*?)" class="blue">'''), contents)[0] winNum = len(allwindows) for i in range(winNum): allwindows[winNum - 1 - i].close()
def splinter(url): #"""""""""""""""""""""""""MySQL DEF********************************************** conn = MySQLdb.connect(host='192.168.1.8',user='******',passwd='123123',db='gwycf') cursor = conn.cursor()#create cursor operate db #"""""""""""""""""""""""""MySQL DEF********************************************** data = xlrd.open_workbook('./chafen.xlsx') table = data.sheets()[0] nrows = table.nrows ncols = table.ncols print nrows browser = Browser('firefox') # browser = Browser('chrome') dir(browser) browser.visit(url) time.sleep(5) count = 0 #<================================================> for i in range(nrows): #HaoMa = str(table.row_values(i)[1]).split(".")[0] name = table.row_values(i)[0] HaoMa = table.row_values(i)[1] # epost = table.row_values(i)[2] browser.find_by_name('TxtName').fill(name) browser.find_by_name('TxtHaoMa').fill(HaoMa) browser.find_by_id('btnSubmit').click() #=================获取页面数据===================== epost = browser.find_by_tag('td')[10].value ecode = browser.find_by_tag('td')[14].value xingce = browser.find_by_tag('td')[16].value shenlun = browser.find_by_tag('td')[18].value jiafen = browser.find_by_tag('td')[20].value zongfen = browser.find_by_tag('td')[22].value #=================获取页面数据====================== query = u"insert into info values('%s','%s','%s','%s','%s','%s','%s','%s',0)" % (name,HaoMa,epost,ecode,xingce,shenlun,jiafen,zongfen) print count,query cursor.execute(query.encode('utf-8')) #原始数据可以根据gbk运行无错,现在改成utf8 conn.commit() browser.back() count = count +1 cursor.close() conn.commit() conn.close()
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = models.User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run) self.process.start() time.sleep(1) def test_add_post(self): log= logging.getLogger("unittest.TestCase") ################################## Login as Alice #self.browser.visit("http://0.0.0.0:8080/login") # original line self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() #self.assertEqual(self.browser.url, "http://0.0.0.0:8080/") # original line # self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") # ask sam about this line ############################################ add a test post ##################### self.browser.visit("http://127.0.0.1:5000") self.browser.click_link_by_partial_href('add') self.browser.fill("title", "post test1 title") self.browser.fill("content", "post test1 content") button = self.browser.find_by_css("button[type=submit]") button.click() post_found = self.browser.find_by_tag('h1').value #cheated here - made template title h2. how do we access? index? #post_found = self.browser.find_by_text('post test1 title').value - didnt work log.debug( "FIRSTH1= %r", post_found ) self.assertEqual(post_found, "post test1 title") def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
class Retreiver(): def __init__(self, folder): self.aux = Auxiliary() self.folder = folder self.tickers = None def click(self, destination): try: self.browser.find_by_text(destination).first.click() except splinter.exceptions.ElementDoesNotExist: self.browser.click_link_by_text(destination) def retreive(self): print ('Please enter the period for retrieval.') raw_dates = input ('Dates in European format: dd/mm/yyyy\n>') eurodates = self.aux.date_parse(raw_dates)[0] dates = self.aux.european_dates_to_american(eurodates) raw_tickers = input ('Tickers:\n>') self.tickers = self.aux.parse_tickers(raw_tickers) self.browser = Browser('chrome') for ticker in self.tickers: self.browser.visit('https://beta.finance.yahoo.com/quote/%s/history' % ticker) time.sleep(5) input_boxes = self.browser.find_by_tag('input') for i in range(0,6): input_boxes[i+2].fill(dates[i]) #we need 3-8 inputs self.click('Apply') download_link = self.browser.find_link_by_text('Download data').first response = requests.get(download_link['href']) with open('%s//%s.csv' % (self.folder, ticker), 'wb') as f: f.write(response.content) self.browser.quit() def put_together(self): if not self.tickers: self.tickers = [] for f in os.listdir(self.folder): self.tickers.append(f[:-4]) target = openpyxl.Workbook() sheet = target.active sheet.append(self.tickers) for filename in os.listdir(self.folder): source = open('%s//%s' %(self.folder, filename), 'r', encoding='utf-8') sheet = target.create_sheet() sheet.title = filename[:-4] #strip out the extension for line in source: sheet.append(self.aux.parse_comma_separated_line(line)) source.close() target.save('Historical_data_together.xlsx')
def __authorize(self): b = Browser('chrome') b.visit("http://box-token-generator.herokuapp.com/") if b.find_link_by_href('set_client_credentials'): b.visit('http://box-token-generator.herokuapp.com/set_client_credentials') time.sleep(2) b.find_by_id('login').first.fill('*****@*****.**') b.find_by_id('password').first.fill('dharit1250') b.find_by_name('login_submit').first.click() b.find_by_id('consent_accept_button').first.click() code = b.find_by_tag('h4')[1].text self.client = box.BoxClient(code) b.quit()
news_title # Use the parent element to find the paragraph text #For example, if we were to use .find_all() instead of .find() when pulling the summary, #we would retrieve all of the summaries on the page instead of just the first one. news_p = slide_elem.find('div', class_="article_teaser_body").get_text() news_p # ### JPL Space Images Featured Image # Visit URL url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html' browser.visit(url) # Find and click the full image button full_image_elem = browser.find_by_tag('button')[1] full_image_elem.click() # Parse the resulting html with soup html = browser.html img_soup = soup(html, 'html.parser') # Find the relative image url img_url_rel = img_soup.find('img', class_='fancybox-image').get('src') img_url_rel # Use the base URL to create an absolute URL img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}' img_url # ### Mars Facts
# Fill in the url browser.visit('somePageURL') # Find the username cell browser.find_by_name('user')[1].fill(username) # Find the password cell browser.find_by_name('password')[1].fill(password) # Find the submit button and click browser.find_by_css('.loginsub').first.click() # If there is a "full sign", we kick somebody out :) Sorry!! try: browser.find_by_tag('input')[3].click() except: pass # I setup a progress bar here bar = pyprind.ProgBar(handle_sheet_0.nrows, stream=1) # I create the excel header style1 = xlwt.easyxf('pattern: pattern solid, fore_colour gray40; align: horiz center') style2 = xlwt.easyxf('align: horiz center') style3 = xlwt.easyxf('pattern: pattern solid, fore_colour red; align: horiz center') style4 = xlwt.easyxf('pattern: pattern solid, fore_colour green; align: horiz center') cellWidths = [75, #0 Rövid cégnév az excelben 20, #1 Adószám az excelben
def scrape(): # NASA Mars News page to be scraped nasa_mars_url = 'https://mars.nasa.gov/news/' # Retrieve Nasa Mars News page with the requests module nasa_response = requests.get(nasa_mars_url) # nasa_response # Create BeautifulSoup object; parse with 'lxml' nasa_soup = BeautifulSoup(nasa_response.text, 'lxml') # print(nasa_soup.prettify()) # NASA Mars News # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. # Assign the text to variables that you can reference later. news_title = nasa_soup.find('div', class_='content_title').find('a').text news_p = nasa_soup.find('div', class_='rollover_description_inner').text # print(news_title) # print(news_p) # JPL Mars Space Images - Featured Image # Visit the url for JPL Featured Space Image here. # Use splinter to navigate the site and find the image url for the current Featured Mars Image executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) jpl_mars_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl_mars_url) # click through the page browser.click_link_by_partial_text('FULL IMAGE') browser.click_link_by_partial_text('more info') browser.click_link_by_id('page') # assign the url string to a variable called featured_image_url. # Make sure to find the image url to the full size .jpg image. # Make sure to save a complete url string for this image. jpl_html = browser.html jpl_soup = BeautifulSoup(jpl_html, 'html.parser') image = jpl_soup.find('img') featured_image_url = image.get('src') # featured_image_url # Mars Weather # Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. # Save the tweet text for the weather report as a variable called mars_weather. mars_weather_url = 'https://twitter.com/marswxreport?lang=en' mars_weather_response = requests.get(mars_weather_url) # mars_weather_response mars_weather_soup = BeautifulSoup(mars_weather_response.text, 'lxml') # print(mars_weather_soup.prettify()) mars_weather_tweets = mars_weather_soup.find( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' ).text # print(mars_weather_tweets) # Mars Facts # Visit the Mars Facts webpage here and use Pandas to scrape the table # containing facts about the planet including Diameter,Mass, etc. # Use Pandas to convert the data to a HTML table string mars_facts_url = 'https://space-facts.com/mars/' mars_facts_tables = pd.read_html(mars_facts_url) # mars_facts_tables # type(mars_facts_tables) # put the table in the data frame mars_facts_df = mars_facts_tables[0] mars_facts_df.columns = ['description', 'value'] mars_facts_df.set_index('description', inplace=True) # mars_facts_df.head() # convert data frame into html mars_facts_html_table = mars_facts_df.to_html() # mars_facts_html_table # drop line break \n mars_facts_html_table.replace('\n', '') # Mars Hemispheres # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres. # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. # Save both the image url string for the full resolution hemisphere image, # and the Hemisphere title containing the hemisphere name. # Use a Python dictionary to store the data using the keys img_url and title. # Append the dictionary with the image url string and the hemisphere title to a list. # This list will contain one dictionary for each hemisphere. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) mars_hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(mars_hemispheres_url) hemisphere_image_urls = [] # find all h3 tag link_cnt = browser.find_by_tag('h3') # loop through all h3 tag for item in range(len(link_cnt)): mars_hemispheres = {} # click on each h3 item browser.find_by_tag('h3')[item].click() # Get Mars Hemispheres Title mars_hemispheres["title"] = browser.find_by_tag("h2.title").text # Find Sample Image Tag & get url sample = browser.find_link_by_text("Sample").first mars_hemispheres["img_url"] = sample["href"] # Append Mars Hemispheres to List hemisphere_image_urls.append(mars_hemispheres) # Navigate Backwards browser.back() # hemisphere_image_urls mars_data = { 'New_Title': news_title, 'News_Paragraph': news_p, 'Feature_Image': featured_image_url, 'Mars_Weather': mars_weather_tweets, 'Mars_Facts': mars_facts_html_table, 'Mars_Hemisphere': hemisphere_image_urls } return mars_data
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) mars_dict = {} # Step One: browser.visit( "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" ) html = browser.html soup = BeautifulSoup(html, 'html.parser') title = soup.find_all('div', class_="content_title")[1].text description = soup.find('div', class_="article_teaser_body").text mars_dict["news_title"] = title mars_dict["news_description"] = description # Step Two: browser.visit( "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars") browser.find_by_id('full_image').click() time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") results = soup.find_all('img', class_="fancybox-image") link = str(results) link = link.split("src=")[1] link = link.split("style")[0] link = link.replace('"', "") link = link.replace(" ", "") featured_image_url = f"https://www.jpl.nasa.gov{link}" mars_dict["featured_image"] = featured_image_url # Step Three: browser.visit("https://twitter.com/marswxreport?lang=en") time.sleep(1) browser.find_by_xpath( "/html/body/div/div/div/div[2]/main/div/div/div/div/div/div/div/div/div[2]/section/div/div/div/div[1]/div/div/div/article/div/div[2]/div[2]/div[2]/div[1]/div/span" ).click() html = browser.html soup = BeautifulSoup(html, "html.parser") weather = soup.find_all( 'div', class_= 'css-901oao r-hkyrab r-1qd0xha r-1blvdjr r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0' ) weather = str(weather) weather = weather.split("r-ad9z0x r-bcqeeo r-qvutc0")[1] weather = weather.split("/span")[0] weather = weather.replace('>', '') weather = weather.replace('<', '') current_mars_weather = weather.replace('"', '') mars_dict["mars_weather"] = current_mars_weather # Step Four: url = "https://space-facts.com/mars/" mars_table = pd.read_html(url) mars_df = mars_table[0] mars_html = mars_df.to_html() mars_dict["mars_facts"] = mars_html # Step Five: hemisphere_list = [] for i in range(0, 4): links = [] browser.visit( "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" ) browser.find_by_tag('h3')[i].click() browser.find_by_id('wide-image-toggle').click() html = browser.html soup = BeautifulSoup(html, "html.parser") title = soup.find('h2', class_="title").text results = soup.find_all('li') for result in results: link = result.find('a')['href'] links.append(link) hemisphere_dict = {"title": title, "img_url": links[0]} hemisphere_list.append(hemisphere_dict) mars_dict["hemispheres"] = hemisphere_list return mars_dict
# In[485]: html_table = mars_df.to_html() html_table # In[645]: sf_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(sf_url) html = browser.html soup = bs(html) # In[646]: images = soup.section.find_all('img', class_='thumb') img_urls = [] for i in range(len(images)): img_dict = {} browser.find_by_css('img.thumb')[i].click() img_dict['title'] = browser.find_by_tag('h2').text img_dict['img_url'] = browser.find_link_by_text("Sample")['href'] img_urls.append(img_dict) browser.back() # In[648]: img_urls # In[ ]:
browser.find_by_name('addScenario').first.click() browser.fill('scName', countryTypeList[conIndex]+typeaName+igType) browser.type('scEffDate', '\b\b\b\b\b\b\b\b\b\b') browser.type('scEffDate', '2015-10-31') browser.find_by_name('update').first.click() browser.find_link_by_text('Obligor').first.click() # choose the companyType type element = browser.find_by_name('companyType').first element.select(str(cType)) browser.fill('obligorName', companyName) browser.find_by_name('ObligorSearch').first.click() browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[2].find_by_tag('a')[0].click() # select "B-III counterpaty type" to be "corporate" element = browser.find_by_name('counterPartyType').first element.select('1') # select "Classification re Asset Value Correlation" to be "Non-Financial Institution (N)" element = browser.find_by_name('avc').first element.select('4') # select proper IG according to the IG type element = browser.find_by_name('obligorIgCode').first if igType == 'cap': element.select('99') browser.find_by_name('UpdateButton').first.click() elif igType == 'floor': element.select('30')
def scrape(): # Create dictionary to return return_dict = {} # Create initial browser object executable_path = {'executable_path': '/Users/joshchung/Bootcamp/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # Scrape NASA Mars news url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = bs(html, 'lxml') results = soup.find('li', class_="slide") article_date = results.find('div', class_="list_date").text article_title = results.find('div', class_="content_title").text article_teaser = results.find('div', class_="article_teaser_body").text return_dict.update({'article_date':article_date, 'article_title':article_title, 'article_teaser':article_teaser}) # Scrape JPL image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'lxml') results = soup.find_all('article', class_="carousel_item") url_string = results[0].get('style') url_string = url_string.split("url('") url_string = url_string[1].split("');") url_string = url_string[0] img_url = 'https://www.jpl.nasa.gov' + url_string return_dict.update({'img_url':img_url}) # Scrape Twitter url = 'https://twitter.com/marswxreport' browser.visit(url) html = browser.html soup = bs(html, 'lxml') last_tweet = soup.find('p', class_="tweet-text").text last_tweet = last_tweet.replace('\n', ' ') return_dict.update({'last_tweet':last_tweet}) # Scrape Mars facts url = 'https://space-facts.com/mars/' tables = pd.read_html(url) mars_df = tables[0] mars_df.columns = ['Statistic','Values'] mars_df = mars_df.set_index('Statistic') mars_table = mars_df.to_html() mars_table = mars_table.replace('\n', '') return_dict.update({'mars_table':mars_table}) # Scrape Mars hemisphere images url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' mars_urls = {} for x in range(0,4): browser.visit(url) links = browser.find_by_tag('h3') links[x].click() html = browser.html soup = bs(html, 'lxml') downloads = soup.find('div', class_="downloads") dl_links = downloads.find_all('a') img_link = dl_links[0].get('href') dld_link = dl_links[1].get('href') title = soup.find('h2', class_="title").text mars_urls.update({ f"marsimg_{x}" : img_link, f"marstitle_{x}": title, f"marsdld_{x}": dld_link }) browser.back() return_dict.update(mars_urls) # Return dictionary when function is run return return_dict
# Go to each of the 4 hemisphere websites and scrape the link for the Sample Image usgs_url = "https://astrogeology.usgs.gov" image_url = [] titles = [] for x in range(len(mars4hemis)): # Go to the hemisphere website browser.visit(usgs_url + mars4hemis[x]) browser.click_link_by_text("Open") time.sleep(2) # CLick Sample link to get the image sample = browser.find_by_text('Sample') image = sample['href'] image_url.append(image) # Search the h2 tags to get the title headers = browser.find_by_tag('h2') full_title = headers.text title = full_title.strip('Enhanced') titles.append(title) # print(browser.url) print(title, image) # Show the two newly created lists: titles and image_url print(titles) print(image_url) # Create the list of 4 Dictionaries with the image url string and the hemisphere title to a list hemisphere_image_urls = [] for x in range((len(titles))): one_hemisphere = {"title": titles[x], "img_url": image_url[x]} hemisphere_image_urls.append(one_hemisphere)
class InstaLiker(): # constructor def __init__(self): self.mUrl = "https://www.instagram.com/" self.cycles = 4 self.browser = Browser() self.username = "******" self.pw = 'xxxxxxxxxxxxxxxx\r' self.totalLikes = 0 self.blackList = ["make a list of users to exclude", "including your own username" ] # scroll the page and # do the liking def launchPage(self): self.browser.visit(self.mUrl) self.login() self.scrollBy() for i in range(0, self.cycles): self.likePosts() print("just liked " + str(self.totalLikes) + " pix...Yay!") def login(self): print("login") print("logging in as " + self.username) self.browser.click_link_by_text('Log in') self.browser.fill('username', self.username) self.browser.fill('password', self.pw) form = self.browser.find_by_tag('form') inputs = form.find_by_tag('button') inputs[0].click() # need to sleep a few seconds here time.sleep(5) def likePosts(self): print("liking posts") likeList = self.browser.find_by_text("Like") if len(likeList) == 0: print("nothing left to like. attempt to scroll farther to load more posts.") self.scrollBy() time.sleep(3) likeList = self.browser.find_by_text("Like") print("likeList is now: " + str(len(likeList))) if (len(likeList) > 0): print("found " + str(len(likeList)) + " posts to like") for foo in likeList: tmpParentNode = foo.find_by_xpath("ancestor::article/header") print(tmpParentNode["innerText"]) if self.checkBlackList(tmpParentNode["innerText"]) == 0: foo.click() self.totalLikes += 1 time.sleep(1) def checkBlackList(self, pString): for foo in self.blackList: if foo in pString: print("found blacklisted item '" + foo + "'") return 1 return 0 def scrollBy(self): print("scrolling down.") self.browser.execute_script( "window.scrollBy(0,30000);" ) time.sleep(1) def boneyard(self): print('boneyard')
import flask from flask import request, jsonify import bs4 import urllib.parse from splinter import Browser from flask import Flask, jsonify ################################################# # Web Scraping for Georgia Income Data ################################################# executable_path={'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) browser.visit('http://datausa.io/api/data?Geography=04000US13:children&measure=Household Income by Race,Household Income by Race Moe&drilldowns=Race') GA_Income_Data = json.loads((browser.find_by_tag('body').first.text)) browser.quit() ################################################# # EV Station Data API - Acquisiton ################################################# url ='https://developer.nrel.gov/api/alt-fuel-stations/v1.json?fuel_type=ELEC,ELEC&state=GA&limit=all&api_key=FHhxl7HnTsc9tm4X9CwUBVDNmbQFFu4uZXKJeO59&format=JSON' response = requests.get(url).json() #print((json.dumps(response, indent = 4, sort_keys =True))) response_string=(json.dumps(response ['fuel_stations'], indent = 4, sort_keys =True)) ############################# #creationg dataframe from ev response ev_df = pd.read_json(response_string) ########################## #selecting relevent data ev_df.head()
from splinter import Browser from selenium import webdriver driver = webdriver.Chrome('C:/Users/phillipparamirez/Downloads/chromedriver') browser = Browser('chrome') url = 'http://localhost:5000' browser.visit(url) assert 'Todo' in browser.title header = browser.find_by_tag('h1').first assert 'Todo list' in header.text browser.quit()
def scrape_info(): # Configure settings for splinter executable_path = {"executable_path": ChromeDriverManager().install()} browser = Browser("chrome", **executable_path, headless=False) ### Part 1: NASA Mars News_________________________________________________________________________________________________________________________________ # Visit mars.nasa.gov/news site # Get html and parse it from mars events website using requests url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" browser.visit(url) # Scrape page into Soup html = browser.html soup = bs(html, "html.parser") # Find most recent news article title and description a = soup.find("body", id="news") b = a.find("ul", class_="item_list") c = b.find("li", class_="slide") d = c.find("div", class_="image_and_description_container") e = d.find("div", class_="list_text") f = e.find("div", class_="content_title") news_title = f.find("a").text news_description = e.find("div", class_="article_teaser_body").text ###Part 2: JPL Mars Space Images Featured Image - Get html and parse it________________________________________________________________________________________________________________________________ url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) # Scrape page into Soup html = browser.html soup = bs(html, "html.parser") # Find the featured image url # Click on FULL IMAGE button link1 = browser.find_by_tag('a').links.find_by_partial_text("FULL IMAGE") link2 = link1.click() time.sleep(10) # Click on More Info button link3 = browser.links.find_by_partial_text("more info") link3.click() time.sleep(10) #Save full size of featured image link4 = browser.find_by_tag('figure[class="lede"]') link5 = link4.find_by_tag('a') for image in link5: featured_image_url = image["href"] ###Part 3: Mars Facts_______________________________________________________________________________________________________________________________________________________________________-- #Scrape page into soup url = "https://space-facts.com/mars/" browser.visit(url) html = browser.html soup = bs(html, "html.parser") a = soup.find("table", id="tablepress-p-mars-no-2") b = a.find_all("tr") metric = list() value = list() for row in b: metric.append(row.find("td", class_="column-1").text) value.append(row.find("td", class_="column-2").text) mars_facts = pd.DataFrame({ 'Metric': metric, 'Value': value }, columns=['Metric', 'Value']) mars_facts.set_index('Metric') # convert dataframe to html mars_facts_html = mars_facts.to_html() ###Part 4: Mars Hemispheres___________________________________________________________________________________________________________________________________________________________________________ #Scrape page into soup url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) html = browser.html soup = bs(html, "html.parser") # Find the body of the page that includes the necessary links for the hemispheres link1 = browser.find_by_tag('div[id="product-section"]') h_count = 0 hemisphere_image_urls = list() # Within the div: product-section, there are 4 class items. Each one corresponds to a different hemisphere while h_count < 4: # Initialize dictionary to store each hemisphere's information temp = dict() # Navigate to the hemisphere's section link2 = link1.find_by_tag('div[class="item"]')[h_count] link3 = link2.find_by_tag("a") link4 = link3.last link5 = link4.click() time.sleep(1) # Get the hemisphere name, store in a list hlink1 = browser.find_by_tag('section[class="block metadata"]') key = hlink1.find_by_tag('h2[class="title"]').text # Get the full size image url for each hemisphere hlink2 = browser.find_by_tag('div[id="wide-image"]') hlink3 = hlink2.find_by_tag('img[class="wide-image"]') value = hlink3['src'] # Save the hemisphere's data in the temp dictionary, append the dictionary to the list temp[key] = value hemisphere_image_urls.append(temp) # Go back to the main page and start again for the next hemisphere link6 = browser.back() link1 = browser.find_by_tag('div[id="product-section"]') h_count += 1 browser.quit() #Create a single dictionary of all the items # Create a single dictionary of the information mars_info = { "Recent_news_title": news_title, "Recent_news_description": news_description, "Featured_image_url": featured_image_url, "Mars_facts_html": mars_facts_html, "Hemispheres_images": hemisphere_image_urls } return (mars_info)
def scrape(): #initialize the large dictionary to store all scraped information mars_dictionary = {} # create path and open browser window executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # establish url url = 'https://mars.nasa.gov/news/' # visit site browser.visit(url) #time delay 1 sec time.sleep(1) # create soup object soup = BeautifulSoup(browser.html,'html.parser') # find title for the latest one, which is the one in the first box news_title = soup.find_all('div',class_='content_title') news_title = news_title[1].text # pulling the text from the paragraph news_p = soup.find_all('div', class_= 'article_teaser_body') news_p = news_p[0].text #append the title and paragrapah text to the larger mars_dictionary mars_dictionary['current_title'] = news_title mars_dictionary['current_p'] = news_p ############################################################# #Visit the site ('https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars') #and scrape for the current featured image (the full size version) ############################################################# # establish url jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' # visit site browser.visit(jpl_url) # grab page html jpl_html = browser.html # create soup object soup = BeautifulSoup(jpl_html,'html.parser') #Find the image at the top and click target = 'a[class="group cursor-pointer block"]' browser.find_by_tag(target).click() #time delay of 1 second time.sleep(1) # grab page html target_html = browser.html # create soup object soup = BeautifulSoup(target_html,'html.parser') #find all anchors and loop through to find href by using get_text for JPG anchors = soup.find_all('a') for a in anchors: if 'JPG' in a.get_text(): featured_image_url=a['href'] #append the featured image url to the larger mars_dictionary mars_dictionary['featured_image_url'] = featured_image_url ############################################################################# #Visit the site ('https://space-facts.com/mars/') and scrape the table with #the mars data and convert back to html ############################################################################## # establish url facts_url = 'https://space-facts.com/mars/' # visit site browser.visit(facts_url) #pull the table from the site tables =pd.read_html('https://space-facts.com/mars/') #pull the specific table for just the Mars data mars_tables = tables[0] #rename columns mars_tables.columns = ['Fact','Value'] #convert dataframe back to html mars_tables_html = mars_tables.to_html() #append the mars_table to the larger mars_dictionary mars_dictionary['mars_tables_html'] = mars_tables_html ################################################################ # Visit the site ('https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars') #scrape for the title and image url for each of the hemisphere and create a list with #a mini dictionary for each hemisphere ################################################################ # establish url hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' # visit site browser.visit(hemi_url) # grab page html hemi_html = browser.html # create soup object soup = BeautifulSoup(hemi_html,'html.parser') #Find the all the titles to use for the click function title_list = soup.find_all('div', class_='description') hemisphere_image_urls = [] #loop through the title_list and pull title and imagine url for t in title_list: title = t.h3.text browser.find_by_text(title).click() # grab page html title_html = browser.html # create soup object soup = BeautifulSoup(title_html,'html.parser') image = soup.find_all('div', class_='downloads') image_url = image[0].li.a['href'] #create mini-dictionary mini_dictionary = {'title': title, 'img_url':image_url} hemisphere_image_urls.append(mini_dictionary) # click back button browser.back() #quit browser browser.quit() #append the hemisphere list to the larger mars_dictionary mars_dictionary['hemisphere_image_urls'] = hemisphere_image_urls #returns the mars_dictionary return mars_dictionary
def run(filename=True): ######################################################################## # read data #filename='MARTA Breeze card numbers.xlsx' #filename='MARTA Breeze card numbers_1.xls' dataset = pd.read_excel(filename, sheetname=0, header=None) ######################################################################## # input params url = 'https://balance.breezecard.com/breezeWeb/jsp/web/cardnumberweb.jsp' columns = [ 'cardnumber', 'protected_balance', 'expiration_date', 'product_name', 'remaining_rides', 'stored_value', 'pending_autoload_transactions' ] cardinformation = pd.DataFrame(columns=columns) n = dataset.shape[0] for i in range(n): cardnumber = dataset.ix[i].values[0] #cardnumer='0164 1487 1502 5743 2323' if (len(cardnumber) < 16): print "invalid card length: %s\n" % cardnumber temp_df = pd.DataFrame( [[cardnumber, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']], columns=columns) cardinformation = cardinformation.append(temp_df) continue browser = Browser('chrome') browser.visit(url) browser.fill('cardnumber', cardnumber) browser.find_by_name('submitButton').click() text = browser.find_by_tag('tr') # 2. breezecard bulk of information temp_txt = text[2].value.split('\n') if len(temp_txt) == 11: txt = [cardnumber] # 5. is our card balance protected? temp_val = temp_txt[2].split(':')[1] txt.append(temp_val) # 6. card expiration date temp_val = temp_txt[3].split(':')[1] txt.append(temp_val) # 7. product name temp_val = temp_txt[5] txt.append(temp_val) # 9. remaining rides temp_val = temp_txt[6] txt.append(temp_val) # 11. store value temp_val = temp_txt[7].split(':')[1] txt.append(temp_val) # 11. store value temp_val = temp_txt[9] txt.append(temp_val) temp_df = pd.DataFrame([txt], columns=columns) cardinformation = cardinformation.append(temp_df) elif len(temp_txt) == 10: txt = [cardnumber] # 5. is our card balance protected? temp_val = temp_txt[2].split(':')[1] txt.append(temp_val) # 6. card expiration date temp_val = temp_txt[3].split(':')[1] txt.append(temp_val) # 7. product name temp_val = temp_txt[5] txt.append(temp_val) # 9. remaining rides temp_val = 0 txt.append(temp_val) # 11. store value temp_val = temp_txt[6].split(':')[1] txt.append(temp_val) # 11. store value temp_val = temp_txt[8] txt.append(temp_val) temp_df = pd.DataFrame([txt], columns=columns) cardinformation = cardinformation.append(temp_df) else: temp_df = pd.DataFrame( [[cardnumber, 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']], columns=columns) cardinformation = cardinformation.append(temp_df) browser.quit() output = 'output_' + time.strftime("%H_%M_%S") + '.xlsx' cardinformation.to_excel(output, header=True, index=False)
class HomePageWebTests(StaticLiveServerTestCase): def setUp(self): self.user1 = UserFactory.build() self.user1.set_password('abc') self.user1.save() self.browser = Browser() def tearDown(self): self.browser.quit() def login_helper(self, username, password): self.browser.visit('%s%s' % (self.live_server_url, '/accounts/login/')) self.browser.fill('username', username) self.browser.fill('password', password) self.browser.find_by_value('Log in').first.click() # Test 2 # Check for login link from anonymous get of homepage def test_anon_login(self): self.browser.visit('%s%s' % (self.live_server_url, '/')) login_link = self.browser.find_by_tag('a')[2] self.assertEqual( '%s%s' % (self.live_server_url, '/accounts/login/'), login_link['href'] ) # Test 3 # Check for register link from anonymous get of homepage def test_anon_register(self): self.browser.visit('%s%s' % (self.live_server_url, '/')) register_link = self.browser.find_by_tag('a')[3] self.assertEqual( '%s%s' % (self.live_server_url, '/accounts/register/'), register_link['href'] ) # Test 4 # Check for user login success def test_login_success(self): self.login_helper(self.user1.username, 'abc') self.assertEqual( self.browser.url, '%s%s' % (self.live_server_url, '/profile/') ) logout_link = self.browser.find_by_tag('a')[6] self.assertEqual( '%s%s' % (self.live_server_url, '/accounts/logout/?next=/'), logout_link['href'] ) greeting = self.browser.find_by_tag('h1')[0] self.assertEqual( '%s%s%s' % ('Well howdy there, ', self.user1.username, '!'), greeting.text ) # Test 5 # Check for user logout success def test_logout_success(self): self.login_helper(self.user1.username, 'abc') self.browser.find_by_tag('a')[6].click() self.assertEqual( self.browser.url, '%s%s' % (self.live_server_url, '/') ) # Test 6 # Register brand new user def test_registration(self): self.browser.visit( '%s%s' % (self.live_server_url, '/accounts/register/') ) self.browser.fill('username', 'joseph') self.browser.fill('email', '*****@*****.**') self.browser.fill('password1', '123') self.browser.fill('password2', '123') self.browser.find_by_value('Submit').first.click() self.assertEqual( self.browser.url, '%s%s' % (self.live_server_url, '/accounts/register/complete/') ) link_end = mail.outbox[0].body.split('days:')[1].split()[0][18:] link = '%s%s' % (self.live_server_url, link_end) self.browser.evaluate_script('document.location="%s"' % link) self.assertEqual( self.browser.url, '%s%s' % (self.live_server_url, '/accounts/activate/complete/') ) self.login_helper('joseph', '123') greeting = self.browser.find_by_tag('h1')[0] self.assertEqual('Well howdy there, joseph!', greeting.text)
# %% # Use the parent element to find the paragraph text news_p = slide_elem.find('div', class_="article_teaser_body").get_text() news_p # %% ### Featured Images # %% # Visit URL url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html' browser.visit(url) # %% # Find and click the full image button full_image_elem = browser.find_by_tag('button')[1] full_image_elem.click() # %% # Parse the resulting html with soup html = browser.html img_soup = soup(html, 'html.parser') # %% # Find the relative image url img_url_rel = img_soup.find('img', class_='fancybox-image').get('src') img_url_rel # %% # Use the base URL to create an absolute URL img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
if browser.is_text_present('Congratulations. This browser is configured to use Tor.'): print "Yes, using TOR :)" else: print "No, not using TOR :(" raise EnvironmentError('Not using TOR') browser.visit('https://tasonline.gotilc.com/GTPublicWeb/MainWeb/GageSelect.aspx') # assert_that(browser.is_text_present('Select Car Mark', wait_time=5)) car_mark_select_list = browser.find_by_id('TRNContentPlaceHolder_ddCarMark_FilterSec') assert_that(len(car_mark_select_list), equal_to(1)) car_mark_select = car_mark_select_list[0] print car_mark_select assert_that(car_mark_select.tag_name, equal_to(u'select')) options = browser.find_by_tag('option') for option in options: car_mark_select.select(option.text) car_mark_filter = browser.find_by_id('TRNContentPlaceHolder_ddCarMark_FilterSec') print option.text car_mark_select.select('ACTX') # assert browser.is_text_present('Loading...'), u'Text not found' assert_that(browser.is_text_not_present('Loading...', wait_time=6)) car_mark_select_list = browser.find_by_id('TRNContentPlaceHolder_ddCarMark_FilterSec') assert_that(len(car_mark_select_list), equal_to(1)) car_mark_select = car_mark_select_list[0] assert_that(car_mark_select.tag_name, equal_to(u'select'))
class OOP(): def __init__(self): self.win = tk.Tk() self.win.geometry('700x700') self.win.title('Python GUI') self.create_settings() self.create_login() self.create_query() self.create_records() self.create_menu() self.valid_sites = [] self.select_row = -1 self.valid_td = [] self.select_record_row = -1 self.browser_state = 'log in' self.today = datetime.date.today() self.username = '' self.password = '' self.mobile = '' def create_menu(self): def msgbox(): msg.showinfo( 'Info Box', 'This is a python GUI designed by Wenliang Zhang\n Use for fun : )' ) menu_bar = Menu(self.win) self.win.config(menu=menu_bar) file_menu = Menu(menu_bar, tearoff=0) file_menu.add_command(label='About', command=msgbox) menu_bar.add_cascade(label='Menu', menu=file_menu) def create_settings(self): settings = ttk.LabelFrame(self.win, text='Settings') settings.grid(row=0, column=0, sticky='w' + 'e', padx=50, pady=10) driver_label = ttk.Label(settings, text='Driver name', width=15, anchor='center') driver_label.grid(row=0, column=0, padx=5, pady=5) self.driver_name = tk.StringVar() driver_option = ttk.Combobox(settings, textvariable=self.driver_name, width=17, state='readonly') driver_option['values'] = ('firefox', 'chrome') driver_option.current(0) driver_option.grid(row=0, column=1, padx=5) def log_in(self): try: driver = self.driver_name.get() self.browser = Browser(driver_name=driver, headless=True) self.browser.visit('https://elife.fudan.edu.cn/') self.browser.find_by_xpath("//div/input[@class='xndl']").click() self.browser.fill("username", self.username) self.browser.fill("password", self.password) self.browser.find_by_value(u'登录').click() self.note.configure(text=('Hello, ' + self.browser.find_by_xpath( "//div[@class='person_a']").first.text)) self.search_button.configure(state='normal') self.search_button2.configure(state='normal') self.info_button1.configure(state='normal') self.info_button2.configure(state='normal') self.browser.cookies.all() except: self.note.configure( text='Failed, please check your input or Internet access') def search(self): self.browser_state = 'search' def select(event, row): self.select_row = int(row) - 1 self.avail_scr.tag_raise('tag_all') self.avail_scr.tag_configure('tag_all', background='white', foreground='black') self.avail_scr.tag_raise('tag' + row) self.avail_scr.tag_configure('tag' + row, background='blue', foreground='white') self.avail_scr.configure(state='normal') self.avail_scr.delete('1.0', 'end') self.valid_sites = [] self.select_row = -1 urlcode = self.court_var.get() user_start_time = int(self.start_time.get()[0:2]) user_end_time = int(self.end_time.get()[0:2]) dtime = Timedict[self.date.get()] reserve_date = (self.today + timedelta(dtime, 0)).strftime('%Y-%m-%d') self.browser.visit(Urldict[urlcode] + '¤tDate=' + reserve_date) found_sites = self.browser.find_by_xpath( "//td[@class='site_td1']/font") sites = [] for site in found_sites: if site.text != '': sites.append(site.text) has_reversed = self.browser.find_by_xpath( "//td[@class='site_td4']/font") all_for_reservation = self.browser.find_by_xpath( "//td[@class='site_td4']/span") if len(has_reversed) == 0: self.avail_scr.insert('insert', '您好,当天没有场地可以预约') else: for i in range(len(has_reversed)): site_time = int(sites[i][0:2]) remain = int(all_for_reservation[i].text) - int( has_reversed[i].text) if (site_time >= user_start_time) and ( site_time <= (user_end_time - 1)) and remain > 0: self.valid_sites.append(i) if len(self.valid_sites) == 0: self.avail_scr.insert('insert', '该时段场地未开放或已预定完,请适当放宽筛选条件。') else: for valid_site_num in self.valid_sites: self.avail_scr.insert( 'insert', Weekdict[int( (self.today + timedelta(dtime, 0)).strftime('%w'))] + ' ' + sites[valid_site_num] + ' ' + namedict[urlcode] + ' \n') self.avail_scr.tag_add('tag_all', '1.0', 'end') self.avail_scr.tag_raise('tag_all') self.avail_scr.tag_configure('tag_all', background='white', foreground='black') #刷新的时候把蓝色漂白 for j in range(len(self.valid_sites)): row = str(j + 1) self.avail_scr.tag_add('tag' + row, row + '.0', row + '.end') self.avail_scr.tag_bind( 'tag' + str(j + 1), '<Button-1>', lambda event, row=row: select(event, row)) self.avail_scr.configure(state='disable') def make_appointment(self): dtime = Timedict[self.date.get()] reserve_date = (self.today + timedelta(dtime, 0)).strftime('%Y-%m-%d') url = Urldict[self.court_var.get()] + '¤tDate=' + reserve_date def wait_for_the_midnight(): while ((self.today.strftime('%d') == datetime.date.today().strftime('%d')) or (datetime.datetime.now().hour < 6)): print('current time: ' + str(datetime.datetime.now().hour).zfill(2) + ':' + str(datetime.datetime.now().minute).zfill(2)) sleep(1800) while ((datetime.datetime.now().hour + datetime.datetime.now().minute / 60) < 6.8): sleep(300) print('current time: ' + str(datetime.datetime.now().hour).zfill(2) + ':' + str(datetime.datetime.now().minute).zfill(2)) while (datetime.datetime.now().hour < 7): sleep(15) print('current time: ' + str(datetime.datetime.now().hour).zfill(2) + ':' + str(datetime.datetime.now().minute).zfill(2)) try: self.browser.visit(url) self.browser.find_by_tag('img')[self.valid_sites[ self.select_row]].click() self.browser.fill('mobile', self.mobile) self.browser.find_by_value(u' 预 约 ').click() self.note2.configure(text='Job done') except: print("重新登陆") self.browser.visit('https://elife.fudan.edu.cn/') self.browser.find_by_xpath( "//div/input[@class='xndl']").click() self.browser.fill("username", self.username) self.browser.fill("password", self.password) self.browser.find_by_value(u'登录').click() self.browser.cookies.all() print("登陆成功") self.browser.visit(url) self.browser.find_by_tag('img')[self.valid_sites[ self.select_row]].click() self.browser.fill('mobile', self.mobile) self.browser.find_by_value(u' 预 约 ').click() self.note2.configure(text='Job done') print("抢票成功") if self.select_row == -1 or self.browser_state != 'search': if self.browser_state != 'search': self.note2.configure(text='Please update the search result') if self.select_row == -1: self.note2.configure(text='Please choose a court first') else: if Timedict[self.date.get()] <= 2: try: self.browser.visit(url) self.browser.find_by_tag('img')[self.valid_sites[ self.select_row]].click() self.browser.fill('mobile', self.mobile) self.browser.find_by_value(u' 预 约 ').click() except: self.note2.configure(text='You cannot book the court') else: self.note2.configure(text='Job done') else: confirm_msg = msg.askokcancel( '提示', '确定执行抢场功能吗,这可能需要一点时间。(场地晚上12点刷新,请保持程序运行)') if confirm_msg == True: self.note2.configure( text='Job has been queued, hold on please.') _thread.start_new_thread(wait_for_the_midnight, ()) def update(self): self.browser_state = 'record' self.record_scr.configure(state='normal') self.record_scr.delete('1.0', 'end') self.valid_td = [] self.select_record_row = -1 def select(event, row): self.select_record_row = int(row) - 1 self.record_scr.tag_raise('tag_all') self.record_scr.tag_configure('tag_all', background='white', foreground='black') self.record_scr.tag_raise('tag' + row) self.record_scr.tag_configure('tag' + row, background='blue', foreground='white') self.valid_td = [] self.browser.visit( 'https://elife.fudan.edu.cn/public/userbox/index.htm?userConfirm=&orderstateselect=' ) record_tr_num = len( self.browser.find_by_xpath("//table[@class='table3']/tbody/tr")) record_td = self.browser.find_by_xpath( "//table[@class='table3']/tbody/tr/td") for i in range(record_tr_num): if record_td[5 + 7 * i].text == '待签到': self.valid_td.append(i) if len(self.valid_td) != 0: for j in self.valid_td: valid_record_name = record_td[ 3 + 7 * j].text + ' ' + record_td[ 4 + 7 * j].text + ' ' + record_td[2 + 7 * j].text + '\n' self.record_scr.insert('insert', valid_record_name) self.record_scr.tag_add('tag_all', '1.0', 'end') self.record_scr.tag_raise('tag_all') self.record_scr.tag_configure('tag_all', background='white', foreground='black') #刷新的时候把蓝色漂白 for p in range(len(self.valid_td)): row = str(p + 1) self.record_scr.tag_add('tag' + row, row + '.0', row + '.end') self.record_scr.tag_bind( 'tag' + row, '<Button-1>', lambda event, row=row: select(event, row)) self.record_scr.configure(state='disable') def cancel(self): self.browser.visit( 'https://elife.fudan.edu.cn/public/userbox/index.htm?userConfirm=&orderstateselect=' ) if self.select_record_row != -1: self.browser.find_by_xpath("//table[@class='table3']/tbody/tr/td")[ 6 + (self.valid_td[self.select_record_row]) * 7].click() self.browser.get_alert().accept() self.record_scr.configure(state='normal') self.record_scr.tag_raise('tag_all') self.record_scr.tag_configure('tag_all', background='white', foreground='black') #刷新的时候把蓝色漂白 self.record_scr.delete('1.0', 'end') self.record_scr.insert('insert', '取消预约成功,请刷新') self.record_scr.configure(state='disable') else: self.record_scr.configure(state='normal') self.record_scr.tag_raise('tag_all') self.record_scr.tag_configure('tag_all', background='white', foreground='black') #刷新的时候把蓝色漂白 self.record_scr.delete('1.0', 'end') self.record_scr.insert('insert', '您未选择需要取消的预约,请刷新后重试') self.record_scr.configure(state='disable') def create_login(self): def confirm(): login_button_1.configure(text='modify', command=modify) student_ID_enter.configure(state='readonly') mobile_enter.configure(state='readonly') password_enter.configure(state='readonly') self.username = student_ID_var.get() self.password = password_var.get() self.mobile = mobile_var.get() def modify(): login_button_1.configure(text='OK', command=confirm) student_ID_enter.configure(state='normal') mobile_enter.configure(state='normal') password_enter.configure(state='normal') login = ttk.LabelFrame(self.win, text=' Log in') login.grid(row=1, column=0, padx=50, pady=10, sticky='w' + 'e') student_ID_label = ttk.Label(login, text='Student ID', width=15, anchor='center') student_ID_label.grid(row=0, column=0, padx=5, pady=5) student_ID_var = tk.StringVar() student_ID_enter = ttk.Entry(login, textvariable=student_ID_var, width=20) student_ID_enter.grid(row=0, column=1, padx=5) password_label = ttk.Label(login, text='Password', width=15, anchor='center') password_label.grid(row=1, column=0, padx=5, pady=5) password_var = tk.StringVar() password_enter = ttk.Entry(login, textvariable=password_var) password_enter.grid(row=1, column=1, padx=5) mobile_label = ttk.Label(login, text='Mobile', width=15, anchor='center') mobile_label.grid(row=2, column=0, pady=5, padx=5) mobile_var = tk.StringVar() mobile_enter = ttk.Entry(login, textvariable=mobile_var) mobile_enter.grid(row=2, column=1, padx=5) img = Image.open(r"./logo.jpg") global tk_img tk_img = ImageTk.PhotoImage(img) logo_frame = tk.Label(login, image=tk_img) logo_frame.grid(row=0, column=3, rowspan=3, columnspan=5, padx=30, pady=5) login_button_1 = ttk.Button(login, text='OK', command=confirm, width=10) login_button_1.grid(row=3, column=6, pady=10, padx=20, sticky='e') login_button_2 = ttk.Button(login, text='Log in', command=self.log_in, width=10) login_button_2.grid(row=3, column=7, pady=10, padx=5) self.note = ttk.Label(login, text='Please verify your identity') self.note.grid(row=3, column=0, padx=100, columnspan=4, sticky='w') def create_query(self): query = ttk.LabelFrame(self.win, text=' Query ') query.grid(row=2, column=0, padx=50, pady=10, sticky='w' + 'e') court_label = ttk.Label(query, text='Court', width=14, anchor='center') court_label.grid(row=0, column=0, padx=5, pady=5) self.court_var = tk.IntVar() self.court_var.set(0) courtRad1 = ttk.Radiobutton(query, text='正大', variable=self.court_var, value=0, width=8) courtRad1.grid(column=1, row=0, sticky='w', padx=5) courtRad2 = ttk.Radiobutton(query, text='北区', variable=self.court_var, value=1, width=8) courtRad2.grid(column=2, row=0, sticky='w') courtRad3 = ttk.Radiobutton(query, text='江湾', variable=self.court_var, value=2, width=8) courtRad3.grid(column=1, row=1, sticky='w', padx=5) date_label = ttk.Label(query, text='Day of Week', anchor='center') date_label.grid(row=2, column=0, padx=5, pady=5) self.date = tk.StringVar() date_option = ttk.Combobox(query, textvariable=self.date, width=17, state='readonly') date_option['values'] = ('Today', 'Tomorrow', '+2', '+3(rush mode)') # date_option.current(int((datetime.date.today()).strftime('%w'))) date_option.current = ('Tomorrow') date_option.grid(row=2, column=1, padx=5, columnspan=2) start_time_label = ttk.Label(query, text='Start time', anchor='center') start_time_label.grid(row=3, column=0, padx=5, pady=5) self.start_time = tk.StringVar() start_time_option = ttk.Combobox(query, textvariable=self.start_time, width=17, state='readonly') start_time_option['values'] = ('08:00', '09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00', '21:00') start_time_option.current(0) start_time_option.grid(row=3, column=1, padx=5, columnspan=2) end_time_label = ttk.Label(query, text='End time', anchor='center') end_time_label.grid(row=4, column=0, padx=5, pady=5) self.end_time = tk.StringVar() end_time_option = ttk.Combobox(query, textvariable=self.end_time, width=17, state='readonly') end_time_option['values'] = ('09:00', '10:00', '11:00', '12:00', '13:00', '14:00', '15:00', '16:00', '17:00', '18:00', '19:00', '20:00', '22:00') end_time_option.current(12) end_time_option.grid(row=4, column=1, padx=5, columnspan=2) avail_label = ttk.Label(query, text='Available', width=10) avail_label.grid(row=0, column=3, padx=30, pady=5, sticky='w') self.avail_scr = scrolledtext.ScrolledText(query, width=33, height=7) self.avail_scr.grid(row=1, column=3, padx=30, pady=5, rowspan=4, columnspan=4) self.avail_scr.bind('<Enter>', self.avail_scr.configure(cursor='arrow')) self.avail_scr.configure(state='disable') self.search_button = ttk.Button(query, text='Search', width=10, command=self.search, state='disable') self.search_button.grid(row=5, column=4, padx=15, pady=10, sticky='w') self.search_button2 = ttk.Button(query, text='Reserve', command=self.make_appointment, width=10, state='disable') self.search_button2.grid(row=5, column=6, padx=30, pady=10, sticky='w') self.note2 = ttk.Label(query, text='Please set your preference') self.note2.grid(row=5, column=0, padx=100, columnspan=4, sticky='w') def create_records(self): info = ttk.LabelFrame(self.win, text=' Info ') info.grid(row=3, column=0, padx=50, pady=10, sticky='w' + 'e') record_label = ttk.Label(info, text='Records', width=14, anchor='center') record_label.grid(row=0, column=0, padx=5, pady=5, sticky='w') self.record_scr = tk.Text(info, width=50, height=3) self.record_scr.grid(row=1, column=0, padx=50, pady=10, rowspan=2, columnspan=2, sticky='e') self.record_scr.bind('<Enter>', self.avail_scr.configure(cursor='arrow')) self.avail_scr.configure(state='disable') self.info_button1 = ttk.Button(info, text='Update', width=10, command=self.update, state='disable') self.info_button1.grid(row=1, pady=5, padx=10, column=2) self.info_button2 = ttk.Button(info, text='Cancel', width=10, command=self.cancel, state='disable') self.info_button2.grid(row=2, pady=5, column=2)
class ScoresWebTests(StaticLiveServerTestCase): def setUp(self): self.user1 = UserFactory.build() self.user1.set_password('abc') self.user1.save() self.user2 = UserFactory.build() self.user2.set_password('123') self.user2.save() self.userscore1 = UserScores( user=self.user1, wpm_gross=110, wpm_net=100, mistakes=8 ) self.userscore1.save() self.userscore2 = UserScores( user=self.user2, wpm_gross=100, wpm_net=90, mistakes=10 ) self.userscore2.save() self.match = Matches(winner=self.userscore1, loser=self.userscore2) self.match.save() self.browser = Browser() def tearDown(self): self.browser.quit() def login_helper(self, username, password): self.browser.visit( '%s%s' % (self.live_server_url, '/accounts/login/') ) self.browser.fill('username', username) self.browser.fill('password', password) self.browser.find_by_value('Log in').first.click() # Test 11 # Check anon get of /scores/ def test_anon_get_scores(self): self.browser.visit('%s%s' % (self.live_server_url, '/scores/')) self.assertEqual( self.browser.url, '%s%s' % (self.live_server_url, '/accounts/login/?next=/scores/') ) # Test 12 # Check anon get of /scores/match_score def test_anon_get_match_score(self): self.browser.visit('%s%s' % ( self.live_server_url, '/scores/match_score') ) self.assertEqual( self.browser.url, '%s%s' % ( self.live_server_url, '/accounts/login/?next=/scores/match_score' ) ) # Test 13 # Check scores for user def test_user_for_scores(self): self.login_helper(self.user1.username, 'abc') self.browser.visit('%s%s' % (self.live_server_url, '/scores/')) self.assertEqual( self.browser.find_by_tag('strong')[2].text, self.user1.username ) self.assertEqual( self.browser.find_by_tag('strong')[3].text, str( self.userscore1.wpm_net ) ) self.assertEqual( self.browser.find_by_tag('strong')[4].text, self.user2.username ) self.assertEqual( self.browser.find_by_tag('strong')[5].text, str( self.userscore2.wpm_net ) )
def scrape(): mars_data = {} # browser = init_browser() # mars_dict = {} #import pdb;pdb.set_trace() executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) # # NASA Mars News # URL of page to be scraped url = 'https://mars.nasa.gov/news/' browser.visit(url) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') #News Title news_title = soup.find('div', class_="bottom_gradient").text print(news_title) #Paragraph text news_p = soup.find('div', class_='article_teaser_body').text # print('--------------------------------------------------') print(news_p) # Add the news title and summary to the dictionary mars_data["news_title"] = news_title mars_data["new_p"] = news_p # # Featured Image #import pdb; pdb.set_trace() Image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(Image_url) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') time.sleep(5) html = browser.html soup = BeautifulSoup(html, 'html.parser') # Extracting image Image_path = soup.find('figure', class_='lede').a['href'] featured_image_url = 'https://www.jpl.nasa.gov/' + Image_path print(featured_image_url) # Add the featured image url to the dictionary mars_data["featured_image_url"] = featured_image_url # # Mars Weather mars_tweet = 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_tweet) html = browser.html soup = BeautifulSoup(html, 'html.parser') # Extracting tweet mars_weather = soup.find('div', class_='js-tweet-text-container').text.replace( '\n', '') print(mars_weather) # Add the weather to the dictionary mars_data["mars_weather"] = mars_weather # # Mars Facts mars_fact = 'https://space-facts.com/mars/' browser.visit(mars_fact) html = browser.html soup = BeautifulSoup(html, 'html.parser') # Extracting mars table #set up lists to hold td elements which alternate between label and value trs = soup.find_all('tr') #set up lists to hold td elements which alternate between label and value labels = [] values = [] #for each tr element append the first td element to labels and the second to values for tr in trs: td_elements = tr.find_all('td') labels.append(td_elements[0].text) values.append(td_elements[1].text) print(labels, values) mars_fact_tabel = pd.DataFrame({"Label": labels, "Values": values}) #mars_fact_tabel # convert the data to a HTML table string fact_table = mars_fact_tabel.to_html(header=False, index=False) print(fact_table) # Add the Mars facts table to the dictionary mars_data["mars_table"] = fact_table # # Mars Hemispheres USGS_site = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(USGS_site) html = browser.html soup = BeautifulSoup(html, 'html.parser') # Get the div element that holds the images. images = soup.find('div', class_='collapsible results') #Loop through the class="item" by clicking the h3 tag and getting the title and url. hemispheres_image_urls = [] # print(len(images.find_all("div", class_="item"))) for i in range(len(images.find_all("div", class_="item"))): # print(i) time.sleep(5) image = browser.find_by_tag('h3') image[i].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') title = soup.find("h2", class_="title").text # print(title) div = soup.find("div", class_="downloads") # for li in div: link = div.find('a') # print(link) url = link.attrs['href'] # print(url) hemispheres = {'title': title, 'img_url': url} hemispheres_image_urls.append(hemispheres) browser.back() print(hemispheres_image_urls) # Add the hemispheres data to the dictionary mars_data["hemispheres_image_urls"] = hemispheres_image_urls # Return the dictionary return mars_data
conn = sqlite3.connect('petfinder.db') c = conn.cursor() # c.execute('''CREATE TABLE dogs # (id text, name text,age text, breed text, animal text, shelterId text, sex text, website text)''') # c.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)") zippers = [90001, 90211, 90608] # url = "https://zogzmiller.github.io/" for z in zippers: browser = Browser('chrome') conn = sqlite3.connect('petfinder.db') c = conn.cursor() browser.visit('https://zogzmiller.github.io/') browser.fill('zip', z) browser.find_by_id('submitZip').click() time.sleep(10) listings = browser.find_by_tag('li') for i in listings: dogs = [] text = i.text.split('^ ') for x in range(8): value = text[x].split(': ')[1] if '"' in value: standin = 'Doug Funny' dogs.append(standin) else: dogs.append(value) # c.executemany('INSERT INTO dogs VALUES (?,?,?,?,?,?,?)', dogs) c.execute( f'INSERT INTO dogs VALUES ("{dogs[0]}","{dogs[1]}","{dogs[2]}","{dogs[3]}","{dogs[4]}","{dogs[5]}","{dogs[6]}","{dogs[7]}")' )
def scrape(): executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) mars = {} #-----------NASA MARS NEWS--------------- url = "https://redplanetscience.com/" browser.visit(url) # Parse Results HTML with BeautifulSoup> html = browser.html news_soup = BeautifulSoup(html, "html.parser") #News news_title = news_soup.find("div", class_="content_title").text news_p = news_soup.find("div", class_="article_teaser_body").text mars["news_titles"] = news_title mars["news_p"] = news_p #-------------JPL MARS SPACE IMAGES - FEATURED IMAGE------------- url = "https://spaceimages-mars.com/" browser.visit(url) browser.find_by_tag("button")[1].click() html = browser.html jplimage = BeautifulSoup(html, "html.parser") image = jplimage.find('img', class_="fancybox-image").get('src') image featimgurl = "https://spaceimages-mars.com/" + image featimgurl mars["featured_image_url"] = featimgurl #-------------- MARS FACTS--------------- url = "https://galaxyfacts-mars.com/" df = pd.read_html(url) df = df[0] df df.columns = ["Description", "Mars", "Earth"] df.set_index("Description", inplace=True) df df_html = df.to_html() df_html mars["facts"] = df_html #----------- MARS HEMISPHERES--------------- url = "https://marshemispheres.com/" browser.visit(url) result = browser.find_by_css("a.product-item img") hemisphere_image_url = [] for i in range(len(result)): hemisphere = {} browser.find_by_css("a.product-item img")[i].click() element = browser.links.find_by_text('Sample').first img_url = element["href"] hemisphere["img_url"] = img_url hemisphere["title"] = browser.find_by_css("h2.title").text hemisphere_image_url.append(hemisphere) browser.back() hemisphere_image_url mars["hemispheres"] = hemisphere_image_url # close the browser browser.quit() # Return one python dictionary containing all of the scraped data mars = {'news_title': news_title} mars['news_p'] = news_p mars['featured_image_url'] = featimgurl mars['facts'] = df_html mars['hemispheres'] = hemisphere_image_url return mars
class ChopeBrowser: def __init__(self, headless=False): self.chrome = Browser('chrome', headless=headless) def time_delay(self, time): self.chrome.is_element_present_by_name('!@#$%^&*())(*&^%$#@!', wait_time=time) def login(self, usr, pwd, domain='STUDENT'): url = 'https://ntupcb.ntu.edu.sg' url += '/fbscbs/Account/SignIn?ReturnUrl=%2ffbscbs' self.chrome.visit(url) dropdown = self.chrome.find_by_tag('option') for option in dropdown: if option.text == domain: option.click() self.chrome.fill('Username', usr) self.chrome.fill('Password', pwd + '\n') def first_setup(self): button = self.chrome.find_by_id('tdFacilityBook') button.click() self.chrome.click_link_by_href('#8') self.chrome.click_link_by_href('#-1') self.chrome.click_link_by_href('/fbscbs/Booking/Create?resourceId=69') self.chrome.click_link_by_id('book') self.chrome.click_link_by_id('changeResource') self.chrome.click_link_by_href('#-1') self.chrome.click_link_by_id('book') def is_registered(event): if event.has_class('noShowWhite'): return False if event.has_class('currentEvent'): return False return True def check_facility(self, evFacilities): columnWeek = self.chrome.find_by_css('.wc-event-column') evWeek = [] for columnDay in columnWeek: evToday = [] evList = columnDay.find_by_css('.ui-corner-all') for event in evList: if not event.has_class('noShowWhite'): if not event.has_class('currentEvent'): event = event.text if not event.find('—') == -1: if event == '': continue evToday.append(event.split('—')) evWeek.append(evToday) evFacilities.append(evWeek) def click_next(self, counter, evFacilities): # Kerja rekursif dengan check_facility. # Milih option facility berdasarkan counter. dropdown = self.chrome.find_by_id('ResourceId') options = dropdown.find_by_tag('option') if counter < len(options): nextOption = options[counter] nextOption.click() self.check_facility(counter, evFacilities) else: return evFacilities def scrape_seats(self, usr, pwd): self.login(usr, pwd) self.first_setup() evFacilities = [] dropdown = self.chrome.find_by_id('ResourceId') options = dropdown.find_by_tag('option') for opt in options: nextOption = opt nextOption.click() self.time_delay(0.1) # while loadingTitle.visible: # pass evFacilities.append(opt.text) self.check_facility(evFacilities) return evFacilities def quit(self): self.chrome.quit()
def scrape(): # Create a library that holds all the Mars' Data mars_library = {} #### PART 1: NASA Mars News #### # Initiate ChromeDriver executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # Target URL url = 'https://mars.nasa.gov/news/' browser.visit(url) # Create a Beautiful Soup object html = browser.html soup = bs(html, "html.parser") # Latest News Title and Paragraph news_title = soup.find("div", class_="content_title").text news_paragraph = soup.find("div", class_="article_teaser_body").text print(f"Title: {news_title}") print(f"Para: {news_p}") # put infos into Library mars_library['news_title'] = news_title mars_library['news_paragraph'] = news_paragraph #### PART 2: JPL Mars Space Images #### # Target URL image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(image_url) # Create a Beautiful Soup object html = browser.html soup2 = bs(html, "html.parser") # Featured Image image_url_route = soup2.find_all( 'a', class_='fancybox')[0].get('data-fancybox-href').strip() # Full Address featured_image_url = "https://www.jpl.nasa.gov/" + image_url_route print(featured_image_url) # put infos into Library mars_library['featured_image_url'] = featured_image_url #### PART 3: Mars Weather #### # Target Twitter URL twitter_url = "https://twitter.com/marswxreport?lang=en" browser.visit(twitter_url) # Create a Beautiful Soup object html = browser.html soup3 = bs(html, "html.parser") # Latest Mars Weather Tweet mars_tweet = soup3.find_all( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' )[0].text print(mars_tweet) # put infos into Library mars_library['mars_weather'] = mars_weather #### PART 4: Mars Fact #### # Target URL url = "https://space-facts.com/mars/" # Import URL to Panda table = pd.read_html(url) mars_fact_df = table[0] mars_fact_df.columns = ["Category", "Stats"] mars_fact_df.set_index(["Category"]) # Exporting as HTML Table mars_fact_html = mars_fact_df.to_html() mars_fact_html = mars_fact_html.replace("\n", "") mars_fact_df.to_html('mars_fact_df.html') # Put infos into Library mars_library['mars_facts'] = mars_facts_df #### PART 5: Mars Hemispheres #### # Target URL url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) # Use splinter to loop through the 4 images and load them into a dictionary html = browser.html soup = bs(html, 'html.parser') mars_hemis = [] # loop through the four tags and load the data to the dictionary for i in range(4): time.sleep(5) images = browser.find_by_tag('h3') images[i].click() html = browser.html soup = bs(html, 'html.parser') partial = soup.find("img", class_="wide-image")["src"] img_title = soup.find("h2", class_="title").text img_url = 'https://astrogeology.usgs.gov' + partial dictionary = {"title": img_title, "img_url": img_url} mars_hemis.append(dictionary) browser.back() # Put infos into Library mars_library['hemisphere_image_urls'] = mars_hemis # Return Library return mars_library
#https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars # In[20]: usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(usgs_url) # In[21]: usgs_html = browser.html usgs_soup = BeautifulSoup(usgs_html, 'html.parser') mars_hemis = [] # In[22]: for i in range(4): images = browser.find_by_tag('h3') images[i].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') partial = soup.find("img", class_="wide-image")["src"] img_title = soup.find("h2", class_="title").text img_url = 'https://astrogeology.usgs.gov' + partial dictionary = {"title": img_title, "img_url": img_url} mars_hemis.append(dictionary) browser.back() # In[23]: print(mars_hemis)
def scrape(): #Scrape the NASA Mars News Site and assign to variables for later reference url = "https://mars.nasa.gov/news/" page = requests.get("https://mars.nasa.gov/news/") soup = BeautifulSoup(page.text, "html.parser") #formated HTML content nicely using the prettify method on the BeautifulSoup object print(soup.prettify()) html = list(soup.children)[2] #Scrape title news_title = html.find("title").get_text() paragraphs = soup.find_all("p") for paragraph in paragraphs: new_p = paragraph.text #set up splinter and navigate to the site executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) #etracting the current featured image browser.find_by_id("full_image").click() featured_image_url = browser.find_by_css(".fancybox-image").first["src"] #Mars Weather url_mars = "https://twitter.com/marswxreport?lang=en" browser.visit(url_mars) tweet_text = browser.find_by_css(".tweet-text") for tweet in tweet_text: if tweet.text.partition(" ")[0] == "Sol": mars_weather = tweet.text break #Mars Facts url_mars_fact = "https://space-facts.com/mars/" browser.visit(url_mars_fact) tables = pd.read_html(url_mars_fact) df = tables[0] mars_df = df.set_index(0).rename(columns={1: "Value"}) mars_df.index.names = ["Planet Profile"] # converting to html data mars_facts = mars_df.to_html() # strip unwanted newlines to clean up the table. mars_facts.replace('\n', '') # with print function to make readable easily print(mars_facts) #Mars Hemispheres url_mars_hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_mars_hemi) Cerberus = browser.find_by_tag("h3")[0].text Schiaparelli = browser.find_by_tag("h3")[1].text Syrtis = browser.find_by_tag("h3")[2].text Valles = browser.find_by_tag("h3")[3].text browser.find_by_css(".thumb")[0].click() Cerberus_img = browser.find_by_text("Sample")["href"] browser.back() browser.find_by_css(".thumb")[1].click() Schiaparelli_img = browser.find_by_text("Sample")["href"] browser.back() browser.find_by_css(".thumb")[2].click() Syrtis_img = browser.find_by_text("Sample")["href"] browser.back() browser.find_by_css(".thumb")[3].click() Valles_img = browser.find_by_text("Sample")["href"] browser.back() hemisphere_image_urls = [{ 'title': Cerberus, 'img_url': Cerberus_img }, { 'title': Schiaparelli, 'img_url': Schiaparelli_img }, { 'title': Syrtis, 'img_url': Syrtis_img }, { 'title': Valles, 'img_url': Valles_img }] hemisphere_image_urls mars_data = { "news_title": news_title, "news_p": new_p, "featured_image_url": featured_image_url, "mars_weather": mars_weather, "mars_facts": mars_facts, "hemisphere_image_urls": hemisphere_image_urls } return mars_data
def scrape(): # Pointing to the directory where chromedriver exists executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) ### NASA Mars News # There is delay in run time wait upto few seconds # URL of page to be scraped url1 = "https://mars.nasa.gov/news/" browser.visit(url1) # Create a Beautiful Soup object html1= browser.html soup1 = bs(html1, 'html.parser') # type(soup1) news_title = soup1.find("div",class_="content_title").text news_paragraph = soup1.find("div", class_="article_teaser_body").text print(f"* TITLE: {news_title}\n") print(f"* PARAGRAPH: {news_paragraph}\n") ### JPL Mars Space Images - Featured Image # There is delay in run time wait upto few seconds # URL of page to be scraped url2 = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url2) # Finding id full_image browser.find_by_id("full_image").click() time.sleep(5) # Create a Beautiful Soup object html2= browser.html soup2 = bs(html2, 'html.parser') #type(soup2) # Setting featured_image_url img_url = soup2.find('img', class_='fancybox-image')['src'] # print(img_url) featured_image_url = "https://www.jpl.nasa.gov" + img_url print(f"* FEATURED IMAGE URL: {featured_image_url}\n") ### Mars Weather # There is delay in run time wait upto few seconds url3 = "https://twitter.com/marswxreport?lang=en" browser.visit(url3) html3= browser.html soup3 = bs(html3, 'html.parser') #type(soup3) # Store the latest match for class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text mars_weather = soup3.find('p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text').text print(f"* MARS WEATHER: {mars_weather}\n") ### Mars Facts # There is delay in run time wait upto few seconds url4 = "http://space-facts.com/mars/" browser.visit(url4) html4= browser.html soup4 = bs(html4, 'html.parser') #type(soup4) mars_facts = pd.read_html(url4) # mars_facts df_mars_facts = mars_facts[0] df_mars_facts.columns = ['Mars_Profile', 'Mars_ProfileValue'] df_mars_facts.set_index('Mars_Profile', inplace=True) # df_mars_facts # mars_facts_html =df_mars_facts.to_html("mars_facts.html",justify='left') mars_facts_html =df_mars_facts.to_html(justify='left') print(f"* MARS FACTS HTML: {mars_facts_html}\n") # !open mars_facts.html ### Mars Hemispheres # There is delay in run time wait upto few seconds url5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url5) html5= browser.html soup5 = bs(html5, 'html.parser') #type(soup5) # Creating empty dictionary for storing images url and title mars_hemisphere_dict = [] for i in range (4): # we have 4 images time.sleep(5) imgs = browser.find_by_tag('h3') # looking for all h3 tags where we have to click imgs[i].click() html5 = browser.html soup5 = bs(html5, 'html.parser') url_part = soup5.find("img", class_="wide-image")["src"] title = soup5.find("h2",class_="title").text iurl = 'https://astrogeology.usgs.gov'+ url_part mars_dict={"title": title,"img_url":iurl} mars_hemisphere_dict.append(mars_dict) browser.back() print(f"* MARS HEMISPHERE: {mars_hemisphere_dict}\n") # Consolidating all scraped data into one dictionary. # mars_mission_data = { # 'LATEST_MARS_NEWS_TITLE': news_title, # 'LATEST_MARS_NEWS_TEXT' : news_paragraph, # 'MARS_FEATURED_IMAGE' : featured_image_url, # 'MARS_WEATHER' : mars_weather, # 'MARS_FACTS' : mars_facts_html, # 'MARS_HEMISPHERE' : mars_hemisphere_dict # } mars_mission_data = { 'news_title' : news_title, 'news_paragraph' : news_paragraph, 'featured_image_url' : featured_image_url, 'mars_weather' : mars_weather, 'mars_facts_html' : mars_facts_html, 'mars_hemisphere_dict' : mars_hemisphere_dict } print(f"** MARS MISSION DATA : {mars_mission_data}\n") return mars_mission_data
def scrape(): from bs4 import BeautifulSoup from splinter import Browser import requests import os import pandas as pd import lxml.html as LH import time # In[2]: executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # In[3]: url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url) # In[4]: html = browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_="article_teaser_body").text # In[5]: print(news_title) # In[6]: # news_p=news_p.text print(news_p) # In[7]: url_jpl = 'https://www.jpl.nasa.gov' url_pic = url_jpl + '/spaceimages/?search=&category=Mars' browser.visit(url_pic) # In[8]: html = browser.html soup = BeautifulSoup(html, 'html.parser') # In[9]: test = soup.find_all('a', class_="fancybox") # In[10]: image = test[1].get('data-fancybox-href') # In[11]: featured_image_url = url_jpl + image print(featured_image_url) # In[12]: # for link in soup.find_all('a', class_="fancybox"): # print(link.get('data-fancybox-href')) # In[13]: url_tweet = "https://twitter.com/marswxreport?lang=en" browser.visit(url_tweet) # In[14]: html = browser.html soup = BeautifulSoup(html, 'html.parser') # In[15]: #tweets = [p.text for p in soup.find_al('p', class="tweet-text")] # In[16]: tweets = soup.find_all('p', class_="tweet-text") # In[17]: mars_weather = tweets[0].text # In[18]: facts_url = "https://space-facts.com/mars/" # In[19]: facts_df = pd.read_html(facts_url, header=None, index_col=None) # In[20]: facts_df = facts_df[0] # In[21]: facts_df.columns = ['Fact', 'Data'] # In[22]: table = facts_df.to_html() # In[23]: table # In[24]: hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemisphere_url) # In[25]: html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_hemis = [] # In[ ]: # In[26]: for x in range(4): time.sleep(5) title = browser.find_by_tag('h3') title[x].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') link = soup.find("img", class_="wide-image")["src"] hem_title = soup.find('h2', class_="title").text mars_hemis.append({ "img_url": "https://astrogeology.usgs.gov" + link, "title": hem_title }) browser.back() # In[27]: mars_hemis # In[28]: mars_dict = { 'hemis_pics': mars_hemis, 'table': table, 'weather': mars_weather, 'feature_pic': featured_image_url, 'title': news_title, 'paragraph': news_p } return mars_dict
from splinter import Browser import time myseed = [] try: with open('seed.txt') as seed: myseed = seed.readlines() except IOError as err: print("File error: ", str(err)) browser1 = Browser('chrome') browser1.visit('http://app.scientificseller.com/keywordtool') #browser1.reload() browser1.find_by_tag('textarea').fill(myseed) browser1.find_by_tag('button').click() browser1.find_by_tag('input').fill('fafafa') browser1.find_by_tag('button').click() volume = 0 try: with open('volume.txt') as v: vol = v.readlines() volume = vol.pop() # print(volume) except IOError as err: print('File error: ',str(err)) loop = True while loop: # print(browser1.find_by_tag('strong').value + '\n') time.sleep(5)
def scrape(): mars_dict = {} # Mars News URL of page to be scraped url = 'https://mars.nasa.gov/news/' html = requests.get(url).text title_soup = BeautifulSoup(html, 'html.parser') # Retrieve the latest news title and paragraph news_title = title_soup.find('div', class_='content_title').text news_par = title_soup.find('div', class_='rollover_description_inner').text # Mars Image to be scraped browser = Browser('chrome', headless=False) mars_image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(mars_image_url) time.sleep(1) # Move to a second page target1 = "a[class='button fancybox']" browser.find_by_tag(target1).click() # Move to next page browser.find_by_text('more info ').click() # Move to next page with image url target2 = "figure[class='lede']" browser.find_by_tag(target2).click() time.sleep(1) image_soup = BeautifulSoup(browser.html, 'html.parser') image_link = image_soup.find('img', src=True) # Retrieve featured image link featured_image_url = image_link['src'] browser.quit() time.sleep(1) # Mars weather to be scraped mars_weather_url = 'https://twitter.com/MarsWxReport?lang=en' browser = Browser('chrome', headless=False) browser.visit(mars_weather_url) time.sleep(3) weather_soup = BeautifulSoup(browser.html, 'html.parser') # Retrieve latest tweet with Mars weather info mars_weather = weather_soup.find( "div", class_= "css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0" ).text browser.quit() time.sleep(1) # Mars facts to be scraped, converted into html table mars_facts_url = 'https://space-facts.com/mars/' mars_facts_table = pd.read_html(mars_facts_url) mars_facts = mars_facts_table[2] mars_facts.columns = ["Description", "Value"] mars_html_table = mars_facts.to_html() mars_html_table.replace('\n', '') time.sleep(1) # Mars hemisphere name and image to be scraped usgs_url = 'https://astrogeology.usgs.gov' hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser = Browser('chrome', headless=False) browser.visit(hemispheres_url) time.sleep(3) hemispheres_html = browser.html pics_soup = BeautifulSoup(hemispheres_html, 'html.parser') # Mars hemispheres products data all_mars_hemispheres = pics_soup.find('div', class_='collapsible results') mars_hemispheres = all_mars_hemispheres.find_all('div', class_='item') hemisphere_image_urls = [] # Iterate through each hemisphere data time.sleep(3) for i in mars_hemispheres: # Collect Title hemisphere = i.find('div', class_="description") title = hemisphere.h3.text # Collect image link by browsing to hemisphere page hemisphere_link = hemisphere.a["href"] browser.visit(usgs_url + hemisphere_link) image_html = browser.html image_soup = BeautifulSoup(image_html, 'html.parser') image_link = image_soup.find('div', class_='downloads') image_url = image_link.find('li').a['href'] # Create Dictionary to store title and url info image_dict = {} image_dict['title'] = title image_dict['img_url'] = image_url hemisphere_image_urls.append(image_dict) time.sleep(1) # Mars mars_dict = { "news_title": news_title, "news_par": news_par, "featured_image_url": featured_image_url, "mars_weather": mars_weather, "fact_table": str(mars_html_table), "hemisphere_images": hemisphere_image_urls } return mars_dict
def scrape(): # Open borowser executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) # URL of page to be scraped url = 'https://mars.nasa.gov/news' url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' url3 = 'https://twitter.com/marswxreport?lang=en' url4 = 'https://space-facts.com/mars/' url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' # Requests of the URLs news_request = requests.get(url) #spaceimages_request = requests.get(url2) #twitter_request = requests.get(url3) #facts_request = requests.get(url4) #hemisphere_request = requests.get(url5) # First scrapping news_request.text news = bs(news_request.text, "html.parser") ##Get the Title of the page TITLE = news.find("title") TITLE = TITLE.get_text().replace('\n', '').strip() ##Get the title of the first news content_title = news.find(class_="content_title") news_title = content_title.get_text().replace('\n', '').strip() ## Get the description of the first news. rollover_description_inner = news.find(class_="rollover_description_inner") news_p = rollover_description_inner.get_text().replace('\n', '').strip() # Second Scrapping browser.visit(url2) browser.find_by_id('full_image').click() html = browser.html soup = bs(html, "html.parser") featured_image_url = soup.find("article", class_="carousel_item").get('style') featured_image_url = featured_image_url.split("'")[1] featured_image_url = f"https://www.jpl.nasa.gov{featured_image_url}" # Third Scraping browser.visit(url3) time.sleep(6) html = browser.html soup = bs(html, "html.parser") mars_weather = soup.find(text=re.compile('InSight')) #mars_weather = soup.find_all(class_="css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0") #mars_weather = mars_weather[23].text # Fourth Scraping tables = pd.read_html(url4) tables[0] HTML_TABLE = tables[0].to_html() HTML_TABLE = bs(HTML_TABLE, "html.parser") # Fifth Scraping browser.visit(url5) browser.find_by_tag('h3')[0].click() html = browser.html soup = bs(html, "html.parser") ##First image data url5_1 = soup.find_all('li') url5_11 = url5_1[0] url5_11 = url5_11.a["href"] url5_12 = soup.find('h2', class_="title").text ##Second image data browser.visit(url5) browser.find_by_tag('h3')[1].click() html = browser.html soup = bs(html, "html.parser") url5_2 = soup.find_all('li') url5_21 = url5_2[0] url5_21 = url5_21.a["href"] url5_22 = soup.find('h2', class_="title").text ##Third image data browser.visit(url5) browser.find_by_tag('h3')[2].click() html = browser.html soup = bs(html, "html.parser") url5_3 = soup.find_all('li') url5_31 = url5_3[0] url5_31 = url5_31.a["href"] url5_32 = soup.find('h2', class_="title").text ##Fourth image data browser.visit(url5) browser.find_by_tag('h3')[3].click() html = browser.html soup = bs(html, "html.parser") url5_4 = soup.find_all('li') url5_41 = url5_4[0] url5_41 = url5_41.a["href"] url5_42 = soup.find('h2', class_="title").text hemisphere_image_urls = [ { "title": url5_11, "img_url": url5_12 }, { "title": url5_21, "img_url": url5_22 }, { "title": url5_31, "img_url": url5_32 }, { "title": url5_41, "img_url": url5_42 }, ] listings = {} listings["title"] = TITLE listings["newsH"] = news_title listings["newsP"] = news_p listings["featuredImg"] = featured_image_url listings["weather"] = mars_weather #listings["HTMLtable"] = HTML_TABLE #listings["HemisphereDic"] = hemisphere_image_urls listings["url5_11"] = url5_11 listings["url5_12"] = url5_12 listings["url5_21"] = url5_21 listings["url5_22"] = url5_22 listings["url5_31"] = url5_31 listings["url5_32"] = url5_32 listings["url5_41"] = url5_41 listings["url5_42"] = url5_42 browser.quit() return listings
class SplinterBrowserDriver(BaseBrowserDriver): """ This is a BrowserDriver for splinter (http://splinter.cobrateam.info) that implements the BaseBrowserDriver API. To use it, you must have splinter installed on your env. For itself it's a browser driver that supports multiple browsing technologies such as selenium, phantomjs, zope, etc. """ driver_name = 'splinter' def __init__(self): super(SplinterBrowserDriver, self).__init__() if not splinter_available: raise ImportError( "In order to use splinter Base Driver you have to install it. " "Check the instructions at http://splinter.cobrateam.info") self._browser = Browser(config.default_browser) def _handle_empty_element_action(self, element): if not element: raise ActionNotPerformableException( "The action couldn't be perfomed because the element couldn't " "be found; Try checking if your element" "selector is correct and if the page is loaded properly.") @property def page_url(self): return self._browser.url @property def page_source(self): return self._browser.html @property def page_title(self): return self._browser.title def open_url(self, url): self._browser.driver.get(url) def quit(self): return self._browser.quit() def is_element_visible(self, element): return element.visible def get_element_text(self, element): return element.text def get_element_by_xpath(self, selector): return self._browser.find_by_xpath(selector) def get_element_by_css(self, selector): return self._browser.find_by_css(selector) def get_element_by_id(self, selector): return self._browser.find_by_id(selector) def get_element_by_tag(self, selector): return self._browser.find_by_tag(selector) @element_action def type(self, element, text, slowly=False): return element.type(text, slowly) @element_action def fill(self, element, text): return element.fill(text) @element_action def clear(self, element): self.fill(element, '') @element_action def click(self, element): return element.click() @element_action def check(self, element): return element.check() @element_action def uncheck(self, element): return element.uncheck() @element_action def mouse_over(self, element): return element.mouse_over() @element_action def mouse_out(self, element): return element.mouse_out() def reload(self): return self._browser.reload() def go_back(self): return self._browser.back() def go_forward(self): return self._browser.forward() def execute_script(self, script): return self._browser.evaluate_script(script) def get_iframe(self, iframe_id): return self._browser.get_iframe(iframe_id) def get_alert(self): return self._browser.get_alert() def attach_file(self, input_name, file_path): return self._browser.attach_file(input_name, file_path) def wait_pageload(self, timeout=30): wait_interval = 0.05 elapsed = 0 while self.execute_script('document.readyState') != 'complete': self.wait(wait_interval) elapsed += wait_interval if elapsed > timeout: raise PageNotLoadedException def click_and_wait(self, element, timeout=30): self.click(element) self.wait_pageload(timeout)
browser.type('scEffDate', '2014-10-31') browser.find_by_name('update').first.click() browser.find_link_by_text('Obligor').first.click() # choose the companyType type element = browser.find_by_name('companyType').first element.select(str(cType)) browser.fill('obligorName', countryList[conIndex]) browser.find_by_name('ObligorSearch').first.click() if cType == 0 or cType == 1: # browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[1].find_by_xpath('//tr[td[@text="'+kmvCountryList[conIndex]+'"]]').first.find_by_tag('a')[0].click() browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[1].find_by_xpath('//tbody/tr[td[text()[contains(.,"' + kmvCountryList[conIndex] + '")]]]')[0].find_by_tag('a')[0].click() else: # element = browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[2].find_by_xpath('//tbody/tr[td[text()[contains(.,"' + kmvCountryList[conIndex] + '")]]]')[0].find_by_tag('a')[0].click() elementList = browser.find_by_tag('tbody').first.find_by_tag('form').first.find_by_tag('table')[2].find_by_tag('tr') for element in elementList: if kmvCountryList[conIndex] in element.text: element.find_by_tag('a')[0].click() break element = browser.find_by_name('counterPartyType').first element.select('1') element = browser.find_by_name('avc').first element.select('4')
returnValue = messageBox(None,"Do you want to copy scenario, obligor or facility?","Copy scenario/obligor/facility",0x40 | 0x1) while returnValue == 1: # browser.fill('clientSearchString', 'jason\'s client') # browser.find_by_name('search').first.click() # browser.find_by_value('GO').first.click() waitNavigation = ctypes.windll.user32.MessageBoxW(0, "Now navigate the scenario/obligor/facility page and then click OK.", "", 0) if waitNavigation == 2: break # noddd = browser.find_by_tag('form').first.find_by_tag('td').first.find_by_tag('table').first noddd = browser.find_by_tag('form').first noddStr = noddd.html.replace('\n','\t').replace('\'','\\\'') scenarioName = browser.find_by_xpath('//body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[@class="rowHeader"]')[2].text if noddStr.find('Scenario Status') != -1: file = open('scenario_' + scenarioName + '.xml', 'w+') file.write(noddStr) file.close() elif noddStr.find('Correlation Information') != -1: file = open('obligor_' + scenarioName + '.xml', 'w+')
class UploadTestCase(unittest.TestCase): def setUp(self): self.testbed = testbed.Testbed() self.testbed.activate() self.testbed.init_datastore_v3_stub() self.testbed.init_memcache_stub() self.browser = Browser('chrome') def tearDown(self): self.testbed.deactivate() def test_when_create_task_upload_file(self): #login self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") self.browser.visit("http://127.0.0.1:8080/tasks") self.browser.click_link_by_text('Create new task') self.browser.fill('title', 'title') self.browser.fill('text', 'text') self.browser.is_element_present_by_name('files[]', wait_time=10) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) #self.browser.attach_file('files[]', 'test/1.png') self.browser.find_by_css('.btn.btn-primary.start').first.click() self.assertEqual(1, len(self.browser.find_by_css('.template-download.fade.in'))) self.assertEqual(4, len(self.browser.find_by_css('.template-download.fade.in td'))) def test_when_create_task_upload_many_files(self): #login self.browser.visit("http://127.0.0.1:8080/") self.assertEqual(self.browser.find_by_tag("h3").first.text, "Not logged in") self.browser.find_by_id("submit-login").first.click() self.assertEqual(self.browser.find_link_by_text("Insurance").first.text, "Insurance") self.browser.visit("http://127.0.0.1:8080/tasks") self.browser.click_link_by_text('Create new task') self.browser.fill('title', 'title') self.browser.fill('text', 'text') self.browser.is_element_present_by_name('files[]') self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) self.browser.attach_file('files[]', os.path.join(os.path.dirname(__file__),'1.png')) #self.browser.attach_file('files[]', 'test/1.png') self.browser.find_by_css('.btn.btn-primary.start').first.click() sleep(3) self.assertEqual(3, len(self.browser.find_by_css('.files tr.template-download')))
browser.visit(url) print u'你现在有20s的时间输入jaccount' sleep(15) print u'请稍等,本程序稍微有点慢.在这段等待的时间,你可以先打开你的微信^_^' browser.visit('http://electsys.sjtu.edu.cn/edu/student/elect/warning.aspx?xklc=1&lb=3') button=browser.find_by_id('CheckBox1') if (browser.is_element_not_present_by_id('CheckBox1')): pass else: button.click() browser.find_by_id('btnContinue').click() #获取所有课程编码 pattern=re.compile(r'[A-Z]{2}[0-9]{3}') classlist=[] for ele in browser.find_by_tag('td'): if (re.match(pattern,ele.text)): classlist.append(ele.text) #打开微信 wechaturl='http://wechat.shwilling.com/auth/qrcode/login?redirect=http%3A%2F%2Fwechat.shwilling.com%2Fsjtu%2Fcourse' browser.visit(wechaturl) print u'你现在有20s的时间扫描二维码确认登陆' sleep(10) print u'请稍等,本程序稍微有点慢...但是等待还是值得的.' myfile=open(u'scorelist.txt','w') for classid in classlist: time=['/2014-2015-1','/2014-2015-2','/2015-2016-1'] for i in range(3): class_str='http://wechat.shwilling.com/sjtu/course/detail/'+classid+time[i] browser.visit(class_str)
class TestViews(unittest.TestCase): def setUp(self): """ Test setup """ self.browser = Browser("phantomjs") # Set up the tables in the database Base.metadata.create_all(engine) # Create an example user self.user = models.User(name="Alice", email="*****@*****.**", password=generate_password_hash("test")) session.add(self.user) session.commit() self.process = multiprocessing.Process(target=app.run) self.process.start() time.sleep(1) def testLoginCorrect(self): self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") def testLoginIncorrect(self): self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/login") def testAddEditPost(self): self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.browser.visit('http://127.0.0.1:5000/post/add') self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/add") self.browser.fill("title", "First Post") self.browser.fill("content", "Hello World!") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.browser.click_link_by_text('Edit Post') self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/1/edit") self.browser.fill("title", "Edited First Post") self.browser.fill("content", "Hello Universe!") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.assertEqual(self.browser.find_by_tag('h1').first.value, "Edited First Post") #divs = self.browser.find_by_tag("div") #myList = [] #if "Hello Universe!" in divs: #myList.append("Hello Universe!") #self.assertEqual(myList[0], "Hello Universe!") def testAddDeletePost(self): self.browser.visit("http://127.0.0.1:5000/login") self.browser.fill("email", "*****@*****.**") self.browser.fill("password", "test") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.browser.visit('http://127.0.0.1:5000/post/add') self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/add") self.browser.fill("title", "First Post") self.browser.fill("content", "Hello World!") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.browser.click_link_by_text('Delete Post') self.assertEqual(self.browser.url, "http://127.0.0.1:5000/post/1/delete") button = self.browser.find_by_css("button[type=submit]") button.click() self.assertEqual(self.browser.url, "http://127.0.0.1:5000/") self.assertEqual(len(self.browser.find_by_tag('h1')),0) #divs = self.browser.find_by_tag("div") #myList = [] #if "Hello Universe!" in divs: #myList.append("Hello Universe!") #self.assertEqual(myList[0], "Hello Universe!") def tearDown(self): """ Test teardown """ # Remove the tables and their data from the database self.process.terminate() session.close() engine.dispose() Base.metadata.drop_all(engine) self.browser.quit()
def splinter(url): browser = Browser('chrome') #login 126 email websize browser.visit(url) #wait web element loading time.sleep(1) ##input username browser.find_by_tag('input')[0].fill('username') ##input password browser.find_by_tag('input')[1].fill('password') time.sleep(10) print('click now!') browser.find_by_id('loginSub').click() time.sleep(20) print('sleep') browser.find_by_xpath('//*[@id="selectYuding"]/a').first.click() time.sleep(0.5) print(browser.cookies.all()) browser.cookies.add({'_jc_save_toStation': '%u90D1%u5DDE%2CZZF'}) browser.cookies.add({'_jc_save_fromStation': '%u4E0A%u6D77%2CSHH'}) browser.cookies.add({'_jc_save_fromDate': '2018-02-13'}) browser.reload() print(browser.cookies.all()) browser.find_by_xpath('//*[@id="query_ticket"]').first.click() time.sleep(0.5) target_line = [ 'G1952', 'G3600', 'G1866', 'G1806', 'G1920', 'G1924', 'G1956', 'G1810', 'G1928', 'G1814', 'G1932', 'G1818' ] flag = False while True: a_list = browser.find_by_tag('a') for a in a_list: if not a['onclick']: continue if 'checkG1234' in a['onclick']: L = a['onclick'].split(',') s = L[2] line = re.findall(r'G[0-9]{4}', s) if (len(line) != 0): line_nb = line[0] print(s, '-----', line_nb) if line_nb in target_line: s = s.strip("'") id_target = 'ZE_' + s print(id_target) print(id_target.strip("'")) available = browser.find_by_xpath( f'//*[@id="{id_target}"]/div') print(available.text) if (available.text == '无'): continue print(type(available.text)) print(line_nb) print(a.text) print('***') a.click() flag = True break if flag == True: print('found') break else: print('not found') browser.reload() browser.find_by_xpath('//*[@id="query_ticket"]').first.click() browser.find_by_xpath('//*[@id="normalPassenger_0"]').first.click() browser.find_by_xpath('//*[@id="normalPassenger_4"]').first.click() browser.find_by_xpath('//*[@id="submitOrder_id"]').first.click() browser.click_link_by_id('query_ticket') time.sleep(8)
def getCVContacts(self, cvID): cvID = "85798642" # 不公开 cvID = "307661274" # cvID = "6098724" # cvID = "318657201" from splinter import Browser browser = Browser() url = "http://ehire.51job.com/MainLogin.aspx" browser.visit(url) time.sleep(1) browser.find_by_id('txtMemberNameCN').fill(u'安能聚业') browser.find_by_id('txtUserNameCN').fill(u'上海安能聚创供应链') browser.find_by_id('txtPasswordCN').fill('aneqc888') browser.find_by_id('Login_btnLoginCN').click() time.sleep(1) browser.find_by_tag('a').click() selector = etree.HTML(browser.html) cvDownableNum = selector.xpath('//span[@id ="Navigate_AvalidResumes"]/a/b')[0].text if cvDownableNum == "0": self.log.fatal("id:%s can not be down, as to cvDownableNum == 0." % (cvID)) browser.quit() return "0\n\ncvDownableNum is 0." browser.find_by_id('hlResumeSearch').click() browser.find_by_id('txtUserID').fill(cvID) time.sleep(1) browser.find_by_id('btnSearchID_leftbtnSearchID').click() cvTarget = browser.find_by_xpath('//tr/td/p/span/a[@target="_blank"]') if len(cvTarget) == 0: self.log.fatal("can not find the cv from this id:%s." % (cvID)) browser.quit() return "0\n\ncan not find the cv from this id." cvTarget.click() allwindows = browser.windows browser.driver.switch_to_window(allwindows[-1].name) UndownloadLink = browser.find_by_id('UndownloadLink') if len(UndownloadLink) != 0: UndownloadLink.click() time.sleep(1) browser.find_by_id('btnCommonOK').click() selector = etree.HTML(browser.html) contents = browser.html.encode("utf-8") winNum = len(allwindows) for i in range(winNum): allwindows[winNum - 1 - i].close() browser.quit() lines = selector.xpath('//title') name = "" if len(lines) != 0: name = strip(lines[0].text) try: phone = \ re.findall(re.compile('''<td height="20">电 话:</td><td height="20" colspan="3">(.*?)<span'''), contents)[0] except: phone = "not supplied" try: eMail = \ re.findall(re.compile('''E-mail:</td><td height="20" colspan="3"><a href="mailto:(.*?)" class="blue">'''), contents)[0] except: eMail = "not supplied" if not isinstance(name, unicode): name = name.decode("utf-8") if not isinstance(phone, unicode): phone = phone.decode("utf-8") result = "1\n\nname:%s\tphone:%s\teMail:%s" % (name, phone, eMail) self.log.fatal(result) return result
# -*- coding: utf-8 -*- """ Created on Mon Feb 15 19:14:26 2021 Downloads BIOS driver for ASRock Z490 Phantom Gaming-ITX/TB3 @author: Eric """ # Import Splinter and Chromedriver from splinter import Browser from webdriver_manager.chrome import ChromeDriverManager import time # Wait for 5 seconds time.sleep(5) executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # ============================================================================= url = 'https://www.asrock.com/MB/Intel/Z490%20Phantom%20Gaming-ITXTB3/index.asp#BIOS' browser.visit(url) bios_box = browser.find_by_tag('tbody') bios_box.find_by_tag("a")[1].click() # Wait for 20 seconds so download completes before window closes time.sleep(20)
parser.read('config.ini') browser = Browser(parser.get('Config', 'Browser')) browser.driver.maximize_window() browser.visit('https://fsweb.no/studentweb/login.jsf?inst=' + parser.get('Config', 'Institution')) browser.find_by_text('Norwegian ID number and PIN').first.click() browser.find_by_id('login-box') browser.fill('j_idt129:j_idt131:fodselsnummer', parser.get('Config', 'Fodselsnummer')) browser.fill('j_idt129:j_idt131:pincode', parser.get('Config', 'Pin')) browser.find_by_text('Log in').first.click() browser.click_link_by_href('/studentweb/resultater.jsf') tags = browser.find_by_tag('tr') chars = [] for tag in tags: if tag.has_class('resultatTop') or tag.has_class('none'): inner_tags = tag.find_by_tag('td') course_id = inner_tags[1].text.split("\n")[0] course_name = inner_tags[1].text.split("\n")[1] grade = inner_tags[5].text if grade != 'passed': chars.append(grade) print "%s\t%-30s\t%s" % (course_id, course_name, grade) total = 0.0 for char in chars:
def scrape(): # Open a blank window of Google Chrome. chrome_exec_shim = os.environ.get("GOOGLE_CHROME_BIN", "chromedriver") chromedriver_path = os.environ.get("CHROMEDRIVER_PATH", "") print("google chrome bin = %s" % chrome_exec_shim) print("chromedriver_path = %s" % chromedriver_path) if (chrome_exec_shim): # chrome_options = Options() # chrome_options.binary_location = chrome_exec_shim # chrome_options.add_argument('--headless') # chrome_options.add_argument('--no-sandbox') # driver = webdriver.Chrome(executable_path=chromedriver_path, chrome_options=chrome_options) # self.selenium = webdriver.Chrome(executable_path=chrome_exec_shim) exec_path = {'executable_path': chromedriver_path} browser = Browser("chrome", headless=True, **exec_path) mars_facts_data = {} # Visit the NASA newspage using the blank Chrome window. nasa_news_url = "https://mars.nasa.gov/news/" browser.visit(nasa_news_url) # Get html code from the site and convert it into json. html = browser.html soup = bs(html, "html.parser") news_title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="article_teaser_body").text mars_facts_data['news_title'] = news_title mars_facts_data['news_paragraph'] = paragraph_text # JPL Mars Space Images - Featured Image # Visit the JPL site which includes the featured image and extract the html code. # jpl_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" # browser.visit(jpl_image_url) # html = browser.html # soup = bs(html,"html.parser") # featured_image_url = soup.find('a', {'id': 'full_image', 'data-fancybox-href': True}).get('data-fancybox-href') # split_url = featured_image_url.split('/') # pia_url = split_url[-1] # base_image_url = "https://photojournal.jpl.nasa.gov/jpeg/" # pia_final = pia_url.split('_')[0]+'.jpg' # full_image_url = base_image_url + pia_final # mars_facts_data["featured_image_url"] = full_image_url browser.visit( 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars') browser.find_by_id('full_image').click() featured_image_url = browser.find_by_css('.fancybox-image').first['src'] mars_facts_data['featured_image_url'] = featured_image_url # Mars Weather mars_weather_twitter_url = "https://twitter.com/marswxreport?lang=en" browser.visit(mars_weather_twitter_url) html = browser.html soup = bs(html, "html.parser") mars_weather = soup.find( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' ).text mars_facts_data["mars_weather"] = mars_weather # Mars Facts mars_facts_url = "https://space-facts.com/mars/" mars_facts_tb1 = pd.read_html(mars_facts_url)[0] mars_facts_tb1.columns = ['Physical Properties', 'Values'] mars_html_table = mars_facts_tb1.to_html(justify='left', index=False).replace('\n', '') mars_facts_data["mars_facts_table"] = mars_html_table # Mars Hemispheres mars_hemi_urls = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(mars_hemi_urls) html = browser.html soup = bs(html, "html.parser") #Loop through the class="item" by clicking the h3 tag and getting the title and url. images = soup.find('div', class_='collapsible results') mars_hemi_urls = [] for i in range(len(images.find_all("div", class_="item"))): time.sleep(5) image = browser.find_by_tag('h3') image[i].click() html = browser.html soup = bs(html, 'html.parser') title = soup.find("h2", class_="title").text div = soup.find("div", class_="downloads") for li in div: link = div.find('a') url = link.attrs['href'] hemispheres = {'title': title, 'img_url': url} mars_hemi_urls.append(hemispheres) browser.back() mars_facts_data["mars_hemispheres"] = mars_hemi_urls browser.quit() return mars_facts_data
def scrape(): driverPath = get_ipython().getoutput('which chromedriver') executable_path = {'executable_path': driverPath[0]} browser = Browser('chrome', **executable_path, headless=False) # In[3]: #Create BeautifulSoup Object html = browser.html soup = BeautifulSoup(html, 'html.parser') # In[164]: url = 'https://mars.nasa.gov/news' browser.visit(url) # In[166]: #Find first article name and text. Store article_list = soup.find(class_='slide') news_title = article_list.find('h3').text news_p = article_list.find(class_='article_teaser_body').text # Check that it works print(news_title) print(news_p) # In[131]: space_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(space_image) image = browser.find_by_tag('article') # In[132]: featured_image_url = browser.find_by_id('full_image') # In[133]: #Splinter to pull tweet tweet_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(tweet_url) # In[134]: timeline = browser.find_by_id("timeline") timeline.click() # In[135]: # Python to scrape facts facts_url = requests.get('https://space-facts.com/mars/') bsfacts = BeautifulSoup(facts_url.text) print(bsfacts) # In[136]: facts_table = bsfacts.find(id='text-2').text print(facts_table) # In[148]: # In[141]: img_find = browser.find_by_css('img') img_click = img_find.find_by_css('thumb') # In[7]: import requests from selenium import webdriver from selenium.webdriver.common.keys import Keys driver = webdriver.Chrome() driver.get( 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' ) # In[27]: imgElement = driver.find_element_by_class_name('thumb') imgElement.click() new_driver = driver.current_url downloads = new_driver.find_element_by_class_name('downloads') image = downloads.find_element_by_tag_name('href')
# convert to soup html = browser.html hemisphere_soup = soup(html, 'html.parser') hemisphere_elem = hemisphere_soup.select_one('div', id_='collapsible results') # In[274]: # 2. Create a list to hold the images and titles. hemisphere_image_urls = [] hemisphere = {} # 3. Write code to retrieve the image urls and titles for each hemisphere. for hemispheres in hemisphere_elem.find_all('div', class_='item'): full_image_elem = browser.find_by_tag('img.thumb') full_image_elem.click() # Parse the resulting html with soup html = browser.html download_soup = soup(html, 'html.parser') download_url = download_soup.select_one('div', class_='downloads') hemisphere['img_url'] = download_url.select_one( 'li', string='Sample').a.get('href') hemisphere['title'] = download_soup.select_one('h2', class_='title').get_text() hemisphere_image_urls.append(hemisphere) # In[275]: hemisphere_image_urls
content = [] # Open file with list of BandCamp albums to download with open(list_file) as f: content = f.readlines() browser = Browser('phantomjs') for album in content: url = album.replace("\n","") print('URL: '+album) browser.visit(url) #time.sleep(3) artistName = browser.find_by_id('band-name-location')[0].value.split('\n')[0] # Get artist name from top-right print('Getting artist name.') # Will be used in the email portion # Check download's type (either says 'Buy Now (name your price)' or 'Free Download') for b in browser.find_by_tag('button'): # Go through page's buttons if b.value == 'Buy Now': enterPrice = True print('Button is of type \'Buy Now\'') b.click() break elif b.value == 'Free Download': enterPrice = False print('Button is of type \'Free Download\'') b.click() break # If Buy Now (name your price) if (enterPrice == True): browser.find_by_id('userPrice').first.fill('0') # Fill $0 as price to pay
from splinter import Browser import time seed=open('seed.txt') myseed=seed.readlines() seed.close() b=open('brandsfile.txt') brands=list(set(b.read().split())) b.close() b=Browser('chrome') b.visit('http://fanzle.com/amazon-longtail-keyword-scraper') b.find_by_value('3').click() b.find_by_tag('textarea').fill(myseed) b.find_by_value('submit').click() search=True while search: time.sleep(2) if int(b.find_by_id('current').value)==int(b.find_by_id('total').value): search=False keywords=b.find_by_tag('tbody').value.split() diff_keywords=list(set(keywords)) diff_keywords.sort(key=keywords.index) diff_keywords.pop(0) final_keywords=[word for word in diff_keywords if word not in brands] fw=open('fazleword.txt', 'w') for i in final_keywords: fw.write(i+' ') fw.close()