def access_elements(driver, proj_type, matchup_list): # parses the HTML and gets us the elements we need # get today's games today_games = driver.find_element_by_css_selector('.day:nth-child(1)') # the parser is in a separate module; this will instantiate it p = HTMLTableParser() # the parser needs the raw HTML, not the webObjects p.feed(today_games.get_attribute('innerHTML')) # iterate through the table rows for matchup in p.tables: # each row will print in the following format: # [['', '7 p.m. Eastern', 'Elo spread', 'Win prob.', 'Score'], [''], ['', '', 'Nuggets', '', '40%', '', ''], ['', '', 'Pacers', '-2.5', '60%', '', ''], ['']] # each matchup has 5 rows; we care about rows 3 & 4. # within each row, the name and win probability are what we need away_team_long = team_name_formatting(matchup[2][2]) away_team_win_prob = string_percent_to_integer(matchup[2][4]) home_team_long = team_name_formatting(matchup[3][2]) home_team_win_prob = string_percent_to_integer(matchup[3][4]) game_dict = { "game_key": away_team_long + home_team_long, "away_team": { "team_name": away_team_long, "win_prob": away_team_win_prob }, "home_team": { "team_name": home_team_long, "win_prob": home_team_win_prob } } # add each game to the tracking list matchup_list.append(game_dict) return matchup_list
def main(): url = 'https://finance.yahoo.com/quote/AAPL/history/' xhtml = url_get_contents(url).decode('utf-8') p = HTMLTableParser() p.feed(xhtml) pprint(p.tables)
def check_new_data(): date_from = (datetime.utcnow() - timedelta(days=5)).strftime("%d.%m.%Y") date_to = datetime.utcnow().strftime("%d.%m.%Y") resp_body = req.smsStat(date_from, date_to, config.child_id) print(resp_body) if resp_body != 0: soup = BeautifulSoup(resp_body, "html.parser") p = HTMLTableParser() try: sms_tab = soup.find('table').find("table") p.feed(str(sms_tab)) except: print("not found element on page") print("resp_body:") print(resp_body) try: sms_data = p.tables[0] print('sms_data = ', sms_data.pop(0)) except IndexError: sms_data = [] print('sorry, no data') for x in sms_data: db.ins_stat(x) else: print("Request error, skip...")
def dataInitialisation(): xhtml = url_get_contents( 'https://nssdc.gsfc.nasa.gov/planetary/factsheet/').decode('utf-8') p = HTMLTableParser() p.feed(xhtml) solarSystemData = p.tables[0] row_dict = { "mass": 1, "diameter": 2, "density": 3, "gravity": 4, "escape velocity": 5, "rotation period": 6, "length of day": 7, "distance from sun": 8, "perihelion": 9, "aphelion": 10, "orbital period": 11, "orbital velocity": 12, "orbital inclination": 13, "orbital eccentricity": 14, "obliquity to orbit": 15, "mean temperature": 16, "surface pressure": 17, "number of moons": 18, "ring system?": 19, "global magnetic field?": 20, } col_dict = { "mercury": 1, "venus": 2, "earth": 3, "moon": 4, "mars": 5, "jupiter": 6, "saturn": 7, "uranus": 8, "neptune": 9, "pluto": 10, } rocket_mass = { "pigeon 10": 549054, "pigeon 10 heavy": 549054, "big fat rocket": 3401942, "air captain": 14220, "air captain heavy": 733000, } rocket_thrust = { "pigeon 10": 7607000, "pigeon 10 heavy": 934000, "big fat rocket": 9900000, "air captain": 110000, "air captain heavy": 240000, } return [solarSystemData, row_dict, col_dict, rocket_mass, rocket_thrust]
def fillTournaments(self, myAPIServer): server = myAPIServer + 'data/tournament' page = 0 while page <= 5400: try: xhtml = self.url_get_contents( 'https://www.cagematch.net/?id=26&s=' + str(page)).decode('utf-8') page = page + 100 p = HTMLTableParser() p.feed(xhtml) table = p.tables[0] table.pop(0) for title in table: data = self.Tournament(0, title[2], 'FIFA', None).toJSON() response = requests.post(server, data=data, headers=headers) FillSportAndTournaments.lastIdTournament = int( response.text) print 'Tournament with id ' + response.text + ' was added.' except: pass
def read_first_table(url): """ Read the given URL, parse the result, and return the first table. """ xhtml = read_url(url) parser = HTMLTableParser() parser.feed(xhtml) return parser.tables[0] # Use first table on the page
def run(self, params={}): p = HTMLTableParser() p.feed(params.get(Input.TAP_ALERT)) data = p.tables clean_data = TAP(data).data return {Output.RESULTS: clean_data}
def obtain_parse_wiki_stocks_sp500(url): """Download and parse the Wikipedia list of S&P500 constituents using requests and libxml. Returns a list of tuples for to add to MySQL.""" # Get S&P500 website content req = urllib.request.Request(url) response = urllib.request.urlopen(req) data = response.read().decode('utf-8') #Instantiate the parser and feed it p = HTMLTableParser() p.feed(data) table_list = p.tables table = table_list[0][1:] # Obtain the symbol information for each row in the S&P500 constituent table symbols = [] for row in table: sd = {'ticker': row[0], 'name': row[1], 'sector': row[3]} # Create a tuple (for the DB format) and append to the grand list symbols.append(sd['ticker']) return symbols
def main(): url = 'https://www.investagrams.com/Stock/Analysis/TechnicalAnalysis' xhtml = url_get_contents(url).decode('utf-8') p = HTMLTableParser() p.feed(xhtml) pprint(p.tables)
def main(): url = 'http://www.twitter.com' xhtml = url_get_contents(url).decode('utf-8') p = HTMLTableParser() p.feed(xhtml) pprint(p.tables)
def get_open_countries(): parser = HTMLTableParser(data_separator=', ') parser.feed(response.text) table = parser.tables[0] open_date = 'I know when countries open boarders! \n\n' for date, countries in table: open_date += f'{date}: {countries} \n' return open_date
def main(): url = 'https://www.skysports.com/premier-league-table' xhtml = url_get_contents(url).decode('utf-8') #using html parser p = HTMLTableParser() p.feed(xhtml) pprint(p.tables)
def table_response(payload): response = requests.post( "https://wish.wis.ntu.edu.sg/webexe/owa/AUS_SCHEDULE.main_display1", data=payload) parser = HTMLTableParser() parser.feed(response.text) table = parser.tables return (table)
def test_singleTable(self): input = self.getTestInput("single_table") uut = HTMLTableParser(decode_html_entities=False, data_separator=' ') uut.feed(input) actual = uut.tables self.assertEqual(1, len(actual)) self.checkNumericTableValues(actual[0], 3, 10, 0)
def main(): url = 'http://www.rich.co.ke/rcdata/nsestocks.php' xhtml = url_get_contents(url).decode('utf-8') p = HTMLTableParser() p.feed(xhtml) #pprint(p.tables[14]) df = pd.DataFrame(p.tables[14]) df.to_csv('stocks.csv', index=False, encoding='utf-8')
def test_decodeCharRefs(self): test_input = self.getTestInput("with_char_ref") uut = HTMLTableParser(decode_html_entities=True, data_separator=' ') uut.feed(test_input) actual = uut.tables self.assertEqual(1, len(actual)) expected_values = [">", "<", "$", "&", "="] self.checkTableValues(actual[0], 1, 5, expected_values)
def test_keepCharRefs(self): input = self.getTestInput("with_char_ref") uut = HTMLTableParser(decode_html_entities=False, data_separator=' ') uut.feed(input) actual = uut.tables self.assertEqual(1, len(actual)) expected_values = ["", "", "", "", ""] self.checkTableValues(actual[0], 1, 5, expected_values)
def laneBarcodeHTML(path): """Retrieve data from the laneBarcode.html page, the data is per barcode/sample per lane""" try: samples_dict = {} dict_samples = { 'Lane': {'column': 'Lane'}, 'Project': {'column': 'Project'}, 'Sample_name': {'column': 'Sample'}, 'Barcode_sequence': {'column': 'Barcode sequence'}, 'PF_Clusters': {'column': 'PF Clusters'}, 'PCT_of_lane': {'column': '% of the lane'}, 'PCT_perfect_barcode': {'column': '% Perfect barcode'}, 'PCT_one_mismatch_barcode': {'column': '% One mismatch barcode'}, 'Yield_Mbases': {'column': 'Yield (Mbases)'}, 'PCT_PF_Clusters': {'column': '% PF Clusters'}, 'PCT_Q30_bases': {'column': '% = Q30 bases'}, 'Mean_Quality_Score': {'column': 'Mean Quality Score'} } samplehtml = commands.getoutput('find {path}/Data/Intensities/BaseCalls/Reports/html/*/all/all/all/ -iname \'laneBarcode.html\''.format( path=str(path) )) with open(samplehtml, 'r') as sample: html = sample.read() tableParser = HTMLTableParser() tableParser.feed(html) tables = tableParser.tables # tables[1]==run tables[2]==sample header_samplehtml = tables[2][0] for col in dict_samples: dict_samples[col]['index'] = header_samplehtml.index(dict_samples[col]['column']) for sample_lane in tables[2][1:]: data_sample_lane = {} if sample_lane[header_samplehtml.index('Project')].upper() != 'DEFAULT': stats = [convert_numbers(item.replace(',', '')) for item in sample_lane] lane = stats[header_samplehtml.index('Lane')] sample = stats[header_samplehtml.index('Sample')] lane_sample = str(lane) + '--' + str(sample) for col in dict_samples: stat = stats[dict_samples[col]['index']] data_sample_lane[col] = stat samples_dict[lane_sample] = data_sample_lane return samples_dict except Exception, e: print(e)
def test_twoTables(self): test_input = self.getTestInput("two_tables") uut = HTMLTableParser(decode_html_entities=False, data_separator=' ') uut.feed(test_input) actual = uut.tables self.assertEqual(2, len(actual)) self.checkNumericTableValues(actual[0], 3, 10, 0) self.checkNumericTableValues(actual[1], 3, 10, 100)
def get_match_players(html_page): match_players = set() players_tables_containers = html_page.find_class('col-soupisky-home') + html_page.find_class('col-soupisky-visitor') for tables_container in players_tables_containers: for table in tables_container.findall('table'): table_string = tostring(table, encoding='utf-8') table_parser = HTMLTableParser() table_parser.feed(table_string.decode('utf-8')) for row in table_parser.tables[0][1:]: if row[0]: match_players.add(extract_player_name(row[2])) return match_players
def main(): url = 'https://w3schools.com/html/html_tables.asp' xhtml = url_get_contents(url).decode('utf-8') p = HTMLTableParser() p.feed(xhtml) # Get all tables pprint(p.tables) # Get tables with id attribute pprint(p.named_tables)
def main(ip): s = NeSession() try: url = "http://" + ip + ":20080/EMSRequest/VoltageStatistics" re = s.get(url) except: url = "https://" + ip + "/EMSRequest/VoltageStatistics" re = s.get(url, verify=False) xhtml = re.text.strip() p = HTMLTableParser() p.feed(xhtml) return p.tables
def test_withInternalTags(self): input = self.getTestInput("tags_inside_cells") uut = HTMLTableParser(decode_html_entities=False, data_separator='-') uut.feed(input) actual = uut.tables self.assertEqual(1, len(actual)) expected_values = [ "0-zero", "1-one", "2-two", "3-three", "4-four", "5-five", "6-six", "7-seven", "8-eight", "9-nine-nuevo" ] self.checkTableValues(actual[0], 1, 10, expected_values)
def data_web_update(): from webb import webb from aplikace.models import Product # from locale import atof import re id = 0 from html_table_parser import HTMLTableParser for polozka in Product.notKL(): url = "http://www.vskprofi.cz/vyhledavani?type=sku&search=" + polozka.Obj + "&sku=OK" page = webb.download_page(url) p = HTMLTableParser() p.feed(page.decode('utf-8')) #print(p.tables) ar = p.tables try: data = Product.find_by_Obj(polozka.Obj) for i in range(6, 10): if re.search('technical-data', ar[0:1][0][1][i]): data.TL = ar[0:1][0][1][i] #print ar[0:1][0][1][i] if re.search('product-lists', ar[0:1][0][1][i]): data.KL = ar[0:1][0][1][i] #print ar[0:1][0][1][i] if re.search('pics', ar[0:1][0][1][i]): data.Foto = ar[0:1][0][1][i] #print ar[0:1][0][1][i] data.sklad = ar[0:1][0][1][3] if ar[0:1][0][1][11]: data.Poznamka = ar[0:1][0][1][11] #print data.Obj data.update(commit=False) id = id + 1 if id % 100 == 0: print "aktualizuji data" db.session.commit() # for i in ar[0:1][0][1]:db.session.commit()rint # print(i) #print(float(re.split(" ", ar[0:1][0][1][4])[0].replace(".","").replace(",","."))) except: print "Chyba" + str(id) + " " + polozka.Obj db.session.commit() #data_web_update.delay() db.session.commit() return True
def get_live_data(self, stock): url = 'https://in.finance.yahoo.com/quote/' + stock + '.NS/history/' html = self.get_contents(url).decode('utf-8') parser = HTMLTableParser() parser.feed(html) livedata = pd.DataFrame(parser.tables[0], columns=parser.tables[0][0])[[ 'Date', 'Close*' ]].rename(columns={'Close*': 'Close'}) livedata.drop(livedata.index[[0, -1]], inplace=True) livedata['Date'] = pd.to_datetime(livedata['Date']).dt.date livedata = livedata.reindex(index=livedata.index[::-1]) livedata.dropna(inplace=True) return livedata
def data_web_update(): from webb import webb from aplikace.models import Product # from locale import atof import re id=0 from html_table_parser import HTMLTableParser for polozka in Product.notKL(): url="http://www.vskprofi.cz/vyhledavani?type=sku&search=" + polozka.Obj + "&sku=OK" page = webb.download_page(url) p = HTMLTableParser() p.feed(page.decode('utf-8')) #print(p.tables) ar=p.tables try: data=Product.find_by_Obj(polozka.Obj) for i in range(6,10): if re.search('technical-data',ar[0:1][0][1][i]): data.TL = ar[0:1][0][1][i] #print ar[0:1][0][1][i] if re.search('product-lists',ar[0:1][0][1][i]): data.KL = ar[0:1][0][1][i] #print ar[0:1][0][1][i] if re.search('pics',ar[0:1][0][1][i]): data.Foto = ar[0:1][0][1][i] #print ar[0:1][0][1][i] data.sklad = ar[0:1][0][1][3] if ar[0:1][0][1][11]: data.Poznamka = ar[0:1][0][1][11] #print data.Obj data.update(commit=False) id=id+1 if id % 100 == 0: print "aktualizuji data" db.session.commit() # for i in ar[0:1][0][1]:db.session.commit()rint # print(i) #print(float(re.split(" ", ar[0:1][0][1][4])[0].replace(".","").replace(",","."))) except: print "Chyba" + str(id) + " " + polozka.Obj db.session.commit() #data_web_update.delay() db.session.commit() return True
def get_payers_scored(html_page): score_tables = html_page.find_class('table-last-right') players_scored = set() for table in score_tables: for redundant_el in table.find_class('row-plus-minus'): redundant_el.getparent().remove(redundant_el) table_string = tostring(table, encoding='utf-8') table_parser = HTMLTableParser() table_parser.feed(table_string.decode('utf-8')) for row in table_parser.tables[0][1:]: if len(row) > 3: players_scored.add(extract_player_name(row[2])) return players_scored
def parse_devicelist(data_str): """Parse the BT Home Hub 5 data format.""" p = HTMLTableParser() p.feed(data_str) known_devices = p.tables[9] devices = {} for device in known_devices: if len(device) == 5 and device[2] != '': devices[device[2]] = device[1] return devices
def test_namedTables(self): input = self.getTestInput("named_tables") uut = HTMLTableParser(decode_html_entities=False, data_separator=' ') uut.feed(input) actual = uut.tables actual_named = uut.named_tables self.assertEqual(2, len(actual)) self.assertEqual(2, len(actual_named)) self.checkNumericTableValues(actual[0], 3, 10, 0) self.checkNumericTableValues(actual[1], 3, 10, 100) self.checkNumericTableValues(actual_named["named_table_one"], 3, 10, 0) self.checkNumericTableValues(actual_named["named_table_two"], 3, 10, 100)
def test_nestedTable(self): test_input = self.getTestInput("nested_table") uut = HTMLTableParser(decode_html_entities=False, data_separator=' ') uut.feed(test_input) actual = uut.tables self.assertEqual(2, len(actual)) nested_table = actual[0] self.checkNumericTableValues(nested_table, 2, 3, 0) outer_table = actual[1] expected_first_row = ['', '1', '2'] expected_second_row = ['10', '11', '12'] self.assertEqual(2, len(outer_table)) self.assertListEqual(outer_table[0], expected_first_row) self.assertListEqual(outer_table[1], expected_second_row)
def run(self, params={}): p = HTMLTableParser() p.feed(params.get(Input.TAP_ALERT)) data = p.tables clean_data = TAP(data).data # Get the Threat details URL which is NOT an HTML table element, but instead the <a> link of the # table element extractor = URLExtract() cleaned_input_for_extractor = params.get(Input.TAP_ALERT) cleaned_input_for_extractor.replace('\n', '') urls_from_input = extractor.find_urls(cleaned_input_for_extractor) threat_details_urls = list(filter(lambda u: r'threat/email' in u and r'threatinsight.proofpoint.com' in u[:40], urls_from_input)) if threat_details_urls: clean_data['threat']['threat_details_url'] = threat_details_urls[0] return {Output.RESULTS: clean_data}
def update(self): _LOGGER.debug('Updating Waste collection dates using scraper') try: today = datetime.today() year = today.strftime('%Y') suffix_url = self.postcode + "/" + self.street_number + "/" + year + "/" url = "https://gemeente.groningen.nl/afvalwijzer/groningen/" + suffix_url req = urllib.request.Request(url=url) f = urllib.request.urlopen(req) xhtml = f.read().decode('utf-8') p = HTMLTableParser() p.feed(xhtml) waste_dict = {} fraction_name = "" if p.tables[0]: for table_row in p.tables[0][1:]: for i in range(13): if table_row[i]: if i == 0: fraction_name = table_row[i].split(" ")[0] if "Klein chemisch afval kunt u" in fraction_name: fraction_name = "Klein chemisch afval" waste_dict[fraction_name] = [] else: for day in table_row[i].split(" "): try: waste_dict[fraction_name].append( datetime.strptime( (day.replace("*", "") + " " + str(i) + " " + year), "%d %m %Y")) except (ValueError, TypeError): pass self.data = waste_dict else: _LOGGER.error( 'Error occurred while fetching data. Probably the postcode/street number is incorrect.' ) self.data = None except urllib.error.URLError as exc: _LOGGER.error('Error occurred while fetching data: %r', exc.reason) self.data = None return False
def get_tags(self, path, dump_html=False): """get tags.""" input_tags = self.browser.find_by_xpath( "//input[contains(@id, 'ajax-upload-id')]") real_path = os.path.realpath(path) try: input_tags[0].type(real_path) except selenium.common.exceptions.ElementNotVisibleException as e: self.log.error('Error', e=e) if len(input_tags) > 1: input_tags[1].type(real_path) else: self.log.error('Input tag not found.') bb_tag = self.browser.find_by_css('.box-body')[1] if dump_html: dump_html_to_file(bb_tag.html) p = HTMLTableParser() p.feed(bb_tag.html.strip()) return p.tables
def get_players_data_by_their_stats(players_data_url, headers): if isinstance(players_data_url, list): urls_to_process = players_data_url else: urls_to_process = [players_data_url] result_df = None for url in urls_to_process: response = requests.get(url, headers=headers) doc = html.fromstring(response.text) data_table_elements = doc.find_class('table-stats') table_string = html.tostring(data_table_elements[0], encoding='utf-8').decode() html_table_parser = HTMLTableParser() html_table_parser.feed(table_string) data_table = html_table_parser.tables[0] df = pd.DataFrame(data_table[1:], columns=data_table[0]) if result_df is None: result_df = df else: merge_on = ['Jméno', 'Tým', 'Z'] result_df = result_df.merge(df, left_on=merge_on, right_on=merge_on, ) return result_df
def get_players_data_from_team_stats(team_name, team_stats_url, headers): html_page = get_html_page(team_stats_url, headers) team_players_stats = html_page.find_class("tablehead") players_stats = [] players_data_columns = [] for table in team_players_stats: for redundant_el in table.find_class("stathead"): redundant_el.getparent().remove(redundant_el) table_string = tostring(table, encoding="utf-8") table_parser = HTMLTableParser() table_parser.feed(table_string.decode("utf-8")) if table_parser.tables[0][0][0] == "PLAYER" and "SOG" in table_parser.tables[0][0]: players_data_columns = table_parser.tables[0][0] for row in table_parser.tables[0][1:]: row[0] = extract_player_name(row[0]) row.append(team_name) players_stats.append(row) break players_data_columns.append("TEAM") result_df = pd.DataFrame(columns=players_data_columns, data=players_stats) return result_df.apply(pd.to_numeric, errors="ignore")