def fetch_production(country_code='CR', session=None): # Do not use existing session as some amount of cache is taking place r = requests.session() url = 'https://appcenter.grupoice.com/CenceWeb/CencePosdespachoNacional.jsf' response = r.get(url) df_yesterday = pd.read_html(response.text, skiprows=1, index_col=0, header=0)[0] soup = BeautifulSoup(response.text, 'html.parser') yesterday_date = soup.select('#formPosdespacho:pickFechaInputDate')[0]['value'] jsf_view_state = soup.select('#javax.faces.ViewState')[0]['value'] yesterday = arrow.get(yesterday_date, 'DD/MM/YYYY', tzinfo=TIMEZONE) today = yesterday.shift(days=+1) data = [ ('formPosdespacho', 'formPosdespacho'), ('formPosdespacho:pickFechaInputDate', today.format(DATE_FORMAT)), ('formPosdespacho:pickFechaInputCurrentDate', today.format(MONTH_FORMAT)), ('formPosdespacho:j_id35.x', ''), ('formPosdespacho:j_id35.y', ''), ('javax.faces.ViewState', jsf_view_state), ] response = r.post(url, cookies={}, data=data) df_today = pd.read_html(response.text, skiprows=1, index_col=0)[0] ydata = df_to_data(country_code, yesterday, df_yesterday) tdata = df_to_data(country_code, today, df_today) production = ydata + tdata unknown_plants() return production
def get_stats(year, level='pro'): #TODO Switch to regex patterns '''Scrapes draftexpress.com/stats for of a given level, year''' front = 'http://www.draftexpress.com/stats.php?sort=8&q=' pages = 2 frontb = '&league=NBA&year=20' if level == 'col': frontb = '&league=NCAA&year=20' pages = 13 midA = '&per=per40pace&qual=prospects&sort2=DESC&pos=all&stage=all&min=10&conference=All&pageno=' back = '&sort=8' url = front + frontb + year + midA+ '0' + back reg = pd.DataFrame() eff = pd.DataFrame() for n in xrange(pages): url = front + frontb + year + midA+ str(n) + back eff_url = front + 'eff'+ frontb + year + midA+ str(n) + back reg_temps = pd.read_html(url, header=0) reg_temp = reg_temps[5] eff_temps = pd.read_html(eff_url) eff_temp = eff_temps[5] eff_temp.to_csv('temp.csv') eff_temp = pd.read_csv('temp.csv', header=3) #im ashamed reg = reg.append(reg_temp) eff = eff.append(eff_temp) reg['year'] = 2000 + float(year) eff['year'] = 2000 + float(year) df = reg.merge(eff, how='inner', on='Name', suffixes=('', '_y')) df = df.drop(['Cmp', 'Team_y', 'year_y', 'Min_y', 'Cmp_y', 'GP_y'], 1) print df.shape return df
def get_html_dfs(stryear, strmonth): year = int(stryear) month = int(strmonth) monthly_file = "./" + stryear + "_" + strmonth + ".html" try: with open (monthly_file, 'r') as mf: dfs = pd.read_html(monthly_file, encoding='utf-8') print ("read html file successfully") return dfs except Exception as e: print(e) if year > 1990: year -= 1911 url = 'http://mops.twse.com.tw/nas/t21/sii/t21sc03_'+str(year)+'_'+str(month)+'_0.html' if year <= 98: url = 'http://mops.twse.com.tw/nas/t21/sii/t21sc03_'+str(year)+'_'+str(month)+'.html' headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} r = requests.get(url, headers=headers) r.encoding = 'big5' print ("fetch html file successfully") with codecs.open( monthly_file, mode='wb') as writefile: writefile.write(r.text.encode('utf8')) dfs = pd.read_html(StringIO(r.text), encoding='big-5') return dfs
def get_box_stats(self, url): """ INPUT: NCAAScraper, STRING OUTPUT: DATAFRAME, DATAFRAME, DATAFRAME Extract html from box stats page and convert to dataframes url is a string linking to the box stats page """ soup = self.page_opener.open_and_soup(url) tables = soup.findAll('table', {'class': 'mytable'}) if len(tables) != 3: print 'Incorrect number of tables' return None htable = pd.read_html(str(tables[0]), header=0)[0] table1 = pd.read_html(str(tables[1]), skiprows=1, header=0, infer_types=False)[0] table2 = pd.read_html(str(tables[2]), skiprows=1, header=0, infer_types=False)[0] team1 = htable.iloc[0, 0] team2 = htable.iloc[1, 0] table1['Team'] = [team1] * table1.shape[0] table2['Team'] = [team2] * table2.shape[0] table1['game_id'] = [self.game_id(url)] * table1.shape[0] table2['game_id'] = [self.game_id(url)] * table2.shape[0] # older box stat page versions use different column names so # we must map them all to common column names (e.g. MIN vs. Min) table1 = self.rename_box_table(table1) table2 = self.rename_box_table(table2) table1 = self.format_box_table(table1) table2 = self.format_box_table(table2) return htable, table1, table2
def make_USrepresentative_df(): representative_df = pd.DataFrame() df = pd.read_html(URLS['dem_USrepresentative'])[0] df.columns = ['county', 'candidate1', 'candidate2', 'candidate3', 'candidate4', 'candidate5', 'candidate6'] df['county'] = df['county'].fillna('') splits = df[df.county.str.startswith('DISTRICT')].index.tolist() splits.append(df.shape[0]) for split in range(len(splits) - 1): df_ = df.iloc[splits[split]:splits[split+1]] df_ = df_.drop(df_.index[0]) df_.columns = df_.iloc[0] df_ = df_.drop(df_.index[0]) df_.columns = ['county'] + list(df_.columns[1:]) df_ = df_.dropna(subset=[df_.columns.values[1]]) df_ = df_.dropna(axis=1) df_ = pd.melt(df_, id_vars=['county'], value_vars=list(df_.columns[1:])) df_.columns = ['county', 'candidate', 'votes'] df_ = df_[df_['county'] != ''] df_['party'] = 'Democratic' df_['candidate'] = df_['candidate'].str.lstrip('*') df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '') df_['candidate'] = df_['candidate'].str.rstrip('()') df_['office'] = 'US Representative' representative_df = representative_df.append(df_) df = pd.read_html(URLS['rep_USrepresentative'])[0] df.columns = ['county', 'candidate1', 'candidate2', 'candidate3', 'candidate4', 'candidate5'] df['county'] = df['county'].fillna('') splits = df[df.county.str.startswith('DISTRICT')].index.tolist() splits.append(df.shape[0]) for split in range(len(splits) - 1): df_ = df.iloc[splits[split]:splits[split+1]] df_ = df_.drop(df_.index[0]) df_.columns = df_.iloc[0] df_ = df_.drop(df_.index[0]) df_.columns = ['county'] + list(df_.columns[1:]) df_ = df_.dropna(subset=[df_.columns.values[1]]) df_ = df_.dropna(axis=1) df_ = pd.melt(df_, id_vars=['county'], value_vars=list(df_.columns[1:])) df_.columns = ['county', 'candidate', 'votes'] df_ = df_[df_['county'] != ''] df_['party'] = 'Republican' df_['candidate'] = df_['candidate'].str.lstrip('*') df_['candidate'] = df_['candidate'].str.replace('\((.*?)\)', '') df_['candidate'] = df_['candidate'].str.rstrip('()') df_['office'] = 'US Representative' representative_df = representative_df.append(df_) return representative_df
def read_ema_ings(): with open('all_ingredients.html', 'rb') as f_in: text_ings = f_in.read() df_sort = pd.read_html(text_ings)[0] df_sort.columns = ['name', 'sort_name', 'form'] df_sort = df_sort.sort_values('name') df_sort.reset_index(drop=True, inplace=True) cols_ings = ['ingredient', 'function', 'form', 'id_test', 'assay'] df_type = ['high_suscept', 'low_suscept', 'high_suscept_id', 'pending'] with open('ema_ingredients.html', 'rb') as f_in: text_ings = f_in.read() dfs = pd.read_html(text_ings) for i, df_ in enumerate(dfs): df_.columns = cols_ings df_['type'] = df_type[i] all_dfs = pd.concat(dfs) all_dfs = all_dfs.sort_values('ingredient') all_dfs = all_dfs.drop_duplicates('ingredient') all_dfs.reset_index(drop=True, inplace=True) all_dfs['sort_name'] = df_sort['sort_name'] all_dfs = all_dfs.sort_values('sort_name') all_dfs.reset_index(drop=True, inplace=True) return all_dfs
def pulldata(): # Schedule, odds and TV listing DataFrame da = pd.read_html('https://www.teamrankings.com/ncaa-basketball/team/florida-gators')[1] db = pd.read_html('http://stats.gatorsports.com/cbk/teamstats.asp?team=210&report=schedule', header=0, parse_dates=False, attrs={'class': 'shsTable shsBorderTable'})[0]['TV'] df = pd.concat([da, db], axis=1) df = df.set_index('Date') return df
def test_processed_table(self): pd.testing.assert_series_equal( pd.read_html(test_two_d.get_result().table)[0]["X-Coordinate"], two_d_result.table["X-Coordinate"] ) pd.testing.assert_series_equal( pd.read_html(test_two_d.get_result().table)[0]["Y-Coordinate"], two_d_result.table["Y-Coordinate"] )
def get_fangraph_pitchers(): # get al pitchers al = pd.read_html('http://www.fangraphs.com/dailyprojections.aspx?pos=all&stats=pit&type=sabersim&team=0&lg=al&players=0') fgpal = al[15] sleep(2) # get nl pitchers nl = pd.read_html('http://www.fangraphs.com/dailyprojections.aspx?pos=all&stats=pit&type=sabersim&team=0&lg=nl&players=0') fgpnl = nl[15] # merge and return fgp = fgpal.append(fgpnl) return(fgp)
def get_cruz(): cruzh = pd.read_html('https://rotogrinders.com/pages/c-r-u-z-mlb-model-792518') czh = cruzh[1] cruzp = pd.read_html('https://rotogrinders.com/pages/c-r-u-z-mlb-model-792521') czp = cruzp[1] czh['cruz'] = czh['\tRating\t'] czp['cruz'] = czp['\tRating\t'] return(czh, czp)
def get_household_income_from_orlando_sentinel(): name = 'city_median_income' url = 'http://databases.sun-sentinel.com/Orlando/ftlaudOS2011income/income2011_list.php' df = pd.read_html(url)[6][:-1] for i in range(2,28): print i df = df.append(pd.read_html(url + '?goto=%d'%i)[6][:-1]) df.columns = ['0', 'City', 'State', 'Median_Income', '4'] df = df[['City', 'State', 'Median_Income']] df.Median_Income = df.Median_Income.str.strip('$').apply(locale.atof) return df
def get_fangraph_batters(): poslist = ['c','1b','2b','ss','3b','rf','cf','lf','dh'] df = pd.DataFrame() for pos in poslist: tmp = pd.read_html('http://www.fangraphs.com/dailyprojections.aspx?pos='+pos+'&stats=bat&type=sabersim&team=0&lg=al&players=0') df = df.append(tmp[15]) sleep(2) tmp2 = pd.read_html('http://www.fangraphs.com/dailyprojections.aspx?pos='+pos+'&stats=bat&type=sabersim&team=0&lg=nl&players=0') df = df.append(tmp2[15]) sleep(2) return(df)
def main(): """Main execution.""" # Determine command line arguments. try: rawopts, _ = getopt.getopt(sys.argv[1:], 'i:o:') except getopt.GetoptError: usage() sys.exit(2) opts = {} # Process each command line argument. for o, a in rawopts: opts[o[1]] = a # The following arguments are required in all cases. for opt in ['i', 'o']: if not opt in opts: usage() sys.exit(2) # Make sure the output directory exists. if not os.path.exists(opts['o']): os.makedirs(opts['o']) # Traverse the root folder that contains sub folders # that represent each pitcher. for root, dirs, _ in os.walk(opts['i']): # Traverse each folder in the root. for pid in dirs: outfile = os.path.join(opts['o'], pid + ".csv") # Check if this pitcher was already processed. if os.path.isfile(outfile): continue for proot, _, files in os.walk(os.path.join(root, pid)): try: # Read in the first game for this pitcher. with open(os.path.join(proot, files[0]), 'r') as f: df = pd.read_html(f.read(), header=0)[0] # Read in the subsequent games and append to the # running DataFrame. for file in files[1:]: with open(os.path.join(proot, file), 'r') as f: df = df.append(pd.read_html(f.read(), header=0)[0]) # Save to disk as a csv file. df.to_csv(outfile) except ValueError: print("Error processing " + pid) continue
def predict_by_stats(games=[]): scores = get_team_scores(team_scores_url) num_scores = len(scores) team_stats = pd.read_html(team_misc_stats_url, header=1)[0].iloc[:-1, :] team_stats['Team'] = [t.strip('*') for t in team_stats['Team'].values] scores['home-away'] = scores['PTS.1'] - scores['PTS'] - home_court_advantage # home court adv = 2 pts param_columns = team_stats.columns[13:21].tolist() # starts from column eFG% param_columns.remove('FT/FGA') param_columns.remove('FT/FGA.1') num_params = len(param_columns) x = np.zeros([num_scores, num_params]) for idx, row in scores.iterrows(): home = row['Home/Neutral'] away = row['Visitor/Neutral'] x[idx] = team_stats.loc[team_stats['Team'] == home][param_columns].values - \ team_stats.loc[team_stats['Team'] == away][param_columns].values x = pd.DataFrame(x, columns=param_columns) y = scores['home-away'] model = sm.OLS(y, x) result = model.fit() print(result.summary()) print() team_ranking = pd.read_html(team_ranking_url, header=1)[0] game_spreads = {} #get_game_spreads() print('{:22s} - {:22s} = {:7s} | {:7s} | {:6s} | {:6s} | {:6s}'.format('home', 'away', 'fit mov', 'ref mov', 'spread', 'vs fit', 'vs mov')) for [home, away] in games: fit_mov = sum(result.params * ( team_stats.loc[team_stats['Team'] == home][param_columns].values - team_stats.loc[team_stats['Team'] == away][ param_columns].values)[0]) + home_court_advantage mov = team_ranking.loc[team_stats['Team'] == home]['MOV/A'].values - \ team_ranking.loc[team_stats['Team'] == away]['MOV/A'].values + 2 home_spread = -999 for k, v in game_spreads.items(): if home.find(k) > -1: home_spread = v * -1 print('{:22s} - {:22s} = {:7.1f} | {:7.1f} | {:6.1f} | {:>6s} | {:>6s}'.format(home, away, fit_mov, mov[0], home_spread, 'home' if fit_mov > home_spread else 'away', 'home' if mov > home_spread else 'away' ))
def getpe_google(stocks): pe_list = dict() pe_list['ticker'] = [] pe_list['value'] = [] for ticker in stocks: try: key_statistics = pd.read_html('https://www.google.com/finance?q=' + str(ticker) + '&ei') except: key_statistics = pd.read_html('https://www.google.com/finance?q=NYSEARCA%3A' + str(ticker) + '&ei') convert = key_statistics[0][1][5:6] input = convert.tolist() pe_list['ticker'].append(ticker) pe_list['value'].extend(input) return(pe_list)
def scrape_mock(year): '''Scrapes a mock_draft off the web in a wierd format''' url = MOCK_URL + str(year) + '/list/' crap = pd.read_html(url, header=0, match='First Round') first_round = crap[-1] crap = pd.read_html(url, header=0, match='Second Round') second_round = crap[-1] first_round.columns = ['pick', 'year', 'details'] second_round.columns = ['pick', 'year', 'details'] second_round['pick'] = second_round['pick'] + 30 mock_draft = first_round.append(second_round) mock_draft['year'] = year mock_draft = mock_draft.set_index('pick') mock_draft['pick'] = mock_draft.index return mock_draft
def crawler(start, difference, maximum): try: result = pd.DataFrame() origin = "http://fraunhofer-repdose.de/repdose/" parameter1 = start parameter2 = parameter1 + difference if parameter2 > maximum: parameter2 = maximum with tqdm(total=math.ceil((maximum-start)/difference)) as pbar: while parameter1 < maximum: target = 'http://fraunhofer-repdose.de/repdose/query.php?cas_where=&cas_string=&cas_show=on&species=' \ '&species_show=on&organ=&organ_show=on&name=&name_show=on&s_sex=&ssex_show=on&effect=&effect_show=' \ 'on&route=&route_show=on&e_sex=&esex_show=on&boilingpoint_c=&boilingpoint_show=on&duration_from=' \ '&duration_to=&duration_show=on&eloel_mg_from=&eloel_mg_to=&eloel_mg_show=on&watersolubility_c=&watersolubility_show' \ '=on&noel_mg_from=&noel_mg_to=&noel_mg_show=on&logpow_c=&logpow_show=on&loel_mg_from=&loel_mg_to=&loel_mg_show' \ '=on&pressure_c=&pressure_show=on&reliabilityA=on&reliabilityB=on&mol_from='+str(parameter1)+'&mol_to='+str(parameter2)+'&molweight_show=on&reference_show=0' page = requests.get(target).text if "Please restrict query conditions." in page: print(str(parameter1)+":error") elif "Page" in page: lists = [] bsObj = BeautifulSoup(page, 'lxml') found_a = bsObj.find_all('a') for item in found_a: found_href = item.get('href') if "query.php" in found_href: lists.append(found_href) for i in lists: html = origin + i r_page = requests.get(html).text table = pd.read_html(r_page)[0] table.drop([0,1], inplace=True) result = pd.concat([result,table]) else: table = pd.read_html(page)[0] table.drop([0,1], inplace=True) result = pd.concat([result,table]) parameter1 = parameter2 parameter2 += difference if parameter2 > maximum: parameter2 = maximum time.sleep(0.5) pbar.update(1) finally: get_c_name = pd.read_html(page)[0] c_name = get_c_name.iloc[1,:] result.rename(columns=c_name, inplace=True) result.to_csv("result_" + str(maximum) + ".csv", index=False)
def dfmaker(pagetitle): # Get page HTML and find tables wikipage = wikipedia.page(title=pagetitle) alltables = BeautifulSoup(wikipage.html(), 'html.parser').find_all('table') # Keep tables that have show results rtabs = [pd.read_html(str(t), header=0, encoding='utf-8')[0] for t in alltables if t.th.text == ' No.\n'] yeares = pd.concat(rtabs) # Clean up dataframe newnames = {'Original air date[3]': 'airdate', 'Original air date': 'airdate', 'Runner-up': 'Runnerup', 'Last place': 'Lastplace', 'No.': 'shownum'} yeares.rename(columns=newnames, inplace=True) qrem = ['Winner', 'Runnerup', 'Lastplace'] yeares[qrem] = yeares[qrem].replace(regex='["]', value='') yeares.airdate = yeares.airdate.str[:-13] results = yeares[pd.notnull(yeares['Winner'])] # removes not results rows return results
def _today_ticks(symbol, tdate, pageNo, retry_count, pause): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['t_ticks'], symbol, tdate, pageNo )) res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr sarr = sarr.replace('--', '0') df = pd.read_html(StringIO(sarr), parse_dates=False)[0] df.columns = ct.TODAY_TICK_COLUMNS df['pchange'] = df['pchange'].map(lambda x : x.replace('%', '')) except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _get_report_data(year, quarter, pageNo, dataArr, orderby): ct._write_console() try: request = Request(ct.REPORT_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1], orderby)) # 默认排序抓取的信息有重复和遗漏,增加排序功能参数orderby text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>' % sarr df = pd.read_html(sarr)[0] df = df.drop(11, axis=1) df.columns = ct.REPORT_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage) > 0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_report_data(year, quarter, pageNo, dataArr,orderby) else: return dataArr except Exception as e: print(e)
def _parse_fq_data(url, index, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: request = Request(url) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id=\"FundHoldSharesTable\"]') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr, skiprows = [0, 1])[0] if len(df) == 0: return pd.DataFrame() if index: df.columns = ct.HIST_FQ_COLS[0:7] else: df.columns = ct.HIST_FQ_COLS if df['date'].dtypes == np.object: df['date'] = df['date'].astype(np.datetime64) df = df.drop_duplicates('date') except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
def _get_forecast_data(year, quarter, pageNo, dataArr): ct._write_console() try: gparser = etree.HTMLParser(encoding='GBK') html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1]), parser=gparser) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('--', '0') sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df = df.drop([4, 5, 8], axis=1) df.columns = ct.FORECAST_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+',nextPage[0])[0] return _get_forecast_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
def _newstocks(data, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'], ct.PAGES['newstock'], pageNo)) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if len(res) == 0: return data if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>'%sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.drop([df.columns[idx] for idx in [12, 13, 14]], axis=1) df.columns = rv.NEW_STOCKS_COLS df['code'] = df['code'].map(lambda x : str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x : str(x).zfill(6)) res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()') tag = '下一页' if ct.PY3 else unicode('下一页', 'utf-8') hasNext = True if tag in res else False data = data.append(df, ignore_index=True) pageNo += 1 if hasNext: data = _newstocks(data, pageNo, retry_count, pause) except Exception as ex: print(ex) else: return data
def extract_coinmarketcap(self, coin, coin_col=False): """Retrieve basic historical information for a specific cryptocurrency from coinmarketcap.com Parameters ---------- coin : the name of the cryptocurrency (e.g. 'bitcoin', 'ethereum', 'dentacoin') coin_col : whether to include the coin name as a column (default is False i.e. the column is not included) Returns ------- pandas Dataframe """ try: output = pd.read_html("https://coinmarketcap.com/currencies/{}/historical-data/?start={}&end={}".format( coin, self.from_date.replace("-", ""), self.to_date.replace("-", "")))[0] except Exception as e: return pd.DataFrame({"error":e}, index=[0]) output = output.assign(Date=pd.to_datetime(output['Date'])) for col in output.columns: if output[col].dtype == np.dtype('O'): output.loc[output[col]=="-",col]=0 output[col] = output[col].astype('int64') output.columns = [col.lower() for col in output.columns] if coin_col: output['coin'] = coin return output
def get_quote_yahoojp(code, start=None, end=None, interval='d'): base = 'http://info.finance.yahoo.co.jp/history/?code={0}.T&{1}&{2}&tm={3}&p={4}' start, end = web._sanitize_dates(start, end) start = 'sy={0}&sm={1}&sd={2}'.format(start.year, start.month, start.day) end = 'ey={0}&em={1}&ed={2}'.format(end.year, end.month, end.day) p = 1 results = [] if interval not in ['d', 'w', 'm', 'v']: raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'") while True: url = base.format(code, start, end, interval, p) tables = pd.read_html(url, header=0) if len(tables) < 2 or len(tables[1]) == 0: break results.append(tables[1]) p += 1 result = pd.concat(results, ignore_index=True) result.columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'] if interval == 'm': result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月') else: result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月%d日') result = result.set_index('Date') result = result.sort_index() return result
def scrape_nba_results(): ''' Scrape recent NBA results''' url = 'http://www.betexplorer.com/basketball/usa/nba/results/' df = pd.read_html(get(url).text)[0] homeTeam = df[0].apply(lambda r: str.split(r, sep='-')[0].strip()) homeTeam[homeTeam == 'Portland Trail Blazers'] = 'Portland Trailblazers' homePoints = df[1].apply(lambda r: str.split(r, sep=':')[0].strip()) awayTeam = df[0].apply(lambda r: str.split(r, sep='-')[1].strip()) awayTeam[awayTeam == 'Portland Trail Blazers'] = 'Portland Trailblazers' awayPoints = df[1].apply(lambda r: str.split(r, sep=':')[1].strip()) awayPoints = awayPoints.apply(lambda r: str.split(r, sep='ET')[0].strip()) dates = df[4].apply(lambda r: datetime.strptime(r, '%d.%m.%Y')) # The dates on this website are GMT so are one day advanced. dates = dates.apply(lambda r: datetime.strftime(r - timedelta(days=1), '%d/%m/%Y')) df['Date'] = dates df['HomeTeam'] = homeTeam df['AwayTeam'] = awayTeam df['HomePoints'] = homePoints df['AwayPoints'] = awayPoints df['HomeWin'] = homePoints > awayPoints df = df.ix[:, 5:11] teams = lookup_teams() df['HomeId'] = df.merge(df.merge(teams, left_on='HomeTeam', right_on='Franchise', sort=False))['TeamId'] df['AwayId']= df.merge(df.merge(teams, left_on='AwayTeam', right_on='Franchise', sort=False))['TeamId'] return df
def scrape_best_odds(): ''' Scrape best odds offered for next round of matches''' url = 'http://www.oddschecker.com/basketball/nba' df = pd.read_html(get(url).text)[0] df = df[pd.notnull(df.ix[:,1])].ix[:,1:3] def parse_team(string): return str.split(string, sep='(')[0].strip() def parse_odds(string): s = string.split(sep='(')[1] s = s.replace(')', '') f = s.split(sep='/') d = 1 if(len(f) > 1): d = float(s.split(sep='/')[1]) return (float(f[0])/d)+1 df['Date'] = datetime.today().strftime('%d/%m/%Y') df['AwayTeam'] = df['1'].apply(parse_team) df['HomeTeam'] = df['2'].apply(parse_team) df['AwayOdds'] = df['1'].apply(parse_odds) df['HomeOdds'] = df['2'].apply(parse_odds) df['AwayOddsProb'] = 1/df['AwayOdds'] df['HomeOddsProb'] = 1/df['HomeOdds'] df = df[['Date', 'HomeTeam', 'AwayTeam', 'HomeOdds', 'HomeOddsProb', 'AwayOdds', 'AwayOddsProb']] return df
def _get_cashflow_data(year, quarter, pageNo, dataArr, retry_count=3, pause=0.001): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.CASHFLOW_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: pass raise IOError(ct.NETWORK_URL_ERROR_MSG)
def scrape_model_and_odds(): model = scrape_model_probs() for i in range(len(model)): home_team = model.HomeTeam[i] away_team = model.AwayTeam[i] url = _odds_checker_page(home_team, away_team) df = pd.read_html(get(url).text)[0] df0 = df.loc[0:1, bookies.keys()] df0.columns = bookies.keys() df0.index = [_adjust_portland(df[2][0]), _adjust_portland(df[2][1])] odds = pd.DataFrame(data = df0.loc[home_team, :].apply(_parse_odds), index = bookies.keys()) odds[away_team] = pd.DataFrame(data = df0.loc[away_team, :].apply(_parse_odds), index = bookies.keys()) model.loc[i, 'HomeOdds'] = odds[home_team].max() model.loc[i, 'HomeBookie'] = bookies[odds.sort_values(by=home_team).index[-1]] model['HomeOddsProb'] = 1 / model.HomeOdds model.loc[i, 'AwayOdds'] = odds[away_team].max() model.loc[i, 'AwayBookie'] = bookies[odds.sort_values(by=away_team).index[-1]] model['AwayOddsProb'] = 1 / model.AwayOdds return model
def get(self, code, start=None, end=None, interval='d'): if code in {'N225', 'GSPC', 'IXIC', 'DJI'}: start = datetime.datetime.strptime(start, '%Y-%m-%d') result = data.DataReader("".join(['^', code]), 'yahoo', start, end) return result.asfreq('B') base = self._base_url() start, end = self._sanitize_dates(start, end) start = 'sy={0}&sm={1}&sd={2}'.format( start.year, start.month, start.day) end = 'ey={0}&em={1}&ed={2}'.format(end.year, end.month, end.day) p = 1 results = [] if interval not in ['d', 'w', 'm', 'v']: raise ValueError( "Invalid interval: valid values are 'd', 'w', 'm' and 'v'") while True: url = base.format(int(code), start, end, interval, p) tables = pd.read_html(url, header=0) if len(tables) < 2 or len(tables[1]) == 0: break results.append(tables[1]) p += 1 result = pd.concat(results, ignore_index=True) result.columns = [ 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'] result['Date'] = pd.to_datetime(result['Date'], format='%Y年%m月%d日') result = result.set_index('Date') result = result.sort_index() return result.asfreq('B')
# To avoid unwanted float values pd.options.display.float_format = '{:,.0f}'.format url = 'https://www.worldometers.info/coronavirus/' # To avoid 403 Error header = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } r = requests.get(url, headers=header) df = pd.read_html(r.text) df = df[0] df = df[1:212] df.columns = [ 'Country', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered', 'ActiveCases', 'Critical', 'Tot Cases/1M pop', 'Deaths/1M pop', ' TotalTests', 'Tests/ 1M pop' ] # Replace few countries names df = df.replace(to_replace="UK", value="United Kingdom") df = df.replace(to_replace='S. Korea', value="South Korea") df = df.replace(to_replace='UAE', value="United Arab Emirates") df = df.replace(to_replace='0', value=0)
# Add player links to player_links list. for key, letter_soup in soups.items(): # First row is a header row, all the rest are players. Create a players list. player_rows=letter_soup.table.find_all("tr")[1:] counter=0 #Choose only centers that began playing after 1980 for idx, player in enumerate(player_rows): if int(player.td.string) >= 1980 and fnmatch.fnmatch(player.find_all("td")[2].string, "*C*"): player_links.append(base_link+player.th.a["href"]) counter+=1 """Collection of individual player URLs is done, now we begin creating the 'player data' dataframe.""" player_data_df= player_df=pd.read_html(str(BeautifulSoup(urlopen(player_links[0])).table))[0].iloc[:-1, 0:30] dataframe_index=1 countries=np.array(["albania","andorra","armenia","austria","azerbaijan","belarus","belgium", " bosnia and herzegovina","bulgaria","croatia","cyprus","czech republic", "denmark","estonia","finland","france","georgia","germany","greece","hungary", "iceland","ireland","italy","kazakhstan","kosovo","latvia","liechtenstein", "lithuania","luxembourg","macedonia","malta","moldova","monaco","montenegro", "netherlands","norway","poland","portugal","romania","russia","san marino", "serbia","slovakia","slovenia","spain","sweden","switzerland","turkey", "ukraine","united kingdom","vatican city"]) states=np.empty(shape=len(us_states), dtype=object) for index, state in enumerate(us_states): states[index]=str(state).lower() for player_link in player_links[1:]: # Creating and adding player dataframes together.
for link in soup_level1.find_all( 'a', id=re.compile("^MainContent_uxLevel2_JobTitles_uxJobTitleBtn_")): #Selenium visits each Job Title page python_button = driver.find_element_by_id( 'MainContent_uxLevel2_JobTitles_uxJobTitleBtn_' + str(x)) python_button.click() #click link #Selenium hands of the source of the specific job page to Beautiful Soup soup_level2 = BeautifulSoup(driver.page_source, 'html.parser') #Beautiful Soup grabs the HTML table on the page table = soup_level2.find_all('table')[0] #Giving the HTML table to pandas to put in a dataframe object df = pd.read_html(str(table), header=0) #Store the dataframe in a list datalist.append(df[0]) #Ask Selenium to click the back button driver.execute_script("window.history.go(-1)") #increment the counter variable before starting the loop over x += 1 #end loop block #loop has completed #end the Selenium browser session
def scrape(): # URL of page to be scraped url = "https://mars.nasa.gov/news/" response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # print(response.text) # Find latest news title about Mars news_title = soup.find('div', class_="content_title").text news_title # Find latest news blurb news_p = soup.find('div', class_="rollover_description_inner").text news_p # * Use splinter to navigate the site and find the image url for the current Featured Mars Image executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) featured_image = browser.find_by_id('full_image') featured_image.click() time.sleep(5) more_info = browser.find_link_by_partial_text('more info') more_info.click() # Pull featured image url html = browser.html soupsearch = BeautifulSoup(html, 'html.parser') part_image_url = soupsearch.find('img', class_='main_image').get('src') featured_image_url = 'https://www.jpl.nasa.gov' + part_image_url featured_image_url # Exit browser browser.quit() # Visit the Mars Weather twitter account [here](https://twitter.com/marswxreport?lang=en) # and scrape the latest Mars weather tweet from the page. url = "https://twitter.com/marswxreport?lang=en" response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') mars_weather = soup.find('div', class_='js-tweet-text-container').text mars_weather # # Pull Mars facts table from Space-Facts executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path) url = 'https://space-facts.com/mars/' marsFacts_df = pd.read_html(url) marsFacts_df = marsFacts_df[0] marsFacts_df # # * Use Pandas to convert the data to a HTML table string. # marsFacts_df.to_html('mars_facts.html', index=False) marsHTML = marsFacts_df.to_html() print(marsHTML) executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) cerberus = browser.find_link_by_partial_text('Cerberus') cerberus.click() html = browser.html soupsearch = BeautifulSoup(html, 'html.parser') astrogeology_url = 'https://astrogeology.usgs.gov/' #--------------------------------------- cerberus_url = soupsearch.find('img', class_='wide-image').get('src') cerberus_img_url = astrogeology_url + cerberus_url print('cerberus image') print(cerberus_img_url) back = browser.find_link_by_partial_text('Back') back.click() #--------------------------------------- schiaparelli = browser.find_link_by_partial_text('Schiaparelli') schiaparelli.click() html = browser.html soupsearch = BeautifulSoup(html, 'html.parser') schiaparelli_url = soupsearch.find('img', class_='wide-image').get('src') schiaparelli_img_url = astrogeology_url + schiaparelli_url back = browser.find_link_by_partial_text('Back') back.click() print('schiaparelli image') print(schiaparelli_img_url) #--------------------------------------- syrtis = browser.find_link_by_partial_text('Syrtis') syrtis.click() html = browser.html soupsearch = BeautifulSoup(html, 'html.parser') syrtis_url = soupsearch.find('img', class_='wide-image').get('src') syrtis_img_url = astrogeology_url + syrtis_url back = browser.find_link_by_partial_text('Back') back.click() valles = browser.find_link_by_partial_text('Valles') valles.click() html = browser.html soupsearch = BeautifulSoup(html, 'html.parser') valles_url = soupsearch.find('img', class_='wide-image').get('src') valles_img_url = astrogeology_url + valles_url valles_img_url print(cerberus_img_url, schiaparelli_img_url, syrtis_img_url, valles_img_url) # # Scrape Hemisphere image urls # executable_path = {'executable_path': 'chromedriver.exe'} # browser = Browser('chrome', **executable_path) # url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' # browser.visit(url) # cerberus = browser.find_link_by_partial_text('Cerberus') # cerberus.click() # html = browser.html # soupsearch = BeautifulSoup(html, 'html.parser') # astrogeology_url = 'https://astrogeology.usgs.gov/' # #--------------------------------------- # cerberus_url = soupsearch.find('img', class_='wide-image').get('src') # cerberus_img_url = astrogeology_url + cerberus_url # back = browser.find_link_by_partial_text('Back') # # back.click() # #--------------------------------------- # url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' # browser.visit(url) # schiaparelli = browser.find_link_by_partial_text('Schiaparelli') # schiaparelli.click() # time.sleep(2) # schiaparelli_url = soupsearch.find('img', class_='wide-image').get('src') # schiaparelli_img_url = astrogeology_url + schiaparelli_url # back = browser.find_link_by_partial_text('Back') # back.click() # #--------------------------------------- # syrtis = browser.find_link_by_partial_text('Syrtis') # syrtis.click() # time.sleep(2) # syrtis_url = soupsearch.find('img', class_='wide-image').get('src') # syrtis_img_url = astrogeology_url + syrtis_url # back = browser.find_link_by_partial_text('Back') # back.click() # valles = browser.find_link_by_partial_text('Valles') # valles.click() # time.sleep(2) # valles_url = soupsearch.find('img', class_='wide-image').get('src') # valles_img_url = astrogeology_url + valles_url # valles_img_url # # Exit browser # browser.quit() # print(cerberus_img_url, schiaparelli_img_url, syrtis_img_url, valles_img_url) # Save hemisphere image urls in a dictionary. hemisphere_image_urls = [ {"title": "Valles Marineris Hemisphere", "img_url": valles_img_url}, {"title": "Cerberus Hemisphere", "img_url": cerberus_img_url}, {"title": "Schiaparelli Hemisphere", "img_url": schiaparelli_img_url}, {"title": "Syrtis Major Hemisphere", "img_url": syrtis_img_url}, ] print(hemisphere_image_urls) # Save all variables in a dictionary mars_data = { "hemisphere_image_urls": hemisphere_image_urls, "news_p" : news_p, "news_title" : news_title, "featured_image_url": featured_image_url, "mars_weather": mars_weather, "mars_facts": marsHTML } return mars_data
html = browser.html image_soup = BeautifulSoup(html, 'html.parser') # use CSS selector in BeautifulSoup to extract img_url image_relative_url = image_soup.select_one('figure.lede a img').get('src') #combine with base URL to create an absolute img URL img_url = f'https://www.jpl.nasa.gov{image_relative_url}' img_url # check if a valid img_url #browser.visit(img_url) # %% [markdown] # ## Web Scrape TABLE from Mars facts website # - (use Pandas funtions to parse HTML Table) # - (No BeautifulSoup used) # - (No auto browser used) # %% fact_url = 'http://space-facts.com/mars/' # only accept http without s # extract the first one in list of DFs df = pandas.read_html(fact_url)[0] df.columns = ['description', 'value'] df.set_index('description', inplace=True) df # convert back to HTML string fact_table_html = df.to_html() fact_table_html # %% browser.quit() # %%
def scrape_all(): browser = init_browser() url = 'https://mars.nasa.gov/news' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') titles = soup.find_all('div', class_='content_title') texts = soup.find_all('div', class_='article_teaser_body') title_text = [] text_only = [] #keep only the text for x in titles: title_text.append(x.text.strip()) for x in texts: text_only.append(x.text.strip()) # JPL Mars Space Image #These lines of code are needed to navigate to the next page image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html' browser.visit(image_url) html = browser.html soup = bs(html, 'html.parser') image_url_src = soup.find('img', class_='headerimage fade-in')['src'] url_short = image_url.split('/') #rearrange and concatenate URL featured_image_url = url_short[0] + '//' + url_short[1] + url_short[ 2] + '/' + url_short[3] + '/' + image_url_src # Mars Facts facts_url = 'https://space-facts.com/mars/' df = pd.read_html(facts_url)[0] mars_facts = df.to_html() # Mars Hemispheres pic_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(pic_url) hemi_url = [] links = browser.find_by_css('a.product-item h3') for i in range(len(links)): hemi = {} browser.find_by_css('a.product-item h3')[i].click() sample_image = browser.links.find_by_text('Sample').first hemi['img_url'] = sample_image['href'] hemi['title'] = browser.find_by_css('h2.title').text hemi_url.append(hemi) browser.back() browser.quit() # Store data in one dictionary mars_data = { "news_title": title_text, "news_paragraph": text_only, "featured_image": featured_image_url, "mars_facts": mars_facts, "hemispheres": hemi_url } return mars_data
import requests import pandas as pd wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' wikipedia_page = requests.get(wiki) df_raw = pd.read_html(wikipedia_page.content, header=0)[0] df_new = df_raw[df_raw.Borough != 'Not assigned'] df_new.head() df_new.loc[df_new.Neighborhood == 'Not assigned'] df_new.Neighborhood.replace('Not assigned', df_new.Borough, inplace=True) df_new.head(8) df_toronto = df_new.groupby(['Postal Code', 'Borough' ])['Neighborhood'].apply(lambda x: ', '.join(x)) df_toronto = df_toronto.reset_index() df_toronto.rename(columns={'Postal Code': 'PostCode'}, inplace=True) df_toronto.rename(columns={'Neighborhood': 'Neighbourhood'}, inplace=True) df_toronto.head() df_toronto.shape
@author: Brad """ #Importing libraries import sqlalchemy import pandas as pd import time #Loading login credentials into Python credentials = pd.read_csv('credentials.csv') SQL_user = credentials.loc[0, 'SQL_user'] SQL_pass = credentials.loc[0, 'SQL_pass'] API_key = credentials.loc[0, 'API_key'] #Creating list of stocks for table asx_200 = pd.read_html('https://en.wikipedia.org/wiki/S%26P/ASX_200') asx_200 = asx_200[0][0] asx_200 = asx_200[1:] asx_200.columns = ['Symbol'] asx_200 = asx_200.str.lower() #Setting up database engine = sqlalchemy.create_engine('mysql+mysqlconnector://' + str(SQL_user) + ':' + str(SQL_pass) + '@localhost:3306') con = engine.connect() con.execute('CREATE database ASX_API') con.close() #Creating connection with MySQL server for API upload engine = sqlalchemy.create_engine('mysql+mysqlconnector://' + str(SQL_user) + ':' + str(SQL_pass) +
def scrape(): browser = init_browser() # mars_scrapped_data = {} url= 'https://mars.nasa.gov/news/' browser.visit(url) html= browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='article_teaser_body').text # just to check the output print(f"Title: {news_title}") print(f"Paragraph: {news_p}") url_image= 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_image) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(3) browser.click_link_by_partial_text('more info') new_html= browser.html imgsoup = BeautifulSoup(new_html, 'html.parser') temp_img = imgsoup.find('img', class_='main_image')['src'] featured_image_url= 'https://www.jpl.nasa.gov/' + temp_img print(featured_image_url) mars_twitter_url= 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_twitter_url) mars_twitter= browser.html soup = BeautifulSoup(mars_twitter, 'html.parser') find_tweet = soup.find('p', class_='TweetTextSize').text mars_weather= find_tweet print(f"Latest Tweet: {mars_weather}") mars_facts_url = 'https://space-facts.com/mars/' tables = pd.read_html(mars_facts_url) tables df = tables[0] df.columns = ['Profile', 'Details'] df.head() df.set_index('Profile', inplace=True) df.head() html_table = df.to_html() html_table html_table.replace('\n', '') df.to_html('table.html') url_mars= 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_mars) hemi_dicts = [] for i in range(1,9,2): hemi_dict = {} browser.visit(url_mars) # time.sleep(1) hemispheres_html = browser.html hemispheres_soup = BeautifulSoup(hemispheres_html, 'html.parser') hemi_name_links = hemispheres_soup.find_all('a', class_='product-item') hemi_name = hemi_name_links[i].text.strip('Enhanced') detail_links = browser.find_by_css('a.product-item') detail_links[i].click() time.sleep(1) browser.find_link_by_text('Sample').first.click() time.sleep(1) browser.windows.current = browser.windows[-1] hemi_img_html = browser.html browser.windows.current = browser.windows[0] browser.windows[-1].close() hemi_img_soup = BeautifulSoup(hemi_img_html, 'html.parser') hemi_img_path = hemi_img_soup.find('img')['src'] print(hemi_name) hemi_dict['title'] = hemi_name.strip() print(hemi_img_path) hemi_dict['img_url'] = hemi_img_path hemi_dicts.append(hemi_dict) mars_scrapped_data ={"news_title": news_title, "news_paragraph": news_p, "featured_image": featured_image_url, "Latest_Tweet": mars_weather, "Hemispheres_details": hemi_dicts # "Table": html_table } return mars_scrapped_data
def init_browser(): executable_path = {'executable_path': 'chromedriver.exe'} return browser('chrome', **executable_path, headless=False) def scrape_info(): browser = init_browser() url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url) time.sleep(5) # HTML html_news = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_news, 'html.parser') news_title = soup.find("div", class_="content_title").text news_p = soup.find("div", class_ ="article_teaser_body").text news_p browser.quit() # Feature Image featured_img_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(featured_img_url) browser.links.find_by_partial_text('FULL IMAGE') browser.links.find_by_partial_text('more info') browser.links.find_by_partial_text('jpg') # HTML Object html_image = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html_image, 'html.parser') featured_img_url = soup.find('img')['src'] featured_img_url # Mars Weather weather_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(weather_url) time.sleep(5) html_weather = browser.html weather_soup = BeautifulSoup(html_weather, "html.parser") mars_weather=weather_soup.find('div', class_ ='css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0').text mars_weather # Mars Facts mars_df = pd.read_html("https://space-facts.com/mars/")[0] #Panda DataFrame #mars_df.columns=["Des", "MARS PLANET PROFILE"] #mars_df.set_index("Des", inplace=True) #mars_df mars_df.columns=["Description", "Value"] mars_df.set_index("Description", inplace = True) mars_facts = mars_df.to_html(classes="table") mars_facts =mars_df.replace("'","") mars_facts # MARS HEMISPHERES # Cerberus Hemisphere mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(mars_hemi_url) # Click to URL of page to be scraped and extract data browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') html_mars_hemi = browser.html soup_mars_hemi = BeautifulSoup(mars_hemi_url, 'html.parser') #Pull specific data from webpage hemi_title1_url = soup_mars_hemi.find('h2', class_ ='title').text img1_url =soup_mars_hemi.find('a', text ='Sample').get("href") #Put data into a dictionary one ={'title':hemi_title1_url, "img_url":img1_url} one # Schiaparelli Hemisphere mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(mars_hemi_url) # Click to URL of page to be scraped and extract data browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') html_mars_hemi = browser.html soup_mars_hemi = BeautifulSoup(html_mars_hemi, 'html.parser') #Pull specific data from webpage hemi_title2_url = soup_mars_hemi.find('h2', class_ ='title').text img2_url =soup_mars_hemi.find('a', text ='Sample').get("href") #Put data into a dictionary two ={'title':hemi_title2_url, "img_url":img2_url} two # Syrtis Major Hemisphere mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(mars_hemi_url) # Click to URL of page to be scraped and extract data browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') html_mars_hemi = browser.html soup_mars_hemi = BeautifulSoup(html_mars_hemi, 'html.parser') # Pull specific data from webpage hemi_title3_url = soup_mars_hemi.find('h2', class_ ='title').text img3_url =soup_mars_hemi.find('a', text ='Sample').get("href") #Put data into a dictionary three ={'title':hemi_title3_url, "img_url":img3_url} three # Valles Marineris Hemisphere mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(mars_hemi_url) browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') html_mars_hemi = browser.html soup_mars_hemi = BeautifulSoup(html_mars_hemi, 'html.parser') #Pull specific data from webpage hemi_title4_url = soup_mars_hemi.find('h2', class_ ='title').text img4_url =soup_mars_hemi.find('a', text ='Sample').get("href") #Put data into a dictionary four ={'title':hemi_title4_url, "img_url":img4_url} four #Summary hemisphere_url = [one, two, three, four] hemisphere_url # Store data in a dictionary mars_data = { "News_Title" : news_p, "Featured_Image" : featured_img_url, "Mars_Weather" : mars_weather, "Mars_Facts" : mars_facts, "Hemisphere_Images" : hemisphere_img_url } browser.quit() # Return results return mars_data
def scrape_info(): # Making space soup browser = init_browser() url = 'https://www.nasa.gov/missions/' html = browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') # Stiring the soup links = [ a for a in soup.find('div', class_="static-landing-page").find_all( 'a', href=True) ] # Pouring out the list and title link_list = [] title_list = [] for i in range(len(links)): if links[i].get('href').find("mission_pages") == 1: link_list.append(links[i]['href']) title_list.append(links[i].text) else: print('No Mission Page') mission_dict = {} mission_dict["Mission"] = title_list mission_dict["Mission Link"] = link_list url = 'https://www.cdscc.nasa.gov/Pages/trackingtoday.html' html = browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') abv_table = pd.read_html(url)[3] abv_table.columns = ['ABV', 'Name'] abv_dict = abv_table.to_dict("records") MADRID = {} GOLDSTONE = {} CANBERRA = {} browser.visit('https://eyes.jpl.nasa.gov/dsn/dsn.html') time.sleep(.3) for i in browser.find_by_tag('a'): if i['class'] == 'inactive' or None: pass elif i.text == '': pass elif i['id'] == '' or None: pass else: if i['id'][:2] == 'sp': ABV = i.text if i['id'][-5] == '0': MADRID[ABV] = {} elif i['id'][-5] == '1': GOLDSTONE[ABV] = {} elif i['id'][-5] == '2': CANBERRA[ABV] = {} # Platting the soup mission_data = { "Madrid": MADRID, "Goldstone": GOLDSTONE, "Canberra": CANBERRA, "Mission_titles": mission_dict, "Mission_Code": abv_dict } browser.quit() return mission_data
def get_family_info_from_mysqldb(save_dataset=True): # 建立连接 conn = pymysql.connect(host='172.16.201.103', port=3306, user='******', password='******', db='resource', charset='utf8') # 获取游标 cursor = conn.cursor(pymysql.cursors.DictCursor) # 执行sql语句 # 获取构件库中所有的族(包含各专业) sql = 'select i.id, i.name, i.resource_desc ' \ 'from resource_item i, resource_parameter_value pv ' \ 'where i.parameter_code=pv.code and i.resource_lib_id=54 and i.could_be_shown_in_front=1' rows = cursor.execute(sql) items = cursor.fetchall() # 获取所有族的参数 sql = 'select ip.resource_item_id , i.name, ip.resource_parameter_id, p.name as parameter, pv.id as param_id, pv.value ' \ 'from resource_item i, resource_item_parameter ip, resource_parameter p, resource_parameter_value pv ' \ 'where i.id=ip.resource_item_id and p.id=ip.resource_parameter_id and ip.resource_parameter_value_id=pv.id ' \ 'and (pv.resource_parameter_id=1 or pv.resource_parameter_id=51 or pv.resource_parameter_id=52 ' \ 'or pv.resource_parameter_id=53 or pv.resource_parameter_id=10004) ' rows = cursor.execute(sql) item_params = cursor.fetchall() # 获取结构化参数 sql = 'select pv.id, pv.code, pv.`value` from resource_parameter_value pv ' \ 'where (pv.resource_parameter_id=1 or pv.resource_parameter_id=51 or pv.resource_parameter_id=52 ' \ 'or pv.resource_parameter_id=53 or pv.resource_parameter_id=10004) ' \ 'order by code' rows = cursor.execute(sql) params = cursor.fetchall() # 关闭游标 cursor.close() # 关闭连接 conn.cursor() items_dict = {} for idx in range(len(items)): # if items[idx]["id"] not in items_dict: try: items[idx]["resource_desc"] = pd.read_html( items[idx]["resource_desc"], header=0)[0].to_dict(orient="records") except: pass if items[idx]["name"][0] not in ["A", "S"]: # 过滤土建专业 continue item_id = items[idx]["id"] items[idx].pop("id", None) items[idx].pop("resource_desc", None) items_dict[item_id] = items[idx] tmp1 = {c["code"]: c["id"] for c in params if c["code"]} tmp2 = {c["id"]: c["value"] for c in params} pid2v = {} for c in params: if not c["code"]: pid2v[c["id"]] = [c["value"]] else: ks = c["code"].split("_") pid2v[c["id"]] = [] for i in range(len(ks)): tmp_k = "_".join(ks[:i + 1]) if tmp1[tmp_k] in pid2v: pid2v[c["id"]].append(tmp2[tmp1[tmp_k]]) for idx in range(len(item_params)): p = item_params[idx] iid = p["resource_item_id"] if iid in items_dict: if p["parameter"] not in items_dict[iid]: items_dict[iid][item_params[idx]["parameter"]] = [] items_dict[iid][item_params[idx]["parameter"]] += pid2v[ p["param_id"]] # 保存结果 if save_dataset: with open( os.path.join( os.path.dirname(os.path.dirname( os.path.abspath(__file__))), "data/standard_vocab.json"), "w") as f: json.dump(list(items_dict.values()), f) if save_dataset: with open( os.path.join( os.path.dirname(os.path.dirname( os.path.abspath(__file__))), "data/standard_param.json"), "w") as f: json.dump(params, f) return items_dict
'Visitors': [43,34,65,56,29,76], 'Bounce Rate': [65,67,78,65,45,52] } df = pd.DataFrame(web_stats) df.set_index('Day', inplace=True) ''' ##read write csv dfx = pd.read_csv('/Users/zhangsicai/Desktop/Panda/grade.csv') dfx.set_index('Date', inplace=True) dfx.rename(columns={'math': 'shuxue'}, inplace=True) print(dfx['shuxue']) #df['math'].to_csv('/Users/zhangsicai/Desktop/Panda/grade1.csv') df = quandl.get('FMAC/HPI_TX', authtoken='HUg-EPXknoSxzbk26DMu') fiddy_states = pd.read_html( 'https://simple.wikipedia.org/wiki/List_of_U.S._states') print(df.head()) print(df[0:2]) print(df['NSA Value']) df.plot() plt.show() print(fiddy_states[0]['Name']) ''' for dd in df['NSA Value'][1:]: print(dd) ''' df1 = pd.DataFrame( {
def scrape(): executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # NASA MARS NEWS news_url = ( 'https://mars.nasa.gov/news/?page=0&per_page=40' '&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' ) browser.visit(news_url) time.sleep(2) html = browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.body.find_all('div', class_="content_title")[1].text.strip() news_p = soup.body.find_all('div', class_="article_teaser_body")[0].text.strip() #JPL MARS SPACE IMAGES - FEATURED IMAGE image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(image_url) time.sleep(2) my_xpath = '/html/body/div[1]/div/div[3]/section[1]/div/div/article/div[1]/footer/a' results = browser.find_by_xpath(my_xpath) img = results[0] img.click() browser.click_link_by_partial_text('more info') html1 = browser.html soup = BeautifulSoup(html1, 'html.parser') feat_img = soup.find_all('figure', class_='lede') feat_img_result = feat_img[0].a['href'] featured_image_url = 'https://www.jpl.nasa.gov' + feat_img_result # MARS FACTS facts_url = 'https://space-facts.com/mars/' facts_table = pd.read_html(facts_url) table_df = facts_table[0] # mars_table_df = table_df.rename(columns={0: 'Mars: Measurement', 1: 'Measurement: Value'}) mars_table_df = table_df.to_html(header=False, index=False) # mars_table_df.to_html(classes="table table-striped") print(mars_table_df) # MARS HEMISPHERES #Note the inconsistent url hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' #alternate site, if previous site is unavailable # hemispheres_url = 'https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemispheres_url) time.sleep(2) hemisphere_image_urls = [] url_links = browser.find_by_css('a.product-item h3') for i in range(len(url_links)): # create an empty dictionary for each hemisphere hemisphere = {} browser.find_by_css('a.product-item h3')[i].click() #get hemisphere title hemisphere['title'] = browser.find_by_css("h2.title").text #next find the sample image anchor tag and get href sample_elem = browser.find_link_by_text('Sample').first hemisphere['img_url'] = sample_elem['href'] #Append hemisphere object to list hemisphere_image_urls.append(hemisphere) #Finally navigate back to start again on loop browser.back() #*************** CREATE A DICTOIONARY ********************* mars_info = {} mars_info['news_title'] = news_title mars_info['news_detail'] = news_p mars_info['featured_img_url'] = featured_image_url mars_info['mars_facts_html'] = mars_table_df mars_info['hemisphere_image_urls'] = hemisphere_image_urls # Close the browser browser.quit() # Return results return mars_info # # xpaths = [ # '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[1]/div/a', # '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[2]/div/a', # '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[3]/div/a', # '/html/body/div[1]/div[1]/div[2]/section/div/div[2]/div[4]/div/a' # ] # hem_title = [] # hem_url = [] # mars_hem_title_url = [] # for path in xpaths : # results = browser.find_by_xpath(path) # img = results[0] # img.click() # html = browser.html # soup = BeautifulSoup(html, 'html.parser') # title = soup.find('h2', class_ = 'title').text # hem_title.append(title) # hem = soup.find('div', class_='downloads') # hem_result = hem # img_url = hem_result.find('a')['href'] # hem_url.append(img_url) # mars_hem_title_url.append({'title': title, 'img_url': img_url}) # browser.visit(hemispheres_url) # browser.quit() # #Store results in dictionary # notebook_dict = {} # notebook_dict = { # 'article_title': news_title, # 'article_paragraph': news_p, # 'mars_image': featured_image_url, # 'mars_data_table': mars_table_df, # 'hemisphere_image_urls': mars_hem_title_url} # print(f"index 0 {notebook_dict['article_title']}") # print(f"index 1 {notebook_dict['article_paragraph']}") # print(f"index 2 {notebook_dict['mars_image']}") # print(f"index 3 {notebook_dict['mars_data_table']}") # print(f"index 4 {notebook_dict['hemisphere_image_urls']}") # return notebook_dict
def scrape(): browser = init_browser() url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' response = requests.get(url) time.sleep(1) soup = BeautifulSoup(response.text, 'lxml') #grabbling the 'slide' class element from the url results = soup.find_all(class_="slide") #creating a list to hold scraped data news_data = [] for result in results: # Error handling try: #loop thru and get the text within these classes, replace \n with blank space news_p = result.find( class_="rollover_description_inner").text.replace('\n', '') news_title = result.find(class_="content_title").text.replace( '\n', '') post = {"news_title": news_title, "news_p": news_p} news_data.append(post) print(post) except Exception as e: print(e) browser = Browser('chrome', headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(1) #use splinter to click the "Full Image" button browser.click_link_by_partial_text('FULL IMAGE') time.sleep(1) #HTML Object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') #find the class where pic is stored results = soup.find(class_='fancybox-image') #retrieve source attribute, i.e. the path url = results['src'] #attach the path to the main site link, this is the full image link featured_image_url = 'https://www.jpl.nasa.gov' + url post_two = {'featured_image': featured_image_url} news_data.append(post_two) print(post_two) #visit the mars twitter page to get the Weather url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) time.sleep(1) response = requests.get(url) #parse HTML with Beautiful soup, get the text soup = BeautifulSoup(response.text, 'html.parser') #get the text from the first p tag with appropriate class (from inspecting the site) mars_weather = soup.find( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text post_three = {'mars_weather': mars_weather} print(post_three) news_data.append(post_three) browser = Browser('chrome', headless=False) #visit the mars space facts site url = 'https://space-facts.com/mars/' #read the table, put into list variable tables = pd.read_html(url) #convert the list to a dataframe mars_df = tables[0] #put column headers on mars_df.columns = ["Characteristic", "Value"] #convert the datframe to dictionary, using 'records' orientation, this does not neeed to be, nor should be, appended to news_data, as will create a list of a dictionary within the list, and not be able to be inserted to mongodb mars_dict = mars_df.to_dict('records') print(mars_dict) #Visit the site to get images of Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') results = soup.find_all(class_='item') #loop through the item class for result in results: #find the first a tag link = result.find('a') #assign the href to variable 'links' links = link['href'] #assign the link h3 title text to variable 'title' title = result.find('h3').text #concatenate the path with the main site link, assign to variable 'url' url = 'https://astrogeology.usgs.gov' + links #open brower, chromedriver browser = Browser('chrome', headless=False) #visit the concatenated url browser.visit(url) time.sleep(1) html = browser.html #parse the html with beautiful soup soup = BeautifulSoup(html, 'html.parser') #find all elemenst with class 'downloads', assign results to variable list 'infos' infos = soup.find_all(class_='downloads') #loop thru infos, pull out links to images, assign with title to dictionary post, and then append to list #mars_images for info in infos: link_two = info.find('a') img_url = link_two['href'] post_four = {'img_url': img_url, 'title': title} news_data.append(post_four) print(post_four) #return your data, so it can be accessed by flask app (where the insertion into mongodb will occur) return news_data + mars_dict
browser = Browser('chrome', headless=False) weather_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(weather_url) html = browser.html weather_soup = BeautifulSoup(html, 'html.parser') weather = weather_soup.find('div', class_='js-tweet-text-container') mars_weather= weather.p.text.lstrip() print(mars_weather) facts_url = 'http://space-facts.com/mars/' fact_table = pd.read_html(facts_url) fact_table df = fact_table[0] df.columns = ['Mars', 'Value'] df html_table = df.to_html() df.to_html('table.html') mars_facts=df.to_dict('records') mars_facts tem=list(mars_facts[0].values()) tem
def _read_change_from_url(self, url: str) -> pd.DataFrame: """read change from url Parameters ---------- url : str change url Returns ------- pd.DataFrame: symbol date type SH600000 2019-11-11 add SH600000 2020-11-10 remove dtypes: symbol: str date: pd.Timestamp type: str, value from ["add", "remove"] """ resp = retry_request(url) _text = resp.text date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text) if len(date_list) >= 2: add_date = pd.Timestamp("-".join(date_list[0])) else: _date = pd.Timestamp("-".join(re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0])) add_date = get_trading_date_by_shift(self.calendar_list, _date, shift=0) remove_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=-1) logger.info(f"get {add_date} changes") try: excel_url = re.findall('.*href="(.*?xls.*?)".*', _text)[0] content = retry_request(f"http://www.csindex.com.cn{excel_url}", exclude_status=[404]).content _io = BytesIO(content) df_map = pd.read_excel(_io, sheet_name=None) with self.cache_dir.joinpath( f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.{excel_url.split('.')[-1]}" ).open("wb") as fp: fp.write(content) tmp = [] for _s_name, _type, _date in [("调入", self.ADD, add_date), ("调出", self.REMOVE, remove_date)]: _df = df_map[_s_name] _df = _df.loc[_df["指数代码"] == self.index_code, ["证券代码"]] _df = _df.applymap(self.normalize_symbol) _df.columns = [self.SYMBOL_FIELD_NAME] _df["type"] = _type _df[self.DATE_FIELD_NAME] = _date tmp.append(_df) df = pd.concat(tmp) except Exception as e: df = None _tmp_count = 0 for _df in pd.read_html(resp.content): if _df.shape[-1] != 4: continue _tmp_count += 1 if self.html_table_index + 1 > _tmp_count: continue tmp = [] for _s, _type, _date in [ (_df.iloc[2:, 0], self.REMOVE, remove_date), (_df.iloc[2:, 2], self.ADD, add_date), ]: _tmp_df = pd.DataFrame() _tmp_df[self.SYMBOL_FIELD_NAME] = _s.map(self.normalize_symbol) _tmp_df["type"] = _type _tmp_df[self.DATE_FIELD_NAME] = _date tmp.append(_tmp_df) df = pd.concat(tmp) df.to_csv( str( self.cache_dir.joinpath( f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.csv" ).resolve() ) ) break return df
def state_list(): fiddy_states = pd.read_html( 'https://simple.wikipedia.org/wiki/List_of_U.S._states') return fiddy_states[0][0][1:]
# In[1]: import os import glob import pandas as pd import numpy as np import time from datetime import datetime # In[2]: dep_df, = pd.read_html("https://www.dublinairport.com/flight-information/live-departures", header=0) dep_df.tail(1) # In[3]: # Initial Cleaning dep1_df = dep_df.dropna() dep1_df = dep1_df.drop('Status', axis=1) dep1_df.columns = ['Terminal', 'Destination', 'Airline', 'Flight No.', 'Scheduled DateTime', 'Actual Departure'] # Month Column new2 = dep1_df["Scheduled DateTime"].str.split(" ", n = 2, expand = True) dep1_df["Month"]= new2[1]
def scrape_eng_pages(filename, sheet, check): print("SHEETNAME********:", sheet) book = load_workbook(filename) old_sheet = sheet sheet = book[sheet] dictionary = {} msg = "" # filling up dictionary with city and corresponding website values first = False for row in sheet.rows: dictionary[row[2].value] = row[7].value keywords = ['Design', 'Professional ', 'Consult', 'Civil', 'Transportation', 'Bike', 'Pedestrian', 'Sidewalk', 'Street' 'Road','Boulevard', 'Blvd', 'Way', 'On-call'] keywords_track = {} pattern = re.compile( "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|" "Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|" "Dec(ember)?)\s+\d{1,2},\s+\d{4}") regexp = re.compile(pattern) dates = [] #list of websites that either have no current RFPS, or are broken - Can't check the checked/not working field because not all are updated so program will break user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' for each in dictionary.keys(): if each == "City": continue if each is not None and dictionary[each] is not None and each not in check: user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' url = dictionary[each] headers = {'User-Agent': user_agent, } request = urllib.request.Request(url, None, headers) # The assembled request response = urllib.request.urlopen(request) html = response.read() soup = BeautifulSoup(html) tables = soup.find_all('table') final_dates = [] for table in tables: # do your stuff for every table try: df = pd.read_html(str(table)) if len(df) == 0: continue else: #convert table from website into string paragraphs a = tabulate(df[0], headers='keys', tablefmt='psql') # run through keywords for key in keywords: if key in a: #print("EACH IS IN KEY: ", each, key) if each not in keywords_track: keywords_track[each] = [key] else: num_occ = a.count(key) if not len(keywords_track[each]) == num_occ: for i in range(num_occ - 1): keywords_track[each].append(key) if regexp.search(a): print("FOUND DATE!") dates.append((each, re.findall(r"[\d]{1,2}[/.-][\d]{1,2}[/.-][\d]{4}", a), dictionary[each], a)) except: continue print("KEY WORD DICT AFTER FILLING: ", keywords_track) array = build_dates(dates) print("Array", array) email_msg = build_email_msg(array, msg, keywords_track) return email_msg
#Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. #Save the tweet text for the weather report as a variable called mars_weather. url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_weather_element = soup.find('div', class_="content") mars_weather = print(mars_weather_element.p.text) #Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. #Use Pandas to convert the data to a HTML table string. url = 'https://space-facts.com/mars/' tables = pd.read_html(url) tables #Mars - Earth Comparison mars_earth_comparison = tables[0] mars_earth_comparison #Mars Planet Profile mars_planet_profile = tables[1].rename(columns={0:"Mars Category", 1:"Value"}) mars_planet_profile #Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres. url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser')
AIRTABLE_API_KEY = 'keybBQGNdYeJkRwcs' base_key = 'appmCQ7CzGefKPdmu' url = 'http://www.espn.com/golf/leaderboard?tournamentId=' + str(tournament_id) table_name = 'PGA_Field' entries_table = 'PGA2019Entries' pd.options.mode.chained_assignment = None page = urlopen(url) soup = BeautifulSoup(page, "html.parser") html = soup.find( "table", attrs={ "class": "Table2__table-scroller Table2__right-aligned Table2__table" }) table = pd.read_html(html.prettify()) df = table[0] df['PLAYER'] = df['PLAYER'].str.replace("'", "") df = df.set_index("PLAYER") df.to_csv('espnfield.csv') airtable = Airtable(base_key, table_name, AIRTABLE_API_KEY) field = airtable.get_all() field = pd.DataFrame.from_dict(field) field_data = [0] * len(field) for (i, entry) in enumerate(field_data): entry = field.loc[i]['fields'] field_data[i] = entry field_data = pd.DataFrame.from_dict(field_data)
def scrape(): browser = init_browser() client = pymongo.MongoClient("mongodb://localhost:27017") db = client.mars_db # Retrieving news title and teaser browser.visit("https://mars.nasa.gov/news/") time.sleep(2) soup = bs(browser.html, "html.parser") items = soup.find("ul", class_="item_list") slides = items.find_all("li", class_="slide") news_titles = [] news_paragraphs = [] for slide in slides: news_title = slide.find("div", class_="content_title").text news_p = slide.find("div", class_="article_teaser_body").text news_titles.append(news_title) news_paragraphs.append(news_p) # Retrieving featured image url browser.visit( "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars") browser.find_by_id("full_image").click() time.sleep(2) soup = bs(browser.html, "html.parser") image_src = soup.find("img", class_="fancybox-image")["src"] featured_image_url = f"https://jpl.nasa.gov{image_src}" # Retriving mars facts table browser.visit("https://space-facts.com/mars/") df = pd.read_html(browser.html)[1] mars_facts_table_html = df.to_html(index=False, justify="center") mars_facts_table_html = mars_facts_table_html.replace("\n", "") browser.visit( "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" ) time.sleep(1) soup = bs(browser.html, "html.parser") # Retrieving hemishere page's urls hemisphere_urls = [] hemispheres = soup.find_all("div", class_="description") for hemisphere in hemispheres: url = hemisphere.find("a")["href"] url = f"https://astrogeology.usgs.gov{url}" hemisphere_urls.append(url) # Retrieving titles and image links of different hemispheres hemisphere_list = [] for hemisphere_url in hemisphere_urls: browser.visit(hemisphere_url) time.sleep(2) soup = bs(browser.html, "html.parser") title = soup.find("h2", class_="title").text title = re.sub(" Enhanced", "", title) image_url = soup.find_all("li")[0].find("a")["href"] hemisphere_list.append({"title": title, "image_url": image_url}) return_dict = {} return_dict["news_titles"] = news_titles return_dict["news_paragraphs"] = news_paragraphs return_dict["featured_image_url"] = featured_image_url return_dict["mars_facts_table_html"] = mars_facts_table_html return_dict["hemisphere_list"] = hemisphere_list return_dict["date"] = datetime.datetime.utcnow() db.mission_to_mars.update({}, return_dict, upsert=True) browser.quit() return return_dict
def scrape(): """ Scrapes all websites for Mars data """ # Create a python dictionary to store all data scrape_mars_dict = {} # Use requests and BeautifulSoup to scrape Nasa News for latest news url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' response = requests.get(url) soup = bs(response.text, 'lxml') results = soup.find('div', class_='features') news_title = results.find('div', class_='content_title').text newsp = results.find('div', class_='rollover_description').text # Store scraped data into dictionary scrape_mars_dict['news_title'] = news_title scrape_mars_dict['newsp'] = newsp # Scrape Mars Weather twitter for latest weather report twitter_url = 'https://twitter.com/marswxreport?lang=en' twitter_response = requests.get(twitter_url) twitter_soup = bs(twitter_response.text, 'lxml') twitter_result = twitter_soup.find('div', class_='js-tweet-text-container') mars_weather = twitter_result.find('p', class_='js-tweet-text').text # Store scraped data into dictionary scrape_mars_dict['mars_weather'] = mars_weather # Scrape facts about Mars from space-facts.com using Pandas read_html function mars_facts_url = 'https://space-facts.com/mars/' tables = pd.read_html(mars_facts_url) df = tables[0] # Cleanup the Index df.rename({0:"Mars - Earth Comparison", 1:"Mars", 2: "Earth"}, axis=1, inplace=True) df.set_index("Mars - Earth Comparison", inplace=True) # Export scraped table into an html script mars_facts = df.to_html() mars_facts.replace("\n","") df.to_html('mars_facts.html') # Store html file to dictionary scrape_mars_dict['mars_facts'] = mars_facts # Call on chromedriver function to use for splinter browser = init_browser() # Scrape Nasa for url of latest featured image of Mars nasa_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(nasa_url) nasa_html = browser.html nasa_soup = bs(nasa_html, "lxml") featured_image = nasa_soup.find('div', class_='default floating_text_area ms-layer').find('footer') featured_image_url = 'https://www.jpl.nasa.gov'+ featured_image.find('a')['data-fancybox-href'] # Store url to dictionary scrape_mars_dict['featured_image_url'] = featured_image_url # Scrape astrogeology.usgs.gov for urls of hemisphere images of Mars hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemisphere_url) hemisphere_html = browser.html hemisphere_soup = bs(hemisphere_html, 'lxml') base_url ="https://astrogeology.usgs.gov" image_list = hemisphere_soup.find_all('div', class_='item') # Create a list to store dictionary of urls and image titles hemisphere_image_urls = [] # Loop through list of hemispheres and click on each one to find large resolution image for image in image_list: # Create a dicitonary to store urls and titles hemisphere_dict = {} # Find link to large image href = image.find('a', class_='itemLink product-item') link = base_url + href['href'] # Visit the link browser.visit(link) # Wait 1 second time.sleep(1) # Parse the html of the new page hemisphere_html2 = browser.html hemisphere_soup2 = bs(hemisphere_html2, 'lxml') # Find the title img_title = hemisphere_soup2.find('div', class_='content').find('h2', class_='title').text # Append to dict hemisphere_dict['title'] = img_title # Find image url img_url = hemisphere_soup2.find('div', class_='downloads').find('a')['href'] # Append to dict hemisphere_dict['url_img'] = img_url # Append dict to list hemisphere_image_urls.append(hemisphere_dict) # Store hemisphere image urls to dictionary scrape_mars_dict['hemisphere_image_urls'] = hemisphere_image_urls return scrape_mars_dict
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html # BeautifulSoup, Pandas, and Requests/Splinter from bs4 import BeautifulSoup from selenium import webdriver import pandas as pd # In[2]: # EIA browser to grab table url = 'https://www.eia.gov/electricity/state/archive/2014/' # In[3]: # Grab table with pandas table_list = pd.read_html(url) table = table_list[0] table.head() # In[4]: # Remove total row from list remove_list = ['U.S. Total'] states_table = table[~table.Name.isin(remove_list)] # remove non-price columns ecost_df = states_table.iloc[:, [0, 1]] # push to csv ecost_df.to_csv('csv/eia_2014_scrape.csv')
def scrape(): browser = init_browser() listings = {} url = "https://mars.nasa.gov/news/" browser.visit(url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='article_teaser_body').text url_base = "https://www.jpl.nasa.gov" url_add = '/spaceimages/?search=&category=Mars' browser.visit(url_base + url_add) time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") bttn_image_url = soup.find('article', class_='carousel_item').get('style') start = bttn_image_url.find("url('") end = bttn_image_url.find("');") featured_image_url = url_base + bttn_image_url[start + 3 + len("('"):end] url_base = "https://twitter.com/marswxreport?lang=en" browser.visit(url_base) time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") mars_weather = soup.find('p', class_='TweetTextSize').text url_base = "https://space-facts.com/mars/" browser.visit(url_base) time.sleep(1) html = browser.html soup = BeautifulSoup(html, "html.parser") table = pd.read_html(url_base) htmltable = table[0].to_html() link2 = [] link3 = [] link4 = [] url_base = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_base) time.sleep(2) links = browser.find_link_by_partial_text('Hemisphere') [link2.append(link['href']) for link in links] for link in link2: browser.visit(link) time.sleep(2) url_link = browser.find_link_by_partial_text('Sample') title_text = browser.find_by_css('.title') link3.append(url_link['href']) link4.append(title_text.html) hemisphere_image_urls = [] for i in range(len(link3)): hemisphere_image_urls.append({"title": link4[i], "img_url": link3[i]}) listings["news_p"] = news_p listings["news_title"] = news_title listings["featured_image_url"] = featured_image_url listings["mars_weather"] = mars_weather listings["html_table"] = htmltable listings["hemisphere_img_dict"] = hemisphere_image_urls return listings
except: ## proxies that do not work are removed from the list print(f"{pick} did not work") proxies_list.remove(pick) print(f"{pick} removed") print(len(proxies_list)) print(Exception) else: ## if proxies_list is empty, we get our proxies without configuring urllib for using proxies req = urllib.request.Request(url, headers={'User-Agent': "Magic Browser"}) sauce = urllib.request.urlopen(req).read() soup = bs.BeautifulSoup(sauce, 'lxml') print(soup) ## use pandas to get tables and choose columns df = pd.read_html(sauce) ## using pandas read_html method to parse through url print(df, len(df)) df = df[0] ## df is a list of tables. We want first table. df.to_csv("proxiesraw.csv", index=True) ## saving df to csv ## print(df[0].columns) ## choosing dataframe with the index 0 and checking all the columns. You may need to check columns if name of column have weird spacing and etc. ## df = df[0] ## setting df as the df[0] the data frame with the ip, port, and etc. df = pd.read_csv("proxiesraw.csv") df = df[['IP Address', "Port", "Https"]] ## making df only show the columns we want to see. df = df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) ## dropping all rows with missing values
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.optimize import minimize import math url = 'http://www.pangoo.it/coronavirus/?t=region&r=Lombardia&data=y#table' data = pd.read_html(url)[0] # data = pd.read_csv('cov19_data.csv') count = data['Totale casi'][:-1] n = len(count) temp = np.array([]) for i in range(n): temp = np.append(temp, int(count[i])) y = np.diff(temp) time = np.linspace(1, n-1, n-1) plt.figure(figsize=[10, 5]) plt.plot(time, y) plt.xlabel('Index') plt.ylabel('y') plt.title('New Covid19 cases in Italy') plt.show() obs = len(y)-5 x = y[:obs] a, p, k, v = [np.ones(n) for _ in range(4)] a[0], p[0], v[0] = x[1], 10000, 0
def getWorksheetsListFromExcelURL(downloadURL, isXLSX): print('downloading file : ' + downloadURL) # handles well formatted .xlsx files with dat reader bitch if isXLSX: urllib.request.urlretrieve(downloadURL, "temp.xlsx") xlsxFile = pd.ExcelFile("temp.xlsx") worksheets = [] print('file downloaded, tranforming into sheets') for xlsxSheet in xlsxFile.sheet_names: worksheets.append(pd.read_excel(xlsxFile, xlsxSheet)) print('file transformed, cleaning up') os.remove("temp.xlsx") # handles .xls schema which is stored string like - parses that bitch else: urllib.request.urlretrieve(downloadURL, "temp.xls") file1 = open('temp.xls', 'r') lines = file1.readlines() worksheets = [] worksheet = [] isWorksheet = False isFirstWorkSheet = True count = 0 print('file downloaded, tranforming into sheets') for line in lines: if '<html' in line: isWorksheet = True if '</html' in line: isWorksheet = False if isWorksheet: worksheet.append(line) else: if len(worksheet) > 0: worksheet.append(line) if not isFirstWorkSheet: temp = '\n'.join(worksheet) temp = temp.replace('3D1', '1') temp = temp.replace('3D2', '2') temp = temp.replace('3D3', '3') temp = temp.replace('3D4', '4') temp = temp.replace('3D5', '5') temp = temp.replace('3D6', '6') temp = temp.replace('3D7', '7') temp = temp.replace('3D8', '8') temp = temp.replace('3D9', '9') temp = temp.replace('3D10', '10') temp = pd.read_html(temp) temp = formatDF(temp, count) count += 1 worksheets.append(temp) worksheet = [] else: worksheet = [] isFirstWorkSheet = False else: worksheet = [] print('file transformed, cleaning up') os.remove("temp.xls") return worksheets
'ultrasound nerve segmentation', 'pneumothorax', '2021 prostate', '2021 pneumonia', '2021 intracrancial, INVERTED METRIC', #NOTE: performance metric is inverted '2021 covid19', '2021 chest xray' ] # Load the data data = dict() interesting_columns = ['Team Name', 'Score', 'Entries'] for i, name in enumerate(names): public = pd.read_html('kaggle/' + name + '_public.html')[0][interesting_columns] private = pd.read_html('kaggle/' + name + '_private.html')[0][interesting_columns] # Select teams who did two or more submissions (to avoid people who # didn't really participate public = public.query('Entries >= 2') private = private.query('Entries >= 2') print(public.head()) # Merge the two public = public.drop(columns='Entries').rename(columns=dict( Score='public')) private = private.drop(columns='Entries').rename(columns=dict( Score='private')) scores = pd.merge(public, private)