def login(driver, credential_json, verbose=True): info = utils.load_json(credential_json) utils.open_url(driver, host + '/id?redirectTo=%2F', verbose=verbose) driver.find_element_by_id('Username').send_keys(info['username']) driver.find_element_by_id('Password').send_keys(info['password']) driver.find_element_by_id('login').click()
def get_filter_options_url(driver, filt_name): # filt_name = 'SKILL LEVELS', 'ROLES', 'SUBJECTS TO LEARN' # 'TOOLS', 'CERTIFICATIONS', 'AUTHORS' # reload the search url utils.print_message('get options of filter {} ...'.format(filt_name)) utils.open_url(driver, search_url, reopen=True) opt_url_dict = dict() for filt in driver.find_elements_by_xpath( '//li[starts-with(@class, "facet__section ")]'): if filt_name != filt.find_element_by_xpath('./div/h3').text: continue # expand option list if (filt.get_attribute('class') == 'facet__section l-search__facets-list--item'): filt.find_element_by_xpath('./div/div').click() # get all options for opt in filt.find_elements_by_xpath('.//a[@role="checkbox"]'): opt_name = opt.get_attribute('aria-label') opt_url = opt.get_attribute('href') opt_url_dict[opt_name] = opt_url utils.print_message('found urls of {} options'.format(len(opt_url_dict))) return opt_url_dict
def open(self, url): sentiments = VoiceAnalyzer().recognize() if sentiments: max_key = max(sentiments, key=sentiments.get) if max_key == 'neu' or max_key == 'pos': utils.speak(self.response) utils.open_url(url)
def get_content(driver, chpt_url, out_html_name, out_audio_name): utils.open_url(driver, chpt_url, verbose=True) elem = driver.find_element_by_xpath('//div[@id="bible_chapter_content"]') save_html(elem.get_attribute('outerHTML'), out_html_name) try: audio_url = driver.find_element_by_xpath('//audio').get_attribute('src') utils.download_file(audio_url, out_audio_name) except: pass
def league_elo(): """ – Rank – Team – Rating – Playoffs % – Division % – World Series % """ url = 'https://projects.fivethirtyeight.com/2018-mlb-predictions/' soup = open_url(url) tbody = soup.find('tbody') trows = tbody.find_all('tr') cols = ['elo_rating', 'playoff_pct', 'division_pct', 'worldseries_pct'] for row in trows: team = row['data-str'] rating = float(row.find('td', {'class': 'num rating'})['data-val']) pcts = [ float(x['data-val']) for x in row.find_all('td', {'class': 'pct'}) ] row_data = [rating] + pcts db_data = {k: v for k, v in zip(cols, row_data)} db_data = parse_types(db_data) # Clear existing elo document tm = convert_name(name=team, how='abbr') db.Teams.update({'Tm': tm}, {'$set': {'elo': []}}) db.Teams.update({'Tm': tm}, {'$push': {'elo': db_data}})
def get_file(self, file_number=1, return_filename=False): """Gets a file that was pasted in, uploaded, or given by a URL. If multiple files are provided, specify the number of the desired file as file_number. Returns None if there is no file. If return_filename is True, returns a tuple: (desired_file, filename).""" paste_name = 'pfif_xml_' + str(file_number) upload_name = 'pfif_xml_file_' + str(file_number) url_name = 'pfif_xml_url_' + str(file_number) desired_file = None filename = None if self.request.POST.get(paste_name): desired_file = StringIO(self.request.POST[paste_name]) elif upload_name in self.request.FILES: desired_file = StringIO(self.request.FILES[upload_name].read()) filename = self.request.FILES[upload_name].name elif self.request.POST.get(url_name): url = self.request.POST[url_name] # make a file-like object out of the URL's xml so we can seek on it desired_file = StringIO(utils.open_url(url).read()) filename = url if desired_file is not None: if return_filename and filename is not None: return (desired_file, filename) elif return_filename: return (desired_file, None) else: return desired_file else: if return_filename: return (None, None) else: return None
def get_video_url(driver, clip_url): # load the player utils.open_url(driver, clip_url) utils.wait(5) # pause the video control_bar = driver.find_element_by_class_name('hidden-bar') play_button = driver.find_element_by_id('play-control') ActionChains(driver).move_to_element(control_bar).click( play_button).perform() # get video url video_url = driver.find_element_by_tag_name('video').get_attribute('src') return video_url
def pitching_logs(team, year): """ Scrape pitching logs from baseball-reference.com """ team = convert_name(name=team, how='abbr') url = "http://www.baseball-reference.com/teams/tgl.cgi?team={}&t=p&year={}".format( team, year) soup = open_url(url) table = soup.find_all('div', {'class': 'table_outer_container'})[-1] # Extract column headers cols = [x.text for x in table.find_all('th', {'scope': 'col'})] # Extract body of pitching logs table tbody = table.find('tbody') trows = tbody.find_all('tr') # # Clear existing Pitlog document db_array = 'Pitlog.{}'.format(year) db.Teams.update({'Tm': team}, {'$set': {db_array: []}}) # Extract pitching logs and push to databse for row in trows: row_data = [ x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat')) ] db_data = {k: v for k, v in zip(cols, row_data)} db_data = parse_types(db_data) # Insert row indo database db.Teams.update({'Tm': team}, {'$push': {db_array: db_data}})
def __init__(self, driver, cache_dir, course_id): # course_id e.g. embedded-systems-programming self.driver = driver self.cache_dir = cache_dir course_url = host + '/library/courses/' + course_id utils.open_url(driver, course_url) skip_ads(driver) self.meta = CourseMeta(driver, course_url, course_id) self.table_of_content = None self.transcript = None self.description = None self.exercise_files = None
def load_all_courses(driver, html_name, num_load=None): utils.open_url(driver, host + '/library/search') # switch to Course tab for elem in driver.find_elements_by_xpath('//li[@class="tab-list__item"]'): if elem.text == 'Courses': elem.click() # define target scraping section course_section = driver.find_element_by_xpath( '//div[@aria-selected="true"]') # expected number ncourse_expect = int( course_section.find_element_by_xpath( './/*[@class="l-search__results-page-info"]').text.split()[1]) nload = 0 if num_load: nload_max = num_load else: nload_max = (ncourse_expect // 25) + 3 while nload < nload_max: courses = course_section.find_elements_by_xpath( './/li[@class="courses-list__item"]') ncourses = len(courses) utils.print_message('#load={}, ncourses={}'.format(nload, ncourses)) nload += 1 buttons = course_section.find_elements_by_xpath( './/a[@class="button button--outlined"]') if len(buttons) == 0: break buttons[0].click() utils.wait(3) # save html utils.save_html(driver, html_name) course_list = course_section.find_elements_by_xpath( './/li[@class="courses-list__item"]') utils.print_message('expect {} courses, loaded {}.'.format( ncourse_expect, len(course_list)))
def get_all_courses_per_option(driver, opt_url, wait_time=5): utils.open_url(driver, opt_url, reopen=True, verbose=True) course.skip_ads(driver) switch_to_courses(driver, 'Courses') ncourse = find_number_courses(driver) utils.print_message('loading {} courses'.format(ncourse)) load_all_courses(driver, wait_time=wait_time) course_id_list = get_course_ids(driver) if ncourse != len(course_id_list): utils.print_message( '*ERROR*: number of courses mismatch, expected {}, loaded {}'. format(ncourse, len(course_id_list))) raise return course_id_list
def get_city_house(user_in_nub,city): #创建时间 create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))) province_dict = get_province_dict() city_code_dict = get_citycode_dict() city_name_dict = city_dict() # 打开数据库连接 db = pymysql.connect(host='172.20.206.28', port=3306, user='******', password='******', db='autodata-roomprice', charset='utf8') cursor = db.cursor() # 生成城市url result_sum = [] for i in generate_allurl(user_in_nub,city): print(i) # 楼盘,楼盘链接 contents, urls = (get_allurl(i)) # 对每个楼盘爬取数据 for content in contents: results = [] re_get = content[0] #数据来源 source = '安居客' # 楼盘名 name = content[1] detail, longitude, latitude = open_url(re_get) city_name = list(city_name_dict.get(city))[0] province = list(province_dict.get(city_name,''))[0] district = content[2] city_code = city_code_dict.get(city_name,'') province_code = city_code_dict.get(province,'') district_code = get_district_code(city_name,district,city_code_dict) for ele in detail: try: house_type = ele[0] area = str(ele[1]).replace('m', '') total_price = int(ele[2])*10000 count = ele[3] result = [name, longitude, latitude, province, city_name, district, house_type, area, total_price, source, create_time, count] results.append(result) except: pass print(results) # update_house_price_db(db,cursor,results,table = '''house_price_yyh''') try: average, metre_average, area_average = get_price(results) result_sum.append([name, longitude, latitude, province, city_name, district, province_code, city_code, district_code,average, metre_average, area_average, create_time,source]) except: pass # update_community_db(db, cursor, result_sum, table='''community_yyh_tmp''') print(result_sum) # 关闭数据库连接 db.close()
def fangraph_splits(year): # 5 is for left handed batters, 6 for right handed batters for hand in [5, 6]: url = """https://www.fangraphs.com/leaderssplits.aspx?splitArr={0}\ &strgroup=season&statgroup=1&startDate={1}-03-01\ &endDate={1}-11-01&filter=&position=P&statType=player\ &autoPt=true&players=&sort=19,-1&pg=0"""\ .format(hand, year).replace(' ', '') soup = open_url(url) # Send POST request to get data in csv format params = { '__EVENTTARGET': 'SplitsLeaderboard$cmdCSV', '__EVENTARGUMENT': '', 'SplitsLeaderboard$dataPlayerId': 'all', 'SplitsLeaderboard$dataPos': 'P', 'SplitsLeaderboard$dataSplitArr': '[{}]'.format(hand), 'SplitsLeaderboard$dataGroup': 'season', 'SplitsLeaderboard$dataType': '1', 'SplitsLeaderboard$dataStart': '{}-03-01'.format(year), 'SplitsLeaderboard$dataEnd': '{}-11-01'.format(year), 'SplitsLeaderboard$dataSplitTeams': 'false', 'SplitsLeaderboard$dataFilter': '[]', 'SplitsLeaderboard$dataAutoPt': 'true', 'SplitsLeaderboard$dataStatType': 'player', 'SplitsLeaderboard$dataPlayers': '' } elems = ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION'] # Find dynamic parameters in the page html more_params = [soup.find('input', {'id': elem}) for elem in elems] for param in more_params: params.update({param['id']: param['value']}) req = requests.post(url, data=params).text df = pd.read_csv(StringIO(req)) # Push one row at a time into database df_data = df.to_dict(orient='index') for key in tqdm(df_data.keys()): name = df_data[key]['Name'] season = df_data[key]['Season'] player_data = { k: v for k, v in df_data[key].items() if k not in ['Name', 'Season'] } handstr = 'vLHH' if hand == 5 else 'vRHH' db_path = 'fg.{}.{}'.format(handstr, season) db.Players.update({'Name': name}, {'$set': {db_path: player_data}})
def build_channels(): url = utils.open_url('https://www.boxplus.com') match = re.compile( '<p><a href = "https://www\.boxplus\.com/live-tv-guide\?channel=(.+?)">(.+?)</a></p></td></tr>' ).findall(url) for url, show in match: name = url.replace('-', ' ') name = name.title() if lines == 'false': name = '[B]%s[/B] - %s' % (name, show) else: name = '[B]%s[CR]NOW:[/B] %s' % (name, show) utils.add_stream(name, url, 'play', icon % url)
def download_package(self, mirror, package): mirror_folder = self.context.get_mirror_folder(mirror['path']) install_path = self.context.package_list.get_package_file(package) install_dir = os.path.join(mirror_folder, os.path.dirname(install_path)) install_file = os.path.join(mirror_folder, install_path) # utils.LOG("Install Dir: ", install_dir) # utils.LOG("Install File: ", install_file) if not os.path.isdir(install_dir): os.makedirs(install_dir) package_url = mirror['path'] + "/" + install_path utils.LOG("Downloading: ", package_url) package['progress']['completed_bytes'] = 0 package['progress']['completed_pct'] = 0 timebefore = time.time() def update_package_progress(chunk, chunk_len, chunk_time = None): if chunk_len == 0: utils.LOG(" ++++++++++++++ Package %s download complete." % package['name']) # and notify the post download queue that we are done... self.context.download_queue.task_done() self.context.post_download_queue.put({'mirror': mirror, 'package': package, 'time': time.time() - timebefore}) else: package['progress']['completed_bytes'] += chunk_len package_size = self.context.package_list.get_package_size(package) completed_pct = float(package['progress']['completed_bytes']) / float(package_size) package['progress']['completed_pct'] = int(100 * completed_pct) utils.LOG("=== Package %s - %d out of %d downloaded..." % (package['name'], package['progress']['completed_bytes'], package_size)) # save url to file utils.open_url(package_url, constants.DEFAULT_CHUNK_SIZE, utils.chunk_handler_to_file(install_file), update_package_progress)
def _daily_forecast_from_location_info(location_info, start_date=None, num_days=6, metric=False): if not start_date: start_date = datetime.date.today() # NOTE: the order of the query-string parameters seems to matter; so, # we can't use a dictionary to hold the params params = location_info + [("format", "24 hourly"), ("startDate", start_date.strftime("%Y-%m-%d")), ("numDays", str(num_days)), ("Unit", "m" if metric else "e")] FORECAST_BY_DAY_URL = ("http://www.weather.gov/forecasts/xml" "/sample_products/browser_interface" "/ndfdBrowserClientByDay.php") resp = utils.open_url(FORECAST_BY_DAY_URL, params) tree = utils.parse_xml(resp) if tree.getroot().tag == 'error': raise exceptions.NOAAException("Unable to retrieve forecast") time_layouts = _parse_time_layouts(tree) min_temp_tlk, min_temps = _parse_temperatures_for_type(tree, 'minimum') max_temp_tlk, max_temps = _parse_temperatures_for_type(tree, 'maximum') conditions_tlk, conditions = _parse_conditions(tree) # Time layout keys have to match for us to sequence and group by them assert (min_temp_tlk == max_temp_tlk == conditions_tlk) time_layout_key = min_temp_tlk time_layout = time_layouts[time_layout_key] dates = [dt.date() for dt, _ in time_layout] forecast = [] for date, min_temp_value, max_temp_value, condition in zip( dates, min_temps, max_temps, conditions): # If we're missing any data, don't create the data point if utils.any_none([min_temp_value, max_temp_value, condition]): continue temp_unit = 'C' if metric else 'F' min_temp = models.Temperature(min_temp_value, unit=temp_unit) max_temp = models.Temperature(max_temp_value, unit=temp_unit) datapoint = models.ForecastedCondition(date, min_temp, max_temp, condition) forecast.append(datapoint) return forecast
def schedule(team): """ Scrape yankees schedule with results from baseball-reference.com """ name = convert_name(team, how='abbr') url = "http://www.baseball-reference.com/teams/{}/2018-schedule-scores.shtml".format( name) soup = open_url(url) table = soup.find('table', {'id': 'team_schedule'}) # Extract schedule columns thead = table.find('thead') cols = [ x.text.replace('\xa0', 'Field').replace('.', '') for x in thead.find_all('th') ] upcoming_cols = cols[:6] + ['Time'] # Extract schedule data tbody = soup.find('tbody') trows = tbody.find_all('tr') # Throw out rows that are duplicates of column headers trows = [x for x in trows if 'Gm#' not in x.text] # Clear existing Schedule document db.Teams.update({'Tm': name}, {'$set': {'Schedule': []}}) # Extract schedule data one row at a time for row in trows: row_data = [ x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat')) ] # Past game if row_data[2] == 'boxscore': game_num = row_data[0] db_data = {k: v for k, v in zip(cols, row_data)} # Upcoming game elif row_data[2] == 'preview': row_data = row_data[:7] game_num = row_data[0] db_data = {k: v for k, v in zip(upcoming_cols, row_data)} db_data = parse_types(db_data) # Insert row into database db.Teams.update({'Tm': name}, {'$push': {'Schedule': db_data}})
def _daily_forecast_from_location_info(location_info, start_date=None, num_days=6, metric=False): if not start_date: start_date = datetime.date.today() # NOTE: the order of the query-string parameters seems to matter; so, # we can't use a dictionary to hold the params params = location_info + [("format", "24 hourly"), ("startDate", start_date.strftime("%Y-%m-%d")), ("numDays", str(num_days)), ("Unit", "m" if metric else "e")] FORECAST_BY_DAY_URL = ("http://www.weather.gov/forecasts/xml" "/sample_products/browser_interface" "/ndfdBrowserClientByDay.php") resp = utils.open_url(FORECAST_BY_DAY_URL, params) tree = utils.parse_xml(resp) if tree.getroot().tag == 'error': raise exceptions.NOAAException("Unable to retrieve forecast") time_layouts = _parse_time_layouts(tree) min_temp_tlk, min_temps = _parse_temperatures_for_type(tree, 'minimum') max_temp_tlk, max_temps = _parse_temperatures_for_type(tree, 'maximum') conditions_tlk, conditions = _parse_conditions(tree) # Time layout keys have to match for us to sequence and group by them assert (min_temp_tlk == max_temp_tlk == conditions_tlk) time_layout_key = min_temp_tlk time_layout = time_layouts[time_layout_key] dates = [dt.date() for dt, _ in time_layout] forecast = [] for date, min_temp_value, max_temp_value, condition in zip( dates, min_temps, max_temps, conditions): # If we're missing any data, don't create the data point if utils.any_none([min_temp_value, max_temp_value, condition]): continue temp_unit = 'C' if metric else 'F' min_temp = models.Temperature(min_temp_value, unit=temp_unit) max_temp = models.Temperature(max_temp_value, unit=temp_unit) datapoint = models.ForecastedCondition( date, min_temp, max_temp, condition) forecast.append(datapoint) return forecast
def get_mirror_contents(self, mirror, refresh = False): """ Gets the contents of (files hosted by) a mirror, refreshing or refetching the setup.bz2 file if necessary. Note that when this happens its throughput is also recorded. """ mirror_folder = self.get_mirror_folder(mirror) # check if setup.ini at the mirror exists mirror_setup_ini = os.path.join(mirror_folder, "setup.ini") mirror_setup_exists = os.path.isfile(mirror_setup_ini) if refresh or (not mirror_setup_exists): utils.LOG("Mirror setup exists: ", mirror_setup_exists, refresh) utils.LOG("Downloading setup.bz2... ") try: setup_file_contents, time_taken = utils.open_url(mirror + "/setup.bz2") # TODO: Try downloading setup.ini file if setup.bz2 fails # open(mirror_folder + "/setup.bz2", "wb").write(setup_file_contents) # now decode it decoder = codecs.getdecoder("bz2") contents, length = decoder(setup_file_contents) open(mirror_setup_ini, "w").write(contents) # reset the mirror's throughput self.mirror_list.update_mirror(mirror, bytes_loaded = length, time_spent = time_taken, health = int(length / time_taken)) except: self.mirror_list.update_mirror(mirror, health = "Down") return None # Load setup.ini and return its contents pkgs = packages.read_package_contents(mirror_setup_ini) self.mirror_list.update_mirror(mirror, num_packages = len(pkgs)) # update the package contents - ie which packages are in # which mirrors etc self.package_list.disable_saves() for pkg_obj in pkgs: pkg_name = pkg_obj['main']['name'] if not self.package_list.contains(pkg_name): self.package_list.add_package(pkg_name, pkg_obj) self.package_list.enable_saves() return pkgs
def get_all_htmls(): '''Download all htmls of Bible in Chinese. Output data will be saved in ../data/chinese_cn/*.html ''' driver = utils.start_driver('phantomjs', verbose=True) out_dir = '{}/data/chinese_cn'.format(work_dir) os.makedirs(out_dir, exist_ok=True) try: for i in range(1, 74): url = ('http://xiaozhushou.org/index.php/?m=bible&template={}' .format(i)) utils.open_url(driver, url, verbose=True) chpt_url_list = [] for elem in driver.find_elements_by_xpath( '//ul[@id="chapter_list"]/li/a'): chpt_url = elem.get_attribute('href') chpt_url_list.append(chpt_url) for chpt_url in chpt_url_list: book_id = str(i).zfill(3) chpt_id = chpt_url.split('=')[-1].zfill(3) out_html_name = ('{}/{}_{}_chapter.html' .format(out_dir, book_id, chpt_id)) out_audio_name = ('{}/{}_{}_audio.mp3' .format(out_dir, book_id, chpt_id)) get_content(driver, chpt_url, out_html_name, out_audio_name) except: print('*ERROR* something wrong') raise finally: utils.close_driver(driver, verbose=True)
def forty_man(team, year): """ Extract 40-man roster from baseball-reference.com """ team = convert_name(name=team, how='abbr') base = "http://www.baseball-reference.com" url = base + "/teams/{}/{}-roster.shtml".format(team, year) soup = open_url(url) table = soup.find('table', {'id': 'the40man'}) # Extract column headers and rename blank columns thead = table.find('thead') cols = [x.text for x in thead.find_all('th')] cols[3], cols[4] = 'Country', 'Pos' # Extract body of fort man table tbody = table.find('tbody') trows = tbody.find_all('tr') # Clear existing Fortyman document db_array = 'Fortyman.{}'.format(year) db.Teams.update({'Tm': team}, {'$set': {db_array: []}}) # Extract forty-man roster and push to database for row in tqdm(trows): bid = row.find('a')['href'].split('=')[-1] row_data = [ x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat')) ] db_data = {k: v for k, v in zip(cols, row_data)} db_data.update({'bid': bid}) db.Teams.update({'Tm': team}, {'$push': {db_array: db_data}}) # Check if player exists in database player = db_data['Name'] exists = dbc.player_exists(player) if not exists: try: print("Scraping br data for {}".format(player)) br_player_stats(player, team) except: print("Unable to scrape br data for {}".format(player))
def notify_lag_update(commit): global older_version global updated_version diff_string = utils.open_url( "https://github.com/ballerina-platform/ballerina-release/commit/" + commit + ".diff").read().decode("utf-8") for line in diff_string.splitlines(): if line.startswith("-"): older_version.append(line[1:]) elif line.startswith("+"): updated_version.append(line[1:]) older_version = older_version[1:] updated_version = updated_version[1:] remove_statement_changes() create_message()
def current_injuries(team): """ Extract current injuries table from baseball-reference.com """ current_year = datetime.date.today().strftime('%Y') team = convert_name(name=team, how='abbr') url = "http://www.baseball-reference.com/teams/{}/{}.shtml"\ .format(team, current_year) soup = open_url(url) # Data is stored in html comment comment = soup.find_all(string=lambda text: isinstance(text, Comment)) comment_html = [x for x in comment if 'Injuries Table' in x][-1].string table = BeautifulSoup(comment_html, "html.parser") # Extract column headers thead = table.find('thead') cols = [x.text for x in thead.find_all('th')] # Extract body from injuries table tbody = table.find('tbody') trows = tbody.find_all('tr') # Clear existing injuries document db.Teams.update({'Tm': team}, {'$set': {'Injuries': []}}) # Extract injuries table and push to database for row in trows: row_data = [ x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat')) ] db_data = {k: v for k, v in zip(cols, row_data)} db_data = parse_types(db_data) db.Teams.update({'Tm': team}, {'$push': {'Injuries': db_data}})
def geocode_location(location, api_key=None): """Use Google to geocode a location string. For high-volume traffic, you will need to specify an API-key. """ GEOCODE_URL = "http://maps.google.com/maps/geo" params = [("q", location), ("sensor", "false"), ("output", "json")] if api_key: params += [("key", api_key)] resp = utils.open_url(GEOCODE_URL, params) data = json.loads(resp.read()) if data["Status"]["code"] != 200: raise exceptions.GeocodeException("Unable to geocode this location") best_match = data["Placemark"][0] address = best_match["address"] lon, lat, _ = best_match["Point"]["coordinates"] location = models.Location(lat, lon, address) return location
def geocode_location(location, api_key=None): """Use Google to geocode a location string. For high-volume traffic, you will need to specify an API-key. """ GEOCODE_URL = "http://maps.google.com/maps/geo" params = [('q', location), ('sensor', 'false'), ('output', 'json')] if api_key: params += [('key', api_key)] resp = utils.open_url(GEOCODE_URL, params) data = json.loads(resp.read()) if data['Status']['code'] != 200: raise exceptions.GeocodeException('Unable to geocode this location') best_match = data['Placemark'][0] address = best_match['address'] lon, lat, _ = best_match['Point']['coordinates'] location = models.Location(lat, lon, address) return location
def standings(): """ Scrape MLB standings from baseball-reference.com """ url = "http://www.baseball-reference.com/leagues/MLB-standings.shtml" soup = open_url(url) # Scrape division identifier data div_data, gb_data = {}, {} divs = ['E', 'C', 'W'] for div in divs: table = soup.find_all('div', {'id': 'div_standings_{}'.format(div)}) trows = [x.find_all('tr') for x in table] trows_flat = [x for y in trows for x in y] teams_html = [x.find('a') for x in trows_flat if x.find('a')] team_names = [x['href'].split('/')[2] for x in teams_html] gbs = [ x.find('td', { 'data-stat': 'games_back' }).text for x in trows_flat if x.find('td') ] div_dict = {k: div for k in team_names} div_data.update(div_dict) gb_dict = {k: v for k, v in zip(team_names, gbs)} gb_data.update(gb_dict) # Scrape full league standings comment = soup.find_all(string=lambda text: isinstance(text, Comment)) comment_html = [x for x in comment if '<td' in x][-1].string comment_soup = BeautifulSoup(comment_html, "html.parser") # Extract table column headers thead = comment_soup.find('thead') cols = [x.text.replace('.', '') for x in thead.find_all('th')] # Extract table body tbody = comment_soup.find('tbody') trows = tbody.find_all('tr') for row in trows: row_data = [ x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat')) ] # Skip last row (league averages) if row_data[0]: team = row_data[1] db_data = {k: v for k, v in zip(cols, row_data)} # Add division and gb information division = '{}-{}'.format(db_data['Lg'], div_data[team]) games_behind = gb_data[team] db_data.update({'div': division}) db_data.update({'gb': games_behind}) # Store int/float when possible db_data = parse_types(db_data) # Insert row into database db.Teams.update({'Tm': team}, {'$set': db_data}, upsert=True)
def fangraphs(state, year): """ Scrape data from fangraphs.com """ tid = 0 # Scrape all teams for now, add individual teams later if needed url = """http://www.fangraphs.com/leaders.aspx?pos=all&stats={0}\ &lg=all&qual=0&type=8&season={1}\ &month=0&season1={1}\ &ind=0&team={2}&page=1_1000"""\ .format(state, year, tid)\ .replace(' ', '') soup = open_url(url) # Extract column headers thead = soup.find('thead') cols = [x.text for x in thead.find_all('th')] # Extract stats from table bdoy tbody = soup.find_all('tbody')[-1] all_rows = tbody.find_all('tr') all_row_data = [x.find_all('td') for x in all_rows] for row in tqdm(all_row_data): row_data = [x.text for x in row] player = row_data[1] db_data = {k: v for k, v in zip(cols, row_data)} # Rename common keys with batting or pitching prefixes rank = '{}_rank'.format(state) db_data[rank] = db_data.pop('#') war = '{}_WAR'.format(state) db_data[war] = db_data.pop('WAR') games = '{}_G'.format(state) db_data[games] = db_data.pop('G') # Convert team name to abbreviation try: db_data['Team'] = convert_name(db_data['Team']) except: pass # any need to pull team value from br here? # print("(fangraphs) No team listed for {}".format(player)) # Store type as numeric if possible db_data = parse_types(db_data) # Insert row into database db_path = 'fg.{}.{}'.format(state, year) db.Players.update({'Name': player}, {'$set': { db_path: db_data }}, upsert=True) # Add current team to top level if year == dbc._year: db.Players.update({'Name': player}, {'$set': { 'Team': db_data['Team'] }})
def get_stream(url): url = utils.open_url('https://www.boxplus.com/live-tv-guide?channel=' + url) match = re.compile("src: '(.+?)',").findall(url) for url in match: utils.play_stream(url)
def check_pending_build_checks(index: int): module = current_level_modules[index] global status_completed_modules print("[Info] Checking the status of the timestamped build in module '" + module['name'] + "'") passing = True pending = False build_check_found = False # This is to stop intermittent failures repo = github.get_repo(constants.BALLERINA_ORG_NAME + '/' + module['name']) pull_request = repo.get_pull(module[MODULE_CREATED_PR].number) sha = pull_request.merge_commit_sha failed_build_name, failed_build_html = [], [] if module[MODULE_CONCLUSION] == MODULE_CONCLUSION_BUILD_PENDING: for build_check in repo.get_commit(sha=sha).get_check_runs(): build_check_found = True # Ignore codecov checks temporarily due to bug if not build_check.name.startswith('codecov'): if build_check.status != 'completed': pending = True break elif build_check.conclusion == 'success': continue else: failed_build_name.append(build_check.name) failed_build_html.append(build_check.html_url) passing = False if build_check_found and not pending: if passing: current_level_modules[index][ MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_SUCCESS else: current_level_modules[index][ MODULE_STATUS] = MODULE_STATUS_COMPLETED current_level_modules[index][ MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_FAILURE module_name = module['name'] print( "[Error] Dependency bump PR merge build checks have failed for '" + module_name + "'") for name, html_url in zip(failed_build_name, failed_build_html): print("[" + module_name + "] Build check '" + name + "' failed for " + html_url) status_completed_modules += 1 else: # Already successful and merged current_level_modules[index][ MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_SUCCESS if current_level_modules[index][ MODULE_CONCLUSION] == MODULE_CONCLUSION_BUILD_SUCCESS: if current_level_modules[index]['name'] == 'ballerina-distribution': current_level_modules[index][ MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_RELEASED else: try: packages_url = 'https://api.github.com/orgs/' + constants.BALLERINA_ORG_NAME + '/packages/maven/' \ + module['group_id'] + '.' + module['artifact_id'] + '/versions' packages_list_string = utils.open_url(packages_url).read() packages_list = json.loads(packages_list_string) latest_package = packages_list[0]['name'] if retrigger_dependency_bump.lower() == 'true': for package in packages_list: sha_of_released_package = package['name'].split( '-')[-1] if sha_of_released_package in sha: latest_package = package['name'] break current_level_modules[index][ MODULE_CONCLUSION] = MODULE_CONCLUSION_BUILD_RELEASED current_level_modules[index][ MODULE_TIMESTAMPED_VERSION] = latest_package except Exception as e: print( "[Error] Failed to get latest timestamped version for module '" + module['name'] + "'", e) current_level_modules[index][ MODULE_STATUS] = MODULE_CONCLUSION_VERSION_CANNOT_BE_IDENTIFIED current_level_modules[index][MODULE_STATUS] = MODULE_STATUS_COMPLETED status_completed_modules += 1
def br_player_stats(name, team): brid = dbc.get_player_brid(name, team) base = 'https://www.baseball-reference.com/redirect.fcgi?player=1&mlb_ID=' url = base + brid redirect = requests.get(url).url soup = open_url(redirect) # Extract Standard Batting/Pitching table table = soup.find('div', {'class': 'table_outer_container'}) thead = table.find('thead') cols = [x.text for x in thead.find_all('th')] pit_or_bat = table.find('caption').text tbody = table.find('tbody') trows = tbody.find_all('tr') # Push to Players collection for row in trows: if row.find('th', {'data-stat': 'year_ID'}).text: row_data = [ x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat')) ] db_data = {k: v for k, v in zip(cols, row_data)} db_data = parse_types(db_data) # Skip blank rows and don't collect minor league data if not row_data[0] or db_data['Lg'] not in ['AL', 'NL']: continue db_array = 'br.{}.{}'.format(pit_or_bat, db_data['Year']) db.Players.update({'Name': name}, {'$set': { 'brID': brid, db_array: db_data }}, upsert=True) # Extract Player Value Table - Stored in html comment comment = soup.find_all(string=lambda text: isinstance(text, Comment)) comment_html = [x for x in comment if 'Player Value' in x] for c in comment_html: table = BeautifulSoup(c.string, "html.parser") table_name = table.find('caption').text.replace('--', ' ').split() title = '{} {}'.format(table_name[2], table_name[1]) thead = table.find('thead') cols = [x.text for x in thead.find_all('th')] tbody = table.find('tbody') trows = tbody.find_all('tr') for row in trows: row_data = [ x.text for x in row.find_all(lambda tag: tag.has_attr('data-stat')) ] db_data = {k: v for k, v in zip(cols, row_data)} db_data = parse_types(db_data) # Rename stats to match fg data renames = {'BA': 'AVG'} for stat in rename.keys(): db_data[renames[stat]] = db_data[stat] db_data.pop(stat) # Skip blank rows and don't collect minor league data if not row_data[0] or db_data['Lg'] not in ['AL', 'NL']: continue db_array = 'br.{}.{}'.format(title, db_data['Year']) db.Players.update({'brID': brid}, {'$set': { db_array: db_data }}, upsert=True)
def load_p(dir_or_url): name = dir_or_url + '/params' return load_dict(open_url(name))
def load_model_params(load_from): filename = "trained_params.npz" f = open_url(load_from + '/' + filename) return numpy.load(f)
import csv import sys import os from selenium import webdriver from utils import pause, load_config, get_table, login, open_url CONFIG = load_config("configuration.yml") email_address = CONFIG["CREDENTIALS"]["USERNAME"] password = CONFIG["CREDENTIALS"]["PASSWORD"] url = CONFIG["URL"] driver = webdriver.Chrome() open_url(driver, url) login(driver, email_address, password) open_url(driver, url) get_table(driver)
def boxscores(date, dbc=dbc): """ Extract all boxscores """ # Delete games with postponed status to avoid conflict dbc.delete_duplicate_game_docs() year = datetime.date.today().strftime('%Y') if date == 'all': url = 'https://www.baseball-reference.com/leagues/MLB/{}-schedule.shtml'.format( year) soup = open_url(url) # Find links for each game this season all_game_urls = [ x['href'] for x in soup.find_all('a', href=True) if x.text == 'Boxscore' ] dates = dbc.get_missing_array_dates('summary') # Format dates to match date in url datesf = [x.replace('-', '') for x in dates] # Filter games by missing dates game_urls = [ game for game in all_game_urls if any(date in game for date in datesf) ] else: y, m, d = date.split('-') url = "http://www.baseball-reference.com/boxes/?year={}\ &month={}\ &day={}"\ .format(y,m,d)\ .replace(' ', '') soup = open_url(url) game_urls = [ x.a['href'] for x in soup.find_all('td', {'class': 'right gamelink'}) ] # Collect boxscore stats on each game for game in tqdm(game_urls): url = 'http://www.baseball-reference.com' + game soup = open_url(url) html_date = ''.join(soup.find('h1').text.split(',')[-2:]).strip() date = str(datetime.datetime.strptime(html_date, '%B %d %Y').date()) tdiv = soup.find_all('div', {'itemprop': 'performer'}) away = tdiv[0].find('a', {'itemprop': 'name'})['href'].split('/')[2] home = tdiv[1].find('a', {'itemprop': 'name'})['href'].split('/')[2] away = convert_name(away, how='abbr') home = convert_name(home, how='abbr') teams = (away, home) # Find mlb game id and resolve double headers url_id = int(game.split('.')[0][-1]) games = list(dbc.get_team_game_preview(away, date)) gnums = [ int(game['preview'][0]['gameData']['game']['gameNumber']) for game in games ] # !!! remove try/except here since postponed games are now removed? try: idx = gnums.index(url_id) if url_id > 0 else 0 except: idx = 0 gid = games[idx]['gid'] # Extract summary stats summary = soup.find('table', {'class': 'linescore'}) thead = summary.find('thead') cols = [x.text for x in thead.find_all('th')][1:] cols[0] = 'Team' tbody = summary.find('tbody') trows = tbody.find_all('tr') # Push summary stats to database db.Games.update({'gid': gid}, {'$set': {'summary': []}}) for row in trows: row_data = [x.text for x in row.find_all('td')][1:] db_data = {k: v for k, v in zip(cols, row_data)} db.Games.update({'gid': gid}, {'$push': {'summary': db_data}}) # Extract batting box score comment = soup.find_all(string=lambda text: isinstance(text, Comment)) bat_tables = [x for x in comment if '>Batting</th>' in x] for table in zip(teams, bat_tables): team = table[0] bat = BeautifulSoup(table[1], "html.parser") # Extract column headers thead = bat.find('thead') cols = [x for x in thead.find('tr').text.split('\n') if x] # Extract Team Totals tfoot = bat.find('tfoot') row_data = [ x.text for x in tfoot.find_all(lambda tag: tag.has_attr('data-stat')) ] db_data = {k: v for k, v in zip(cols, row_data)} db_data = parse_types(db_data) db_array = '{}.batting'.format(team) db.Games.update({'gid': gid}, {'$set': {db_array: [db_data]}}) # Extract stats on individual batters tbody = bat.find('tbody') trows = tbody.find_all('tr') for row in trows: try: player = row.find('a').text except: continue stats = [ x.text for x in row.find_all( lambda tag: tag.has_attr('data-stat')) ] stats[0] = player db_data = {k: v for k, v in zip(cols, stats)} db_data = parse_types(db_data) db_array = '{}.batting'.format(team) db.Games.update({'gid': gid}, {'$push': {db_array: db_data}}) # Extract pitching box score pit_tables = [x for x in comment if '>Pitching</th>' in x][0] pit = BeautifulSoup(pit_tables, "html.parser") # Extract column headers thead = pit.find('thead') cols = [x for x in thead.find('tr').text.split('\n') if x] # Extract Team Totals tfoots = pit.find_all('tfoot') for foot in zip(teams, tfoots): team = foot[0].replace('.', '') row_data = [ x.text for x in foot[1].find_all( lambda tag: tag.has_attr('data-stat')) ] db_data = {k: v for k, v in zip(cols, row_data)} db_data = parse_types(db_data) db_array = '{}.pitching'.format(team) db.Games.update({'gid': gid}, {'$set': {db_array: [db_data]}}) # Extract stats on individual pitchers tbodies = pit.find_all('tbody') for tbody in zip(teams, tbodies): team = tbody[0].replace('.', '') trows = tbody[1].find_all('tr') for row in trows: player = row.find('th').text.split(',')[0] stats = [x.text for x in row.find_all('td')] stats.insert(0, player) db_data = {k: v for k, v in zip(cols, stats)} db_data = parse_types(db_data) db_array = '{}.pitching'.format(team) db.Games.update({'gid': gid}, {'$push': {db_array: db_data}})