def get_race_distances(self): dist = find(self.doc, 'span', 'block-distanceInd') dist_y = find(self.doc, 'span', 'block-fullDistanceInd').strip('()') try: dist_f = self.distance_to_furlongs(dist) except ValueError: print( f'ERROR: distance_to_furlongs(). Error: {ValueError}, dist: {dist}, dist_y: {dist}' ) print('Race: ', self.url) dist_f = -1 dist_m = self.distance_to_metres(dist_y) if dist_m == 0: dist_m = round(dist_f * 201.168) dist_y = round(dist_m * 1.0936) dist_f = str(dist_f).replace('.0', '') + 'f' if self.race_info['region'] not in {'GB', 'IRE', 'USA', 'CAN'}: dist_m = float(dist_f.strip('f')) * 200 return dist, dist_y, dist_f, dist_m
def get_race_type(doc, race, distance): race_type = '' fences = find(doc, 'div', 'RC-headerBox__stalls') if 'hurdle' in fences.lower(): race_type = 'Hurdle' elif 'fence' in fences.lower(): race_type = 'Chase' else: if distance >= 12: if any(x in race for x in {'national hunt flat', 'nh flat race', 'mares flat race'}): race_type = 'NH Flat' if any( x in race for x in { 'inh bumper', ' sales bumper', 'kepak flat race', 'i.n.h. flat race' }): race_type = 'NH Flat' if any(x in race for x in {' hurdle', '(hurdle)'}): race_type = 'Hurdle' if any( x in race for x in { ' chase', '(chase)', 'steeplechase', 'steeple-chase', 'steeplchase', 'steepl-chase' }): race_type = 'Chase' if race_type == '': race_type = 'Flat' return race_type
def get_num_runners(self): ran = find(self.doc, 'span', 'rp-raceInfo__value rp-raceInfo__value_black') if ran is not None: return ran.replace('ran', '').strip() return None
def get_course(self, course_url): course = find(self.doc, 'h1', 'RC-courseHeader__name') if course == '': try: course = self.doc.xpath( "//a[contains(@class, 'rp-raceTimeCourseName__name')]/text()" )[0].strip() except IndexError: course = course_url.title() return course
def get_race_type(self): race_type = '' race = self.race_info['race_name'].lower() if self.race_info[ 'code'] == 'flat' and 'national hunt flat' not in race: race_type = 'Flat' else: fences = find(self.doc, 'span', 'rp-raceTimeCourseName_hurdles') if 'hurdle' in fences.lower(): race_type = 'Hurdle' elif 'fence' in fences.lower(): race_type = 'Chase' if race_type == '': if self.race_info['dist_m'] >= 2400: if any( x in race for x in {'national hunt flat', 'nh flat race', 'mares flat race'}): race_type = 'NH Flat' if any( x in race for x in { 'inh bumper', ' sales bumper', 'kepak flat race', 'i.n.h. flat race' }): race_type = 'NH Flat' if any(x in race for x in {' hurdle', '(hurdle)'}): race_type = 'Hurdle' if any( x in race for x in { ' chase', '(chase)', 'steeplechase', 'steeple-chase', 'steeplchase', 'steepl-chase' }): race_type = 'Chase' if race_type == '': race_type = 'Flat' return race_type
def parse_race_bands(self): band = find(self.doc, 'span', 'rp-raceTimeCourseName_ratingBandAndAgesAllowed', property='class') bands = band.strip('()').split(',') band_age = '' band_rating = '' if len(bands) > 1: for x in bands: if 'yo' in x: band_age = x.strip() elif '-' in x: band_rating = x.strip() else: if 'yo' in band: band_age = band.strip() elif '-' in band: band_rating = band.strip() return band_age.strip('()'), band_rating
def __init__(self, url, document, code, fields): self.url = url self.doc = document self.race_info = {} self.runner_info = {} url_split = self.url.split('/') self.race_info['code'] = code self.race_info['date'] = convert_date(url_split[6]) self.race_info['course'] = self.get_course(url_split[5]) self.race_info['course_id'] = url_split[4] self.race_info['region'] = get_region(url_split[4]) self.race_info['race_id'] = url_split[7] self.race_info['going'] = find(self.doc, 'span', 'rp-raceTimeCourseName_condition', property='class') self.race_info['surface'] = get_surface(self.race_info['going']) self.race_info['off'] = find(self.doc, 'span', 'text-raceTime') self.race_info['race_name'] = find(self.doc, 'h2', 'rp-raceTimeCourseName__title', property='class') self.race_info['class'] = find(self.doc, 'span', 'rp-raceTimeCourseName_class', property='class').strip('()') self.race_info['race_name'] = self.clean(self.race_info['race_name']) if self.race_info['class'] == '': self.race_info['class'] = self.get_race_class() self.race_info['pattern'] = self.get_race_pattern() self.race_info['race_name'] = self.clean_race_name( self.race_info['race_name']) self.race_info['age_band'], self.race_info[ 'rating_band'] = self.parse_race_bands() if self.race_info[ 'class'] == '' and self.race_info['rating_band'] != '': self.race_info['class'] = self.get_class_from_rating() self.race_info['sex_rest'] = self.sex_restricted() self.race_info['dist'], self.race_info['dist_y'],\ self.race_info['dist_f'], self.race_info['dist_m'] = self.get_race_distances() self.race_info['type'] = self.get_race_type() self.race_info['ran'] = self.get_num_runners() pedigree = Pedigree( xpath(self.doc, 'tr', 'block-pedigreeInfoFullResults', fn='/td')) self.runner_info['sire_id'] = pedigree.id_sires self.runner_info['sire'] = pedigree.sires self.runner_info['dam_id'] = pedigree.id_dams self.runner_info['dam'] = pedigree.dams self.runner_info['damsire_id'] = pedigree.id_damsires self.runner_info['damsire'] = pedigree.damsires self.runner_info['sex'] = self.get_sexs(pedigree.pedigrees) self.runner_info['comment'] = self.get_comments() self.runner_info['pos'] = self.get_positions() self.runner_info['prize'] = self.get_prizemoney() self.runner_info['draw'] = self.get_draws() self.runner_info['ovr_btn'], self.runner_info[ 'btn'] = self.get_distance_btn() self.runner_info['sp'] = self.get_starting_prices() self.runner_info['dec'] = self.get_decimal_odds() self.runner_info['num'] = self.get_numbers() if not self.race_info['ran']: self.race_info['ran'] = len(self.runner_info['num']) else: self.race_info['ran'] = int(self.race_info['ran']) self.runner_info['age'] = self.get_horse_ages() self.runner_info['horse'] = self.get_names_horse() self.runner_info['horse_id'] = self.get_ids_horse() self.runner_info['jockey'] = self.get_names_jockey() self.runner_info['jockey_id'] = self.get_ids_jockey() self.runner_info['trainer'] = self.get_names_trainer() self.runner_info['trainer_id'] = self.get_ids_trainer() self.runner_info['owner'] = self.get_names_owner() self.runner_info['owner_id'] = self.get_ids_owner() self.runner_info['hg'] = self.get_headgear() self.runner_info['wgt'], self.runner_info['lbs'] = self.get_weights() self.runner_info['or'] = xpath(self.doc, 'td', 'OR', 'data-ending', fn='/text()') self.runner_info['rpr'] = xpath(self.doc, 'td', 'RPR', 'data-ending', fn='/text()') self.runner_info['ts'] = xpath(self.doc, 'td', 'TS', 'data-ending', fn='/text()') self.runner_info['silk_url'] = xpath(self.doc, 'img', 'rp-horseTable__silk', 'class', fn='/@src') self.runner_info['time'] = self.get_finishing_times() self.runner_info['secs'] = self.time_to_seconds( self.runner_info['time']) self.clean_non_completions() self.csv_data = self.create_csv_data(fields)
def parse_races(session, race_urls, date): races = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) going_info = get_going_info(session, date) for url in race_urls: r = session.get(url, headers=random_header.header()) doc = html.fromstring(r.content) race = {} url_split = url.split('/') race['race_id'] = int(url_split[7]) race['date'] = url_split[6] race['course_id'] = int(url_split[4]) race['course'] = find(doc, 'h1', 'RC-courseHeader__name') race['off_time'] = find(doc, 'span', 'RC-courseHeader__time') race['race_name'] = find(doc, 'span', 'RC-header__raceInstanceTitle') race['distance_round'] = find(doc, 'strong', 'RC-header__raceDistanceRound') race['distance'] = find(doc, 'span', 'RC-header__raceDistance') race['distance'] = race['distance_round'] if not race[ 'distance'] else race['distance'].strip('()') race['distance_f'] = distance_to_furlongs(race['distance_round']) race['region'] = get_region(str(race['course_id'])) race['pattern'] = get_pattern(race['race_name'].lower()) race['race_class'] = find(doc, 'span', 'RC-header__raceClass') race['race_class'] = race['race_class'].strip( '()') if race['race_class'] else '' race['type'] = get_race_type(doc, race['race_name'].lower(), race['distance_f']) if not race['race_class']: if race['pattern']: race['race_class'] = 'Class 1' try: band = find(doc, 'span', 'RC-header__rpAges').strip('()').split() if band: race['age_band'] = band[0] race['rating_band'] = band[1] if len(band) > 1 else None else: race['age_band'] = None race['rating_band'] = None except AttributeError: race['age_band'] = None race['rating_band'] = None prize = find(doc, 'div', 'RC-headerBox__winner').lower() race['prize'] = prize.split( 'winner:')[1].strip() if 'winner:' in prize else None field_size = find(doc, 'div', 'RC-headerBox__runners').lower() if field_size: race['field_size'] = int( field_size.split('runners:')[1].split('(')[0].strip()) else: race['field_size'] = '' try: race['going_detailed'] = going_info[race['course_id']]['going'] race['rail_movements'] = going_info[ race['course_id']]['rail_movements'] race['stalls'] = going_info[race['course_id']]['stalls'] race['weather'] = going_info[race['course_id']]['weather'] except KeyError: race['going'] = None race['rail_movements'] = None race['stalls'] = None race['weather'] = None going = find(doc, 'div', 'RC-headerBox__going').lower() race['going'] = going.split( 'going:')[1].strip().title() if 'going:' in going else '' race['surface'] = get_surface(race['going']) profile_hrefs = doc.xpath( "//a[@data-test-selector='RC-cardPage-runnerName']/@href") profile_urls = [ 'https://www.racingpost.com' + a.split('#')[0] + '/form' for a in profile_hrefs ] runners = get_runners(session, profile_urls) for horse in doc.xpath("//div[contains(@class, ' js-PC-runnerRow')]"): horse_id = int( find(horse, 'a', 'RC-cardPage-runnerName', attrib='href').split('/')[3]) if 'broken_url' in runners[horse_id]: sire = find(horse, 'a', 'RC-pedigree__sire').split('(') dam = find(horse, 'a', 'RC-pedigree__dam').split('(') damsire = find( horse, 'a', 'RC-pedigree__damsire').lstrip('(').rstrip(')').split('(') runners[horse_id]['sire'] = clean_name(sire[0]) runners[horse_id]['dam'] = clean_name(dam[0]) runners[horse_id]['damsire'] = clean_name(damsire[0]) runners[horse_id]['sire_region'] = sire[1].replace(')', '').strip() runners[horse_id]['dam_region'] = dam[1].replace(')', '').strip() runners[horse_id]['damsire_region'] = damsire[1].replace( ')', '').strip() runners[horse_id]['age'] = find(horse, 'span', 'RC-cardPage-runnerAge', attrib='data-order-age') sex = find(horse, 'span', 'RC-pedigree__color-sex').split() runners[horse_id]['colour'] = sex[0] runners[horse_id]['sex_code'] = sex[1].capitalize() runners[horse_id]['trainer'] = find( horse, 'a', 'RC-cardPage-runnerTrainer-name', attrib='data-order-trainer') runners[horse_id]['number'] = int( find(horse, 'span', 'RC-cardPage-runnerNumber-no', attrib='data-order-no')) try: runners[horse_id]['draw'] = int( find(horse, 'span', 'RC-cardPage-runnerNumber-draw', attrib='data-order-draw')) except ValueError: runners[horse_id]['draw'] = None runners[horse_id]['headgear'] = find(horse, 'span', 'RC-cardPage-runnerHeadGear') runners[horse_id]['headgear_first'] = find( horse, 'span', 'RC-cardPage-runnerHeadGear-first') try: runners[horse_id]['lbs'] = int( find(horse, 'span', 'RC-cardPage-runnerWgt-carried', attrib='data-order-wgt')) except ValueError: runners[horse_id]['lbs'] = None try: runners[horse_id]['ofr'] = int( find(horse, 'span', 'RC-cardPage-runnerOr', attrib='data-order-or')) except ValueError: runners[horse_id]['ofr'] = None try: runners[horse_id]['rpr'] = int( find(horse, 'span', 'RC-cardPage-runnerRpr', attrib='data-order-rpr')) except ValueError: runners[horse_id]['rpr'] = None try: runners[horse_id]['ts'] = int( find(horse, 'span', 'RC-cardPage-runnerTs', attrib='data-order-ts')) except ValueError: runners[horse_id]['ts'] = None claim = find(horse, 'span', 'RC-cardPage-runnerJockey-allowance') jockey = find(horse, 'a', 'RC-cardPage-runnerJockey-name', attrib='data-order-jockey') if jockey: runners[horse_id][ 'jockey'] = jockey if not claim else jockey + f'({claim})' else: runners[horse_id]['jockey'] = None try: runners[horse_id]['last_run'] = find( horse, 'div', 'RC-cardPage-runnerStats-lastRun') except TypeError: runners[horse_id]['last_run'] = None runners[horse_id]['form'] = find(horse, 'span', 'RC-cardPage-runnerForm') try: runners[horse_id]['trainer_rtf'] = find( horse, 'span', 'RC-cardPage-runnerTrainer-rtf') except TypeError: runners[horse_id]['trainer_rtf'] = None race['runners'] = [runner for runner in runners.values()] races[race['region']][race['course']][race['off_time']] = race return races