def main(): parser = ArgumentParser() parser.add_argument('--output_dir', action='store', type=str, default='.', help='output directory filename') parser.add_argument('--ids', action='store', nargs='+', type=str, required=True, help='target id list') args = parser.parse_args() output_dirname = args.output_dir target_ids = args.ids for id in tqdm(target_ids): url = "https://db.netkeiba.com/horse/{}/".format(id) html = getPage(url) result = getHorseRaceResults(html) sleep(0.2) output_filename = "horse_race_result_{}.csv".format(id) output_filename = os.path.join(output_dirname, output_filename) df_out = pd.DataFrame( result, columns=["date", "name", "place", "prize", "weight"]) df_out.to_csv(output_filename, index=False)
def main(): parser = ArgumentParser() parser.add_argument('--year', '-y', action='store', type=int, required=True, help='target year') parser.add_argument('-n', action='store', type=int, default=None, help='max sample') parser.add_argument('--output', '-o', action='store', type=str, default='jockey_leading.csv', help='output filename') args = parser.parse_args() year = args.year max_sample = args.n output_filename = args.output results = [] page = 1 while (True): html = getPage( "http://db.netkeiba.com/?pid=jockey_leading&year=%d&page=%d" % (year, page)) page_result = getJockeyResult(html, offset=len(results)) print('get {}'.format(len(page_result))) results += page_result if len(page_result) == 0: break if max_sample and len(results) >= max_sample: results = results[:max_sample] break page = page + 1 sleep(1) print(len(results)) df = pd.DataFrame(results) reordered_cols = [ 'id', 'name', 'stable', 'win_count', 'second_place_count', 'third_place_count', 'unplaced_count', 'grade_race_count', 'grade_race_win', 'stakes_race_count', 'stakes_race_win', 'general_race_count', 'general_race_win', 'turf_race_count', 'turf_race_win', 'dart_race_count', 'dart_race_win', 'win_ratio', 'in_second_place_ratio', 'in_third_place_ratio', 'prize' ] df = df[reordered_cols] df.to_csv(output_filename)
def main(): parser = ArgumentParser() parser.add_argument('--input', '-i', action='store', type=str, default=None, help='horse data csv') parser.add_argument('--output', '-o', action='store', type=str, default='horse_data.additional.csv', help='output filename') parser.add_argument('ids', type=str, nargs='+') args = parser.parse_args() input_filename = args.input output_filename = args.output ids = [] if input_filename: df = pd.read_csv(input_filename) ids.extend(df['id'].values) ids.extend(args.ids) results = [] for id in tqdm(ids): # print(id) url = "http://db.netkeiba.com/horse/{id}/".format(id=id) html = getPage(url) result = getHorseProfile(html) results.append(result) sleep(0.2) cols = [ 'id', 'name', 'sire', 'sire_id', 'mare', 'mare_id', 'bms', 'bms_id', 'hair', 'sex', 'birth_date', 'trainer', 'trainer_id', 'owner', 'owner_id', 'breeder', 'breeder_id', 'prize', 'race_result', 'debut_weight', 'sales_price', 'relatives', 'maruchi', 'kakuchi' ] df_out = pd.DataFrame(results, columns=cols) df_out.to_csv(output_filename, index=False)
def main(): parser = ArgumentParser() parser.add_argument('--output_dir', action='store', type=str, default='.', help='output directory filename') parser.add_argument('--ids', action='store', nargs='+', type=str, required=True, help='target id list') args = parser.parse_args() output_dirname = args.output_dir mare_ids = args.ids for id in tqdm(mare_ids): url = "https://db.netkeiba.com/horse/mare/{}/".format(id) html = getPage(url) result = getMareCrops(html) sleep(0.2) output_filename = "mare_crop_{}.csv".format(id) output_filename = os.path.join(output_dirname, output_filename) df_out = pd.DataFrame(result, columns=["year", "name", "horse_id", "sex", "sire"]) df_out.to_csv(output_filename, index=False)
def main(): parser = ArgumentParser() parser.add_argument('--input', '-i', action='store', type=str, default='horse_ranking.csv', help='horse data csv') parser.add_argument('--output', '-o', action='store', type=str, default='horse_data.additional.csv', help='output filename') args = parser.parse_args() input_filename = args.input output_filename = args.output df = pd.read_csv(input_filename) ids = df['id'].values results = [] for id in tqdm(ids): # print(id) url = "http://db.netkeiba.com/horse/{id}/".format(id=id) html = getPage(url) result = getHorseAdditionalInfo(html) results.append(result) sleep(0.2) df_out = pd.DataFrame(results) reordered_cols = [ 'id', 'name', 'hair', 'birth_date', 'race_result', 'debut_weight', 'sales_price', 'relatives' ] df_out = df_out[reordered_cols] df_out.to_csv(output_filename)
def main(): parser = ArgumentParser() parser.add_argument('--input', '-i', action='store', type=str, default='horse_ranking.csv', help='horse data csv') parser.add_argument('--output', '-o', action='store', type=str, default='mare_crop_data.csv', help='output filename') parser.add_argument('--cache', action='store', type=str, default='horse_id.pkl', help='horse_id cache file') args = parser.parse_args() input_filename = args.input output_filename = args.output cache_filename = args.cache df = pd.read_csv(input_filename) mare_names = df['mare'].values bms_names = df['bms'].values # print(mare_names) if os.path.exists(cache_filename): # load horse_id cache horse_ids = pickle.load(open(cache_filename, 'rb')) else: horse_ids = {} results = [] for mare, bms in tqdm(zip(mare_names, bms_names), total=len(mare_names)): if mare in horse_ids: horse_id = horse_ids[mare] else: if mare in mare_conv_tbl: mare = mare_conv_tbl[mare] horse_id = getHorseIdByName(mare, sire=bms, sex=[2]) if not horse_id: # try in partial match mode.. horse_id = getHorseIdByName2(mare, sire=bms, sex=[2]) if not horse_id: if bms in bms_conv_tbl: horse_id = getHorseIdByName2(mare, sire=bms_conv_tbl[bms], sex=[2]) if not horse_id: # try without sire parameter horse_id = getHorseIdByName(mare, sex=[2]) print("WARNING: horse_id is found without sire parameter ({})". format(mare)) if not horse_id: print("WARNING: horse_id is not found ({})".format(mare)) continue horse_ids[mare] = horse_id # update cache pickle.dump(horse_ids, open(cache_filename, 'wb')) # skip if we already checked if mare in [res['name'] for res in results]: continue url = "https://db.netkeiba.com/horse/{id}/".format(id=horse_id) html = getPage(url) result = getMareCropsResult(html) # print(result) results.append(result) sleep(0.2) df_out = pd.DataFrame(results) reordered_cols = [ 'id', 'name', 'birth_date', 'race_result', 'crop_count', 'crop_win_count', 'crop_grade_horse_count', 'crop_grade_win_count' ] df_out = df_out[reordered_cols] df_out.to_csv(output_filename)
'name': name, 'horse_id': horse_id, 'sex': sex, 'sire': sire }) return crops[::-1] if __name__ == "__main__": # html = getPage("https://db.netkeiba.com/horse/2014102565/") # html = getPage("https://db.netkeiba.com/horse/2015102894/") # html = getPage("https://db.netkeiba.com/horse/2014106083/") # html = getPage("https://db.netkeiba.com/horse/2016100893/") # html = getPage("https://db.netkeiba.com/horse/2016103387/") # html = getPage("https://db.netkeiba.com/horse/2016104532/") html = getPage("https://db.netkeiba.com/horse/2001100925/") result = getHorseProfile(html) # html = getPage("https://db.netkeiba.com/horse/2004104258/") # html = getPage("https://db.netkeiba.com/horse/1992108561/") # html = getPage("https://db.netkeiba.com/horse/2000106445/") # html = getPage("https://db.netkeiba.com/horse/2004102429/") # html = getPage("https://db.netkeiba.com/horse/000a013c70") # html = getPage("https://db.netkeiba.com/horse/000a011df8/") # result = getMareCropsResult(html) # result = getHorseIdByName('オルフェーヴル') # result = getHorseIdByName('スティンガー') # result = getHorseIdByName('トリプレックス') # result = getHorseIdByName('ラッキーライラック', sex=[2])
breeder_id = link.get("href").split('/')[-2] name = link.get("title").replace('の近走成績', '') result = {'alt_id': breeder_id, 'name': name} return result if __name__ == "__main__": # html = getPage("http://db.netkeiba.com/horse/2014102565/") # html = getPage("http://db.netkeiba.com/horse/2014106083/") # result = getHorseAdditionalInfo(html) # html = getPage("http://db.netkeiba.com/horse/2004104258/") # html = getPage("http://db.netkeiba.com/horse/1992108561/") # html = getPage("http://db.netkeiba.com/horse/2000106445/") # html = getPage("http://db.netkeiba.com/horse/2004102429/") # result = getMareCropsResult(html) # result = getHorseIdByName('オルフェーヴル') # result = getHorseIdByName('スティンガー') # result = getHorseIdByName('トリプレックス') # result = getHorseIdByName('ラッキーライラック', sex=[2]) # result = getHorseIdByName2('ベラドーラII') # result = getHorseIdByName2('Debit Or Credit') html = getPage("http://db.netkeiba.com/breeder/373126/") result = getBreederId(html) print(result)