Ejemplo n.º 1
0
def Scrapping_Data_Day(RaceDay):
    """
    Scrapping Historical Data of a certain day
    """
    #Start Timer
    print(RaceDay)
    start_time = time.time()

    #Scrap Data
    All_race_Result = Get_data(RaceDay)

    #Loop through Races
    Dataset = Formatting_Result(All_race_Result[0])
    for i in range(1, len(All_race_Result)):
        data = Formatting_Result(All_race_Result[i])
        Dataset = Dataset.append(data, sort=False)

    #Reset Index
    Dataset.reset_index(inplace=True, drop=True)

    #Loading to Dataset
    Load_Dataset_toDatabase('RaceDb', Dataset)

    print("---- %s Races are Scrapped in %s minutes ----" %
          (len(All_race_Result), str(round(
              (time.time() - start_time) / 60, 4))))

    return Dataset
Ejemplo n.º 2
0
def Get_Stable():

    #Start Timer
    start_time = time.time()
    #Selenium Options
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--log-level=OFF")
    web_driver = webdriver.Chrome(ChromeDriverManager().install(),
                                  options=chrome_options)

    #Get Current list of Stables on record
    Data = Extraction_Database("""
                               Select Distinct SNum from SnameDb
                               """)['SNum'].values.tolist()
    To_scrap = list(set(map(str, range(1, 1000))) - set(Data))
    Stable_DF = pd.DataFrame()
    for i in To_scrap:
        web_driver.get(
            'https://racing.appledaily.com.hk/search-horse/result?trainer=' +
            i)
        time.sleep(3)
        _Login(web_driver)
        #Wait for the page to load
        time.sleep(3)
        raw_html = web_driver.page_source
        try:
            name = str(raw_html).split('養馬數目')[0].split(
                '<div data-v-6bfbe05a="">')[-2].split('</div>')[0]
            if len(name) < 10:
                Stable_DF = Stable_DF.append(pd.DataFrame([[name, i]]))
        except:
            pass
    #Closing Session
    web_driver.close()

    if len(Stable_DF) != 0:
        #Reset Index
        Stable_DF.reset_index(inplace=True, drop=True)
        Stable_DF.columns = ['SNAME', 'SNum']
        #Loading to Dataset
        Load_Dataset_toDatabase('SNameDb', Stable_DF)
    print("---- %s Stable Names are Scrapped in %s minutes ----" %
          (len(Stable_DF), str(round((time.time() - start_time) / 60, 4))))

    return None
Ejemplo n.º 3
0
def Update_Running_Stat(Dataset):

    """
    Update All Running Statistics Table after raceday
    Parameter
    ---------
    Dataset : Dataframe of Post-Match Data of one Raceday
    """
    for RARID, Race in Dataset.groupby('RARID'):
        Horse_ELO = []
        HNAME_List = '('+str(Race['HNAME'].tolist())[1:-1]+')'
        JNAME_List = '('+str(Race['JNAME'].tolist())[1:-1]+')'
        SNAME_List = '('+str(Race['SNAME'].tolist())[1:-1]+')'
        Dist_ELO = 'HELO_'+str(Race.loc[:,'RADIS'].values[0])
        Sur_ELO = 'HELO_' + Race.loc[:,'RATRA'].apply(lambda x : 'TURF' if x == 'T' else 'AW').values[0]
        GOG_ELO = 'HELO_' + Going_map[Race.loc[:,'RAGOG'].values[0].strip()]
        PFL_ELO = str(Race.loc[:,'RALOC'].values[0])+'_'+str(Race.loc[:,'RADIS'].values[0])+'_'\
            +str(Race.loc[:,'RATRA'].values[0])
        for Target in ['HELO',Dist_ELO,Sur_ELO,GOG_ELO,PFL_ELO]:
            Horse_ELO.append(Calculate_HELO(Target, Race, K = 128))
        HELO_DF = reduce(lambda x, y: pd.merge(x, y, on = 'HNAME'), Horse_ELO)
        JELO_DF = Calculate_JELO(Race, K = 128)
        SELO_DF = Calculate_SELO(Race, K = 128)

        #Update HELO Score to Database
        HPrior_DF = Extraction_Database("""
                                       Select * from RS_HORSE_ELO where HNAME in {HNAME_List}
                                       """.format(HNAME_List = HNAME_List))

        HPrior_DF = HPrior_DF.loc[:,[i for i in HPrior_DF.columns if i not in HELO_DF.columns[1:]]]
        HELO_DF = HELO_DF.merge(HPrior_DF, how='left')

        General_Query_Database("""
                               DELETE FROM RS_HORSE_ELO where HNAME in {HNAME_List}
                               """.format(HNAME_List = HNAME_List))
        Load_Dataset_toDatabase('RS_HORSE_ELO',HELO_DF)

        #Update JELO Score to Database
        JPrior_DF = Extraction_Database("""
                                       Select * from RS_JOCKEY_ELO where JNAME in {JNAME_List}
                                       """.format(JNAME_List = JNAME_List))

        JPrior_DF = JPrior_DF.loc[:,[i for i in JPrior_DF.columns if i not in JELO_DF.columns[1:]]]
        JELO_DF = JELO_DF.merge(JPrior_DF, how='left')

        General_Query_Database("""
                               DELETE FROM RS_JOCKEY_ELO where JNAME in {JNAME_List}
                               """.format(JNAME_List = JNAME_List))
        Load_Dataset_toDatabase('RS_JOCKEY_ELO',JELO_DF)

        #Update SELO Score to Database
        SPrior_DF = Extraction_Database("""
                                       Select * from RS_STABLE_ELO where SNAME in {SNAME_List}
                                       """.format(SNAME_List = SNAME_List))

        SPrior_DF = SPrior_DF.loc[:,[i for i in SPrior_DF.columns if i not in SELO_DF.columns[1:]]]
        SELO_DF = SELO_DF.merge(SPrior_DF, how='left')

        General_Query_Database("""
                               DELETE FROM RS_STABLE_ELO where SNAME in {SNAME_List}
                               """.format(SNAME_List = SNAME_List))
        Load_Dataset_toDatabase('RS_STABLE_ELO',SELO_DF)

    return None
Ejemplo n.º 4
0
def parallel_pace_charts(Raceday):

    #For the 2012 season, give the model something to start with
    Raceday = max(Raceday, 20130710)

    """
    Time Candidates
    """
    Parallel_Chart = pd.DataFrame()
    min_time = max(0, Extraction_Database("""
                                   Select * from
                                   (Select min(RESS1) from RaceDb where RESS1 > 0),
                                   (Select min(RESS2) from RaceDb where RESS2 > 0),
                                   (Select min(RESS3) from RaceDb where RESS3 > 0),
                                   (Select min(RESS4) from RaceDb where RESS4 > 0),
                                   (Select min(RESS5) from RaceDb where RESS5 > 0),
                                   (Select min(RESS6) from RaceDb where RESS6 > 0)
                                   """.format(Raceday = Raceday)).values.min())
    max_time = Extraction_Database("""
                                   Select * from
                                   (Select max(RESS1) from RaceDb where RESS1 > 0),
                                   (Select max(RESS2) from RaceDb where RESS2 > 0),
                                   (Select max(RESS3) from RaceDb where RESS3 > 0),
                                   (Select max(RESS4) from RaceDb where RESS4 > 0),
                                   (Select max(RESS5) from RaceDb where RESS5 > 0),
                                   (Select max(RESS6) from RaceDb where RESS6 > 0)
                                   """.format(Raceday = Raceday)).values.max()
    Parallel_Chart.loc[:,'time_vector'] = [round(i,2) for i in np.arange(min_time-10, max_time+3, 0.01)]

    """
    Equivalent Time
    """
    equivalent_time = Extraction_Database("""Select RADIS, RATRA, RALOC, round(avg(RESS1),2) S1_eq_time, round(avg(RESS2),2) S2_eq_time,
                                          round(avg(RESS3),2) S3_eq_time, round(avg(RESS4),2) S4_eq_time, round(avg(RESS5),2) S5_eq_time,
                                          round(avg(RESS6),2) S6_eq_time from RaceDb
                                          where RADAT < {Raceday}
                                          group by RADIS, RATRA, RALOC""".format(Raceday = Raceday))
    equivalent_time.loc[:,'RADIS'] = equivalent_time.loc[:,'RADIS'].map(str)
    equivalent_time.loc[:,'profile'] = equivalent_time.loc[:,['RALOC', 'RADIS', 'RATRA']].agg('_'.join, axis=1)
    equivalent_time.loc[:,'RADIS'] = equivalent_time.loc[:,'RADIS'].map(int)

    Distance_List = equivalent_time.loc[:,'RADIS'].unique()
    for distance in Distance_List:
        condition = equivalent_time.loc[:,'RADIS'] == distance
        equivalent_time.loc[condition,'S1_1s'] = round(Sectional_Dict[distance][0] / (equivalent_time.loc[condition,'S1_eq_time'] * 1), 3)
        equivalent_time.loc[condition,'S2_1s'] = round(Sectional_Dict[distance][1] / (equivalent_time.loc[condition,'S2_eq_time'] * 1), 3)
        equivalent_time.loc[condition,'S3_1s'] = round(Sectional_Dict[distance][2] / (equivalent_time.loc[condition,'S3_eq_time'] * 1), 3)
        try :
            equivalent_time.loc[condition,'S4_1s'] = round(Sectional_Dict[distance][3] / (equivalent_time.loc[condition,'S4_eq_time'] * 1), 3)
            equivalent_time.loc[condition,'S5_1s'] = round(Sectional_Dict[distance][4] / (equivalent_time.loc[condition,'S5_eq_time'] * 1), 3)
            equivalent_time.loc[condition,'S6_1s'] = round(Sectional_Dict[distance][5] / (equivalent_time.loc[condition,'S6_eq_time'] * 1), 3)
        except :
            pass

    """
    Parallel Charts
    """
    for index, row in equivalent_time.iterrows():
        for Section in ['S1', 'S2', 'S3', 'S4', 'S5', 'S6']:
            Time_DF = []
            eq_time = row[Section+'_eq_time']
            increment = row[Section+'_1s']
            if eq_time != 0:
                #Top Half
                for i in np.arange(eq_time, max_time, 0.01):
                    Time_DF.append([round(i,2), round(80 - (round(i,2) - eq_time) * increment, 3)])
                #Botton Half
                for i in np.arange(min_time, eq_time, 0.01):
                    Time_DF.append([round(i,2), round(80 + (eq_time - round(i,2)) * increment, 3)])

                #Merging to Parallel Chart
                Parallel_Chart = Parallel_Chart.merge(pd.DataFrame(Time_DF, columns=['time', row['profile']+'_'+Section]), how='left',left_on='time_vector',right_on='time')
                Parallel_Chart.drop(columns=['time'], inplace = True)

    """
    Uploading to Database
    """
    #Create Connection
    with sqlite3.connect('Data.db') as db:
        sql = db.cursor()
        db.commit()

    sql.execute("drop table if exists Parallel_PaceDb")
    db.commit()
    sql.execute("""
                Create Table Parallel_PaceDb(
                time_vector real, HV_1000_T_S1 real, HV_1000_T_S2 real, HV_1000_T_S3 real, ST_1000_T_S1 real, ST_1000_T_S2 real, ST_1000_T_S3 real,
                ST_1200_AW_S1 real, ST_1200_AW_S2 real, ST_1200_AW_S3 real, HV_1200_T_S1 real, HV_1200_T_S2 real, HV_1200_T_S3 real, ST_1200_T_S1 real,
                ST_1200_T_S2 real, ST_1200_T_S3 real, ST_1400_T_S1 real, ST_1400_T_S2 real, ST_1400_T_S3 real, ST_1400_T_S4 real, ST_1600_T_S1 real,
                ST_1600_T_S2 real, ST_1600_T_S3 real, ST_1600_T_S4 real, ST_1650_AW_S1 real, ST_1650_AW_S2 real, ST_1650_AW_S3 real, ST_1650_AW_S4 real,
                HV_1650_T_S1 real, HV_1650_T_S2 real, HV_1650_T_S3 real, HV_1650_T_S4 real, ST_1800_AW_S1 real, ST_1800_AW_S2 real, ST_1800_AW_S3 real,
                ST_1800_AW_S4 real, ST_1800_AW_S5 real, HV_1800_T_S1 real, HV_1800_T_S2 real, HV_1800_T_S3 real, HV_1800_T_S4 real, HV_1800_T_S5 real,
                ST_1800_T_S1 real, ST_1800_T_S2 real, ST_1800_T_S3 real, ST_1800_T_S4 real, ST_1800_T_S5 real,ST_2000_T_S1 real, ST_2000_T_S2 real,
                ST_2000_T_S3 real, ST_2000_T_S4 real, ST_2000_T_S5 real, HV_2200_T_S1 real, HV_2200_T_S2 real, HV_2200_T_S3 real, HV_2200_T_S4 real,
                HV_2200_T_S5 real, HV_2200_T_S6 real, ST_2200_T_S1 real, ST_2200_T_S2 real, ST_2200_T_S3 real, ST_2200_T_S4 real, ST_2200_T_S5 real,
                ST_2200_T_S6 real, ST_2400_T_S1 real, ST_2400_T_S2 real, ST_2400_T_S3 real, ST_2400_T_S4 real, ST_2400_T_S5 real, ST_2400_T_S6 real)
                """)
    db.commit()

    #Closing Connection
    sql.close()
    db.close()

    Load_Dataset_toDatabase('Parallel_PaceDb', Parallel_Chart)

    return None
Ejemplo n.º 5
0
def parallel_speed_charts(Raceday):

    #For the 2012 season, give the model something to start with
    Raceday = max(Raceday, 20130710)

    """
    Time Candidates
    """
    Parallel_Chart = pd.DataFrame()
    Parallel_Chart['time_vector'] = [round(i,2) for i in np.arange(45, 175, 0.01)]

    """
    Equivalent Time
    """
    equivalent_time = Extraction_Database("""Select round(avg(RESFT),2) eq_time, RADIS, RATRA, RALOC from RaceDb
                                          where RADAT < {Raceday}
                                          group by RADIS, RATRA, RALOC""".format(Raceday = Raceday))
    equivalent_time.loc[:,'RADIS'] = equivalent_time['RADIS'].map(str)
    equivalent_time.loc[:,'profile'] = equivalent_time.loc[:,['RALOC', 'RADIS', 'RATRA']].agg('_'.join, axis=1)
    equivalent_time.loc[:,'RADIS'] = equivalent_time.loc[:,'RADIS'].map(int)
    equivalent_time.loc[:,'1s'] = round(equivalent_time.loc[:,'RADIS'] / (equivalent_time.loc[:,'eq_time'] * 1), 3)

    """
    Parallel Charts
    """
    for index, row in equivalent_time.iterrows():

        Dist_DF = []
        eq_time = row['eq_time']
        increment = row['1s']

        #Top Half
        for i in np.arange(eq_time, 175, 0.01):
            Dist_DF.append([round(i,2), round(80 - (round(i,2) - eq_time) * increment, 3)])

        #Botton Half
        for i in np.arange(45, eq_time, 0.01):
            Dist_DF.append([round(i,2), round(80 + (eq_time - round(i,2)) * increment, 3)])

        #Merging to Parallel Chart
        Parallel_Chart = Parallel_Chart.merge(pd.DataFrame(Dist_DF, columns=['time', row['profile']]), how='left',left_on='time_vector',right_on='time')
        Parallel_Chart.drop(columns=['time'], inplace = True)

    """
    Uploading to Database
    """
    #Create Connection
    with sqlite3.connect('Data.db') as db:
        sql = db.cursor()
        db.commit()

    sql.execute("drop table if exists Parallel_SpeedDb")
    db.commit()
    sql.execute("""
                Create Table Parallel_SpeedDb(
                time_vector real, HV_1000_T real, ST_1000_T real, ST_1200_AW real, HV_1200_T real, ST_1200_T real, ST_1400_T real, ST_1600_T real,
                ST_1650_AW real, HV_1650_T real, ST_1800_AW real, HV_1800_T real, ST_1800_T real, ST_2000_T real, HV_2200_T real, ST_2200_T real, ST_2400_T real)
                """)
    db.commit()

    #Closing Connection
    sql.close()
    db.close()

    Load_Dataset_toDatabase('Parallel_SpeedDb', Parallel_Chart)

    return None #print("---- Parallel_SpeedDb is Created ----")
Ejemplo n.º 6
0
def Update_Race_PosteriroDb(index):
    """
    Update All Race_PosteriorDb Features after raceday
    Parameter
    ---------
    Feature_DF : Feature Dataset for a raceday
    Result_DF : Post race Dataset for a raceday
    """

    Feature_DF, Result_DF = index

    Result_DF.loc[:,'RADIS'] = Result_DF.loc[:,'RADIS'].map(str)
    Result_DF.loc[:,'Profile'] = Result_DF.loc[:,['RALOC', 'RADIS', 'RATRA']].agg('_'.join, axis=1)
    Raceday = int(Result_DF.loc[:,'RADAT'].to_list()[0])
    Figures = Result_DF.loc[:, ['RARID', 'HNAME', 'JNAME', 'SNAME','RADAT', 'Profile', 'RARAL','RALOC', 'RAGOG', 'RADIS','HDRAW', 'RATRA','RACLS','RESFP']]

    """
    Recalculate Parallel Charts
    """
    # parallel_speed_charts(Raceday)
    # parallel_pace_charts(Raceday)

    """
    Beyer Speed Figure, Beaten Length Figure, Sartin Pace Figures, Preference Residual
    """
    #Beyer Speed Figure
    Beyer_DF = Beyer_Speed(Result_DF)

    #Pace Figures
    Pace_DF = Pace_Figure(Result_DF)

    #Calculating Daily Track Variant
    Beyer_DF, Pace_DF = Track_Variant(Beyer_DF, Pace_DF, Result_DF)

    #Sartin Pace Figures
    Sartin_DF = Sartin_Pace(pd.merge(Pace_DF, Result_DF, on = ['HNAME','RARID']))

    #Beaten Length Figure
    BL_DF = Beaten_Length(Result_DF)

    #Preference Residual
    Preference_Res_DF = Preference_Residuals(Feature_DF, Result_DF)

    #Combining Everything
    To_combine = [Figures ,Beyer_DF, BL_DF, Pace_DF, Sartin_DF, Preference_Res_DF]

    Figures_DF = reduce(lambda x, y: pd.merge(x, y, on = ['HNAME','RARID']), To_combine)

    """
    Load to Race_PosteriorDb
    """
    Figures_DF = Figures_DF.loc[:,['HNAME','JNAME','SNAME','RARID','RADAT','RESFP','RARAL','Profile','RALOC','HDRAW','RADIS','RATRA','RAGOG',
                             'RACLS','RESFP','BEYER_SPEED','EARLY_PACE','FINAL_FRACTION_PACE','AVERAGE_PAGE','SUSTAINED_PACE',
                             'EARLY_ENERGY','BEATEN_FIGURE','PACE_S1','PACE_S2','PACE_S3','PACE_S4','PACE_S5','PACE_S6',
                             'HPRE_DIST_RES','HPRE_GO_RES','HPRE_SUR_RES','HPRE_PFL_RES','JPRE_DIST_RES','JPRE_GO_RES',
                             'JPRE_SUR_RES','JPRE_LOC_RES','JPRE_PFL_RES','SPRE_DIST_RES','SPRE_GO_RES','SPRE_SUR_RES',
                             'SPRE_LOC_RES','SPRE_PFL_RES']]

    Load_Dataset_toDatabase('Race_PosteriorDb', Figures_DF)


    return None
Ejemplo n.º 7
0
def Formatting_Result(data, Odds):
    """
    Formatting the 3 tables into Dataset
    Patameter
    ---------
    Result : page of Beautiful Soup parsed html
    Return
    -------
    Dataframe of Result
    """
    """
    Set Up
    """
    Dataframe = pd.DataFrame()

    Result = []
    table = data.find('table', attrs={'class': 'race_entry'})
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        Result.append([ele for ele in cols if ele])
    Result = pd.DataFrame(Result[1:-2])
    """
    If last row is missing horse number
    """
    try:
        last_item = int(Result[0].tail(1).values[0])
    except:
        last_item = Result[0].tail(1).values[0]
    try:
        last_last_item = int(Result[0].tail(2).values[0])
    except:
        last_last_item = Result[0].tail(2).values[0]
    if type(last_item) == str:
        Result.iloc[-1] = pd.concat([pd.Series([200]), Result.iloc[-1][:-1]
                                     ]).reset_index(drop=True)
    if type(last_last_item) == str:
        Result.iloc[-2] = pd.concat([pd.Series([201]), Result.iloc[-2][:-1]
                                     ]).reset_index(drop=True)

    #HNUMI
    Dataframe['HNUMI'] = Result[0].map(int)

    #RADAT
    Condition = str(data).split('race-day-race__content')[3:]
    mon_day = list(map(lambda x: '0'+x if len(x) == 1 else x, \
                       [Condition[0].split('年')[1].split('月')[0], Condition[0].split('月')[1].split('日')[0]]))
    Date = Condition[0].split('年')[0][-4:] + mon_day[0] + mon_day[1]

    #Delete horses that did not race
    Irregular_List = ['退出', '00.00', '-']
    Irr_Horse = Result.index[Result[13].map(lambda x: x in Irregular_List)].tolist() \
    + Result.index[Result[14].map(lambda x: x in Irregular_List)].tolist() \
    + Result.index[Result[11].map(lambda x: x in Irregular_List)].tolist()

    #Load HNAME to Irregular_RecordDb
    Irr_HNAME = pd.DataFrame()
    Irr_HNAME['HNAME'] = Result.loc[Irr_Horse][1].drop_duplicates()
    Irr_HNAME['INCIDENT_DATE'] = Date
    Load_Dataset_toDatabase('Irregular_RecordDb', Irr_HNAME)
    Result.loc[Irr_Horse, 1:] = 0

    # #Remove Duplicates
    Irr_Horse = list(set(Irr_Horse))
    # Irr_Horse = Result[0][list(set(Irr_Horse))].tolist()
    # Irr_Horse = [int(x) for x in Irr_Horse]

    #Delete horses that did not race
    try:
        # Irr_HNum = list(Result.loc[Irr_Horse,0].values)
        Dataframe.drop(Irr_Horse, inplace=True)
        Result.drop(Irr_Horse, inplace=True)
    except:
        pass
    """
    Results
    """
    #HBWEI
    Dataframe['HBWEI'] = Result[8].map(int)

    #HAGEI
    Dataframe['HAGEI'] = Result[2].map(int)

    #HDRAW
    Dataframe['HDRAW'] = Result[5].map(int)

    #HJRAT
    try:
        Dataframe['HJRAT'] = Result[6].map(int)
    except:
        try:
            if Condition[1].split('評分 (')[1].split(')<')[0][-2:] in ['', []]:
                Dataframe['HJRAT'] = 0
            else:
                Dataframe['HJRAT'] = Condition[1].split('評分 (')[1].split(
                    ')<')[0][-2:]
        except:
            Dataframe['HJRAT'] = 0
    #HNAME
    Dataframe['HNAME'] = Result[1]

    #HWEIC
    Dataframe['HWEIC'] = Result[4].map(int)

    #JNAME
    Dataframe['JNAME'] = Result[3]

    #RARUN
    Dataframe['RARUN'] = Result[1].astype(bool).sum(axis=0)

    #RESFP
    def RESFP(x):
        if '併頭馬' in str(x):
            x = x.replace('併頭馬', '')
        if '平頭馬' in str(x):
            x = x.replace('平頭馬', '')
        return x

    Dataframe['RESFP'] = Result[14].map(RESFP)

    #RESFT
    def finishing_time(x):
        try:
            if len(x) == 7:
                return round(int(x.split('.')[0]) * 60 + float(x[2:]), 2)
            else:
                return float(x)
        except:
            pass

    Dataframe['RESFT'] = Result[16].map(finishing_time)

    #RESP123456, RESS123456
    #Remove Irr Races
    Res_copy = Result.copy()
    try:
        Res_copy.drop(Irr_Horse, inplace=True)
    except:
        pass

    #Remove front 0.00
    SS = []
    for i in Res_copy[15]:
        i = str(i).split()
        i = [x for x in i if x != '0.00']
        SS.append(i)
    Res_copy[15] = SS

    #Seperate Sections
    for i in range(1, 7):
        Dataframe['RESP' + str(i)] = 0
        Dataframe['RESS' + str(i)] = 0
        try:
            Dataframe['RESS' + str(i)] = Res_copy[15].map(lambda x: x[i - 1])
            Dataframe['RESP' + str(i)] = Res_copy[13].map(
                lambda x: float(str(x).split()[i - 1]))
        except:
            pass
    #RESPB
    def Bets(x):
        if ',' in str(x):
            return int(x.replace(',', ''))
        else:
            return int(x) * 10000

    Dataframe['RESPB'] = Result[11].map(Bets)

    #RESWB
    Dataframe['RESWB'] = Result[10].map(Bets)

    #RESWD
    def Winning_Dist(x):
        if x == '頭馬':
            return 0
        elif x == '鼻':
            return 0.1
        elif x == '短頭':
            return 0.2
        elif x == '頭':
            return 0.3
        elif x == '頸':
            return 0.4
        elif x == '多位':
            return 50
        elif '-' in str(x):
            y = str(x).split('-')
            z = y[1].split('/')
            return int(y[0]) + (int(z[0]) / int(z[1]))
        elif '/' in str(x):
            y = str(x).split('/')
            return int(y[0]) / int(y[1])
        else:
            try:
                return int(x)
            except:
                return 50

    Dataframe['RESWD'] = Result[17].map(Winning_Dist)
    Dataframe['RESWD'] = Dataframe['RESWD'].replace(
        0, -1 * min([n for n in Dataframe['RESWD'].tolist() if n > 0]))

    #RESWL
    Dataframe['RESWL'] = (Dataframe['RESFP'].astype(int) < 2).astype(int)

    #RESWT
    Dataframe['RESWT'] = Dataframe['RESFT'].min()

    #SNAME
    Stable_List = []
    #Get stable numbers, filter out any & and characters
    Stable_Num = [
        re.sub("\D", "", Stable[:3])
        for Stable in str(data).split('trainer=')[1:]
    ]
    for _, _ in enumerate(Irr_Horse):
        Stable_Num.pop()
    for i in Stable_Num:
        Stable_List.append(
            Extraction_Database(
                """ Select SName from SNameDb where SNum = ? """,
                [i]).values.tolist()[0][0])
    Dataframe['SNAME'] = Stable_List
    """
    Conditions
    """
    #RACLS
    Dataframe['RACLS'] = Condition[1].split('班')[0][-1]
    if Dataframe['HJRAT'][0] == 0:
        Dataframe['RACLS'] = '五'
    Dataframe['RACLS'].replace(['一', '二', '三', '四', '五', '"'],
                               [1, 2, 3, 4, 5, 0],
                               inplace=True)

    #RARAL
    Dataframe['RARAL'] = Condition[0].split('跑道')[0].split('地')[-1]

    def check_Rail(rail):
        if '賽' in rail:
            return 'NA'
        else:
            return rail

    Dataframe['RARAL'] = Dataframe['RARAL'].map(check_Rail)

    #RADIS
    Dataframe['RADIS'] = Condition[1].split('米')[0][-4:]

    #RADAT
    Dataframe['RADAT'] = Date

    #RESSP
    Dataframe['RESSP'] = Dataframe['RADIS'].astype(float) / Dataframe['RESFT']

    #RAGOG
    Dataframe['RAGOG'] = Condition[2].split('場地:')[1][0:3]

    #RALOC
    Dataframe['RALOC'] = Condition[0].split('日')[1][0]
    Dataframe['RALOC'].replace(['沙', '快'], ['ST', 'HV'], inplace=True)

    #RARID
    Dataframe['RARID'] = Condition[1].split('第')[1].split('場')[0]
    Dataframe['RARID'].replace(['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四']\
             ,['01','02','03','04','05','06','07','08','09','10','11','12','13','14'], inplace = True)
    Dataframe['RARID'] = Date + Dataframe['RARID']

    #RASEA
    def Season_from_RARID(x):
        if int(str(x)
               [4:6]) < 9:  #Matches before September -> Considered Last Season
            return int(str(x)[0:4]) - 1
        else:
            return int(str(x)[0:4])

    Dataframe['RASEA'] = Dataframe['RARID'].map(Season_from_RARID)

    #RATRA
    Dataframe['RATRA'] = Condition[1].split('米')[1].split('(')[1][0]
    Dataframe['RATRA'].replace(['草', '泥'], ['T', 'AW'], inplace=True)
    """
    Odds
    """
    # try :
    #     Result[0] = Result[0].map(int)
    #     Dataframe.set_index('HNUMI', inplace=True, drop = False)
    #     Result.set_index(0, inplace=True, drop = False)
    #     Dataframe.sort_index(inplace=True)
    #     Result.sort_index(inplace=True)
    #     Dataframe.drop(Irr_Horse)
    #     Dataframe.drop(Irr_Horse, inplace = True)
    #     Result.drop(Irr_Horse, inplace = True)
    # except:
    #     pass
    #Fill avaliable odds data
    Dataframe['RESFO'] = Result[9].map(
        lambda x: x.split('\n')[1].strip().split(' ')[1]).fillna(0)
    Dataframe['ODPLA'] = Result[12].fillna(0)

    Dataframe.sort_index(inplace=True)

    return Dataframe
Ejemplo n.º 8
0
def One_Race_Feature(Dataframe):
    """
    Feature Creation for one Race
    Parameter
    ---------
    Matchday_Dataset from Racecard
    Return
    ------
    Feature DataFrame
    """
    """
    Get Feature Names
    """
    #Get Feature_List from FeatureDb
    Feature_List = Feature_Storage.Feature_List  #list(Extraction_Database("""PRAGMA table_info('FeatureDb')""")['name'])
    # Feature_List.remove('RARID')    Feature_List.remove('HNAME')
    Features_Dataframe = Dataframe.loc[:, ['RARID', 'HNAME']]
    """
    Create Features in Parallel
    """
    #Prepare Matchday Dataset
    HNAME_List = '(' + str(Dataframe['HNAME'].tolist())[1:-1] + ')'
    Raceday = Dataframe.loc[:, 'RADAT'].values[0]

    # results = []
    # for Feature in Feature_List:
    #     results.append(Create_Features([Feature, Dataframe, HNAME_List, Raceday]))

    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

    results = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for Feature in Feature_List:
            #Run Functions
            """
            All Feature Functions accepts a Matchday Dataframe
            then return a dataframe of a race,
            containing the following columns in the order of :
            [HNAME, Feature Name]
            """
            results.append(
                executor.submit(Create_Features,
                                [Feature, Dataframe, HNAME_List, Raceday]))
    results = [i.result() for i in results]

    warnings.filterwarnings("default", category=RuntimeWarning)
    warnings.filterwarnings("default", category=ConvergenceWarning)

    #Combine all features into one dataframe
    Features_DF = reduce(lambda x, y: pd.merge(x, y, on='HNAME'), results)

    #Combine all features into one dataframe
    # Features_Dataframe = pd.merge(Features_Dataframe, Features_DF, on = 'HNAME', how='left')
    """
    Feature Transformation
    """
    Transformation_List = Feature_Storage.Transformation_List

    # results = []
    # for Feature in Transformation_List:
    #     results.append(Transform_Features([Feature, Features_DF]))

    results = []
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for Feature in Transformation_List:
            #Run Functions
            """
            All Feature Functions accepts a Base Features_Dataframe
            then return a dataframe of a race,
            containing the following columns in the order of :
            [HNAME, Feature Name]
            """
            results.append(
                executor.submit(Transform_Features, [Feature, Features_DF]))
    results = [i.result() for i in results]

    #Combine all features into one dataframe
    Transformation_DF = reduce(lambda x, y: pd.merge(x, y, on='HNAME'),
                               results)

    #Combine all features into one dataframe
    Features_Dataframe = reduce(
        lambda x, y: pd.merge(x, y, on='HNAME'),
        [Features_Dataframe, Features_DF, Transformation_DF])

    if not len(Features_Dataframe.index) == len(Dataframe.index):
        print(Features_Dataframe)
        # print(Dataframe.loc[:,'RARID'].tolist()[0])

    #Inserting Features_Dataframe to Database
    Load_Dataset_toDatabase('FeatureDb', Features_Dataframe)

    if sum(Features_Dataframe.isna().sum()) != 0:
        print(Dataframe.loc[:, 'RARID'].to_list()[0])

    Features_Dataframe.loc[:,Features_Dataframe.columns[2:]] = \
        pd.DataFrame(Features_Dataframe.loc[:,Features_Dataframe.columns[2:]].values.astype(float))

    return Features_Dataframe