Ejemplo n.º 1
0
def read_experiment_id_raw_result(experiment_id: int):
    conn = connect_rds()
    try:
        sql_result = f"""select
                     *
                 from 
                    rws_experiment.result_table
                 where
                    experiment_id={experiment_id};"""

        result_df = pd.read_sql(sql_result, conn)

        result_ids = tuple(result_df.result_id.tolist())
        sql_raw = f"""select
                     *
                 from 
                    rws_experiment.raw_y_score
                 where
                    result_id in {result_ids};"""
        score_df = pd.read_sql(sql_raw, conn)
        result_score_df = pd.merge(
            result_df,
            score_df,
            on=['result_id', 'experiment_id', 'val_set_id'],
            how='inner')
        return result_score_df

    except (exception, psycopg2.databaseerror) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
def main(argv):
    conn = misc_utils.connect_rds()
    experiment_id = argv[1]
    query = f'''select * from rws_experiment.result_table where experiment_id={experiment_id}'''
    df_all = pd.read_sql(query, conn)
    models = df_all.model.unique()
    print(models)  #list of models in this experiement
    best_model_hyp = {"model": "", "param": "", "r8": 0}
    best_r8_score = -1
    for model in models:
        q = f'''select * from rws_experiment.result_table where experiment_id={experiment_id} and model=\'{model}\'  '''
        df = pd.read_sql(q, conn)
        params = df.parameters_id.unique()
        for param in params:
            param_result = ParameterResult(experiment_id, model, param)
            p_8, r_8 = param_result.generate_report(
            )  #generate a report for each model+hyperparameters
            if r_8 > best_r8_score:
                best_r8_score = r_8
                best_model_hyp["model"] = model
                best_model_hyp["param"] = param
                best_model_hyp["r8"] = r_8
        print(best_model_hyp
              )  #print out model+hyperparameters with highest f-1 score
    conn.close()
Ejemplo n.º 3
0
    def write_to_db(self):
        self.validate()
        if self.valid == True:
            ## writing logic
            print("Answer name of experiment - don't be lazy and describe well:")
            self.EXP_REMARKS = input()
            experiment_info = self.__create_info_tuple()
            sql = """INSERT INTO 
                         rws_experiment.experiment_table(
                         
                         experiment_datetime,
                         time_granularity,
                         segmentation_table,
                         time_lag_minutes,                       
                         weather_station_mapping_table,
                         
                         sample_train,
                         sample_val,
                         sample_test,
                         train_sample_method_params,                         
                         val_sample_method_params,
                         
                         test_sample_method_params,
                         features_knmi_config,
                         features_hectopunten_config,
                         features_flow_config,
                         features_temporal_config,
                         
                         features_ongevallen_config,
                         validation_config,
                         classifiers,
                         grid_parameters,
                         random_seed,
                         
                         exp_remarks) 
                    VALUES(%s,%s,%s,%s,%s,  %s,%s,%s,%s,%s,  %s,%s,%s,%s,%s,
                           %s,%s,%s,%s,%s, %s);"""

            #Connection to PostGreSQL
            conn = None
            try:
                conn = misc_utils.connect_rds()
                cur = conn.cursor(cursor_factory=DictCursor)
                cur.execute(sql, experiment_info)
                conn.commit()
                #close connection
                cur.close()
                
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
            finally:
                if conn is not None:
                    conn.close()
            #Close connection
                
        else:
         #Raise Error saying configuration is not valid 
            print("this configuration is not VALID!")
Ejemplo n.º 4
0
def main(argv):
    print(argv[1])
    w = write_exp_utils.ExperimentConfig(argv[1], argv[2])
    print("writing {} to database".format(argv[1]))
    w.write_to_db()  # write experiment on database

    # check if the experiment is  written correctly
    q = 'select experiment_id from rws_experiment.experiment_table order by experiment_id desc limit 1;'
    conn = misc_utils.connect_rds()
    print(pd.read_sql(q, conn))
Ejemplo n.º 5
0
    def _get_all_features(self, feature_config):
        table_name = feature_config['table_name']
        meta_columns = feature_config['meta_columns']
        
        q = '''SELECT * FROM information_schema.columns WHERE table_schema = '{0}' AND table_name='{1}';'''.format(table_name.split('.')[0], table_name.split('.')[1]) 
        conn = misc_utils.connect_rds()
        tbl_info = pd.read_sql(q, conn)
        conn.close()
#         print(tbl_info.columns)
        return tbl_info.loc[~tbl_info.column_name.isin(meta_columns),"column_name"].tolist()
Ejemplo n.º 6
0
def read_sql(sql):
    conn = connect_rds()
    try:
        df = pd.read_sql(sql, conn)
        return df

    except (exception, psycopg2.databaseerror) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
Ejemplo n.º 7
0
def update_table(sql):
    conn = None
    try:
        conn = misc_utils.connect_rds()
        cur = conn.cursor(cursor_factory=DictCursor)        
        cur.execute(sql)
        conn.commit()
        cur.close()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()    
Ejemplo n.º 8
0
def read_raw_y_db(result_id: int):
    conn = connect_rds()
    try:
        sql = f"""select
                   *
                from
                   rws_experiment.raw_y_score
                where
                   result_id={result_id};"""
        df = pd.read_sql(sql, con=conn)
        return df
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
Ejemplo n.º 9
0
def read_empty_results():
    conn = connect_rds()
    try:
        sql = f"""select
                   result_id
                from
                   rws_experiment.result_table
                where
                   p_8_dayshift is null;"""
        df = pd.read_sql(sql, con=conn)
        return df

    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
Ejemplo n.º 10
0
def read_results_for_experiment(experiment_id: int):
    conn = connect_rds()
    try:
        sql = f"""select
                   result_id
                from
                   rws_experiment.result_table
                where
                   experiment_id={experiment_id};"""
        df = pd.read_sql(sql, con=conn)
        return df['result_id'].values

    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
Ejemplo n.º 11
0
def read_experiment(experiment_id: int):
    conn = connect_rds()
    try:
        cur = conn.cursor(cursor_factory=DictCursor)
        sql = f"""select
                      classifiers, grid_parameters, random_seed 
                 from 
                    rws_experiment.experiment_table
                 where
                    experiment_id={experiment_id};"""
        cur.execute(sql)
        out = cur.fetchone()
        return out
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()
Ejemplo n.º 12
0
def read_experiment_result_db(experiment_id: int, model: str,
                              parameters_id: str):
    conn = connect_rds()
    try:
        sql = f"""select
                   *
                from
                   rws_experiment.result_table
                where
                   experiment_id={experiment_id}
                   and model=\'{model}\'
                   and parameters_id=\'{parameters_id}\';"""
        df = pd.read_sql(sql, con=conn)
        return df
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

    finally:
        if conn is not None:
            conn.close()
}

feature_dict_rain_intensity = {
    'columns': ['hourly_rain'],
    'divisors': ['precip_duration']
}

feature_dict_discretize = {
    'columns': [
        'temp', 'avg_wind_speed_hr', 'air_pressure', 'humidity',
        'sun_duration', 'precip_duration', 'observability', 'cloud_cover',
        'hourly_rain'
    ],
    'nums': [2, 3, 4, 5]
}

feature_config = {
    'feature_weather_x_hr_ago_grid': feature_dict_ago,
    'feature_weather_x_hr_agg_grid': feature_dict_agg,
    'feature_weather_last_event_grid': feature_dict_last,
    'feature_weather_rain_intensity_grid': feature_dict_rain_intensity,
    # discretized removed because code is buggy but also because defining discrete blocks at this
    # point in pipeline leads to information leakage
    #'feature_weather_discretize_grid':feature_dict_discretize,
    'feature_weather_last_event_grid': feature_dict_last
}

conn = connect_rds()
weather_driver('rws_clean.knmi', feature_config, conn, 'features_knmi_agg',
               False)
conn.close()
Ejemplo n.º 14
0
    def write_to_db(self):

        self.validate()

        if self.valid == True:

               ## writing logic

            result_info = self.__create_info_tuple()
            
            
            sql_result = """INSERT INTO

                         rws_experiment.result_table(

                             experiment_id,

                             val_set_id,

                             result_datetime,

                             model,

                             parameters_id,

                             parameters,
                             
                             split_info,
                             
                             train_size,
                             
                             val_size,

                             actual_features_size,
                             
                             accident_pct_train_pre_sample,
                             
                             accident_pct_val,
                             
                             p_8_dayshift,
                             
                             p_1_nightshift,
                             
                             r_8_dayshift,
                             
                             r_1_nightshift,

                             auc_roc,

                             pickle_path,

                             remarks )

                     VALUES(%s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s)
                     RETURNING result_id;"""
            
            sql_raw_score = """INSERT INTO

                         rws_experiment.raw_y_score(
                             result_id,

                             experiment_id,

                             val_set_id,
   
                             y_true,

                             y_scores,

                            space_time,

                            roc,
 
                            precision_recall_curve)

                     VALUES(%s,%s,%s,%s,%s, %s,%s,%s);"""
            #Connection to PostGreSQL
            conn = None
            try:
                conn = misc_utils.connect_rds()
                cur = conn.cursor(cursor_factory=DictCursor)
                cur.execute(sql_result, result_info)#write to result_table
                result_id = cur.fetchone()[0]
                result_y_raw = (result_id,self.EXPERIMENT_ID,self.VAL_SET_ID,self.Y_TRUE,self.Y_SCORES,self.SPACE_TIME,self.ROC,self.PRECISION_RECALL_CURVE,)
                cur.execute(sql_raw_score, result_y_raw)#write to raw_y_score table
                conn.commit()
                #close connection
                cur.close()
                
            except (Exception, psycopg2.DatabaseError) as error:
                print(error)
            finally:
                if conn is not None:
                    conn.close()
            #Close connection
 
        else:

         #Raise Error saying configuration is not valid

            print("this configuration is not VALID!")
Ejemplo n.º 15
0
    def validate(self):
        '''
        validation logic is here, if all tests pass then it sets self.valid = True
        '''
        #Connection to PostGreSQL
        conn = None
        try:
            conn = misc_utils.connect_rds()
            cur = conn.cursor(cursor_factory=DictCursor)
        
            #TODO! Check if use_all_feautures if is False then custom_features_list is provided.
            
            ## Check if there are features without num/cat marking
            unmarked_features = list(set(self.FEATURES_TOTAL) - set(self.CATEGORICAL_FEATURES_LIST) - set(self.NUMERICAL_FEATURES_LIST))
            if len(unmarked_features)>0:
                self.valid = False
                print("ERROR: These features don't have cat/num \n",','.join(unmarked_features))
            
            ## Validating if TIME_GRANULARITY IS IN MINUTES AND INTEGER
            tg = self.TIME_GRANULARITY        
            if type(tg)!=int or tg>60 or tg<=0:
                print('\n ERROR: TIME GRANULARITY HAS TO BE AN INT BETWEEN 1 AND 60 \n')
                self.valid = False
            
            print('Validating if columns exists for features mentioned in config')
            for table, feature_list in zip(
                [self.FEATURES_KNMI_CONFIG['table_name'],
                 self.FEATURES_FLOW_CONFIG['table_name'],
                 self.FEATURES_HECTOPUNTEN_CONFIG['table_name'],
                 self.FEATURES_TEMPORAL_CONFIG['table_name'],
                 self.FEATURES_ONGEVALLEN_CONFIG['table_name']
                ],
                [
                 self.FEATURES_KNMI_LIST,
                 self.FEATURES_FLOW_LIST,
                 self.FEATURES_HECTOPUNTEN_LIST,
                 self.FEATURES_TEMPORAL_LIST,
                 self.FEATURES_ONGEVALLEN_LIST,
                ]):
                print('Validating', table, 'to check if columns mentioned exist')
                q = '''SELECT * FROM information_schema.columns WHERE table_schema = '{0}' AND table_name='{1}'; '''.format(table.split('.')[0], table.split('.')[1])
                tbl_info = pd.read_sql(q, conn)
                missing_columns = list(set(feature_list)-set(tbl_info.column_name))
                if len(missing_columns)>0:
                    print('\n!!ERROR!!\n',','.join(missing_columns), '\n', 'are missing in: ', table, '\n')
                    self.valid = False 


            #list of tables
            print("VALIDATING: If these tables exist")
            tables = [self.SEGMENTATION_TABLE,
                      self.WEATHER_STATION_MAPPING_TABLE,
                      ]
                      #add more table when they are ready e.g. self.FEATURES_TEMPORAL_TABLE


            # Checking if categorical numerical features mentioned equal features described otherwise.
            
            total_features_cat_num = self.CATEGORICAL_FEATURES_LIST[:] + self.NUMERICAL_FEATURES_LIST[:]
            # print(total_features_cat_num)
            
            
            difference = list(set(self.FEATURES_TOTAL) - set(total_features_cat_num))
            if len(difference)>0:
                print('These columns are in total_features but not defined in cat-num lists:', ','.join(difference))
                self.valid = False

            
            difference = list( set(total_features_cat_num)- set(self.FEATURES_TOTAL))
            if len(difference)>0:
                print('These columns are defined in cat-num lists but not in total features:', ','.join(difference))
                self.valid = False
     

            #Connection to PostGreSQL
            ####VALIDATE TABLES
            validate_table = []
            for t in tables:
                schema, name = t.split(".")
                print('Validating if ',schema,name, 'exists')
                sql = f"""SELECT EXISTS (
                            SELECT 1
                               FROM   information_schema.tables
                               WHERE  table_schema = '{schema}'
                               AND    table_name = '{name}'
                         );"""
                cur.execute(sql)
                conn.commit()
                row = cur.fetchone()
                validate_table.append(row[0])
 
            if len(validate_table)>=1:
                if False in validate_table:
                    self.valid=False
                else:
                    print("All these tables exist!")
            cur.close()

            #Check if feature tables mentioned are consitent with the granularities mentioned.
            #note: this only works with space granularity since naming convention for time granularity is inconsistent. 
            #note: KNMI table name does not contain any info about granularity
            #note: temporal table has no space granularity information
            for config in [self.FEATURES_FLOW_CONFIG,self.FEATURES_HECTOPUNTEN_CONFIG, self.FEATURES_ONGEVALLEN_CONFIG]:     
                if num2words[config['space_granularity_km']]+'_km' in config['table_name']:
                    print(config['table_name'],'-> this table\'s name is consistent with the granularity mentioned.' ) 
                else:
                    print(config['table_name'],'-> the table name is not consistent with the granularity mentioned, please check again!')
                    self.valid=False

        except (Exception, psycopg2.DatabaseError) as error:
            print(error)
        finally:
            if conn is not None:
                conn.close()