def read_experiment_id_raw_result(experiment_id: int): conn = connect_rds() try: sql_result = f"""select * from rws_experiment.result_table where experiment_id={experiment_id};""" result_df = pd.read_sql(sql_result, conn) result_ids = tuple(result_df.result_id.tolist()) sql_raw = f"""select * from rws_experiment.raw_y_score where result_id in {result_ids};""" score_df = pd.read_sql(sql_raw, conn) result_score_df = pd.merge( result_df, score_df, on=['result_id', 'experiment_id', 'val_set_id'], how='inner') return result_score_df except (exception, psycopg2.databaseerror) as error: print(error) finally: if conn is not None: conn.close()
def main(argv): conn = misc_utils.connect_rds() experiment_id = argv[1] query = f'''select * from rws_experiment.result_table where experiment_id={experiment_id}''' df_all = pd.read_sql(query, conn) models = df_all.model.unique() print(models) #list of models in this experiement best_model_hyp = {"model": "", "param": "", "r8": 0} best_r8_score = -1 for model in models: q = f'''select * from rws_experiment.result_table where experiment_id={experiment_id} and model=\'{model}\' ''' df = pd.read_sql(q, conn) params = df.parameters_id.unique() for param in params: param_result = ParameterResult(experiment_id, model, param) p_8, r_8 = param_result.generate_report( ) #generate a report for each model+hyperparameters if r_8 > best_r8_score: best_r8_score = r_8 best_model_hyp["model"] = model best_model_hyp["param"] = param best_model_hyp["r8"] = r_8 print(best_model_hyp ) #print out model+hyperparameters with highest f-1 score conn.close()
def write_to_db(self): self.validate() if self.valid == True: ## writing logic print("Answer name of experiment - don't be lazy and describe well:") self.EXP_REMARKS = input() experiment_info = self.__create_info_tuple() sql = """INSERT INTO rws_experiment.experiment_table( experiment_datetime, time_granularity, segmentation_table, time_lag_minutes, weather_station_mapping_table, sample_train, sample_val, sample_test, train_sample_method_params, val_sample_method_params, test_sample_method_params, features_knmi_config, features_hectopunten_config, features_flow_config, features_temporal_config, features_ongevallen_config, validation_config, classifiers, grid_parameters, random_seed, exp_remarks) VALUES(%s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s);""" #Connection to PostGreSQL conn = None try: conn = misc_utils.connect_rds() cur = conn.cursor(cursor_factory=DictCursor) cur.execute(sql, experiment_info) conn.commit() #close connection cur.close() except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close() #Close connection else: #Raise Error saying configuration is not valid print("this configuration is not VALID!")
def main(argv): print(argv[1]) w = write_exp_utils.ExperimentConfig(argv[1], argv[2]) print("writing {} to database".format(argv[1])) w.write_to_db() # write experiment on database # check if the experiment is written correctly q = 'select experiment_id from rws_experiment.experiment_table order by experiment_id desc limit 1;' conn = misc_utils.connect_rds() print(pd.read_sql(q, conn))
def _get_all_features(self, feature_config): table_name = feature_config['table_name'] meta_columns = feature_config['meta_columns'] q = '''SELECT * FROM information_schema.columns WHERE table_schema = '{0}' AND table_name='{1}';'''.format(table_name.split('.')[0], table_name.split('.')[1]) conn = misc_utils.connect_rds() tbl_info = pd.read_sql(q, conn) conn.close() # print(tbl_info.columns) return tbl_info.loc[~tbl_info.column_name.isin(meta_columns),"column_name"].tolist()
def read_sql(sql): conn = connect_rds() try: df = pd.read_sql(sql, conn) return df except (exception, psycopg2.databaseerror) as error: print(error) finally: if conn is not None: conn.close()
def update_table(sql): conn = None try: conn = misc_utils.connect_rds() cur = conn.cursor(cursor_factory=DictCursor) cur.execute(sql) conn.commit() cur.close() except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()
def read_raw_y_db(result_id: int): conn = connect_rds() try: sql = f"""select * from rws_experiment.raw_y_score where result_id={result_id};""" df = pd.read_sql(sql, con=conn) return df except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()
def read_empty_results(): conn = connect_rds() try: sql = f"""select result_id from rws_experiment.result_table where p_8_dayshift is null;""" df = pd.read_sql(sql, con=conn) return df except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()
def read_results_for_experiment(experiment_id: int): conn = connect_rds() try: sql = f"""select result_id from rws_experiment.result_table where experiment_id={experiment_id};""" df = pd.read_sql(sql, con=conn) return df['result_id'].values except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()
def read_experiment(experiment_id: int): conn = connect_rds() try: cur = conn.cursor(cursor_factory=DictCursor) sql = f"""select classifiers, grid_parameters, random_seed from rws_experiment.experiment_table where experiment_id={experiment_id};""" cur.execute(sql) out = cur.fetchone() return out except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()
def read_experiment_result_db(experiment_id: int, model: str, parameters_id: str): conn = connect_rds() try: sql = f"""select * from rws_experiment.result_table where experiment_id={experiment_id} and model=\'{model}\' and parameters_id=\'{parameters_id}\';""" df = pd.read_sql(sql, con=conn) return df except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()
} feature_dict_rain_intensity = { 'columns': ['hourly_rain'], 'divisors': ['precip_duration'] } feature_dict_discretize = { 'columns': [ 'temp', 'avg_wind_speed_hr', 'air_pressure', 'humidity', 'sun_duration', 'precip_duration', 'observability', 'cloud_cover', 'hourly_rain' ], 'nums': [2, 3, 4, 5] } feature_config = { 'feature_weather_x_hr_ago_grid': feature_dict_ago, 'feature_weather_x_hr_agg_grid': feature_dict_agg, 'feature_weather_last_event_grid': feature_dict_last, 'feature_weather_rain_intensity_grid': feature_dict_rain_intensity, # discretized removed because code is buggy but also because defining discrete blocks at this # point in pipeline leads to information leakage #'feature_weather_discretize_grid':feature_dict_discretize, 'feature_weather_last_event_grid': feature_dict_last } conn = connect_rds() weather_driver('rws_clean.knmi', feature_config, conn, 'features_knmi_agg', False) conn.close()
def write_to_db(self): self.validate() if self.valid == True: ## writing logic result_info = self.__create_info_tuple() sql_result = """INSERT INTO rws_experiment.result_table( experiment_id, val_set_id, result_datetime, model, parameters_id, parameters, split_info, train_size, val_size, actual_features_size, accident_pct_train_pre_sample, accident_pct_val, p_8_dayshift, p_1_nightshift, r_8_dayshift, r_1_nightshift, auc_roc, pickle_path, remarks ) VALUES(%s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s,%s, %s,%s,%s,%s) RETURNING result_id;""" sql_raw_score = """INSERT INTO rws_experiment.raw_y_score( result_id, experiment_id, val_set_id, y_true, y_scores, space_time, roc, precision_recall_curve) VALUES(%s,%s,%s,%s,%s, %s,%s,%s);""" #Connection to PostGreSQL conn = None try: conn = misc_utils.connect_rds() cur = conn.cursor(cursor_factory=DictCursor) cur.execute(sql_result, result_info)#write to result_table result_id = cur.fetchone()[0] result_y_raw = (result_id,self.EXPERIMENT_ID,self.VAL_SET_ID,self.Y_TRUE,self.Y_SCORES,self.SPACE_TIME,self.ROC,self.PRECISION_RECALL_CURVE,) cur.execute(sql_raw_score, result_y_raw)#write to raw_y_score table conn.commit() #close connection cur.close() except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close() #Close connection else: #Raise Error saying configuration is not valid print("this configuration is not VALID!")
def validate(self): ''' validation logic is here, if all tests pass then it sets self.valid = True ''' #Connection to PostGreSQL conn = None try: conn = misc_utils.connect_rds() cur = conn.cursor(cursor_factory=DictCursor) #TODO! Check if use_all_feautures if is False then custom_features_list is provided. ## Check if there are features without num/cat marking unmarked_features = list(set(self.FEATURES_TOTAL) - set(self.CATEGORICAL_FEATURES_LIST) - set(self.NUMERICAL_FEATURES_LIST)) if len(unmarked_features)>0: self.valid = False print("ERROR: These features don't have cat/num \n",','.join(unmarked_features)) ## Validating if TIME_GRANULARITY IS IN MINUTES AND INTEGER tg = self.TIME_GRANULARITY if type(tg)!=int or tg>60 or tg<=0: print('\n ERROR: TIME GRANULARITY HAS TO BE AN INT BETWEEN 1 AND 60 \n') self.valid = False print('Validating if columns exists for features mentioned in config') for table, feature_list in zip( [self.FEATURES_KNMI_CONFIG['table_name'], self.FEATURES_FLOW_CONFIG['table_name'], self.FEATURES_HECTOPUNTEN_CONFIG['table_name'], self.FEATURES_TEMPORAL_CONFIG['table_name'], self.FEATURES_ONGEVALLEN_CONFIG['table_name'] ], [ self.FEATURES_KNMI_LIST, self.FEATURES_FLOW_LIST, self.FEATURES_HECTOPUNTEN_LIST, self.FEATURES_TEMPORAL_LIST, self.FEATURES_ONGEVALLEN_LIST, ]): print('Validating', table, 'to check if columns mentioned exist') q = '''SELECT * FROM information_schema.columns WHERE table_schema = '{0}' AND table_name='{1}'; '''.format(table.split('.')[0], table.split('.')[1]) tbl_info = pd.read_sql(q, conn) missing_columns = list(set(feature_list)-set(tbl_info.column_name)) if len(missing_columns)>0: print('\n!!ERROR!!\n',','.join(missing_columns), '\n', 'are missing in: ', table, '\n') self.valid = False #list of tables print("VALIDATING: If these tables exist") tables = [self.SEGMENTATION_TABLE, self.WEATHER_STATION_MAPPING_TABLE, ] #add more table when they are ready e.g. self.FEATURES_TEMPORAL_TABLE # Checking if categorical numerical features mentioned equal features described otherwise. total_features_cat_num = self.CATEGORICAL_FEATURES_LIST[:] + self.NUMERICAL_FEATURES_LIST[:] # print(total_features_cat_num) difference = list(set(self.FEATURES_TOTAL) - set(total_features_cat_num)) if len(difference)>0: print('These columns are in total_features but not defined in cat-num lists:', ','.join(difference)) self.valid = False difference = list( set(total_features_cat_num)- set(self.FEATURES_TOTAL)) if len(difference)>0: print('These columns are defined in cat-num lists but not in total features:', ','.join(difference)) self.valid = False #Connection to PostGreSQL ####VALIDATE TABLES validate_table = [] for t in tables: schema, name = t.split(".") print('Validating if ',schema,name, 'exists') sql = f"""SELECT EXISTS ( SELECT 1 FROM information_schema.tables WHERE table_schema = '{schema}' AND table_name = '{name}' );""" cur.execute(sql) conn.commit() row = cur.fetchone() validate_table.append(row[0]) if len(validate_table)>=1: if False in validate_table: self.valid=False else: print("All these tables exist!") cur.close() #Check if feature tables mentioned are consitent with the granularities mentioned. #note: this only works with space granularity since naming convention for time granularity is inconsistent. #note: KNMI table name does not contain any info about granularity #note: temporal table has no space granularity information for config in [self.FEATURES_FLOW_CONFIG,self.FEATURES_HECTOPUNTEN_CONFIG, self.FEATURES_ONGEVALLEN_CONFIG]: if num2words[config['space_granularity_km']]+'_km' in config['table_name']: print(config['table_name'],'-> this table\'s name is consistent with the granularity mentioned.' ) else: print(config['table_name'],'-> the table name is not consistent with the granularity mentioned, please check again!') self.valid=False except (Exception, psycopg2.DatabaseError) as error: print(error) finally: if conn is not None: conn.close()