def main( selected_pd = "JetHT", cutoff_eventlumi = False, is_dropna = True, is_fillna_zero = True, data_preprocessing_mode = 'minmaxscalar', DATA_SPLIT_TRAIN = [1.0 for i in range(3)], ): # setting model_name = "OneClassSVM_{}_f{}".format(selected_pd, FEATURE_SET_NUMBER) features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) if is_dropna: df_good = df_good.dropna() df_bad = df_bad.dropna() if is_fillna_zero: df_good = df_good.fillna(0) df_bad = df_bad.fillna(0) x = df_good[features] x_train_full, x_valid, x_test = utility.split_dataset(x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID) y_test = np.concatenate((np.full(x_test.shape[0], 0), np.full(df_bad[features].shape[0], 1))) x_test = np.concatenate([x_test, df_bad[features].to_numpy()]) model_list = [svm.OneClassSVM( nu=0.1, kernel="rbf", gamma=0.1 )for i in range(len(DATA_SPLIT_TRAIN))] for dataset_fraction, model in zip(DATA_SPLIT_TRAIN, model_list): print("Model: {}, Chunk of Training Dataset fraction: {}".format(model_name, dataset_fraction)) x_train = x_train_full[:int(dataset_fraction*len(x_train_full))] # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0,1)) if data_preprocessing_mode == 'normalize': x_train_tf = normalize(x_train, norm='l1') x_valid_tf = normalize(x_valid, norm='l1') x_test_tf = normalize(x_test, norm='l1') else: transformer.fit(x_train) x_train_tf = transformer.transform(x_train) x_valid_tf = transformer.transform(x_valid) x_test_tf = transformer.transform(x_test) model.fit(x_train_tf) try: file_eval = open('report/reco/eval/{} {}.txt'.format(model_name, dataset_fraction), 'w') except FileNotFoundError: os.makedirs("./report/reco/eval/") file_eval = open('report/reco/eval/{} {}.txt'.format(model_name, dataset_fraction), 'w') file_eval.write("fpr tpr threshold\n") fprs, tprs, thresholds = roc_curve(y_test, -model.decision_function(x_test_tf)) for fpt, tpr, threshold in zip(fprs, tprs, thresholds): file_eval.write("{} {} {}\n".format(fpt, tpr, threshold)) file_eval.close() print("AUC {}".format(auc(fprs, tprs)))
def main( selected_pd="JetHT", include_bad_failure=False, cutoff_eventlumi=False, is_dropna=True, is_fillna_zero=True, BS=2**15, EPOCHS=1200, data_preprocessing_mode='minmaxscalar', DATA_SPLIT_TRAIN=[1.0 for i in range(10)], gpu_memory_growth=True, ): features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) if include_bad_failure: df_bad_human = utility.read_data( selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) df_bad_failure = utility.read_data( selected_pd=selected_pd, pd_data_directory=PD_FAILURE_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) df_bad = pd.concat([df_bad_human, df_bad_failure], ignore_index=True) else: df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) if is_dropna: df_good = df_good.dropna() df_bad = df_bad.dropna() if is_fillna_zero: df_good = df_good.fillna(0.0) df_bad = df_bad.fillna(0.0) x = df_good[features] x_train_full, x_valid, x_test_good = utility.split_dataset( x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID) y_test = np.concatenate([ np.full(x_test_good.shape[0], 0.0), np.full(df_bad[features].shape[0], 1.0) ]) x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()]) file_auc = open('report/reco/eval/roc_auc_{}.txt'.format(selected_pd), 'w') file_auc.write("model_name data_fraction roc_auc\n") for model_name, Autoencoder in zip( [ "SparseContractive", "SparseVariational", "ContractiveVariational", "Standard" ], # [ "Vanilla", "Sparse", "Contractive", "Variational"], [ SparseContractiveAutoencoder, SparseVariationalAutoencoder, ContractiveVariationalAutoencoder, StandardAutoencoder ] # [ VanillaAutoencoder, SparseAutoencoder, ContractiveAutoencoder, VariationalAutoencoder], ): model_list = [ Autoencoder( input_dim=[len(features)], summary_dir="model/reco/summary", model_name="{}_model_{}_f{}_{}".format(model_name, selected_pd, FEATURE_SET_NUMBER, i), batch_size=BS, gpu_memory_growth=gpu_memory_growth, ) for i in range(1, len(DATA_SPLIT_TRAIN) + 1) ] for dataset_fraction, autoencoder in zip(DATA_SPLIT_TRAIN, model_list): print("Model: {}, Chunk of Training Dataset fraction: {}".format( autoencoder.model_name, dataset_fraction)) file_log = open( 'report/reco/logs/{}.txt'.format(autoencoder.model_name), 'w') file_log.write("EP loss_train loss_valid\n") x_train = x_train_full[:int(dataset_fraction * len(x_train_full))] print( "Data # training: {}, # validation: {}, # testing good {}, # testing bad {}" .format( x_train.shape[0], x_valid.shape[0], x_test_good.shape[0], df_bad[features].shape[0], )) # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0, 1)) if data_preprocessing_mode == 'normalize': x_train = normalize(x_train, norm='l1') x_valid = normalize(x_valid, norm='l1') x_test = normalize(x_test, norm='l1') else: transformer.fit(x_train) x_train_tf = transformer.transform(x_train) x_valid_tf = transformer.transform(x_valid) x_test_tf = transformer.transform(x_test) autoencoder.init_variables() for EP in range(EPOCHS): x_train_shuf = shuffle(x_train_tf) for iteration_i in range(int(len(x_train_shuf) / BS)): x_batch = x_train_shuf[BS * iteration_i:BS * (iteration_i + 1)] autoencoder.train(x_batch) autoencoder.log_summary(x_train_tf, EP) file_log.write("{} {} {}\n".format( EP + 1, autoencoder.get_loss(x_train_tf)["loss_total"], autoencoder.get_loss(x_valid_tf)["loss_total"])) file_log.close() try: file_eval = open( 'report/reco/eval/{} {}.txt'.format( autoencoder.model_name, dataset_fraction), 'w') except FileNotFoundError: os.makedirs("./report/reco/eval/") file_eval = open( 'report/reco/eval/{} {}.txt'.format( autoencoder.model_name, dataset_fraction), 'w') file_eval.write("fpr tpr threshold\n") ### Tracking Error print( "Error tracking for model: {}, # NaN in SD: {}, # inf in SD: {} " .format( model_name, len( list( filter( lambda x: x == True, np.isnan( autoencoder.get_sd(x_test_tf, scalar=True))))), len( list( filter( lambda x: x == True, np.isinf( autoencoder.get_sd(x_test_tf, scalar=True))))))) ### fprs, tprs, thresholds = roc_curve( y_test, autoencoder.get_sd(x_test_tf, scalar=True)) for fpt, tpr, threshold in zip(fprs, tprs, thresholds): file_eval.write("{} {} {}\n".format(fpt, tpr, threshold)) file_eval.close() print("AUC {}".format(auc(fprs, tprs))) file_auc.write("{} {} {}\n".format(model_name, dataset_fraction, auc(fprs, tprs))) autoencoder.save()
def error_features( selected_pd="JetHT", Autoencoder=VanillaAutoencoder, model_name="Vanilla", number_model=1, include_bad_failure=False, cutoff_eventlumi=False, is_dropna=True, is_fillna_zero=True, BS=2**15, data_preprocessing_mode='minmaxscalar', gpu_memory_growth=True, dir_log='report/reco', ): features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) if include_bad_failure: df_bad_human = utility.read_data( selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) df_bad_failure = utility.read_data( selected_pd=selected_pd, pd_data_directory=PD_FAILURE_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) df_bad = pd.concat([df_bad_human, df_bad_failure], ignore_index=True) else: df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) if is_dropna: df_good = df_good.dropna() df_bad = df_bad.dropna() if is_fillna_zero: df_good = df_good.fillna(0.0) df_bad = df_bad.fillna(0.0) x = df_good[features] x_train_full, x_valid, x_test_good = utility.split_dataset( x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID) y_test = np.concatenate([ np.full(x_test_good.shape[0], 0.0), np.full(df_bad[features].shape[0], 1.0) ]) x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()]) x_train = x_train_full # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0, 1)) if data_preprocessing_mode == 'normalize': x_test_good_tf = normalize(x_test_good, norm='l1') x_test_bad_tf = normalize(df_bad[features].to_numpy(), norm='l1') else: transformer.fit(x_train) x_test_good_tf = transformer.transform(x_test_good) x_test_bad_tf = transformer.transform(df_bad[features].to_numpy()) autoencoder = Autoencoder( input_dim=[len(features)], model_name="{}_model_{}_f{}_{}".format(model_name, selected_pd, FEATURE_SET_NUMBER, number_model), batch_size=BS, ) autoencoder.restore() vec_avg_sd_good = np.mean(autoencoder.get_sd(x_test_good_tf), axis=0) vec_avg_sd_bad = np.mean(autoencoder.get_sd(x_test_bad_tf), axis=0) vec_sum_sd_good = np.sum(autoencoder.get_sd(x_test_good_tf), axis=0) vec_sum_sd_bad = np.sum(autoencoder.get_sd(x_test_bad_tf), axis=0) # visualize x = range(1, len(features) + 1) fig, axs = plt.subplots(2, 1, constrained_layout=True) axs[0].plot(x, vec_avg_sd_good) axs[0].set_title('Good LS') axs[0].set_xlabel("Feature Number") axs[0].set_ylabel("|x - $\~{x}|^2$") axs[1].plot(x, vec_avg_sd_bad) axs[1].set_title('Bad LS') axs[1].set_xlabel("Feature Number") axs[1].set_ylabel("|x - $\~{x}|^2$") fig.suptitle( "Average reconstruction error over testing sample ({}, {})".format( selected_pd, model_name)) plt.savefig('avg_sd_{}_{}_f{}_{}.png'.format(model_name, selected_pd, FEATURE_SET_NUMBER, number_model)) fig, axs = plt.subplots(2, 1, constrained_layout=True) axs[0].plot(x, vec_sum_sd_good) axs[0].set_title('Good LS') axs[0].set_xlabel("Feature Number") axs[0].set_ylabel("|x - $\~{x}|^2$") axs[1].plot(x, vec_sum_sd_bad) axs[1].set_title('Bad LS') axs[1].set_xlabel("Feature Number") axs[1].set_ylabel("|x - $\~{x}|^2$") fig.suptitle( "Sum reconstruction error over testing sample ({}, {})".format( selected_pd, model_name)) plt.savefig('sum_sd_{}_{}_f{}_{}.png'.format(model_name, selected_pd, FEATURE_SET_NUMBER, number_model)) print( features[48:58], '\n', features[78:85], '\n', features[85:95], '\n', features[99:108], '\n', )
def plot_bad_good_separate_case( selected_pds = ["JetHT", "ZeroBias", ], data_preprocessing_mode = 'minmaxscalar', is_dropna = True, is_fillna_zero = True, ): # styling COLORS_SEPARATE = ('green', 'red', 'purple', 'orange') HUMAN_LABELS_SEPARATE = ('Good', 'Bad_Human', 'Bad_FailureScenario', 'Bad_DCS') MARKERS = ('o', '^', '^', '^') for selected_pd in selected_pds: print("\n\n Processing {} \n\n".format(selected_pd)) features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY) df_bad_human = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY) df_bad_failure = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_FAILURE_DATA_DIRECTORY) df_bad_dcs = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_DCS_BAD_DATA_DIRECTORY) if is_dropna: df_good = df_good.dropna() df_bad_human = df_bad_human.dropna() df_bad_failure = df_bad_failure.dropna() df_bad_dcs = df_bad_dcs.dropna() elif is_fillna_zero: df_good = df_good.fillna(0) df_bad_human = df_bad_human.fillna(0) df_bad_failure = df_bad_failure.fillna(0) df_bad_dcs = df_bad_dcs.fillna(0) x = df_good[features] x_train_full, x_valid, x_test_good = utility.split_dataset(x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID) y_test = np.concatenate(( np.full(x_test_good.shape[0], 0), np.full(df_bad_human[features].shape[0], 1), np.full(df_bad_dcs[features].shape[0], 1) )) x_test = np.concatenate([ x_test_good, df_bad_human[features].to_numpy(), df_bad_failure[features].to_numpy(), df_bad_dcs[features].to_numpy(), ]) x_train = x_train_full print("Data # training: {}, # validation: {}, # testing good {}, # testing bad_human {}, # testing bad_failure {}, # testing bad DCS {}".format( x_train.shape[0], x_valid.shape[0], x_test_good.shape[0], df_bad_human.shape[0], df_bad_failure.shape[0], df_bad_dcs.shape[0], )) # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0,1)) if data_preprocessing_mode == 'normalize': x_train = normalize(x_train, norm='l1') x_valid = normalize(x_valid, norm='l1') x_test = normalize(x_test, norm='l1') else: transformer.fit(x_train) x_train = transformer.transform(x_train) x_valid = transformer.transform(x_valid) x_test = transformer.transform(x_test) # Visualization section pca = PCA(n_components=2) # pca.fit(transformer.transform(df_good[features].to_numpy())) pca.fit(np.concatenate([ transformer.transform(df_good[features].to_numpy()), transformer.transform(df_bad_human[features].to_numpy()), transformer.transform(df_bad_dcs[features].to_numpy()), ])) # visulize human x_labeled_good = pca.transform(transformer.transform(df_good[features].to_numpy())) x_labeled_bad_human = pca.transform(transformer.transform(df_bad_human[features].to_numpy())) x_labeled_bad_failure = pca.transform(transformer.transform(df_bad_failure[features].to_numpy())) x_labeled_bad_dcs = pca.transform(transformer.transform(df_bad_dcs[features].to_numpy())) fig, ax = plt.subplots() for color, x, group_label, marker in zip(COLORS_SEPARATE, [x_labeled_good, x_labeled_bad_human, x_labeled_bad_failure, x_labeled_bad_dcs, ], HUMAN_LABELS_SEPARATE, MARKERS): ax.scatter( x[:, 0], x[:, 1], alpha=0.2, c = color, marker = marker, label = group_label ) ax.legend() plt.title('Labeled 2018 data ({})'.format(selected_pd)) plt.xlabel("Principal component 1") plt.ylabel("Principal component 2") plt.savefig('{}_label_separate.png'.format(selected_pd), bbox_inches='tight') plt.ylim((-3,3)) plt.xlim((-3,3)) plt.savefig('{}_label_separate_short_range.png'.format(selected_pd), bbox_inches='tight')
def compute_ms_dist( selected_pd="JetHT", Autoencoder=VanillaAutoencoder, model_name="Vanilla", number_model=1, include_bad_failure=False, cutoff_eventlumi=False, is_dropna=True, is_fillna_zero=True, BS=2**15, data_preprocessing_mode='minmaxscalar', gpu_memory_growth=True, dir_log='report/reco', ): features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY) if include_bad_failure: df_bad_human = utility.read_data( selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) df_bad_failure = utility.read_data( selected_pd=selected_pd, pd_data_directory=PD_FAILURE_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) df_bad = pd.concat([df_bad_human, df_bad_failure], ignore_index=True) else: df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY, cutoff_eventlumi=cutoff_eventlumi) if is_dropna: df_good = df_good.dropna() df_bad = df_bad.dropna() if is_fillna_zero: df_good = df_good.fillna(0.0) df_bad = df_bad.fillna(0.0) x = df_good[features] x_train_full, x_valid, x_test_good = utility.split_dataset( x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID) y_test = np.concatenate([ np.full(x_test_good.shape[0], 0.0), np.full(df_bad[features].shape[0], 1.0) ]) x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()]) x_train = x_train_full # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0, 1)) if data_preprocessing_mode == 'normalize': x_test_good_tf = normalize(x_test_good, norm='l1') x_test_bad_tf = normalize(df_bad[features].to_numpy(), norm='l1') else: transformer.fit(x_train) x_test_good_tf = transformer.transform(x_test_good) x_test_bad_tf = transformer.transform(df_bad[features].to_numpy()) run_good, lumi_good = df_good['runId'].iloc[ -int(FRAC_TEST * len(x)):].to_numpy( ), df_good['lumiId'].iloc[-int(FRAC_TEST * len(x)):].to_numpy() run_bad, lumi_bad = df_bad['runId'].to_numpy(), df_bad['lumiId'].to_numpy() autoencoder = Autoencoder( input_dim=[len(features)], summary_dir="model/reco/summary", model_name="{}_model_{}_f{}_{}".format(model_name, selected_pd, FEATURE_SET_NUMBER, number_model), batch_size=BS, ) autoencoder.restore() with open( os.path.join( dir_log, 'good_totalSE_{}_{}_f{}_{}.txt'.format(model_name, selected_pd, FEATURE_SET_NUMBER, number_model)), 'w') as f: f.write('total_se run lumi\n') for good_totalsd, run, lumi in zip( autoencoder.get_sd(x_test_good_tf, scalar=True), run_good, lumi_good): f.write('{} {} {}\n'.format(good_totalsd, run, lumi)) with open( os.path.join( dir_log, 'bad_totalSE_{}_{}_f{}_{}.txt'.format(model_name, selected_pd, FEATURE_SET_NUMBER, number_model)), 'w') as f: f.write('total_se run lumi\n') for bad_totalsd, run, lumi in zip( autoencoder.get_sd(x_test_bad_tf, scalar=True), run_bad, lumi_bad): f.write('{} {} {}\n'.format(bad_totalsd, run, lumi))
def plot_subsystem3d( selected_pd = "JetHT", interested_statuses = { 'hcal_hcal': 'hcal-hcal', 'ecal_ecal': 'ecal-ecal', 'tracker_track': 'tracker-track', 'muon_muon': 'muon-muon' }, data_preprocessing_mode = 'minmaxscalar', is_dropna = True, is_fillna_zero = True, ): # styling COLORS_SEPARATE = ('green', 'orange', 'red', 'purple', 'c') HUMAN_LABELS_SEPARATE = ('Good', 'Bad_HCAL', 'Bad_ECAL','Bad_TRACKER', 'Bad_MUON') MARKERS = ('o', '^', '^', '^', '^') print("\n\n Processing {} \n\n".format(selected_pd)) features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_LABELED_SUBSYSTEM_GOOD_DATA_DIRECTORY) df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_LABELED_SUBSYSTEM_BAD_DATA_DIRECTORY) df_bad_hcal = df_bad.query('hcal_hcal == 0') df_bad_ecal = df_bad.query('ecal_ecal == 0') df_bad_traker = df_bad.query('tracker_track == 0') df_bad_muon = df_bad.query('muon_muon == 0') df_bad_human = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY) df_bad_dcs = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_DCS_BAD_DATA_DIRECTORY) print("Before dropna; # Good:{} , # Bad:{}, # HCAL:{}, # ECAL:{}, # TRACKER:{}, # MUON:{}".format( df_good.shape[0], df_bad.shape[0], df_bad_hcal.shape[0], df_bad_ecal.shape[0], df_bad_traker.shape[0], df_bad_muon.shape[0] )) if is_dropna: df_good = df_good.dropna() df_bad = df_bad.dropna() df_bad_hcal = df_bad_hcal.dropna() df_bad_ecal = df_bad_ecal.dropna() df_bad_traker = df_bad_traker.dropna() df_bad_muon = df_bad_muon.dropna() df_bad_human = df_bad_human.dropna() df_bad_dcs = df_bad_dcs.dropna() elif is_fillna_zero: df_good = df_good.fillna(0) df_bad = df_bad.fillna(0) df_bad_hcal = df_bad_hcal.fillna(0) df_bad_ecal = df_bad_ecal.fillna(0) df_bad_traker = df_bad_traker.fillna(0) df_bad_muon = df_bad_muon.fillna(0) df_bad_human = df_bad_human.fillna(0) df_bad_dcs = df_bad_dcs.fillna(0) x = df_good[features] x_train_full, x_valid, x_test_good = utility.split_dataset( x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID ) y_test = np.concatenate(( np.full(x_test_good.shape[0], 0), np.full(df_bad_hcal[features].shape[0], 1), np.full(df_bad_ecal[features].shape[0], 1), np.full(df_bad_traker[features].shape[0], 1), np.full(df_bad_muon[features].shape[0], 1), )) x_test = np.concatenate([ x_test_good, df_bad_hcal[features].to_numpy(), df_bad_ecal[features].to_numpy(), df_bad_traker[features].to_numpy(), df_bad_muon[features].to_numpy(), ]) file_auc = open('report/reco/eval/roc_auc.txt', 'w') file_auc.write("model_name data_fraction roc_auc\n") x_train = x_train_full print("Before dropna; # Good:{}, # HCAL:{}, # ECAL:{}, # TRACKER:{}, # MUON:{}".format( df_good.shape[0], df_bad_hcal.shape[0], df_bad_ecal.shape[0], df_bad_traker.shape[0], df_bad_muon.shape[0] )) # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0,1)) if data_preprocessing_mode == 'normalize': x_train = normalize(x_train, norm='l1') x_valid = normalize(x_valid, norm='l1') x_test = normalize(x_test, norm='l1') else: transformer.fit(x_train) x_train = transformer.transform(x_train) x_valid = transformer.transform(x_valid) x_test = transformer.transform(x_test) # Visualization section pca = PCA(n_components=3) # pca.fit(transformer.transform(df_good[features].to_numpy())) pca.fit(np.concatenate([ transformer.transform(df_good[features].to_numpy()), transformer.transform(df_bad_human[features].to_numpy()), transformer.transform(df_bad_dcs[features].to_numpy()), ])) # visualize human x_labeled_good = pca.transform(transformer.transform(df_good[features].to_numpy())) x_labeled_bad_hcal = pca.transform(transformer.transform(df_bad_hcal[features].to_numpy())) x_labeled_bad_ecal = pca.transform(transformer.transform(df_bad_ecal[features].to_numpy())) x_labeled_bad_tracker = pca.transform(transformer.transform(df_bad_traker[features].to_numpy())) x_labeled_bad_muon = pca.transform(transformer.transform(df_bad_muon[features].to_numpy())) # fig, ax = plt.subplots() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for color, x, group_label, marker in zip( COLORS_SEPARATE, [x_labeled_good, x_labeled_bad_hcal, x_labeled_bad_ecal, x_labeled_bad_tracker, x_labeled_bad_muon, ], HUMAN_LABELS_SEPARATE, MARKERS ): ax.scatter( x[:, 0], x[:, 1], x[:, 2], alpha=0.2, c = color, marker = marker, label = group_label ) ax.legend() plt.title('Labeled 2018 data ({})'.format(selected_pd)) plt.xlabel("Principal component 1") plt.ylabel("Principal component 2") plt.savefig('{}_subsystem_label.png'.format(selected_pd), bbox_inches='tight') # plt.ylim((-3,3)) # plt.xlim((-3,3)) # plt.savefig('{}_subsystem_label_short_range.png'.format(selected_pd), bbox_inches='tight') for azimuth in [0, 45, 90, 135, 180]: for phi in [0, 45, 90, 135, 180]: ax.view_init(azimuth, phi) plt.savefig('{}_subsystem_label_short_range({}{}).png'.format(selected_pd, azimuth, phi))
def plot_human_label( selected_pds = ["ZeroBias", "JetHT", "EGamma", "SingleMuon"], data_preprocessing_mode = 'minmaxscalar', is_dropna = True, is_fillna_zero = True, ): # styling COLORS = ('green', 'blue') GROUP_LABELS = ('A', 'B') HUMAN_LABELS = ('Good', 'Bad') for selected_pd in selected_pds: print("\n\n Processing {} \n\n".format(selected_pd)) features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY) df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY) if is_dropna: df_good = df_good.dropna() df_bad = df_bad.dropna() elif is_fillna_zero: df_good = df_good.fillna(0) df_bad = df_bad.fillna(0) x = df_good[features] x_train_full, x_valid, x_test_good = utility.split_dataset(x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID) y_test = np.concatenate((np.full(x_test_good.shape[0], 0), np.full(df_bad[features].shape[0], 1))) x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()]) x_train = x_train_full print("Data # training: {}, # validation: {}, # testing good {}, # testing bad {}".format( x_train.shape[0], x_valid.shape[0], x_test_good.shape[0], df_bad.shape[0], )) # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0,1)) if data_preprocessing_mode == 'normalize': x_train = normalize(x_train, norm='l1') x_valid = normalize(x_valid, norm='l1') x_test = normalize(x_test, norm='l1') else: transformer.fit(x_train) x_train = transformer.transform(x_train) x_valid = transformer.transform(x_valid) x_test = transformer.transform(x_test) # Visualization section pca = PCA(n_components=2) pca.fit(np.concatenate([transformer.transform(df_good[features].to_numpy()), transformer.transform(df_bad[features].to_numpy())])) # visulize human x_labeled_good = pca.transform(transformer.transform(df_good[features].to_numpy())) x_labeled_bad = pca.transform(transformer.transform(df_bad[features].to_numpy())) fig, ax = plt.subplots() for color, x, group_label in zip(COLORS, [x_labeled_good, x_labeled_bad], HUMAN_LABELS): ax.scatter( x[:, 0], x[:, 1], alpha=0.8, c = color, label = group_label ) ax.legend() plt.title('Labeled by Human ({})'.format(selected_pd)) plt.xlabel("Principal component 1") plt.ylabel("Principal component 2") plt.savefig('{}_label.png'.format(selected_pd), bbox_inches='tight')
def plot_subsystem( selected_pd = "JetHT", interested_statuses = { 'hcal_hcal': 'hcal-hcal', 'ecal_ecal': 'ecal-ecal', 'tracker_track': 'tracker-track', 'muon_muon': 'muon-muon' }, data_preprocessing_mode = 'minmaxscalar', is_dropna = True, is_fillna_zero = True, ): # styling COLORS_SEPARATE = ('green', 'orange', 'red', 'purple', 'c') HUMAN_LABELS_SEPARATE = ('Good', 'Bad_HCAL', 'Bad_ECAL','Bad_TRACKER', 'Bad_MUON') MARKERS = ('o', '^', '^', '^', '^') print("\n\n Processing {} \n\n".format(selected_pd)) features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_LABELED_SUBSYSTEM_GOOD_DATA_DIRECTORY) df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_LABELED_SUBSYSTEM_BAD_DATA_DIRECTORY) df_bad_hcal = df_bad.query('hcal_hcal == 0') df_bad_ecal = df_bad.query('ecal_ecal == 0') df_bad_traker = df_bad.query('tracker_track == 0') df_bad_muon = df_bad.query('muon_muon == 0') df_bad_human = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY) df_bad_dcs = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_DCS_BAD_DATA_DIRECTORY) print("Before dropna; # Good:{} , # Bad:{}, # HCAL:{}, # ECAL:{}, # TRACKER:{}, # MUON:{}".format( df_good.shape[0], df_bad.shape[0], df_bad_hcal.shape[0], df_bad_ecal.shape[0], df_bad_traker.shape[0], df_bad_muon.shape[0] )) if is_dropna: df_good = df_good.dropna() df_bad = df_bad.dropna() df_bad_hcal = df_bad_hcal.dropna() df_bad_ecal = df_bad_ecal.dropna() df_bad_traker = df_bad_traker.dropna() df_bad_muon = df_bad_muon.dropna() df_bad_human = df_bad_human.dropna() df_bad_dcs = df_bad_dcs.dropna() elif is_fillna_zero: df_good = df_good.fillna(0) df_bad = df_bad.fillna(0) df_bad_hcal = df_bad_hcal.fillna(0) df_bad_ecal = df_bad_ecal.fillna(0) df_bad_traker = df_bad_traker.fillna(0) df_bad_muon = df_bad_muon.fillna(0) df_bad_human = df_bad_human.fillna(0) df_bad_dcs = df_bad_dcs.fillna(0) x = df_good[features] x_train_full, x_valid, x_test_good = utility.split_dataset( x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID ) y_test = np.concatenate(( np.full(x_test_good.shape[0], 0), np.full(df_bad_hcal[features].shape[0], 1), np.full(df_bad_ecal[features].shape[0], 1), np.full(df_bad_traker[features].shape[0], 1), np.full(df_bad_muon[features].shape[0], 1), )) x_test = np.concatenate([ x_test_good, df_bad_hcal[features].to_numpy(), df_bad_ecal[features].to_numpy(), df_bad_traker[features].to_numpy(), df_bad_muon[features].to_numpy(), ]) file_auc = open('report/reco/eval/roc_auc.txt', 'w') file_auc.write("model_name data_fraction roc_auc\n") x_train = x_train_full print("Before dropna; # Good:{}, # HCAL:{}, # ECAL:{}, # TRACKER:{}, # MUON:{}".format( df_good.shape[0], df_bad_hcal.shape[0], df_bad_ecal.shape[0], df_bad_traker.shape[0], df_bad_muon.shape[0] )) # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0,1)) if data_preprocessing_mode == 'normalize': x_train = normalize(x_train, norm='l1') x_valid = normalize(x_valid, norm='l1') x_test = normalize(x_test, norm='l1') else: transformer.fit(x_train) x_train = transformer.transform(x_train) x_valid = transformer.transform(x_valid) x_test = transformer.transform(x_test) # Visualization section pca = PCA(n_components=2) # pca.fit(transformer.transform(df_good[features].to_numpy())) pca.fit(np.concatenate([ transformer.transform(df_good[features].to_numpy()), transformer.transform(df_bad_human[features].to_numpy()), transformer.transform(df_bad_dcs[features].to_numpy()), ])) ### print(pca.explained_variance_ratio_) ## For check inlier and outlier # filter_above_muon_malfunc = list(map(lambda x: True if x > 1.0 else False, pca.transform(transformer.transform(df_bad_muon[features].to_numpy()))[:, 1])) # filter_below_muon_malfunc = list(map(lambda x: True if x < 1.0 else False, pca.transform(transformer.transform(df_bad_muon[features].to_numpy()))[:, 1])) # print("Shape df_bad_muon before cut", df_bad_muon.shape) # print("Shape df_bad_muon outlier", df_bad_muon[filter_above_muon_malfunc].shape) # print("Shape df_bad_muon inlier", df_bad_muon[filter_below_muon_malfunc].shape) # print("Sample muon outlier \n", df_bad_muon[filter_above_muon_malfunc].sample(n=10)[['runId', 'lumiId']]) # print("Sample muon inlier \n", df_bad_muon[filter_below_muon_malfunc].sample(n=10)[['runId', 'lumiId']]) ## Component in eigen vector # N_FIRST_COMPONENT = 20 # abs_st_components = list(map(lambda component, feature: {'feature': feature, 'component': component}, abs(pca.components_[0]), features)) # sorted_abs_st_components = sorted(abs_st_components, key = lambda i: i['component'], reverse=True) # df_pc1 = pd.DataFrame(sorted_abs_st_components) # df_pc1['axis'] = 1 # abs_nd_components = list(map(lambda component, feature: {'feature': feature, 'component': component}, abs(pca.components_[1]), features)) # sorted_abs_nd_components = sorted(abs_nd_components, key = lambda i: i['component'], reverse=True) # df_pc2 = pd.DataFrame(sorted_abs_nd_components) # df_pc2['axis'] = 2 # df_pc = pd.concat([df_pc1, df_pc2], ignore_index=True) # df_pc.to_csv("pc_{}.csv".format(selected_pd)) ### # visualize human x_labeled_good = pca.transform(transformer.transform(df_good[features].to_numpy())) x_labeled_bad_hcal = pca.transform(transformer.transform(df_bad_hcal[features].to_numpy())) x_labeled_bad_ecal = pca.transform(transformer.transform(df_bad_ecal[features].to_numpy())) x_labeled_bad_tracker = pca.transform(transformer.transform(df_bad_traker[features].to_numpy())) x_labeled_bad_muon = pca.transform(transformer.transform(df_bad_muon[features].to_numpy())) fig, ax = plt.subplots() for color, x, group_label, marker in zip( COLORS_SEPARATE, [x_labeled_good, x_labeled_bad_hcal, x_labeled_bad_ecal, x_labeled_bad_tracker, x_labeled_bad_muon, ], HUMAN_LABELS_SEPARATE, MARKERS ): ax.scatter( x[:, 0], x[:, 1], alpha=0.2, c = color, marker = marker, label = group_label ) ax.legend() plt.title('Labeled 2018 data ({})'.format(selected_pd)) plt.xlabel("Principal component 1") plt.ylabel("Principal component 2") plt.savefig('{}_subsystem_label.png'.format(selected_pd), bbox_inches='tight') plt.ylim((-3,3)) plt.xlim((-3,3)) plt.savefig('{}_subsystem_label_short_range.png'.format(selected_pd), bbox_inches='tight')
def main(): # setting model_name = "OneClass_SVM" selected_pds = ["ZeroBias", "JetHT", "EGamma", "SingleMuon"] data_preprocessing_mode = 'minmaxscalar' BS = 2**15 EPOCHS = 1200 is_fillna_zero = True for selected_pd in selected_pds: features = utility.get_full_features(selected_pd) df_good = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_GOOD_DATA_DIRECTORY) df_bad = utility.read_data(selected_pd=selected_pd, pd_data_directory=PD_BAD_DATA_DIRECTORY) if is_fillna_zero: df_good_nan = pd.isnull(df_good) print(df_good[df_good_nan].index.tolist()) df_good = df_good.fillna(0) df_bad = df_bad.fillna(0) x = df_good[features] x_train_full, x_valid, x_test_good = utility.split_dataset( x, frac_test=FRAC_TEST, frac_valid=FRAC_VALID) y_test = np.concatenate( (np.full(x_test_good.shape[0], 0), np.full(df_bad[features].shape[0], 1))) x_test = np.concatenate([x_test_good, df_bad[features].to_numpy()]) file_auc = open('report/reco/eval/roc_auc.txt', 'w') file_auc.write("model_name data_fraction roc_auc\n") x_train = x_train_full print( "Data # training: {}, # validation: {}, # testing good {}, # testing bad {}" .format( x_train.shape[0], x_valid.shape[0], x_test_good.shape[0], df_bad[features].shape[0], )) # Data Preprocessing if data_preprocessing_mode == 'standardize': transformer = StandardScaler() elif data_preprocessing_mode == 'minmaxscalar': transformer = MinMaxScaler(feature_range=(0, 1)) if data_preprocessing_mode == 'normalize': x_train = normalize(x_train, norm='l1') x_valid = normalize(x_valid, norm='l1') x_test = normalize(x_test, norm='l1') else: transformer.fit(x_train) x_train = transformer.transform(x_train) x_valid = transformer.transform(x_valid) x_test = transformer.transform(x_test) # Visualization section pca = PCA(n_components=2) pca.fit( np.concatenate([ transformer.transform(df_good[features].to_numpy()), transformer.transform(df_bad[features].to_numpy()) ])) # visulize human x_labeled_good = pca.transform(df_good[features].to_numpy()) x_labeled_bad = pca.transform(df_bad[features].to_numpy()) fig, ax = plt.subplots() for color, x, group_label in zip(COLORS, [x_labeled_good, x_labeled_bad], GROUP_LABELS): ax.scatter(x[:, 0], x[:, 1], alpha=0.8, c=color, label=group_label) ax.legend() plt.title('Labeled by Human ({})'.format(selected_pd)) plt.xlabel("Principal component 1") plt.ylabel("Principal component 2") plt.savefig('{}_label.png'.format(selected_pd), bbox_inches='tight') # random visual for rand_i in range(2): print("rand number {}".format(rand_i)) rand_two_features = random.sample(features, 2) x_labeled_good = df_good[rand_two_features].to_numpy() x_labeled_bad = df_bad[rand_two_features].to_numpy() fig, ax = plt.subplots() for color, x, group_label in zip(COLORS, [x_labeled_good, x_labeled_bad], GROUP_LABELS): ax.scatter(x[:, 0], x[:, 1], alpha=0.8, c=color, label=group_label) ax.legend() plt.title('Labeled by Human ({})'.format(selected_pd)) plt.xlabel("{}".format(rand_two_features[0])) plt.ylabel("{}".format(rand_two_features[1])) plt.savefig('{}_label_rand_{}.png'.format(selected_pd, rand_i), bbox_inches='tight')
def main( selected_pd = "JetHT", recon_name = "PromptReco", interested_statuses = { 'hcal_hcal': 'hcal-hcal', 'ecal_ecal': 'ecal-ecal', 'tracker_track': 'tracker-track', 'muon_muon': 'muon-muon' }, ): print("\n\n Extract {} dataset \n\n".format(selected_pd)) features = new_prompt_reco_utility.get_full_features(selected_pd) df_good = new_prompt_reco_utility.read_data(selected_pd=selected_pd, pd_data_directory=new_prompt_reco_setting.PD_GOOD_DATA_DIRECTORY) df_bad = new_prompt_reco_utility.read_data(selected_pd=selected_pd, pd_data_directory=new_prompt_reco_setting.PD_BAD_DATA_DIRECTORY) df_write_good = df_good for sub_detector, sub_detector_str in interested_statuses.items(): df_write_good[sub_detector] = 1 sub_detector_statuses = [] for df in [df_bad, ]: for row_i in range(df.shape[0]): run_id, lumi_id = int(df['runId'][row_i]), int(df['lumiId'][row_i]) if not row_i % 1000: print("process %.2f%% (%s/%s), run# %s, lumi# %s" % (100.0 * row_i/df.shape[0], row_i, df.shape[0], run_id, lumi_id)) prompt_reco_dataset = get_dataset_name(recon_name=recon_name, run_id=run_id) detector_status_ranges = get_lumisection_ranges(run_id, prompt_reco_dataset) range_lumis = [{'start': int(x['start']), 'end': int(x['end'])} for x in detector_status_ranges] if lumi_id > range_lumis[-1]['end'] and lumi_id < range_lumis[0]['start']: raise LSNotInRangeError for index_range in range(len(range_lumis)): if lumi_id >= range_lumis[index_range]['start'] and lumi_id <= range_lumis[index_range]['end']: detector_status_range = detector_status_ranges[index_range] ## # print(detector_status_range.keys()) # input() ## sub_detector_status = [ 1 if detector_status_range[sub_detector_str]['status'] == 'GOOD' else 0 for sub_detector, sub_detector_str in interested_statuses.items() ] sub_detector_statuses.append(sub_detector_status) # hb_status = 1 if detector_status_range['hcal-hb']['status'] == 'GOOD' else 0 # he_status = 1 if detector_status_range['hcal-he']['status'] == 'GOOD' else 0 # hf_status = 1 if detector_status_range['hcal-hf']['status'] == 'GOOD' else 0 # h_status = 1 if detector_status_range['hcal-hcal']['status'] == 'GOOD' else 0 # h_all_status = hb_status * he_status * hf_status * h_status # e_status = 1 if detector_status_range['ecal-ecal']['status'] == 'GOOD' else 0 # sub_detector_statuses.append([hb_status, he_status, hf_status, h_status, h_all_status, e_status]) if 0 in sub_detector_status: print( "Found bad HCAL in run {} LS {}!!".format(run_id, lumi_id), sub_detector_status ) df_label = pd.DataFrame(sub_detector_statuses, columns = [ sub_detector for sub_detector, sub_detector_str in interested_statuses.items() ] ) df_write_bad = pd.concat([df_bad, df_label], axis=1) df_write_good.to_csv(os.path.join(implest_rr_setting.RR_DATA_DIRECTORY, 'good', "{}.csv".format(selected_pd))) df_write_bad.to_csv(os.path.join(implest_rr_setting.RR_DATA_DIRECTORY, 'bad', "{}.csv".format(selected_pd))) # 1) get data_name from RR by runID # 2) get prompt_reco from those