def main(): path = '../data/accidents' data = pd.read_csv(f'{path}/accident_data_clean_balanced.csv', header=0) # Feature columns cat_cols = ['roadway_type', 'intersection', 'light_condition', 'atmospheric_conditions', 'manner_of_collision', 'body_type', 'vehicle_conditions', 'part_of_day'] binary_cols = ['land_use_urban', 'national_highway_system', 'previous_dwi_convictions', 'previous_speeding_convictions', 'speeding_related', 'driver_vision_obscured', 'is_weekend', 'multiple_vehicles', 'nonmotorist_involved', 'multiple_motorists', 'drunk_driver_involved'] numeric_cols = ['vehicle_year', 'speed_limit'] data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category')) labels = data['multiple_fatalities'] features = data[cat_cols + binary_cols + numeric_cols] # features = pd.get_dummies(features, columns=cat_cols, drop_first=True) # features.rename(columns={'manner_of_collision_Not Collision with Motor Vehicle in Transport (Not Necessarily in Transport for\n2005-2009)': 'manner_of_collision_Not Collision with Motor Vehicle in Transport'}, # inplace=True) feature_names = features.columns oe = OrdinalEncoder() features = oe.fit_transform(features) scaler = StandardScaler() features = scaler.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2020) print('Class Balance') print(y_test.value_counts()) print() models = { 'Random Forest': (RandomForestClassifier(n_estimators=100, min_samples_leaf=5, random_state=2020), 'rf'), 'Logistic Regression': (LogisticRegressionCV(cv=5, scoring='f1', max_iter=1000, random_state=2020), 'lr') } for name, (model, suffix) in models.items(): print(name) print('-' * 20) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_probs = model.predict_proba(X_test)[:, 1] utils.print_metrics(y_test, y_pred) utils.roc_curve(y_test, y_probs, name, suffix) utils.feature_importance(model, feature_names, name, suffix) utils.permutation_importances(model, X_test, y_test, feature_names, name, suffix) utils.permutation_importances(model, X_train, y_train, feature_names, name, suffix, dataset='train') print('#' * 50)
def test_(self, data_generator, model, repurposing_mode=False, test=False, verbose=True): y_pred = [] y_label = [] model.eval() for i, (v_d, label) in enumerate(data_generator): if self.drug_encoding == "MPNN" or self.drug_encoding == 'Transformer': v_d = v_d else: v_d = v_d.float().to(self.device) score = self.model(v_d) if self.binary: m = torch.nn.Sigmoid() logits = torch.squeeze(m(score)).detach().cpu().numpy() else: logits = torch.squeeze(score).detach().cpu().numpy() label_ids = label.to('cpu').numpy() y_label = y_label + label_ids.flatten().tolist() y_pred = y_pred + logits.flatten().tolist() outputs = np.asarray( [1 if i else 0 for i in (np.asarray(y_pred) >= 0.5)]) model.train() if self.binary: if repurposing_mode: return y_pred ## ROC-AUC curve if test: if verbose: roc_auc_file = os.path.join(self.result_folder, "roc-auc.jpg") plt.figure(0) roc_curve(y_pred, y_label, roc_auc_file, self.drug_encoding) plt.figure(1) pr_auc_file = os.path.join(self.result_folder, "pr-auc.jpg") prauc_curve(y_pred, y_label, pr_auc_file, self.drug_encoding) return roc_auc_score(y_label, y_pred), average_precision_score( y_label, y_pred), f1_score(y_label, outputs), y_pred else: if repurposing_mode: return y_pred return mean_squared_error(y_label, y_pred), \ pearsonr(y_label, y_pred)[0], \ pearsonr(y_label, y_pred)[1], \ concordance_index(y_label, y_pred), y_pred
def roc_experiment(motif, trials=10**5): pw_model = pairwise_model_from_motif(motif) li_model = linear_model_from_motif(motif) L = len(motif[0]) negatives = [random_site(L) for i in trange(trials)] pw_pos = [pw_prob_site(site, pw_model) for site in motif] pw_neg = [pw_prob_site(site, pw_model) for site in tqdm(negatives)] li_pos = [linear_prob_site(site, li_model) for site in motif] li_neg = [linear_prob_site(site, li_model) for site in tqdm(negatives)] _, _, _, pw_auc = roc_curve(pw_pos, pw_neg) _, _, _, li_auc = roc_curve(li_pos, li_neg, color='g') return li_auc, pw_auc
def main(): path = '../data/accidents' data = pd.read_csv(f'{path}/accident_data_clean_balanced.csv', header=0) cat_cols = [ 'month', 'roadway_type', 'intersection', 'light_condition', 'atmospheric_conditions', 'manner_of_collision', 'body_type', 'vehicle_conditions', 'part_of_day' ] binary_cols = [ 'land_use_urban', 'national_highway_system', 'previous_dwi_convictions', 'previous_speeding_convictions', 'speeding_related', 'driver_vision_obscured', 'is_weekend', 'multiple_vehicles', 'nonmotorist_involved', 'multiple_motorists', 'drunk_driver_involved' ] numeric_cols = ['vehicle_year', 'speed_limit'] data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category')) labels = data['multiple_fatalities'] features = data[cat_cols + binary_cols + numeric_cols] feature_names = features.columns # oe = OrdinalEncoder() # features = oe.fit_transform(features) features = pd.get_dummies(features, columns=cat_cols) # scaler = StandardScaler() # features = scaler.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2020) print('Class Balance') print(y_test.value_counts()) print() model = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': range(1, 20, 2)}, cv=5, scoring='f1') model.fit(X_train, y_train) print(model.best_params_) print() y_pred = model.predict(X_test) y_probs = model.predict_proba(X_test)[:, 1] utils.print_metrics(y_test, y_pred) utils.roc_curve(y_test, y_probs, 'KNN', 'knn')
def test(self, test_loader, epoch=0): X, y = next(iter(test_loader)) B, D, C, W, H = X.shape # X = X.view(B, C * D, W, H) self.unet.eval() self.facenet.eval() self.discrim.eval() with torch.no_grad(): y_ = self.unet(X.to(device)) mse = self.mse_loss_function(y_, y.to(device)) loss_G = self.loss_GAN_generator(btch_X=X.to(device)) loss_D = self.loss_GAN_discrimator(btch_X=X.to(device), btch_y=y.to(device)) loss_facenet, _, n_bad = self.loss_facenet(X.to(device), y.to(device)) plt.title(f"epoch {epoch} mse={mse.item():.4} facenet={loss_facenet.item():.4} bad={n_bad / B ** 2}") i = np.random.randint(0, B) a = np.hstack((y[i].transpose(0, 1).transpose(1, 2), y_[i].transpose(0, 1).transpose(1, 2).to(cpu))) b = np.hstack((X[i][0].transpose(0, 1).transpose(1, 2), X[i][-1].transpose(0, 1).transpose(1, 2))) plt.imshow(np.vstack((a, b))) plt.axis('off') plt.show() self.writer.add_scalar("test bad_percent", n_bad / B ** 2, global_step=epoch) self.writer.add_scalar("test loss", mse.item(), global_step=epoch) # self.writer.add_scalars("test GAN", {"discrim": loss_D.item(), # "gen": loss_G.item()}, global_step=epoch) with torch.no_grad(): n_for_show = 10 y_show_ = y_.to(device) y_show = y.to(device) embeddings_anc, _ = self.facenet(y_show_) embeddings_pos, _ = self.facenet(y_show) embeds = torch.cat((embeddings_anc[:n_for_show], embeddings_pos[:n_for_show])) imgs = torch.cat((y_show_[:n_for_show], y_show[:n_for_show])) names = list(range(n_for_show)) * 2 # print(embeds.shape, imgs.shape, len(names)) # self.writer.add_embedding(mat=embeds, metadata=names, label_img=imgs, tag="embeddings", global_step=epoch) trshs, fprs, tprs = roc_curve(embeddings_anc.detach().to(cpu), embeddings_pos.detach().to(cpu)) rnk1 = rank1(embeddings_anc.detach().to(cpu), embeddings_pos.detach().to(cpu)) plt.step(fprs, tprs) # plt.xlim((1e-4, 1)) plt.yticks(np.arange(0, 1, 0.05)) plt.xticks(np.arange(min(fprs), max(fprs), 10)) plt.xscale('log') plt.title(f"ROC auc={auc(fprs, tprs)} rnk1={rnk1}") self.writer.add_figure("ROC test", plt.gcf(), global_step=epoch) self.writer.add_scalar("auc", auc(fprs, tprs), global_step=epoch) self.writer.add_scalar("rank1", rnk1, global_step=epoch) print(f"\n###### {epoch} TEST mse={mse.item():.4} GAN(G/D)={loss_G.item():.4}/{loss_D.item():.4} " f"facenet={loss_facenet.item():.4} bad={n_bad / B ** 2:.4} auc={auc(fprs, tprs)} rank1={rnk1} #######")
def main(): path = '../data/persons' data = pd.read_csv(f'{path}/person_data_clean.csv', header=0) cat_cols = ['person_type', 'trafficway_type', 'manner_of_collision', 'body_type', 'seating_position', 'ejection', 'safety_equipment_use'] binary_cols = ['sex', 'land_use_urban', 'rollover', 'air_bag_deployed'] numeric_cols = ['age'] data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category')) labels = data['fatality'] features = data[cat_cols + binary_cols + numeric_cols] feature_names = features.columns oe = OrdinalEncoder() features = oe.fit_transform(features) # features = pd.get_dummies(features, columns=cat_cols) scaler = StandardScaler() features = scaler.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2020) print('Class Balance') print(y_test.value_counts()) print() model = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': range(1, 20, 2)}, cv=5, scoring='f1') model.fit(X_train, y_train) print(model.best_params_) print() y_pred = model.predict(X_test) y_probs = model.predict_proba(X_test)[:, 1] utils.print_metrics(y_test, y_pred) utils.roc_curve(y_test, y_probs, 'KNN', 'knn')
def test(self, test_loader, epoch): self.enc.eval() self.dec.eval() self.clf.eval() X, y = next(iter(test_loader)) B, D, C, W, H = X.shape n = len(y) with torch.no_grad(): loss_mse = self.get_mse_loss(X.to(device), y.to(device)) loss_clf = self.get_clf_loss(X.to(device), y.to(device)) self.writer.add_scalar("test loss_mse", loss_mse.item(), global_step=epoch) self.writer.add_scalar("test loss_clf", loss_clf.item(), global_step=epoch) embeddings_anc = self.enc(X.view(B * D, C, W, H).to(device)) embeddings_pos = self.enc(y.to(device)) trshs, fprs, tprs = roc_curve(embeddings_anc.detach(), embeddings_pos.detach(), self.clf) rnk1 = rank1(embeddings_anc.detach(), embeddings_pos.detach(), self.clf) plt.step(fprs, tprs) plt.yticks(np.arange(0, 1, 0.05)) plt.xticks(np.arange(min(fprs), max(fprs), 10)) plt.xscale('log') plt.title(f"ROC auc={auc(fprs, tprs)} rnk1={rnk1}") self.writer.add_figure("ROC test", plt.gcf(), global_step=epoch) self.writer.add_scalar("auc", auc(fprs, tprs), global_step=epoch) self.writer.add_scalar("rank1", rnk1, global_step=epoch) print(f"\n###### {epoch} TEST loss_mse {loss_mse.item():.5} loss_clf {loss_clf.item():.5} " f"auc={auc(fprs, tprs)} rank1 = {rnk1} #######") x = X.view(B * D, C, W, H)[0:1] emb = self.enc(x.to(device)) front = self.dec(emb).detach().cpu() self.writer.add_image("cfr", np.hstack((x[0], y[0], front[0])), global_step=epoch) torch.cuda.empty_cache()
def main(): path = '../data/persons' data = pd.read_csv(f'{path}/person_data_clean.csv', header=0) cat_cols = [ 'person_type', 'trafficway_type', 'manner_of_collision', 'body_type', 'seating_position', 'ejection', 'safety_equipment_use' ] binary_cols = ['sex', 'land_use_urban', 'rollover', 'air_bag_deployed'] numeric_cols = ['age'] data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category')) labels = data['fatality'] features = data[cat_cols + binary_cols + numeric_cols] # features = pd.get_dummies(features, columns=cat_cols) # features.rename(columns={'manner_of_collision_Not Collision with Motor Vehicle in Transport (Not Necessarily in Transport for\n2005-2009)': 'manner_of_collision_Not Collision with Motor Vehicle in Transport'}, # inplace=True) feature_names = features.columns oe = OrdinalEncoder() features = oe.fit_transform(features) scaler = StandardScaler() features = scaler.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=2020) print('Class Balance') print(y_test.value_counts()) print() models = { 'Random Forest': (RandomForestClassifier(n_estimators=100, min_samples_leaf=5, class_weight='balanced', random_state=2020), 'rf'), 'Logistic Regression': (LogisticRegressionCV(cv=5, scoring='f1', class_weight='balanced', max_iter=500, random_state=2020), 'lr') } for name, (model, suffix) in models.items(): print(name) print('-' * 20) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_probs = model.predict_proba(X_test)[:, 1] utils.print_metrics(y_test, y_pred) utils.roc_curve(y_test, y_probs, name, suffix) utils.feature_importance(model, feature_names, name, suffix) utils.permutation_importances(model, X_test, y_test, feature_names, name, suffix) # utils.permutation_importances(model, X_train, y_train, feature_names, name, suffix + '_ohe', dataset='train') print('#' * 50)