def calc(self, model, featureslice, experiment): if 'df_input' not in dir( featureslice.featureset ) or featureslice.featureset.df_input is None: raise AttributeError( f"No input dataframe for featureset of the experiment found. " f"Set it with lb['{experiment.identifier}'].set_df(df)") def score_func(X, y): return experiment.metric(y, model.predict(X)) X = featureslice(featureslice.idx_test[:self.n_rows]).values y = featureslice.featureset.target.values[ featureslice.idx_test][:self.n_rows] base_score, score_decreases = get_score_importances( score_func, X, y, n_iter=self.n_iter, random_state=self.random_state) feature_importances = np.mean(score_decreases, axis=0) return { name: imp for name, imp in zip(featureslice.columns, feature_importances) }
def __init__(self, data, pr, distanceAnalysis=False, exceptedColumns=None): ''' :param data: pandas dataframe with datasets where each row represents a dataset :param resultColumnName: Name of column in data that contains actual results :param pr: Predictor of ML-System :param distanceAnalysis: if set to true, distances are used as measurement for correctness of result plots and saves feature importance plot by using ELI5 and Accuracy ''' resultColumnName = pr.resultColumn self.pr = pr self.distanceAnalysis = distanceAnalysis data = self.pr.encode(data, exceptedColumns=exceptedColumns) X = data y = data[ resultColumnName] # target column i.e price range apply SelectKBest class to extract top 10 best features if distanceAnalysis: self.pr.returnDistanceOfClass = True else: X = X.drop([resultColumnName], axis=1) #independent columns. base_score, score_decreases = get_score_importances( self.score, np.array(X), y) feature_importances = np.mean(score_decreases, axis=0) feature_importance_dict = {} for i, feature_name in enumerate(X.columns): feature_importance_dict[feature_name] = feature_importances[i] print( dict( sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)[:4])) self.f_importances(feature_importance_dict, resultColumnName)
def _get_score_importances(self, score_func, X, y): return get_score_importances(score_func, X, y, n_iter=self.n_iter, random_state=self.rng_, n_jobs=self.n_jobs)
def test_get_feature_importances(boston_train): X, y, feat_names = boston_train svr = SVR(C=20, gamma='auto').fit(X, y) score, importances = get_score_importances(svr.score, X, y) assert score > 0.7 importances = dict(zip(feat_names, np.mean(importances, axis=0))) print(score) print(importances) assert importances['AGE'] > importances['NOX'] assert importances['B'] > importances['CHAS']
def my_feature_importance(my_pipeline, accuracy_scorer, X, y): try: return _get_feature_importances(my_pipeline.named_steps['clf']) except: def score(X, y): return accuracy_scorer(my_pipeline, X, y) base_score, score_decreases = get_score_importances(score, X, y, n_iter=5) feature_importances = np.mean(score_decreases, axis=0) return feature_importances
def VIP(): # ... load data, define score function dn = np.array([ "transbig", "unt", "upp", "mainz", "nki", "GSE6532", "GEO", "TCGA753", "TCGA500", "UK", "HEL", "TCGA1093" ]) for i in range(12): ddata = pd.read_csv("data/" + dn[i] + ".csv") ddata = ddata.to_numpy("float32") n, p = ddata.shape X_in = ddata[:, :-2] Y_in = ddata[:, (p - 2):p] base_score, score_decreases = get_score_importances(score, X_in, Y_in) if (i == 0): feature_importances = np.mean(score_decreases, axis=0) else: feature_importances = np.vstack( (feature_importances, np.mean(score_decreases, axis=0))) np.savetxt("vip.csv", feature_importances, delimiter=",") print("OK")
def run_ELI5(model, X_train, X_test, X_val, y_train, y_test, y_val): X_train2 = np.array(X_train).astype(np.float) X_test2 = np.array(X_test).astype(np.float) X_val2 = np.array(X_val).astype(np.float) y_train2 = np.array(y_train).astype(np.float) y_test2 = np.array(y_test).astype(np.float) y_val2 = np.array(y_val).astype(np.float) score = ROC_PR.ROC_Score(model, X_val2, y_val2) score_test = ROC_PR.ROC_Score(model, X_test2, y_test2) # score_for_each_drug = ROC_PR.ROC(model, X_test2, y_test2, ("LRCN" + "BO_delete"), True) spec_recall, prec_recall = ROC_PR.PR(model, X_test2, y_test2) print('area under ROC curve for val:', score) print('area under ROC curve for test:', score_test) print("recall at 95 spec: ", spec_recall) print("precision recall: ", prec_recall) def score(X_test, y_test): return ROC_PR.ROC_Score(model, X_test, y_test) from eli5.permutation_importance import get_score_importances feature_score = [] for i in range(0, len(X_test2[0])): lst = [] lst.append(i) base_score, score_decreases = get_score_importances( score, X_test2, y_test2, n_iter=1, columns_to_shuffle=lst) feature_importances = np.mean(score_decreases, axis=0) feature_score.append(feature_importances[0]) print(i) print(feature_score)
# First of all, cut of last observation from reference dict reference_dict = model.data_preprocessor.reference_dict for key in reference_dict.keys(): reference_dict[key] = reference_dict[key][:-1] # Second, prepare train set appropriately max_dbn = train['date_block_num'].max() score_set = train.query('date_block_num == @max_dbn') score_y = y_train.loc[score_set.index] else: score_set, score_y = test, y_test cols = train.columns.tolist() # This will be used to keep the preprocessing right. I will then use some dirty tricks to permute embedding columns. constant_data = score_set[['item_id', 'item_category_id', 'shop_id']] score_func = get_score_function(model, cols, constant_data, evaluate_embeddings) base_score, score_decreases = get_score_importances( score_func, score_set.to_numpy(), score_y.to_numpy(), random_state=234234, n_iter=1 ) feature_importances = np.mean(score_decreases, axis=0) # We sort ascending because when score_decreases is negative, it means it increases, # which is what we care about (if RMSE increases it means the feature is important) result = sorted(list(zip(feature_importances, cols)), key=lambda x: x[0]) for result_row in result: print(result_row)
X = np.hstack((X,continents)) # load random forest algorithm rf = joblib.load('/disk/scratch/local.2/dmilodow/pantrop_AGB_LUH/saved_algorithms/rfbc_mean.pkl') rf1=rf['rf1'] rf2=rf['rf2'] # Permutation Importance # - define the score used to underpin importance values def r2_score(X,y): y_rfbc = useful.rfbc_predict(rf1,rf2,X) temp1,temp2,r,temp3,temp4 = stats.linregress(y,y_rfbc) return r**2 n_iter=5 X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,test_size=0.25,random_state=23) base_score,score_drops = get_score_importances(r2_score,X_test,y_test,n_iter=n_iter) labels = [] for ii in range(1,X.shape[1]): labels.append('PC%s' % str(ii).zfill(2)) labels.append('Region') var_labels = labels*n_iter var_imp = np.zeros(n_iter*len(labels)) for ii,drops_iter in enumerate(score_drops): var_imp[ii*len(labels):(ii+1)*len(labels)] = drops_iter imp_df = pd.DataFrame(data = {'variable': var_labels, 'permutation_importance': var_imp}) fig,axis= plt.subplots(nrows=1,ncols=1,figsize=[5,8],sharex=True) sns.barplot(x='permutation_importance',y='variable',ci='sd',data=imp_df,ax=axis,color='0.5') axis.set_ylabel('Principal component')
def main(): print('Total memory allocated: ' + str(torch.cuda.memory_allocated())) n_samples = np.random.randint(100, 100000) # n_samples = 100000 print('Number of Samples in DS: ' + str(n_samples)) n_feats = np.random.choice([10, 20, 50, 100, 200, 500], 1).item() # n_feats = 500 n_clusters = np.random.randint(2, 14) sep = 5 * np.random.random_sample() hyper = np.random.choice([True, False], 1).item() X, y = make_classification(n_samples, n_feats, n_feats // 2, 0, 0, 2, n_clusters, None, 0, sep, True, 0, 1, hyper) X, x_test, y, y_test = train_test_split(X, y, test_size=0.2) btchsz = [ len(X), len(X), len(X), len(X), len(X), len(X), len(X), len(X), len(X), len(X), 25000, 20000, 10000, 5000 ] params = [ 5, 10, 25, 50, 100, 500, 1000, 2000, 5000, 10000, 25000, 30000, 35000, 40000 ] scaler = StandardScaler() X = scaler.fit_transform(X) x_test = scaler.transform(x_test) trainset = data_loader(X, y) testset = data_loader(x_test, y_test) if torch.cuda.is_available(): print('Using device:', torch.cuda.get_device_name(torch.cuda.current_device())) no_epochs = 5 accs = [] infl = [] permute = [] for i in range(len(params)): start_time = time.time() torch.cuda.empty_cache() iter = i model = EVINet.EVINet(n_feats, params[iter], batch_size=btchsz[iter]) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) trainloader = DataLoader(trainset, batch_size=btchsz[iter], shuffle=False) testloader = DataLoader(testset, batch_size=btchsz[iter], shuffle=False) scaler = torch.cuda.amp.GradScaler() for epoch in range(no_epochs): total_train_loss = 0 for batchidx, (train_data, train_targets) in enumerate(trainloader): model.train() targets_hot = torch.nn.functional.one_hot(train_targets, 2) optimizer.zero_grad() with torch.cuda.amp.autocast(enabled=False): pred, sig = model(train_data) loss = model.batch_loss(pred, sig, targets_hot.to('cuda:1')) total_train_loss += loss.item() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() if epoch != 0 and (epoch % 1 == 0): print('Epoch: ' + str(epoch) + '/' + str(no_epochs) + ', Train Loss: ' + str(total_train_loss)) print("Total Train Time: " + str(time.time() - start_time)) # validation model.eval() test_acc = model.score(x_test, y_test) accs.append(test_acc) print('Test Accuracy: ' + str(test_acc)) inform_feats = set(range(n_feats // 2)) model.zero_grad() del train_data, train_targets, loss, optimizer torch.cuda.empty_cache() eqn_5_smooth = evi_influence_batch.influence( X, y, x_test, y_test, model, model.fullyCon2.mean_fc.weight, btchsz=btchsz[iter]) eqn_5_smooth = np.mean(normalize(np.vstack(eqn_5_smooth)), axis=0) loss_acc = len( inform_feats.intersection( set(np.argsort( abs(eqn_5_smooth))[::-1][:n_feats // 2]))) / (n_feats // 2) infl.append(loss_acc) start_time = time.time() base_score, score_decreases = get_score_importances( model.score, x_test, y_test) perm_importances = np.mean(score_decreases, axis=0) print("Total Permutation Time: " + str(time.time() - start_time)) perm_acc = len( inform_feats.intersection( set(np.argsort( abs(perm_importances))[::-1][:n_feats // 2]))) / (n_feats // 2) permute.append(perm_acc) print('Inner Loop ' + str(i + 1) + '/' + str(len(params)) + ' Finished') del model gc.collect() torch.cuda.empty_cache() return np.asarray(accs), np.asarray(infl), np.asarray(permute)
def feature_list_generation(train_data_path, test_data_path): input_size = 1412 # original feature size # hidden_size = int(input_size/3) hidden_size = 300 output_size = 2 num_epochs = 50 # lr = 0.00001 lr = 0.0001 batch_size = 32 ### Load train dataset with open('train_list.pkl', 'rb') as f_train: train_list = pickle.load(f_train) with open('training_label.pickle', 'rb') as f_label_train: train_labels = pickle.load(f_label_train) train_list = train_list[0:200000] print(len(train_list)) feature_train, target_train = load_data(train_list, train_labels, train_data_path) ## Load test dataset with open('test_list.pkl', 'rb') as f_test: test_list = pickle.load(f_test) with open('test_label.pickle', 'rb') as f_label_test: test_labels = pickle.load(f_label_test) # test_list = test_list[0:10000] print(len(test_list)) feature_test, target_test = load_data(test_list, test_labels, test_data_path) # model with skorch # convert model based on pytorch to sklearn net = NeuralNetClassifier( mlp_model(input_size, hidden_size, output_size), max_epochs=num_epochs, lr=lr, # Shuffle training data on each epoch iterator_train__shuffle=True, batch_size=batch_size, criterion=torch.nn.CrossEntropyLoss, optimizer=torch.optim.SGD, optimizer__momentum=0.9, optimizer__weight_decay=0.00001) model = net.fit(feature_train, target_train) print(model.score(feature_test, target_test)) # define a score function. using accuracy def score(feature_test, target_test): y_pred = net.predict(feature_test) return accuracy_score(target_test, y_pred) # This function takes only numpy arrays as inputs # base_score = score_func(feature_train, target_train) base_score, score_decreases = get_score_importances(score, feature_test, target_test, n_iter=10) feature_importances = np.mean(score_decreases, axis=0) feature_importance_dict = {} for i in range(1412): feature_importance_dict[str(i)] = feature_importances[i] permu_features = dict( sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)) # print(permu_features) with open('permu_feature_improtance.json', 'w') as fp: json.dump(permu_features, fp)
from sklearn.linear_model import LassoCV GLM = LassoCV() GLM.fit(data_train, df_train[target]) GLM.score(data_test,df_test[target]) from eli5.permutation_importance import get_score_importances # ... load data, define score function def score(X, y): y_pred = GLM.predict(X) return r2_score(y, y_pred) base_score, score_decreases = get_score_importances(score, data_test, df_test[target]) GLM_feature_importances = abs(np.mean(score_decreases, axis=0)) # In[130]: importance_plot(feats,GLM_feature_importances) # from group_lasso import GroupLasso # from sklearn.metrics import r2_score # from eli5.sklearn import PermutationImportance # # GL = GroupLasso() # GL.fit(data_train, df_train[target])
def feature_selection(self, dt: pd.DataFrame, params: dict = None, drop_list: list = None, perm: bool = True, use_ext: bool = False) -> pd.DataFrame: """ Performs feature selection for given data frame. Refresh object`s features and categorical features, than perform cross-validation, calculate importances and do cross-validation again for top selected features. Then plot importances. Args: dt (DataFrame): data frame to feature selection on params (dict): LGBMClassifier parameters dictionary drop_list (list): list to explicitly drop feature before selection perm (bool): use permutation importance flag use_ext (bool): use external (DMX) features flag Returns: res (DataFrame): feature selection results """ if drop_list is None: drop_list = [] if params is None: params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'num_leaves': 16, 'is_unbalance': True, # 'max_depth': 4, 'learning_rate': 0.05, 'feature_fraction': 0.7, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 1, 'random_state': 321 } if use_ext: for table in self.ext_features.keys(): drop_list += list(set(self.full_ext_features[table]) - set(self.ext_features[table])) features = dt.columns[~dt.columns.isin(list(self.meta_vars) + ['label'])].tolist() features = list(set(features) - set(drop_list)) cat_features = [feature for feature in features if feature in self.all_cats] num_boost_round = 100 lgb_train = lgb.Dataset(dt[dt['label'] >= 0][features].values, label=dt[dt['label'] >= 0]['label'].values, feature_name=features, categorical_feature=cat_features, free_raw_data=False) bst = lgb.train(params, lgb_train, num_boost_round=num_boost_round) def score(x, y): return roc_auc_score(y, bst.predict(x)) cv = lgb.cv(params, lgb_train, num_boost_round=num_boost_round) log('Score before feature selection:') log('Max CV ROC AUC score: {}'.format(max(cv['auc-mean']))) log('Min CV ROC AUC score: {}'.format(min(cv['auc-mean']))) log('Average CV ROC AUC score: {}\n'.format(sum(cv['auc-mean']) / len(cv['auc-mean']))) pred = bst.predict(dt[features].values) sns.distplot(pred) plt.show() if perm: _, score_decreases = get_score_importances(score, dt[features].values, dt['label'].values) feature_importances = np.mean(score_decreases, axis=0) else: feature_importances = bst.feature_importance(importance_type='gain') res = pd.DataFrame({'name': features, 'fi': feature_importances}) self.top_features = res.sort_values(by='fi', ascending=False).head(self.top_amount).name.values.tolist() self.top_cats = [feature for feature in self.top_features if feature in self.all_cats] lgb_train = lgb.Dataset(dt[dt['label'] >= 0][self.top_features].values, label=dt[dt['label'] >= 0]['label'].values, feature_name=self.top_features, categorical_feature=self.top_cats, free_raw_data=False) cv = lgb.cv(params, lgb_train, num_boost_round=num_boost_round) log('Score on top {} selected features:'.format(self.top_amount)) log('Max CV ROC AUC score: {}'.format(max(cv['auc-mean']))) log('Min CV ROC AUC score: {}'.format(min(cv['auc-mean']))) log('Average CV ROC AUC score: {}\n'.format(sum(cv['auc-mean']) / len(cv['auc-mean']))) res = res.sort_values(by='fi', ascending=False).head(self.top_amount) sns.barplot(res.fi, res.name) plt.show() return res
def kfold_model_perm(self, X, y, model, params, cols, indices, fit_function, predict_function, score_function, folds=5, verbose=0): kfold = KFold(folds, True) best_model_output = pd.DataFrame() fold_num = 0 perm_df = pd.DataFrame() for trn, test in kfold.split(X): if verbose >= 2: print('Working on fold', fold_num) X_trn = X[trn, :] X_test = X[test, :] y_trn = y[trn] y_test = y[test] model = fit_function(model=model, X=X_trn, y=y_trn) preds = predict_function(model=model, X=X_test) score = score_function(y=y, preds=preds) df_dict = { 'pred': preds, 'actual': y_test, 'test_ind': test, 'fold_num': pd.Series([fold_num for i in range(y_test.shape[0])]), score_name: pd.Series([score for i in range(y_test.shape[0])]) } def score(X, y): preds = predict_function(model=model, X=X_test) score = score_function(y=y, preds=preds) return score base_score, score_decreases = get_score_importances(score, X, y) feature_importances = np.mean(score_decreases, axis=0) best_model_output_temp = pd.DataFrame(df_dict) best_model_output = pd.concat( [best_model_output, best_model_output_temp]) perm_df_temp = pd.DataFrame({ 'importance': feature_importances, 'feature': cols, 'index': indices, 'fold': [fold_num for i in range(len(cols))] }) perm_df_temp = perm_df_temp.sort_values('importance', ascending=False) perm_df = pd.concat([perm_df, perm_df_temp]) fold_num += 1 return best_model_output, perm_df
gbm = lgb.train(param,d_tr,num_boost_round = 1000, valid_sets = [d_tr,d_val], evals_result = eval_result, verbose_eval = 10,early_stopping_rounds=50) ax = lgb.plot_metric(eval_result,metric='l2') #plt.show() def score(X,y): y_pred = gbm.predict(X) return np.sqrt(mean_squared_error(y,y_pred)) import eli5 from eli5.permutation_importance import get_score_importances base_score,score_decreases = get_score_importances(score,trs.to_numpy(),salepr.to_numpy()) feature_importances = np.mean(score_decreases,axis =0) fe_dic = {} for i,fea_n in enumerate(trs.columns): fe_dic[fea_n]=feature_importances[i] print(sorted(fe_dic.items(),key=lambda x:x[1])) ''' #Feature importance with GBM m = gbm.feature_name() n =gbm.feature_importance() a= zip(n,m) print(m,sorted(a,reverse = True ))
def fit(self): base_score, score_decreases = get_score_importances( self.metric_dict[self.metric], self.feature, self.target) self.weight_ = np.mean(score_decreases, axis=0) return self
def train_nn(csv): df = pd.read_csv(csv) ind_train = df[df.year.isin(range(1980, 2000))].index # 1980 to 1999 ind_test = df[df.year.isin(range(2000, 2020))].index # 2000 to 2019 df_train = df.loc[ind_train, :].copy().reset_index(drop=True) df_test = df.loc[ind_test, :].copy().reset_index(drop=True) feats_not_to_use = [ "permno", "year", "month", "next_ret", "pe_op_dil", "DATE", "COMNAM", "TICKER", "SICCD", "SECTOR" ] feats = [feat for feat in df.columns if feat not in feats_not_to_use] target = 'next_ret' """ Data Normalization """ def normalize(series): return (series - series.mean(axis=0)) / series.std(axis=0) mean = df_train[feats].mean(axis=0) df_train[feats] = df_train[feats].fillna(mean) data_train = df_train[feats].apply(normalize).values mean = df_test[feats].mean(axis=0) df_test[feats] = df_test[feats].fillna(mean) data_test = df_test[feats].apply(normalize).values """ Create TensorFlow Train and Test Datasets """ train_dataset = tf.data.Dataset.from_tensor_slices( (data_train, df_train[target].values)) test_dataset = tf.data.Dataset.from_tensor_slices( (data_test, df_test[target].values)) """ Constructing the Model """ nfeats = len(feats) # Geometric pyramid rule (Masters 1993) nhid = [32, 16, 8, 4, 2] def build_models(): models = [] layers_stack = [ layers.Dense(nhid[i], activation="tanh") for i, _ in enumerate(nhid) ] for i in range(1, 6): layers_arr = [ layers.Dense(nhid[0], activation='tanh', input_shape=[nfeats]) ] for j in range(1, i): layers_arr.append(layers_stack[j]) layers_arr.append(layers.Dense(1)) model = keras.Sequential(layers_arr) optimizer = tf.keras.optimizers.SGD(0.005) model.compile(loss='mse', optimizer=optimizer, metrics=['mse']) models.append(model) return models NN1, NN2, NN3, NN4, NN5 = build_models() # Initialize model weights to random values NN1_weights = NN1.weights NN2_weights = NN2.weights NN3_weights = NN3.weights NN4_weights = NN4.weights NN5_weights = NN5.weights np.random.seed(12345) weights_arr = [] for i in range(1, 6): w = [np.random.uniform(-0.01, 0.01, size=(nfeats, nhid[0]))] for j in range(0, i): w.append(np.random.uniform(-0.01, 0.01, size=nhid[j])) if j == i - 1: w.append(np.random.uniform(-0.01, 0.01, size=(nhid[j], 1))) else: w.append( np.random.uniform(-0.01, 0.01, size=(nhid[j], nhid[j + 1]))) w.append(np.random.uniform(-0.01, 0.01, size=1)) weights_arr.append(w) NN1.set_weights(weights_arr[0]) NN2.set_weights(weights_arr[1]) NN3.set_weights(weights_arr[2]) NN4.set_weights(weights_arr[3]) NN5.set_weights(weights_arr[4]) """ Inspecting the Model """ NN1.summary() NN2.summary() NN3.summary() NN4.summary() NN5.summary() """ Training the Model """ NN1.fit(train_dataset.batch(1), epochs=1) NN2.fit(train_dataset.batch(1), epochs=1) NN3.fit(train_dataset.batch(1), epochs=1) NN4.fit(train_dataset.batch(1), epochs=1) NN5.fit(train_dataset.batch(1), epochs=1) # Trained model weights NN1_weights = NN1.weights NN2_weights = NN2.weights NN3_weights = NN3.weights NN4_weights = NN4.weights NN5_weights = NN5.weights # """ # Make Predictions # """ # Larger batch size (100) for faster predictions NN1_test_predictions = NN1.predict(test_dataset.batch(100)).flatten() NN2_test_predictions = NN2.predict(test_dataset.batch(100)).flatten() NN3_test_predictions = NN3.predict(test_dataset.batch(100)).flatten() NN4_test_predictions = NN4.predict(test_dataset.batch(100)).flatten() NN5_test_predictions = NN5.predict(test_dataset.batch(100)).flatten() # """ # Model Evaluation # """ def R2(y, y_hat): R2 = 1 - np.sum((y - y_hat)**2) / np.sum(y**2) return R2 NN1_R2_Val = R2(df_test[target].values, NN1_test_predictions) NN2_R2_Val = R2(df_test[target].values, NN2_test_predictions) NN3_R2_Val = R2(df_test[target].values, NN3_test_predictions) NN4_R2_Val = R2(df_test[target].values, NN4_test_predictions) NN5_R2_Val = R2(df_test[target].values, NN5_test_predictions) all_R2_Val = [NN1_R2_Val, NN2_R2_Val, NN3_R2_Val, NN4_R2_Val, NN5_R2_Val] def NN1_score(X, y): y_pred = NN1.predict(X) return R2(y, y_pred) def NN2_score(X, y): y_pred = NN2.predict(X) return R2(y, y_pred) def NN3_score(X, y): y_pred = NN3.predict(X) return R2(y, y_pred) def NN4_score(X, y): y_pred = NN4.predict(X) return R2(y, y_pred) def NN5_score(X, y): y_pred = NN5.predict(X) return R2(y, y_pred) _, NN1_score_decreases = get_score_importances(NN1_score, data_test, df_test[target].values) _, NN2_score_decreases = get_score_importances(NN2_score, data_test, df_test[target].values) _, NN3_score_decreases = get_score_importances(NN3_score, data_test, df_test[target].values) _, NN4_score_decreases = get_score_importances(NN4_score, data_test, df_test[target].values) _, NN5_score_decreases = get_score_importances(NN5_score, data_test, df_test[target].values) NN1_feat_imps = np.mean(NN1_score_decreases, axis=0) NN2_feat_imps = np.mean(NN2_score_decreases, axis=0) NN3_feat_imps = np.mean(NN3_score_decreases, axis=0) NN4_feat_imps = np.mean(NN4_score_decreases, axis=0) NN5_feat_imps = np.mean(NN5_score_decreases, axis=0) all_importances = [] for feat_imps in [ NN1_feat_imps, NN2_feat_imps, NN3_feat_imps, NN4_feat_imps, NN5_feat_imps ]: importances = {} for index, feat_imp in enumerate(feat_imps): importances[feats[index]] = feat_imp all_importances.append(importances) return all_importances, all_R2_Val
def main(): n_samples = np.random.randint(100, 100000) # n_samples = 100000 # n_feats = 500 print('Number of Samples in DS: ' + str(n_samples)) n_feats = np.random.choice([10, 20, 50, 100, 200, 500], 1).item() n_clusters = np.random.randint(2, 14) sep = 5 * np.random.random_sample() hyper = np.random.choice([True, False], 1).item() X, y = make_classification(n_samples, n_feats, n_feats // 2, 0, 0, 2, n_clusters, None, 0, sep, True, 0, 1, hyper) X, x_test, y, y_test = train_test_split(X, y, test_size=0.2) scaler = StandardScaler() X = scaler.fit_transform(X) x_test = scaler.transform(x_test) device = 'cuda:0' if (torch.cuda.is_available()): print('Using device:', torch.cuda.get_device_name(torch.cuda.current_device())) no_epochs = 100 btchsz = [ len(X), len(X), len(X), len(X), len(X), len(X), len(X), len(X), len(X), len(X) ] params = [5, 10, 25, 50, 100, 500, 1000, 2000, 5000, 10000, 25000] trainset = data_loader(X, y) testset = data_loader(x_test, y_test) accs = [] infl = [] permute = [] for i in range(len(params)): start_time = time.time() torch.cuda.empty_cache() iter = i model = Vanilla(n_feats, params[iter], batch_size=btchsz[iter]) #.half() criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.01) if device: # model.to(device) print('Moved to GPU') for epoch in range(no_epochs): total_train_loss = 0 model.train() optimizer.zero_grad() pred = model(torch.from_numpy(X).float().to('cuda:0')) loss = criterion(pred, torch.from_numpy(y).long().to('cuda:0')) total_train_loss += loss.item() optimizer.step() if epoch != 0 and (epoch % 25 == 0): print('Epoch: ' + str(epoch + 1) + '/' + str(no_epochs) + ', Train Loss: ' + str(total_train_loss)) print("Total Train Time: " + str(time.time() - start_time)) # validation model.eval() image_test = torch.from_numpy(x_test).float().to(device) label_test = torch.from_numpy(y_test).long().to(device) pred_test = model(image_test) test_acc = model.score(x_test, y_test) accs.append(test_acc) inform_feats = set(range(n_feats // 2)) eqn_5_smooth = smoothInfluence.influence( torch.from_numpy(X).float().to('cuda:0'), torch.from_numpy(y).long().to('cuda:0'), image_test, model, model.linear_2.weight) eqn_5_smooth = np.mean(normalize(np.vstack(eqn_5_smooth)), axis=0) loss_acc = len( inform_feats.intersection( set(np.argsort( abs(eqn_5_smooth))[::-1][:n_feats // 2]))) / (n_feats // 2) print(loss_acc) infl.append(loss_acc) base_score, score_decreases = get_score_importances( model.score, x_test, y_test) perm_importances = np.mean(score_decreases, axis=0) perm_acc = len( inform_feats.intersection( set(np.argsort( abs(perm_importances))[::-1][:n_feats // 2]))) / (n_feats // 2) permute.append(perm_acc) print('Inner Loop ' + str(i + 1) + '/' + str(len(params)) + ' Finished') return np.asarray(accs), np.asarray(infl), np.asarray(permute)
rfbc4 = {} rfbc5 = {} rfbc1['rf1'], rfbc1['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 0], y[cal_blocks != 0]) rfbc2['rf1'], rfbc2['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 1], y[cal_blocks != 1]) rfbc3['rf1'], rfbc3['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 2], y[cal_blocks != 2]) rfbc4['rf1'], rfbc4['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 3], y[cal_blocks != 3]) rfbc5['rf1'], rfbc5['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 4], y[cal_blocks != 4]) n_iter = 5 base_score, score_drops = get_score_importances(r2_score, X, y, n_iter=n_iter) # Additional importance estimates that holistically consider the impact of # permuting all the layers from a given sensor texture_labs = [ 'value', 'contrast', 'correlation', 'dissimilarity', 'entropy', 'homogeneity', 'mean', 'second_moment', 'variance' ] texture_labs_alt = [ 'enlee', 'cont', 'corr', 'diss', 'ent', 'hom', 'mean', 's_m_', 'var' ] texture_labs_display = [ 'value', 'contrast', 'correlation', 'dissimilarity', 'entropy', 'homogeneity', 'mean', 'second moment', 'variance' ]
# train_summ = sess.run(performance_summaries, feed_dict={tf_loss_ph:outs[1], tf_accuracy_ph:outs[2]}) # train_writer.add_summary(train_summ, epoch) # val_summ = sess.run(performance_summaries, feed_dict={tf_loss_ph:cost, tf_accuracy_ph:acc}) # val_writer.add_summary(val_summ, epoch) if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean( cost_val[-(FLAGS.early_stopping + 1):-1]): print("Early stopping...") break print("Optimization Finished!") print("Running train feature importance ...") base_score, score_decreases = get_score_importances(score_train, raw_features.toarray(), y_train) mean_feat_imp = np.mean(score_decreases, axis=0) std_feat_imp = np.std(score_decreases, axis=0) feat_imp_stats = pd.DataFrame(columns=data_cols, data=[mean_feat_imp, std_feat_imp]) feat_imp_stats.to_csv(log_dir + '/train/feat_imp.csv', index=False) print("Running validation feature importance ...") base_score, score_decreases = get_score_importances(score_val, raw_features.toarray(), y_val) mean_feat_imp = np.mean(score_decreases, axis=0) std_feat_imp = np.std(score_decreases, axis=0) feat_imp_stats = pd.DataFrame(columns=data_cols, data=[mean_feat_imp, std_feat_imp])
print("*** optimal hyperparameters ***") for k, v in sorted(parameters.iteritems()): print(str(k) + " = " + str(v)) if "model" in cfg: resvm_train(cfg) elif "eli" in cfg: binary_labels = read_binary_labels(cfg["data"], " ", cfg["pos"]) # all data true_labels = [binary_labels[x] for x in range(len(binary_labels))] def score(X, y): # or cfg['data'] cfg, binary_labels dump_svmlight_file(X, y, cfg['data']) labels, decision_values = resvm_predict(cfg) true_labels = [binary_labels[x] for x in range(len(binary_labels))] return scorefun(true_labels, [ x > 0.5 for x in decision_values ]) # [x > 0.5 for x in decision_values] <--- gets you all predict as 1 base_score, score_decreases = get_score_importances( score, cfg['X'], cfg['y'] ) # get_score_importances(score, cfg, [x > 0.5 for x in decision_values]) <---- adapt to the scorefun function already define in the resvm model feature_importances = np.mean(score_decreases, axis=0) importance_df = pd.DataFrame({ 'feature': cfg['df'].drop(columns="tagged").columns, 'importance': feature_importances }) importance_df.to_csv('feature_importances.csv')