Exemple #1
0
        def calc(self, model, featureslice, experiment):
            if 'df_input' not in dir(
                    featureslice.featureset
            ) or featureslice.featureset.df_input is None:
                raise AttributeError(
                    f"No input dataframe for featureset of the experiment found. "
                    f"Set it with lb['{experiment.identifier}'].set_df(df)")

            def score_func(X, y):
                return experiment.metric(y, model.predict(X))

            X = featureslice(featureslice.idx_test[:self.n_rows]).values
            y = featureslice.featureset.target.values[
                featureslice.idx_test][:self.n_rows]
            base_score, score_decreases = get_score_importances(
                score_func,
                X,
                y,
                n_iter=self.n_iter,
                random_state=self.random_state)
            feature_importances = np.mean(score_decreases, axis=0)
            return {
                name: imp
                for name, imp in zip(featureslice.columns, feature_importances)
            }
    def __init__(self, data, pr, distanceAnalysis=False, exceptedColumns=None):
        '''
        :param data: pandas dataframe with datasets where each row represents a dataset
        :param resultColumnName: Name of column in data that contains actual results
        :param pr: Predictor of ML-System
        :param distanceAnalysis: if set to true, distances are used as measurement for correctness of result
        plots and saves feature importance plot by using ELI5 and Accuracy
        '''
        resultColumnName = pr.resultColumn
        self.pr = pr
        self.distanceAnalysis = distanceAnalysis
        data = self.pr.encode(data, exceptedColumns=exceptedColumns)
        X = data
        y = data[
            resultColumnName]  # target column i.e price range apply SelectKBest class to extract top 10 best features
        if distanceAnalysis:
            self.pr.returnDistanceOfClass = True
        else:
            X = X.drop([resultColumnName], axis=1)  #independent columns.

        base_score, score_decreases = get_score_importances(
            self.score, np.array(X), y)
        feature_importances = np.mean(score_decreases, axis=0)

        feature_importance_dict = {}
        for i, feature_name in enumerate(X.columns):
            feature_importance_dict[feature_name] = feature_importances[i]
        print(
            dict(
                sorted(feature_importance_dict.items(),
                       key=lambda x: x[1],
                       reverse=True)[:4]))
        self.f_importances(feature_importance_dict, resultColumnName)
Exemple #3
0
 def _get_score_importances(self, score_func, X, y):
     return get_score_importances(score_func,
                                  X,
                                  y,
                                  n_iter=self.n_iter,
                                  random_state=self.rng_,
                                  n_jobs=self.n_jobs)
def test_get_feature_importances(boston_train):
    X, y, feat_names = boston_train
    svr = SVR(C=20, gamma='auto').fit(X, y)
    score, importances = get_score_importances(svr.score, X, y)
    assert score > 0.7
    importances = dict(zip(feat_names, np.mean(importances, axis=0)))
    print(score)
    print(importances)
    assert importances['AGE'] > importances['NOX']
    assert importances['B'] > importances['CHAS']
Exemple #5
0
def my_feature_importance(my_pipeline, accuracy_scorer, X, y):

    try:
        return _get_feature_importances(my_pipeline.named_steps['clf'])
    except:

        def score(X, y):
            return accuracy_scorer(my_pipeline, X, y)

        base_score, score_decreases = get_score_importances(score,
                                                            X,
                                                            y,
                                                            n_iter=5)
        feature_importances = np.mean(score_decreases, axis=0)

        return feature_importances
Exemple #6
0
def VIP():
    # ... load data, define score function
    dn = np.array([
        "transbig", "unt", "upp", "mainz", "nki", "GSE6532", "GEO", "TCGA753",
        "TCGA500", "UK", "HEL", "TCGA1093"
    ])
    for i in range(12):
        ddata = pd.read_csv("data/" + dn[i] + ".csv")
        ddata = ddata.to_numpy("float32")
        n, p = ddata.shape
        X_in = ddata[:, :-2]
        Y_in = ddata[:, (p - 2):p]

        base_score, score_decreases = get_score_importances(score, X_in, Y_in)

        if (i == 0):
            feature_importances = np.mean(score_decreases, axis=0)
        else:
            feature_importances = np.vstack(
                (feature_importances, np.mean(score_decreases, axis=0)))

    np.savetxt("vip.csv", feature_importances, delimiter=",")
    print("OK")
Exemple #7
0
def run_ELI5(model, X_train, X_test, X_val, y_train, y_test, y_val):
    X_train2 = np.array(X_train).astype(np.float)
    X_test2 = np.array(X_test).astype(np.float)
    X_val2 = np.array(X_val).astype(np.float)

    y_train2 = np.array(y_train).astype(np.float)
    y_test2 = np.array(y_test).astype(np.float)
    y_val2 = np.array(y_val).astype(np.float)

    score = ROC_PR.ROC_Score(model, X_val2, y_val2)
    score_test = ROC_PR.ROC_Score(model, X_test2, y_test2)
    # score_for_each_drug = ROC_PR.ROC(model, X_test2, y_test2, ("LRCN" + "BO_delete"), True)
    spec_recall, prec_recall = ROC_PR.PR(model, X_test2, y_test2)

    print('area under ROC curve for val:', score)
    print('area under ROC curve for test:', score_test)
    print("recall at 95 spec: ", spec_recall)
    print("precision recall: ", prec_recall)

    def score(X_test, y_test):
        return ROC_PR.ROC_Score(model, X_test, y_test)

    from eli5.permutation_importance import get_score_importances

    feature_score = []

    for i in range(0, len(X_test2[0])):
        lst = []
        lst.append(i)
        base_score, score_decreases = get_score_importances(
            score, X_test2, y_test2, n_iter=1, columns_to_shuffle=lst)
        feature_importances = np.mean(score_decreases, axis=0)
        feature_score.append(feature_importances[0])
        print(i)

    print(feature_score)
Exemple #8
0
        # First of all, cut of last observation from reference dict
        reference_dict = model.data_preprocessor.reference_dict
        for key in reference_dict.keys():
            reference_dict[key] = reference_dict[key][:-1]
        # Second, prepare train set appropriately
        max_dbn = train['date_block_num'].max()
        score_set = train.query('date_block_num == @max_dbn')
        score_y = y_train.loc[score_set.index]
    else:
        score_set, score_y = test, y_test

    cols = train.columns.tolist()

    # This will be used to keep the preprocessing right. I will then use some dirty tricks to permute embedding columns.
    constant_data = score_set[['item_id', 'item_category_id', 'shop_id']]

    score_func = get_score_function(model, cols, constant_data, evaluate_embeddings)

    base_score, score_decreases = get_score_importances(
           score_func, score_set.to_numpy(), score_y.to_numpy(), random_state=234234, n_iter=1
    )

    feature_importances = np.mean(score_decreases, axis=0)

    # We sort ascending because when score_decreases is negative, it means it increases,
    # which is what we care about (if RMSE increases it means the feature is important)
    result = sorted(list(zip(feature_importances, cols)), key=lambda x: x[0])

    for result_row in result:
        print(result_row)
X = np.hstack((X,continents))

# load random forest algorithm
rf = joblib.load('/disk/scratch/local.2/dmilodow/pantrop_AGB_LUH/saved_algorithms/rfbc_mean.pkl')
rf1=rf['rf1']
rf2=rf['rf2']

# Permutation Importance
# - define the score used to underpin importance values
def r2_score(X,y):
    y_rfbc = useful.rfbc_predict(rf1,rf2,X)
    temp1,temp2,r,temp3,temp4 = stats.linregress(y,y_rfbc)
    return r**2
n_iter=5
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,test_size=0.25,random_state=23)
base_score,score_drops = get_score_importances(r2_score,X_test,y_test,n_iter=n_iter)
labels = []
for ii in range(1,X.shape[1]):
    labels.append('PC%s' % str(ii).zfill(2))
labels.append('Region')

var_labels = labels*n_iter
var_imp = np.zeros(n_iter*len(labels))
for ii,drops_iter in enumerate(score_drops):
    var_imp[ii*len(labels):(ii+1)*len(labels)] = drops_iter
imp_df = pd.DataFrame(data = {'variable': var_labels,
                              'permutation_importance': var_imp})

fig,axis= plt.subplots(nrows=1,ncols=1,figsize=[5,8],sharex=True)
sns.barplot(x='permutation_importance',y='variable',ci='sd',data=imp_df,ax=axis,color='0.5')
axis.set_ylabel('Principal component')
Exemple #10
0
def main():
    print('Total memory allocated: ' + str(torch.cuda.memory_allocated()))
    n_samples = np.random.randint(100, 100000)
    # n_samples = 100000
    print('Number of Samples in DS: ' + str(n_samples))
    n_feats = np.random.choice([10, 20, 50, 100, 200, 500], 1).item()
    # n_feats = 500
    n_clusters = np.random.randint(2, 14)
    sep = 5 * np.random.random_sample()
    hyper = np.random.choice([True, False], 1).item()

    X, y = make_classification(n_samples, n_feats, n_feats // 2, 0, 0, 2,
                               n_clusters, None, 0, sep, True, 0, 1, hyper)
    X, x_test, y, y_test = train_test_split(X, y, test_size=0.2)

    btchsz = [
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X), 25000, 20000, 10000, 5000
    ]
    params = [
        5, 10, 25, 50, 100, 500, 1000, 2000, 5000, 10000, 25000, 30000, 35000,
        40000
    ]

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    x_test = scaler.transform(x_test)

    trainset = data_loader(X, y)
    testset = data_loader(x_test, y_test)

    if torch.cuda.is_available():
        print('Using device:',
              torch.cuda.get_device_name(torch.cuda.current_device()))

    no_epochs = 5

    accs = []
    infl = []
    permute = []

    for i in range(len(params)):
        start_time = time.time()
        torch.cuda.empty_cache()
        iter = i
        model = EVINet.EVINet(n_feats, params[iter], batch_size=btchsz[iter])
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        trainloader = DataLoader(trainset,
                                 batch_size=btchsz[iter],
                                 shuffle=False)

        testloader = DataLoader(testset,
                                batch_size=btchsz[iter],
                                shuffle=False)

        scaler = torch.cuda.amp.GradScaler()

        for epoch in range(no_epochs):
            total_train_loss = 0
            for batchidx, (train_data,
                           train_targets) in enumerate(trainloader):

                model.train()

                targets_hot = torch.nn.functional.one_hot(train_targets, 2)

                optimizer.zero_grad()

                with torch.cuda.amp.autocast(enabled=False):
                    pred, sig = model(train_data)

                loss = model.batch_loss(pred, sig, targets_hot.to('cuda:1'))
                total_train_loss += loss.item()

                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            if epoch != 0 and (epoch % 1 == 0):
                print('Epoch: ' + str(epoch) + '/' + str(no_epochs) +
                      ', Train Loss: ' + str(total_train_loss))
        print("Total Train Time: " + str(time.time() - start_time))
        # validation
        model.eval()
        test_acc = model.score(x_test, y_test)
        accs.append(test_acc)
        print('Test Accuracy: ' + str(test_acc))

        inform_feats = set(range(n_feats // 2))

        model.zero_grad()
        del train_data, train_targets, loss, optimizer
        torch.cuda.empty_cache()
        eqn_5_smooth = evi_influence_batch.influence(
            X,
            y,
            x_test,
            y_test,
            model,
            model.fullyCon2.mean_fc.weight,
            btchsz=btchsz[iter])
        eqn_5_smooth = np.mean(normalize(np.vstack(eqn_5_smooth)), axis=0)
        loss_acc = len(
            inform_feats.intersection(
                set(np.argsort(
                    abs(eqn_5_smooth))[::-1][:n_feats // 2]))) / (n_feats // 2)
        infl.append(loss_acc)

        start_time = time.time()
        base_score, score_decreases = get_score_importances(
            model.score, x_test, y_test)
        perm_importances = np.mean(score_decreases, axis=0)
        print("Total Permutation Time: " + str(time.time() - start_time))
        perm_acc = len(
            inform_feats.intersection(
                set(np.argsort(
                    abs(perm_importances))[::-1][:n_feats //
                                                 2]))) / (n_feats // 2)
        permute.append(perm_acc)

        print('Inner Loop ' + str(i + 1) + '/' + str(len(params)) +
              ' Finished')
        del model
        gc.collect()
        torch.cuda.empty_cache()

    return np.asarray(accs), np.asarray(infl), np.asarray(permute)
Exemple #11
0
def feature_list_generation(train_data_path, test_data_path):

    input_size = 1412  # original feature size
    # hidden_size = int(input_size/3)
    hidden_size = 300
    output_size = 2
    num_epochs = 50
    # lr = 0.00001
    lr = 0.0001
    batch_size = 32

    ### Load train dataset
    with open('train_list.pkl', 'rb') as f_train:
        train_list = pickle.load(f_train)
    with open('training_label.pickle', 'rb') as f_label_train:
        train_labels = pickle.load(f_label_train)

    train_list = train_list[0:200000]
    print(len(train_list))
    feature_train, target_train = load_data(train_list, train_labels,
                                            train_data_path)
    ## Load test dataset
    with open('test_list.pkl', 'rb') as f_test:
        test_list = pickle.load(f_test)
    with open('test_label.pickle', 'rb') as f_label_test:
        test_labels = pickle.load(f_label_test)

    # test_list = test_list[0:10000]
    print(len(test_list))
    feature_test, target_test = load_data(test_list, test_labels,
                                          test_data_path)
    # model with skorch
    # convert model based on pytorch to sklearn
    net = NeuralNetClassifier(
        mlp_model(input_size, hidden_size, output_size),
        max_epochs=num_epochs,
        lr=lr,
        # Shuffle training data on each epoch
        iterator_train__shuffle=True,
        batch_size=batch_size,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.SGD,
        optimizer__momentum=0.9,
        optimizer__weight_decay=0.00001)
    model = net.fit(feature_train, target_train)
    print(model.score(feature_test, target_test))

    # define a score function. using accuracy
    def score(feature_test, target_test):
        y_pred = net.predict(feature_test)
        return accuracy_score(target_test, y_pred)

    # This function takes only numpy arrays as inputs
    # base_score = score_func(feature_train, target_train)
    base_score, score_decreases = get_score_importances(score,
                                                        feature_test,
                                                        target_test,
                                                        n_iter=10)
    feature_importances = np.mean(score_decreases, axis=0)
    feature_importance_dict = {}
    for i in range(1412):
        feature_importance_dict[str(i)] = feature_importances[i]
    permu_features = dict(
        sorted(feature_importance_dict.items(),
               key=lambda x: x[1],
               reverse=True))
    # print(permu_features)

    with open('permu_feature_improtance.json', 'w') as fp:
        json.dump(permu_features, fp)
Exemple #12
0

from sklearn.linear_model import LassoCV

GLM = LassoCV()
GLM.fit(data_train, df_train[target])
GLM.score(data_test,df_test[target])

from eli5.permutation_importance import get_score_importances

# ... load data, define score function
def score(X, y):
    y_pred = GLM.predict(X)
    return r2_score(y, y_pred)

base_score, score_decreases = get_score_importances(score, data_test, df_test[target])
GLM_feature_importances = abs(np.mean(score_decreases, axis=0))


# In[130]:


importance_plot(feats,GLM_feature_importances)


# from group_lasso import GroupLasso
# from sklearn.metrics import r2_score
# from eli5.sklearn import PermutationImportance
# 
# GL = GroupLasso()
# GL.fit(data_train, df_train[target])
Exemple #13
0
    def feature_selection(self, dt: pd.DataFrame,
                          params: dict = None,
                          drop_list: list = None,
                          perm: bool = True,
                          use_ext: bool = False) -> pd.DataFrame:
        """
        Performs feature selection for given data frame.

        Refresh object`s features and categorical features, than perform cross-validation, calculate importances and
        do cross-validation again for top selected features. Then plot importances.

        Args:
            dt (DataFrame): data frame to feature selection on
            params (dict): LGBMClassifier parameters dictionary
            drop_list (list): list to explicitly drop feature before selection
            perm (bool): use permutation importance flag
            use_ext (bool): use external (DMX) features flag

        Returns:
            res (DataFrame): feature selection results
        """
        if drop_list is None:
            drop_list = []

        if params is None:
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'num_leaves': 16,
                'is_unbalance': True,
                # 'max_depth': 4,
                'learning_rate': 0.05,
                'feature_fraction': 0.7,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'verbose': 1,
                'random_state': 321
            }

        if use_ext:
            for table in self.ext_features.keys():
                drop_list += list(set(self.full_ext_features[table]) - set(self.ext_features[table]))

        features = dt.columns[~dt.columns.isin(list(self.meta_vars) + ['label'])].tolist()
        features = list(set(features) - set(drop_list))
        cat_features = [feature for feature in features if feature in self.all_cats]

        num_boost_round = 100

        lgb_train = lgb.Dataset(dt[dt['label'] >= 0][features].values,
                                label=dt[dt['label'] >= 0]['label'].values,
                                feature_name=features,
                                categorical_feature=cat_features,
                                free_raw_data=False)

        bst = lgb.train(params, lgb_train, num_boost_round=num_boost_round)

        def score(x, y):
            return roc_auc_score(y, bst.predict(x))

        cv = lgb.cv(params, lgb_train, num_boost_round=num_boost_round)
        log('Score before feature selection:')
        log('Max CV ROC AUC score: {}'.format(max(cv['auc-mean'])))
        log('Min CV ROC AUC score: {}'.format(min(cv['auc-mean'])))
        log('Average CV ROC AUC score: {}\n'.format(sum(cv['auc-mean']) / len(cv['auc-mean'])))

        pred = bst.predict(dt[features].values)
        sns.distplot(pred)
        plt.show()
        if perm:
            _, score_decreases = get_score_importances(score, dt[features].values, dt['label'].values)
            feature_importances = np.mean(score_decreases, axis=0)
        else:
            feature_importances = bst.feature_importance(importance_type='gain')

        res = pd.DataFrame({'name': features, 'fi': feature_importances})

        self.top_features = res.sort_values(by='fi', ascending=False).head(self.top_amount).name.values.tolist()
        self.top_cats = [feature for feature in self.top_features if feature in self.all_cats]

        lgb_train = lgb.Dataset(dt[dt['label'] >= 0][self.top_features].values,
                                label=dt[dt['label'] >= 0]['label'].values,
                                feature_name=self.top_features,
                                categorical_feature=self.top_cats,
                                free_raw_data=False)

        cv = lgb.cv(params, lgb_train, num_boost_round=num_boost_round)
        log('Score on top {} selected features:'.format(self.top_amount))
        log('Max CV ROC AUC score: {}'.format(max(cv['auc-mean'])))
        log('Min CV ROC AUC score: {}'.format(min(cv['auc-mean'])))
        log('Average CV ROC AUC score: {}\n'.format(sum(cv['auc-mean']) / len(cv['auc-mean'])))

        res = res.sort_values(by='fi', ascending=False).head(self.top_amount)
        sns.barplot(res.fi, res.name)
        plt.show()
        return res
Exemple #14
0
    def kfold_model_perm(self,
                         X,
                         y,
                         model,
                         params,
                         cols,
                         indices,
                         fit_function,
                         predict_function,
                         score_function,
                         folds=5,
                         verbose=0):

        kfold = KFold(folds, True)
        best_model_output = pd.DataFrame()
        fold_num = 0
        perm_df = pd.DataFrame()
        for trn, test in kfold.split(X):
            if verbose >= 2:
                print('Working on fold', fold_num)
            X_trn = X[trn, :]
            X_test = X[test, :]
            y_trn = y[trn]
            y_test = y[test]
            model = fit_function(model=model, X=X_trn, y=y_trn)
            preds = predict_function(model=model, X=X_test)
            score = score_function(y=y, preds=preds)
            df_dict = {
                'pred': preds,
                'actual': y_test,
                'test_ind': test,
                'fold_num':
                pd.Series([fold_num for i in range(y_test.shape[0])]),
                score_name: pd.Series([score for i in range(y_test.shape[0])])
            }

            def score(X, y):
                preds = predict_function(model=model, X=X_test)
                score = score_function(y=y, preds=preds)
                return score

            base_score, score_decreases = get_score_importances(score, X, y)
            feature_importances = np.mean(score_decreases, axis=0)

            best_model_output_temp = pd.DataFrame(df_dict)
            best_model_output = pd.concat(
                [best_model_output, best_model_output_temp])

            perm_df_temp = pd.DataFrame({
                'importance':
                feature_importances,
                'feature':
                cols,
                'index':
                indices,
                'fold': [fold_num for i in range(len(cols))]
            })
            perm_df_temp = perm_df_temp.sort_values('importance',
                                                    ascending=False)

            perm_df = pd.concat([perm_df, perm_df_temp])

            fold_num += 1

        return best_model_output, perm_df
gbm = lgb.train(param,d_tr,num_boost_round = 1000,
valid_sets = [d_tr,d_val],
evals_result = eval_result,
verbose_eval = 10,early_stopping_rounds=50)

ax = lgb.plot_metric(eval_result,metric='l2')
#plt.show()

def score(X,y):
 y_pred = gbm.predict(X)
 return np.sqrt(mean_squared_error(y,y_pred))

import eli5
from eli5.permutation_importance import get_score_importances
base_score,score_decreases = get_score_importances(score,trs.to_numpy(),salepr.to_numpy())
feature_importances = np.mean(score_decreases,axis =0)
fe_dic = {}

for i,fea_n in enumerate(trs.columns):
 fe_dic[fea_n]=feature_importances[i]

print(sorted(fe_dic.items(),key=lambda x:x[1])) 

'''
#Feature importance with GBM
m = gbm.feature_name()
n =gbm.feature_importance()
a= zip(n,m)
print(m,sorted(a,reverse = True ))
    def fit(self):
        base_score, score_decreases = get_score_importances(
            self.metric_dict[self.metric], self.feature, self.target)

        self.weight_ = np.mean(score_decreases, axis=0)
        return self
def train_nn(csv):
    df = pd.read_csv(csv)

    ind_train = df[df.year.isin(range(1980, 2000))].index  # 1980 to 1999
    ind_test = df[df.year.isin(range(2000, 2020))].index  # 2000 to 2019

    df_train = df.loc[ind_train, :].copy().reset_index(drop=True)
    df_test = df.loc[ind_test, :].copy().reset_index(drop=True)

    feats_not_to_use = [
        "permno", "year", "month", "next_ret", "pe_op_dil", "DATE", "COMNAM",
        "TICKER", "SICCD", "SECTOR"
    ]
    feats = [feat for feat in df.columns if feat not in feats_not_to_use]
    target = 'next_ret'
    """
    Data Normalization
    """
    def normalize(series):
        return (series - series.mean(axis=0)) / series.std(axis=0)

    mean = df_train[feats].mean(axis=0)
    df_train[feats] = df_train[feats].fillna(mean)

    data_train = df_train[feats].apply(normalize).values

    mean = df_test[feats].mean(axis=0)
    df_test[feats] = df_test[feats].fillna(mean)

    data_test = df_test[feats].apply(normalize).values
    """
    Create TensorFlow Train and Test Datasets
    """

    train_dataset = tf.data.Dataset.from_tensor_slices(
        (data_train, df_train[target].values))
    test_dataset = tf.data.Dataset.from_tensor_slices(
        (data_test, df_test[target].values))
    """
    Constructing the Model
    """

    nfeats = len(feats)

    # Geometric pyramid rule (Masters 1993)
    nhid = [32, 16, 8, 4, 2]

    def build_models():
        models = []
        layers_stack = [
            layers.Dense(nhid[i], activation="tanh")
            for i, _ in enumerate(nhid)
        ]

        for i in range(1, 6):
            layers_arr = [
                layers.Dense(nhid[0], activation='tanh', input_shape=[nfeats])
            ]
            for j in range(1, i):
                layers_arr.append(layers_stack[j])
            layers_arr.append(layers.Dense(1))

            model = keras.Sequential(layers_arr)
            optimizer = tf.keras.optimizers.SGD(0.005)
            model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])

            models.append(model)

        return models

    NN1, NN2, NN3, NN4, NN5 = build_models()

    # Initialize model weights to random values
    NN1_weights = NN1.weights
    NN2_weights = NN2.weights
    NN3_weights = NN3.weights
    NN4_weights = NN4.weights
    NN5_weights = NN5.weights

    np.random.seed(12345)

    weights_arr = []

    for i in range(1, 6):

        w = [np.random.uniform(-0.01, 0.01, size=(nfeats, nhid[0]))]

        for j in range(0, i):
            w.append(np.random.uniform(-0.01, 0.01, size=nhid[j]))
            if j == i - 1:
                w.append(np.random.uniform(-0.01, 0.01, size=(nhid[j], 1)))
            else:
                w.append(
                    np.random.uniform(-0.01, 0.01,
                                      size=(nhid[j], nhid[j + 1])))

        w.append(np.random.uniform(-0.01, 0.01, size=1))
        weights_arr.append(w)

    NN1.set_weights(weights_arr[0])
    NN2.set_weights(weights_arr[1])
    NN3.set_weights(weights_arr[2])
    NN4.set_weights(weights_arr[3])
    NN5.set_weights(weights_arr[4])
    """
    Inspecting the Model
    """

    NN1.summary()
    NN2.summary()
    NN3.summary()
    NN4.summary()
    NN5.summary()
    """
    Training the Model
    """

    NN1.fit(train_dataset.batch(1), epochs=1)
    NN2.fit(train_dataset.batch(1), epochs=1)
    NN3.fit(train_dataset.batch(1), epochs=1)
    NN4.fit(train_dataset.batch(1), epochs=1)
    NN5.fit(train_dataset.batch(1), epochs=1)

    # Trained model weights
    NN1_weights = NN1.weights
    NN2_weights = NN2.weights
    NN3_weights = NN3.weights
    NN4_weights = NN4.weights
    NN5_weights = NN5.weights

    # """
    # Make Predictions
    # """

    # Larger batch size (100) for faster predictions
    NN1_test_predictions = NN1.predict(test_dataset.batch(100)).flatten()
    NN2_test_predictions = NN2.predict(test_dataset.batch(100)).flatten()
    NN3_test_predictions = NN3.predict(test_dataset.batch(100)).flatten()
    NN4_test_predictions = NN4.predict(test_dataset.batch(100)).flatten()
    NN5_test_predictions = NN5.predict(test_dataset.batch(100)).flatten()

    # """
    # Model Evaluation
    # """

    def R2(y, y_hat):
        R2 = 1 - np.sum((y - y_hat)**2) / np.sum(y**2)
        return R2

    NN1_R2_Val = R2(df_test[target].values, NN1_test_predictions)
    NN2_R2_Val = R2(df_test[target].values, NN2_test_predictions)
    NN3_R2_Val = R2(df_test[target].values, NN3_test_predictions)
    NN4_R2_Val = R2(df_test[target].values, NN4_test_predictions)
    NN5_R2_Val = R2(df_test[target].values, NN5_test_predictions)

    all_R2_Val = [NN1_R2_Val, NN2_R2_Val, NN3_R2_Val, NN4_R2_Val, NN5_R2_Val]

    def NN1_score(X, y):
        y_pred = NN1.predict(X)
        return R2(y, y_pred)

    def NN2_score(X, y):
        y_pred = NN2.predict(X)
        return R2(y, y_pred)

    def NN3_score(X, y):
        y_pred = NN3.predict(X)
        return R2(y, y_pred)

    def NN4_score(X, y):
        y_pred = NN4.predict(X)
        return R2(y, y_pred)

    def NN5_score(X, y):
        y_pred = NN5.predict(X)
        return R2(y, y_pred)

    _, NN1_score_decreases = get_score_importances(NN1_score, data_test,
                                                   df_test[target].values)
    _, NN2_score_decreases = get_score_importances(NN2_score, data_test,
                                                   df_test[target].values)
    _, NN3_score_decreases = get_score_importances(NN3_score, data_test,
                                                   df_test[target].values)
    _, NN4_score_decreases = get_score_importances(NN4_score, data_test,
                                                   df_test[target].values)
    _, NN5_score_decreases = get_score_importances(NN5_score, data_test,
                                                   df_test[target].values)

    NN1_feat_imps = np.mean(NN1_score_decreases, axis=0)
    NN2_feat_imps = np.mean(NN2_score_decreases, axis=0)
    NN3_feat_imps = np.mean(NN3_score_decreases, axis=0)
    NN4_feat_imps = np.mean(NN4_score_decreases, axis=0)
    NN5_feat_imps = np.mean(NN5_score_decreases, axis=0)

    all_importances = []

    for feat_imps in [
            NN1_feat_imps, NN2_feat_imps, NN3_feat_imps, NN4_feat_imps,
            NN5_feat_imps
    ]:
        importances = {}

        for index, feat_imp in enumerate(feat_imps):
            importances[feats[index]] = feat_imp

        all_importances.append(importances)

    return all_importances, all_R2_Val
Exemple #18
0
def main():
    n_samples = np.random.randint(100, 100000)
    # n_samples = 100000
    # n_feats = 500
    print('Number of Samples in DS: ' + str(n_samples))
    n_feats = np.random.choice([10, 20, 50, 100, 200, 500], 1).item()
    n_clusters = np.random.randint(2, 14)
    sep = 5 * np.random.random_sample()
    hyper = np.random.choice([True, False], 1).item()

    X, y = make_classification(n_samples, n_feats, n_feats // 2, 0, 0, 2,
                               n_clusters, None, 0, sep, True, 0, 1, hyper)
    X, x_test, y, y_test = train_test_split(X, y, test_size=0.2)

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    x_test = scaler.transform(x_test)
    device = 'cuda:0'
    if (torch.cuda.is_available()):
        print('Using device:',
              torch.cuda.get_device_name(torch.cuda.current_device()))

    no_epochs = 100
    btchsz = [
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X),
        len(X)
    ]
    params = [5, 10, 25, 50, 100, 500, 1000, 2000, 5000, 10000, 25000]

    trainset = data_loader(X, y)
    testset = data_loader(x_test, y_test)

    accs = []
    infl = []
    permute = []

    for i in range(len(params)):
        start_time = time.time()
        torch.cuda.empty_cache()
        iter = i
        model = Vanilla(n_feats, params[iter],
                        batch_size=btchsz[iter])  #.half()
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        if device:
            # model.to(device)
            print('Moved to GPU')
        for epoch in range(no_epochs):
            total_train_loss = 0
            model.train()
            optimizer.zero_grad()
            pred = model(torch.from_numpy(X).float().to('cuda:0'))
            loss = criterion(pred, torch.from_numpy(y).long().to('cuda:0'))
            total_train_loss += loss.item()
            optimizer.step()
            if epoch != 0 and (epoch % 25 == 0):
                print('Epoch: ' + str(epoch + 1) + '/' + str(no_epochs) +
                      ', Train Loss: ' + str(total_train_loss))
        print("Total Train Time: " + str(time.time() - start_time))
        # validation
        model.eval()
        image_test = torch.from_numpy(x_test).float().to(device)
        label_test = torch.from_numpy(y_test).long().to(device)

        pred_test = model(image_test)
        test_acc = model.score(x_test, y_test)
        accs.append(test_acc)

        inform_feats = set(range(n_feats // 2))

        eqn_5_smooth = smoothInfluence.influence(
            torch.from_numpy(X).float().to('cuda:0'),
            torch.from_numpy(y).long().to('cuda:0'), image_test, model,
            model.linear_2.weight)
        eqn_5_smooth = np.mean(normalize(np.vstack(eqn_5_smooth)), axis=0)
        loss_acc = len(
            inform_feats.intersection(
                set(np.argsort(
                    abs(eqn_5_smooth))[::-1][:n_feats // 2]))) / (n_feats // 2)
        print(loss_acc)
        infl.append(loss_acc)

        base_score, score_decreases = get_score_importances(
            model.score, x_test, y_test)
        perm_importances = np.mean(score_decreases, axis=0)

        perm_acc = len(
            inform_feats.intersection(
                set(np.argsort(
                    abs(perm_importances))[::-1][:n_feats //
                                                 2]))) / (n_feats // 2)
        permute.append(perm_acc)

        print('Inner Loop ' + str(i + 1) + '/' + str(len(params)) +
              ' Finished')
    return np.asarray(accs), np.asarray(infl), np.asarray(permute)
    rfbc4 = {}
    rfbc5 = {}
    rfbc1['rf1'], rfbc1['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 0],
                                              y[cal_blocks != 0])
    rfbc2['rf1'], rfbc2['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 1],
                                              y[cal_blocks != 1])
    rfbc3['rf1'], rfbc3['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 2],
                                              y[cal_blocks != 2])
    rfbc4['rf1'], rfbc4['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 3],
                                              y[cal_blocks != 3])
    rfbc5['rf1'], rfbc5['rf2'] = rff.rfbc_fit(rf, X[cal_blocks != 4],
                                              y[cal_blocks != 4])

    n_iter = 5
    base_score, score_drops = get_score_importances(r2_score,
                                                    X,
                                                    y,
                                                    n_iter=n_iter)

    # Additional importance estimates that holistically consider the impact of
    # permuting all the layers from a given sensor
    texture_labs = [
        'value', 'contrast', 'correlation', 'dissimilarity', 'entropy',
        'homogeneity', 'mean', 'second_moment', 'variance'
    ]
    texture_labs_alt = [
        'enlee', 'cont', 'corr', 'diss', 'ent', 'hom', 'mean', 's_m_', 'var'
    ]
    texture_labs_display = [
        'value', 'contrast', 'correlation', 'dissimilarity', 'entropy',
        'homogeneity', 'mean', 'second moment', 'variance'
    ]
Exemple #20
0
    # train_summ = sess.run(performance_summaries, feed_dict={tf_loss_ph:outs[1], tf_accuracy_ph:outs[2]})
    # train_writer.add_summary(train_summ, epoch)
    # val_summ = sess.run(performance_summaries, feed_dict={tf_loss_ph:cost, tf_accuracy_ph:acc})
    # val_writer.add_summary(val_summ, epoch)

    if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(
            cost_val[-(FLAGS.early_stopping + 1):-1]):
        print("Early stopping...")
        break

print("Optimization Finished!")

print("Running train feature importance ...")
base_score, score_decreases = get_score_importances(score_train,
                                                    raw_features.toarray(),
                                                    y_train)
mean_feat_imp = np.mean(score_decreases, axis=0)
std_feat_imp = np.std(score_decreases, axis=0)
feat_imp_stats = pd.DataFrame(columns=data_cols,
                              data=[mean_feat_imp, std_feat_imp])
feat_imp_stats.to_csv(log_dir + '/train/feat_imp.csv', index=False)

print("Running validation feature importance ...")
base_score, score_decreases = get_score_importances(score_val,
                                                    raw_features.toarray(),
                                                    y_val)
mean_feat_imp = np.mean(score_decreases, axis=0)
std_feat_imp = np.std(score_decreases, axis=0)
feat_imp_stats = pd.DataFrame(columns=data_cols,
                              data=[mean_feat_imp, std_feat_imp])
Exemple #21
0
    print("*** optimal hyperparameters ***")
    for k, v in sorted(parameters.iteritems()):
        print(str(k) + " = " + str(v))

    if "model" in cfg:
        resvm_train(cfg)

elif "eli" in cfg:
    binary_labels = read_binary_labels(cfg["data"], " ",
                                       cfg["pos"])  # all data
    true_labels = [binary_labels[x] for x in range(len(binary_labels))]

    def score(X, y):  # or cfg['data'] cfg, binary_labels
        dump_svmlight_file(X, y, cfg['data'])
        labels, decision_values = resvm_predict(cfg)
        true_labels = [binary_labels[x] for x in range(len(binary_labels))]
        return scorefun(true_labels, [
            x > 0.5 for x in decision_values
        ])  # [x > 0.5 for x in decision_values] <--- gets you all predict as 1

    base_score, score_decreases = get_score_importances(
        score, cfg['X'], cfg['y']
    )  # get_score_importances(score, cfg, [x > 0.5 for x in decision_values]) <---- adapt to the scorefun function already define in the resvm model
    feature_importances = np.mean(score_decreases, axis=0)
    importance_df = pd.DataFrame({
        'feature': cfg['df'].drop(columns="tagged").columns,
        'importance': feature_importances
    })
    importance_df.to_csv('feature_importances.csv')