class TargetEncoderNSplits(BaseTransformer):
    def __init__(self, n_splits, **kwargs):
        self.k_folds = KFold(n_splits=n_splits)
        self.target_means_map = {}

    def _target_means_names(self, columns):
        confidence_rate_names = ['target_mean_{}'.format(column) for column in columns]
        return confidence_rate_names

    def _is_null_names(self, columns):
        is_null_names = ['target_mean_is_nan_{}'.format(column) for column in columns]
        return is_null_names

    def fit(self, categorical_features, target, **kwargs):
        feature_columns, target_column = categorical_features.columns, target.columns[0]

        X_target_means = []
        self.k_folds.get_n_splits(target)
        for train_index, test_index in self.k_folds.split(target):
            X_train, y_train = categorical_features.iloc[train_index], target.iloc[train_index]
            X_test, y_test = categorical_features.iloc[test_index], target.iloc[test_index]

            train = pd.concat([X_train, y_train], axis=1)
            for column, target_mean_name in zip(feature_columns, self._target_means_names(feature_columns)):
                group_object = train.groupby(column)
                train_target_means = group_object[target_column].mean(). \
                    reset_index().rename(index=str, columns={target_column: target_mean_name})

                X_test = X_test.merge(train_target_means, on=column, how='left')
            X_target_means.append(X_test)
        X_target_means = pd.concat(X_target_means, axis=0).astype(np.float32)

        for column, target_mean_name in zip(feature_columns, self._target_means_names(feature_columns)):
            group_object = X_target_means.groupby(column)
            self.target_means_map[column] = group_object[target_mean_name].mean().reset_index()

        return self

    def transform(self, categorical_features, **kwargs):
        columns = categorical_features.columns

        for column, target_mean_name, is_null_name in zip(columns,
                                                          self._target_means_names(columns),
                                                          self._is_null_names(columns)):
            categorical_features = categorical_features.merge(self.target_means_map[column],
                                                              on=column,
                                                              how='left').astype(np.float32)
            categorical_features[is_null_name] = pd.isnull(categorical_features[target_mean_name]).astype(int)
            categorical_features[target_mean_name].fillna(0, inplace=True)

        return {'numerical_features': categorical_features[self._target_means_names(columns)],
                'categorical_features': categorical_features[self._is_null_names(columns)]}

    def load(self, filepath):
        self.target_means_map = joblib.load(filepath)
        return self

    def save(self, filepath):
        joblib.dump(self.target_means_map, filepath)
Ejemplo n.º 2
0
def kFolds(dataSet, k = 10):
    """
    This is the k-fold method
    :param dataSet: of type DataFrame
    :param k: number of subsets to choose
    """
    df_mx = dataSet.as_matrix()
    X = df_mx[:, 1:16]
    Y = df_mx[:, 0:1]

    lm = svm.SVC(gamma=0.001, C=100.)  # Support Vector Machine
    kf = KFold(n_splits=10)  # Define the split - into 10 folds
    i = 0
    accuracies = numpy.zeros(kf.get_n_splits(X))
    for train_index, test_index in kf.split(X):
        print("{}. TRAIN: {} TEST: {}".format(i+1, train_index, test_index))
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        # train using X_Train
        model = lm.fit(X_train, Y_train)
        # evaluate against X_Test
        predictions = lm.predict(X_test)
        # save accuracy
        accuracies[i] = model.score(X_test, Y_test)
        i = i + 1

    # find mean accuracy over all rounds
    print("Average accuracy of K-Folds (k={}): {}%".format(numpy.mean(accuracies) * 100, k))
Ejemplo n.º 3
0
    def VTiter(self, *parsedArgs, **envars):
        largs, dictargs = self.full_parse(parsedArgs)

        # get arguments
        if 'splits' not in dictargs:
            raise functions.OperatorError(__name__.rsplit('.')[-1], "No splits argument.")
        else:
            self.n_splits = int(dictargs['splits'])

        # print largs
        # print dictargs

        self.data = []

        if 'query' not in dictargs:
            raise functions.OperatorError(__name__.rsplit('.')[-1], "No query argument ")
        query = dictargs['query']

        cur = envars['db'].cursor()
        c = cur.execute(query)

        for r in c:
            if r[0].isdigit():
                self.data.append(r[0])
            else:
                self.data.append(str(r[0]))
        yield [('rid',), ('idofset',)]

        # print "data", self.data
        X = np.array(self.data)
        # print X

        kf = KFold(self.n_splits)
        kf.get_n_splits(X)
        # print"KF", kf

        try:
            for train_index, test_index in kf.split(X):
                # print("TRAIN:", train_index ,"TEST:", test_index)
                j = 0
                for train_index, test_index in kf.split(X):
                    for k in test_index:
                        yield (self.data[k], j)
                    j += 1
        except:
            yield (-1, "Cannot have number of splits greater than the number of samples")
Ejemplo n.º 4
0
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np

from sklearn import metrics

df = pd.read_csv("ols_dataset.csv")
# print(df)
target = df.iloc[:,
                 2].values  #doesnot work we need to change it into arrays by adding .valuesS
data = df.iloc[:, 3:10].values

kfold_object = KFold(
    n_splits=4
)  #KFold only uses each value once, so we separate the dataset into 4
kfold_object.get_n_splits(data)

for training_index, test_index in kfold_object.split(data):
    print(training_index)
    print(test_index)
    data_training, data_test = data[training_index], data[test_index]
    target_training, target_test = target[training_index], target[test_index]
    machine = linear_model.LinearRegression()
    machine.fit(data_training, target_training)
    prediction = machine.predict(data_test)
    print(
        metrics.r2_score(target_test, prediction)
    )  #r-square is the distance between test and prediction, the higher, the model is predicting more correctly

# -----------------------------------------------------
                                                             weights_train))
                training_scores.append(training_score)
        return training_scores, test_scores

    weights = X['age'].pipe(lambda s: s > train_cutoff).map({
        True: .9,
        False: .1
    }).values
    logistic_model = LogisticRegression()
    weighted_training_scores, weighted_test_scores = cross_val_scores_weighted(
        logistic_model, X, y, weights, cv=10)

    unweighted_test_scores = []
    unweighted_train_scores = []
    kf = KFold(n_splits=10)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X.index):
        X_train, X_audit_holdout = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_audit_holdout)[:, 1]
        score = roc_auc_score(y_test, y_pred)
        unweighted_test_scores.append(score)
        y_pred = clf.predict_proba(X_train)[:, 1]
        score = roc_auc_score(y_train, y_pred)
        unweighted_train_scores.append(score)

    hard_weighted_test_scores = []
    hard_weighted_train_scores = []
    kf = KFold(n_splits=10)
    kf.get_n_splits(X)
Ejemplo n.º 6
0
    X = X.T
M = len(y)

np.random.seed(42)
p = np.random.permutation(M)
perm_X = X[p]
perm_y = y[p]
num_test_ex = int(np.floor(0.15 * M))
test_data = X[0:num_test_ex]
test_labels = y[0:num_test_ex]
train_validate_data = X[num_test_ex:]
train_validate_labels = y[num_test_ex:]

#Try 6-fold cross validation instead?
kf = KFold(n_splits=6, shuffle=True)
kf.get_n_splits(train_validate_data)
for train_index, validate_index in kf.split(train_validate_data):
    for i, params in enumerate(params_to_try):
        train_data, validate_data = train_validate_data[
            train_index], train_validate_data[validate_index]
        train_labels, validate_labels = train_validate_labels[
            train_index], train_validate_labels[validate_index]
        clf = LogisticRegression(penalty=params['penalty_type'],
                                 C=params['C'],
                                 solver=params['solver'],
                                 max_iter=100,
                                 random_state=42,
                                 multi_class='multinomial')
        clf.fit(train_data, train_labels)
        train_predict = clf.predict(train_data)
        train_accuracy = accuracy_score(train_labels, train_predict)
# ### Build and learn PCA model

# +
from sklearn.decomposition import PCA

pca = PCA(n_components=3)

# -

x = dataset['train']['x'][0]

# +
from sklearn.model_selection import KFold  # import KFold
import sklearn as sk
kf = KFold(n_splits=5)  # Define the split - into 2 folds
kf.get_n_splits(
    x)  # returns the number of splitting iterations in the cross-validator

for train_index, test_index in kf.split(x):
    #print('TRAIN:', train_index, 'TEST:', test_index)
    X_train, X_test = x[train_index], x[test_index]
    principalComponents = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    x_hat = pca.inverse_transform(X_test_pca)
    print("mae loss:")
    print(sk.metrics.mean_absolute_error(X_test, x_hat))
    print("mse loss:")
    print(sk.metrics.mean_squared_error(X_test, x_hat))
# y_train, y_test = y[train_index], y[test_index]
# -

pca.explained_variance_ratio_
Ejemplo n.º 8
0
df2 = pd.read_csv('data/ip2.nmap.online.hosts.masscan.csv.optimised.som_win_map6.csv')

df2_data=df2.drop(columns=['cluster'])
df2_labels=df2['cluster'].values

results=dict([
    (8,  {'accuracy':[],'recall':[],'f1':[],'precision':[]}), 
    (12, {'accuracy':[],'recall':[],'f1':[],'precision':[]}), 
    (16, {'accuracy':[],'recall':[],'f1':[],'precision':[]}), 
    (32, {'accuracy':[],'recall':[],'f1':[],'precision':[]}), 
    (64, {'accuracy':[],'recall':[],'f1':[],'precision':[]}), 
    ])

kf = KFold(n_splits=4,shuffle=True)
kf.get_n_splits(df2)

next(kf.split(df2), None)

for units in results.keys():
    for train_index, test_index in kf.split(df2_data):
        # print('Train')
        # print(train_index)
        # print('Test')
        # print(test_index)
        X_train = df2_data.iloc[train_index]
        X_test =  df2_data.iloc[test_index]
        y_train = to_categorical([df2_labels[i] for i in train_index],num_classes=64)
        y_test =  to_categorical([df2_labels[i] for i in test_index],num_classes=64)
        #
        classifier = Sequential()
Ejemplo n.º 9
0
img_data_original = np.empty([50000, 1024])

for i in range(0, 50000):
    filename = '{0:05d}'.format(i) + '.png'
    img = rgb2gray(io.imread(train_set_dir +
                             filename))  #Reading file, converting to Grayscale
    img_data_original[i, :] = img.flatten()

for i in range(15, 1024, 16):
    pca = PCA(n_components=i, whiten=True)
    pca.fit(img_data_original)
    img_data = pca.transform(img_data_original)

    # Split the data using K-Folds, using 5 different sets
    kf = KFold(n_splits=5)
    kf.get_n_splits(img_data)

    count = 0
    train_score = np.zeros(5)
    val_score = np.zeros(5)
    for train_index, val_index in kf.split(img_data):
        img_data_train, img_data_val = img_data[train_index], img_data[
            val_index]
        img_labels_train, img_labels_val = img_labels[train_index], img_labels[
            val_index]

        regr = LogisticRegression(multi_class='ovr')
        regr.fit(img_data_train, img_labels_train)

        count += 1
        train_score[count - 1] = regr.score(img_data_train, img_labels_train)
Ejemplo n.º 10
0
        y = []
        with open(path) as f_train:
            for line in f_train:
                data_row = line.split(',')
                X.append(data_row[:-1])
                y.append(data_row[-1])
        X = np.asarray(X, dtype=np.float)
        y = np.asarray(y, dtype=np.float)
        return X, y


if __name__ == '__main__':

    pen_based_recognition = DatasetBase(train_path='raw_data/pendigits.tra',
                                        test_path='raw_data/pendigits.tes')

    clf = neighbors.KNeighborsClassifier(n_neighbors=1, p=2)

    kf = KFold(n_splits=10, shuffle=True)
    kf.get_n_splits(pen_based_recognition.X_data, pen_based_recognition.y_data)

    for train_index, test_index in kf.split(pen_based_recognition.X_data):
        print("TRAIN:", train_index, "TEST:", test_index)
        # print(len(train_index))
        # print(len(test_index))
        X_train, X_test = pen_based_recognition.X_data[train_index], pen_based_recognition.X_data[test_index]
        y_train, y_test = pen_based_recognition.y_data[train_index], pen_based_recognition.y_data[test_index]

        clf.fit(X_train, y_train)
        print(clf.score(X_test, y_test))
Ejemplo n.º 11
0
print('nonlinearity: ' + str(nonlinearity))
print('model: ' + str(model))

# tr_fmri_net, tr_adj, tr_labels, val_fmri_net, val_adj, val_labels, test_fmri_net, test_adj, test_labels = \
#     Data_processing.load_data('Data_BNF/'+dataset+'/','Data_BNF/'+dataset+'/labels.csv', augmentation)

# tr_fmri_net, tr_adj, tr_labels, val_fmri_net, val_adj, val_labels, test_fmri_net, test_adj, test_labels = \
#     Data_processing.load_sorted_data('Data_BNF/'+dataset+'/sort_data.pkl')

fmri_signals, labels, graphs = \
    Data_processing.load_ADHD('Data_BNF/'+dataset+'/','Data_BNF/'+dataset+'/labels.csv', augmentation)

# while(True):

kf = KFold(n_splits=5)
kf.get_n_splits(fmri_signals)
labels = np.array(labels)

test_loss_avg = []
test_acc_avg = []
predict_labels = []
actual_labels = []

for train_ind, test_ind in kf.split(fmri_signals):
    train_fmri_rsignals, test_fmri_rsignals = fmri_signals[
        train_ind], fmri_signals[test_ind]
    train_rlabels, test_rlabels = labels[train_ind], labels[test_ind]
    train_rgraphs, test_rgraphs = graphs[train_ind], graphs[test_ind]

    train_fmri_rsignals, val_fmri_rsignals, train_rlabels, val_rlabels, train_rgraphs, val_rgraphs = train_test_split(
        train_fmri_rsignals, train_rlabels, train_rgraphs, test_size=0.2)
Ejemplo n.º 12
0
    print(model.score(test_X, test_y))

    test_predict = model.predict(test_X)

    #avg_feature_importance.append(test_predict.feature_importances_)
    acc, precision, recall, f1, matrix = evaluation(test_y, test_predict)

    print("Fold: %d, Accuracy: %f, Precision: %f, Recall: %f, F1: %f" %
          (fold_count + 1, round(acc, 3), round(precision, 3), round(
              recall, 3), round(f1, 3)))
    avg_acc += acc
    avg_precision += precision
    avg_recall += recall
    avg_f1 += f1
    avg_confusion_matrix.append(matrix)
    fold_count += 1

print(
    "================================================================================="
)
print("Avg Accuracy: %f, Avg Precision: %f, Avg Recall: %f, Avg F1: %f" % (round(avg_acc / kf.get_n_splits(), 3), \
                                                                           round(avg_precision / kf.get_n_splits(), 3), \
                                                                           round(avg_recall / kf.get_n_splits(), 3), \
                                                                           round(avg_f1 / kf.get_n_splits(), 3)))
'''
importance_dict = {}
for col, importance in zip(train_X.columns, np.mean(np.array(avg_feature_importance), axis=0)):
    importance_dict[col] = importance

print(sorted(importance_dict.items(), key=lambda x: -x[1])[:10])
'''
Ejemplo n.º 13
0
import pandas as pd
from sklearn.model_selection import KFold, cross_val_predict
from sklearn import svm, tree, neighbors, neural_network, naive_bayes
import sklearn.metrics as met
import matplotlib.pyplot as plt

data = pd.read_csv('DataSets/tae.csv', sep=',', header=None)
#print(data)
train = data.ix[:, 0:4]
target = data.ix[:, 5]
#print(x)
#print(target)

kf = KFold(n_splits=10)
print(kf)
print(kf.get_n_splits(train))
for train_index, test_index in kf.split(train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = train.ix[train_index, :], train.ix[test_index, :]
    print(X_train)
    y_train, y_test = target[train_index], target[test_index]
    print(y_train)
for i in range(5, 12):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    print('--------------------{}---------------------------------'.format(i))
    print('cross_val_predict')
    predicted = cross_val_predict(
        clf,
        train,
        target,
        cv=10,
Ejemplo n.º 14
0
df['flag'] = flag[0]
df['target'] = tar[0]


features = list(df.columns[0:39])

x = df[features]
y = df["target"]

xnmp=x.to_numpy()
xnmp = np.array(xnmp, dtype=np.float64)
ynmp=y.to_numpy()
ynmp = np.array(ynmp, dtype=np.float64)

kf = KFold(n_splits=3)
kf.get_n_splits(xnmp)
time = 0
for train_index, test_index in kf.split(xnmp):
    time = time + 1
    x_train, x_test = xnmp[train_index], xnmp[test_index]
    y_train, y_test = ynmp[train_index], ynmp[test_index]
    print("-----Time: "+ str(time) + "  Decision Tree-----")
    #decision tree
    classifier = tree.DecisionTreeClassifier(criterion = 'entropy' , max_depth=10, random_state=0)
    classifier.fit(x_train, y_train)
    tree.plot_tree(classifier)
    predict_y = classifier.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, predict_y)
    cm = confusion_matrix(y_test, predict_y)
    print("Accuracy: "+ str(accuracy))
    recall = cm[1,1]/(cm[0,1]+cm[1,1])  #true positives/ false negatives + true positives
Ejemplo n.º 15
0
def tachTestTrain(pathFolder):
    print("Create New DataSet")
    listFolderNameMSSV = timFolderName(pathFolder)
    dataX = []
    # dataY = []
    # dataName = []
    for ia in listFolderNameMSSV:
        for ib in listdir(join(pathFolder, ia)):
            if isfile(join(pathFolder, ia, ib)):
                dataX.append(join(pathFolder, ia, ib))
                # dataName.append(ia)
    shuffle(dataX)
    # for ia in dataX:
    #     dataY.append(int(ia.split('/')[4].split('_')[0]))
    kf = KFold(n_splits=10, shuffle=True)
    kf.get_n_splits(dataX)

    tempDictXtrain = {}
    tempDictYtrain = {}
    tempDictXtest = {}
    tempDictYtest = {}
    # tempNtrain = []
    # tempNtest = []
    intDem = 0
    for train_index, test_index in kf.split(dataX):
        y_train = []
        y_test = []

        X_train, X_test = np.array(dataX)[train_index], np.array(
            dataX)[test_index]
        shuffle(X_train)
        shuffle(X_test)

        for ia in X_train:
            y_train.append(int(ia.split('/')[4].split('_')[0]))
        for ia in X_test:
            y_test.append(int(ia.split('/')[4].split('_')[0]))
        # N_train, N_test = np.array(dataName)[train_index], np.array(dataName)[test_index]

        # Luu lai
        tempDictXtrain[intDem] = X_train
        tempDictYtrain[intDem] = y_train
        tempDictXtest[intDem] = X_test
        tempDictYtest[intDem] = y_test
        intDem += 1

        # tempNtrain.append(N_train.tolist())
        # tempNtest.append(N_test.tolist())

        # print("TRAIN:", train_index, "TEST:", test_index)
        # print("TRAIN_len:", len(train_index), "TEST_len:", len(test_index))
    dfXtrain = pd.DataFrame(tempDictXtrain)
    dfYtrain = pd.DataFrame(tempDictYtrain)
    dfXtest = pd.DataFrame(tempDictXtest)
    dfYtest = pd.DataFrame(tempDictYtest)
    if not os.path.exists("./tempLuu"):
        os.makedirs("./tempLuu")
    dfXtrain.to_csv('./tempLuu/tempDictXtrain.csv', index=None)
    dfYtrain.to_csv('./tempLuu/tempDictYtrain.csv', index=None)
    dfXtest.to_csv('./tempLuu/tempDictXtest.csv', index=None)
    dfYtest.to_csv('./tempLuu/tempDictYtest.csv', index=None)
Ejemplo n.º 16
0
def train_stage(df_path, lgb_path, xgb_path, cb_path):
    print('>> Loading training data...')
    df = pd.read_csv(df_path)

    y_df = np.array(df['Eclipse Duration (m)'])
    df_ids = np.array(df.index)
    df.drop(['Eclipse Duration (m)', 'Lunation Number'], axis=1, inplace=True)

    X = preprocess_data(df)
    print('>> Shape of train data:', X.shape)

    lgb_cv_result = np.zeros(df.shape[0])
    xgb_cv_result = np.zeros(df.shape[0])
    cb_cv_result = np.zeros(df.shape[0])

    skf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
    print(">> K Fold with", skf.get_n_splits(df_ids, y_df), "splits")

    print('>> Model Fitting...')
    for counter, ids in enumerate(skf.split(df_ids, y_df)):
        print('>> Fold', counter + 1)
        X_fit, y_fit = X[ids[0]], y_df[ids[0]]
        X_val, y_val = X[ids[1]], y_df[ids[1]]

        if TRAIN_LGB:
            print('>>> LigthGBM...')
            lgb_cv_result[ids[1]] += fit_lgb(X_fit,
                                             y_fit,
                                             X_val,
                                             y_val,
                                             counter,
                                             lgb_path,
                                             name='lgb')

        if TRAIN_XGB:
            print('>>> XGBoost...')
            xgb_cv_result[ids[1]] += fit_xgb(X_fit,
                                             y_fit,
                                             X_val,
                                             y_val,
                                             counter,
                                             xgb_path,
                                             name='xgb')

        if TRAIN_CB:
            print('>>> CatBoost...')
            cb_cv_result[ids[1]] += fit_cb(X_fit,
                                           y_fit,
                                           X_val,
                                           y_val,
                                           counter,
                                           cb_path,
                                           name='cb')

        del X_fit, X_val, y_fit, y_val
        gc.collect()

    if TRAIN_LGB:
        rmse_lgb = round(sqrt(mean_squared_error(y_df, lgb_cv_result)), 4)
        print('>> LightGBM VAL RMSE:', rmse_lgb)

    if TRAIN_XGB:
        rmse_xgb = round(sqrt(mean_squared_error(y_df, xgb_cv_result)), 4)
        print('>> XGBoost  VAL RMSE:', rmse_xgb)

    if TRAIN_CB:
        rmse_cb = round(sqrt(mean_squared_error(y_df, cb_cv_result)), 4)
        print('>> Catboost VAL RMSE:', rmse_cb)

    if TRAIN_LGB and TRAIN_XGB and TRAIN_CB:
        rmse_mean = round(
            sqrt(
                mean_squared_error(
                    y_df, (lgb_cv_result + xgb_cv_result + cb_cv_result) / 3)),
            4)
        print('>> Mean XGBoost+Catboost+LightGBM, VAL RMSE:', rmse_mean)

    if TRAIN_LGB and TRAIN_CB:
        rmse_mean_lgb_cb = round(
            sqrt(mean_squared_error(y_df, (lgb_cv_result + cb_cv_result) / 2)),
            4)
        print('>> Mean Catboost+LightGBM VAL RMSE:', rmse_mean_lgb_cb)

    if TRAIN_LGB and TRAIN_XGB:
        rmse_mean_lgb_xgb = round(
            sqrt(mean_squared_error(y_df,
                                    (lgb_cv_result + xgb_cv_result) / 2)), 4)
        print('>> Mean XGBoost+LightGBM VAL RMSE:', rmse_mean_lgb_xgb)

    return 0
Ejemplo n.º 17
0
beta = np.zeros((P))
beta[causal_ind] = 1.0
X = np.random.randn(*(N,P))
noise = np.random.randn(N)
y = X.dot(beta)

print("Initialize the model")
print("Option 1: use native glmnet `nfolds`")
model = glmnet(l1_ratio=0.5, n_folds=10)

print("Option 2: use `sklearn` `cv` syntax")
from sklearn.model_selection import KFold
n_folds =10
kf = KFold(n_folds)

model = glmnet(l1_ratio=0.5, cv=kf.get_n_splits(y), keep=True)

print("Fit in sklearn style")
model.fit(X, y)

print("Predict in sklearn style")
y_hat = model.predict(X)
print("penalty", model.alpha_)

print("Use `.cross_val_score()` method in order to apply cross-validation metrics other than MSE")
from sklearn import metrics
print(model.cross_val_score(metrics.r2_score))

print("plot native R graphs")
model.rplot()
def main():
    parser = ArgumentParser()
    parser.add_argument("-m",
                        "--model",
                        default="vectorize",
                        help="Choose between 'vectorize' and 'doc_to_vec'")
    parser.add_argument(
        "-d",
        "--distribution-graph",
        default=False,
        help="Draw a graph of the distribution of the content item pageviews")
    parser.add_argument(
        "-i",
        "--feature-importance",
        default=False,
        action='store_true',
        help="Show graph of the importance of various features")
    parser.add_argument("-c",
                        "--confusion-matrix",
                        default=False,
                        action='store_true',
                        help="Show confusion matrices during training")
    parser.add_argument("-k",
                        "--k-fold",
                        default=False,
                        action='store_true',
                        help="Perform K-Fold")
    parser.add_argument("-t",
                        "--test",
                        default=False,
                        action='store_true',
                        help="Split into training and test")
    args = vars(parser.parse_args())

    model_file = "vectorize"
    if args["model"] != "vectorize":
        model_file = "doc_to_vec"
    with open("data/processed/" + model_file + "_X", 'rb') as fp:
        X = pickle.load(fp)
    with open("data/processed/" + model_file + "_y", 'rb') as fp:
        y = np.asarray(pickle.load(fp))

    if args["distribution_graph"]:
        pageviews = utils.load_pageviews()
        discretizer, view_numbers = utils.generate_discretizer(pageviews)
        draw_content_item_distribution_graph(pageviews, discretizer,
                                             view_numbers)

    if args["feature_importance"]:
        show_feature_importance(X, y)

    if args["k_fold"]:
        kf = KFold(n_splits=5)
        kf.get_n_splits(X)

        f1_scores = []
        accuracy_scores = []
        confusion_matrix = np.zeros((utils.number_bins(), utils.number_bins()))
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            f1_score, accuracy_score, fold_confusion_matrix = utils.train_and_test_logistic_regression(
                X_train, y_train, X_test, y_test, args["confusion_matrix"])
            confusion_matrix = confusion_matrix + fold_confusion_matrix
            f1_scores.append(f1_score)
            accuracy_scores.append(accuracy_score)

        if args["confusion_matrix"]:
            utils.plot_confusion_matrix(confusion_matrix)

        print("Average f1 score :" + str(np.mean(f1_scores)))
        print("Average accuracy score :" + str(np.mean(accuracy_scores)))

    if args["test"]:
        count = len(X)
        split_index = math.floor(count * 0.8)
        X_train = X[0:split_index]
        X_test = X[split_index:]
        y_train = y[0:split_index]
        y_test = y[split_index:]
        f1_score, accuracy_score, confusion_matrix = utils.train_and_test_logistic_regression(
            X_train, y_train, X_test, y_test, args["confusion_matrix"])
        confusion_matrix

        if args["confusion_matrix"]:
            utils.plot_confusion_matrix(confusion_matrix)

        print("Accuracy score for test data :" + str(accuracy_score))
        print("F1 score for test data :" + str(f1_score))

    model = utils.train_logistic_regression(X, y)
    model_filename = "logistic_regression_model.pkl"
    pickle.dump(model, open(model_filename, 'wb'))
    print("Model saved to " + model_filename)
Ejemplo n.º 19
0
def main():
    # =============================================================================================================
    # VGG-16 ARCHITECTURE
    # =============================================================================================================
    model = Sequential()

    model.add(ZeroPadding2D((1, 1), input_shape=(20, 224, 224)))
    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_1'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_2'))
    model.add(ZeroPadding2D((1, 1)))
    model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

    model.add(Flatten())
    model.add(Dense(4096, name='fc6', init='glorot_uniform'))

    # =============================================================================================================
    # WEIGHT INITIALIZATION
    # =============================================================================================================
    layerscaffe = [
        'conv1_1', 'conv1_2', 'conv2_1', 'conv2_2', 'conv3_1', 'conv3_2',
        'conv3_3', 'conv4_1', 'conv4_2', 'conv4_3', 'conv5_1', 'conv5_2',
        'conv5_3', 'fc6', 'fc7', 'fc8'
    ]
    h5 = h5py.File(vgg_16_weights)

    layer_dict = dict([(layer.name, layer) for layer in model.layers])

    # Copy the weights stored in the 'vgg_16_weights' file to the feature extractor part of the VGG16
    for layer in layerscaffe[:-3]:
        w2, b2 = h5['data'][layer]['0'], h5['data'][layer]['1']
        w2 = np.transpose(np.asarray(w2), (0, 1, 2, 3))
        w2 = w2[:, :, ::-1, ::-1]
        b2 = np.asarray(b2)
        layer_dict[layer].W.set_value(w2)
        layer_dict[layer].b.set_value(b2)

    # Copy the weights of the first fully-connected layer (fc6)
    layer = layerscaffe[-3]
    w2, b2 = h5['data'][layer]['0'], h5['data'][layer]['1']
    w2 = np.transpose(np.asarray(w2), (1, 0))
    b2 = np.asarray(b2)
    layer_dict[layer].W.set_value(w2)
    layer_dict[layer].b.set_value(b2)

    adam = Adam(lr=learning_rate,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-08,
                decay=0.0005)
    model.compile(optimizer=adam,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # =============================================================================================================
    # FEATURE EXTRACTION
    # =============================================================================================================
    if save_features:
        saveFeatures(model, features_file, labels_file)

    # =============================================================================================================
    # TRAINING
    # =============================================================================================================
    do_training = True
    compute_metrics = True
    threshold = 0.5

    if do_training:
        h5features = h5py.File(features_file, 'r')
        h5labels = h5py.File(labels_file, 'r')

        # X_full will contain all the feature vectors extracted from optical flow images
        X_full = h5features[features_key]
        _y_full = np.asarray(h5labels[labels_key])

        zeroes = np.asarray(np.where(_y_full == 0)[0])
        ones = np.asarray(np.where(_y_full == 1)[0])
        zeroes.sort()
        ones.sort()

        kf_falls = KFold(n_splits=5)
        kf_falls.get_n_splits(X_full[zeroes, ...])

        kf_nofalls = KFold(n_splits=5)
        kf_nofalls.get_n_splits(X_full[ones, ...])

        sensitivities = []
        specificities = []
        accuracies = []

        for (train_index_falls,
             test_index_falls), (train_index_nofalls,
                                 test_index_nofalls) in zip(
                                     kf_falls.split(X_full[zeroes, ...]),
                                     kf_nofalls.split(X_full[ones, ...])):

            train_index_falls = np.asarray(train_index_falls)
            test_index_falls = np.asarray(test_index_falls)
            train_index_nofalls = np.asarray(train_index_nofalls)
            test_index_nofalls = np.asarray(test_index_nofalls)
            train_index = np.concatenate(
                (train_index_falls, train_index_nofalls), axis=0)
            test_index = np.concatenate((test_index_falls, test_index_nofalls),
                                        axis=0)
            train_index.sort()
            test_index.sort()
            X = np.concatenate((X_full[train_index_falls,
                                       ...], X_full[train_index_nofalls, ...]))
            _y = np.concatenate((_y_full[train_index_falls,
                                         ...], _y_full[train_index_nofalls,
                                                       ...]))
            X2 = np.concatenate((X_full[test_index_falls,
                                        ...], X_full[test_index_nofalls, ...]))
            _y2 = np.concatenate((_y_full[test_index_falls,
                                          ...], _y_full[test_index_nofalls,
                                                        ...]))

            # Balance the number of positive and negative samples so that there is the same amount of each of them
            all0 = np.asarray(np.where(_y == 0)[0])
            all1 = np.asarray(np.where(_y == 1)[0])
            if len(all0) < len(all1):
                all1 = np.random.choice(all1, len(all0), replace=False)
            else:
                all0 = np.random.choice(all0, len(all1), replace=False)
            allin = np.concatenate((all0.flatten(), all1.flatten()))
            allin.sort()
            X = X[allin, ...]
            _y = _y[allin]

            # ==================== CLASSIFIER ========================
            extracted_features = Input(shape=(4096, ),
                                       dtype='float32',
                                       name='input')
            if batch_norm:
                x = BatchNormalization(axis=-1, momentum=0.99,
                                       epsilon=0.001)(extracted_features)
                x = ELU(alpha=1.0)(x)
            else:
                x = ELU(alpha=1.0)(extracted_features)

            x = Dropout(0.9)(x)
            x = Dense(4096, name='fc2', init='glorot_uniform')(x)
            if batch_norm:
                x = BatchNormalization(axis=-1, momentum=0.99,
                                       epsilon=0.001)(x)
                x = Activation('relu')(x)
            else:
                x = ELU(alpha=1.0)(x)

            x = Dropout(0.8)(x)
            x = Dense(1, name='predictions', init='glorot_uniform')(x)
            x = Activation('sigmoid')(x)

            classifier = Model(input=extracted_features,
                               output=x,
                               name='classifier')
            classifier.compile(optimizer=adam,
                               loss='binary_crossentropy',
                               metrics=['accuracy'])

            # ==================== TRAINING =======================
            # weighting of each class: only the fall class gets a different weight
            class_weight = {0: weight_0, 1: 1}
            # Batch training
            if mini_batch_size == 0:
                history = classifier.fit(X,
                                         _y,
                                         validation_data=(X2, _y2),
                                         batch_size=X.shape[0],
                                         nb_epoch=epochs,
                                         shuffle='batch',
                                         class_weight=class_weight)
            else:
                history = classifier.fit(X,
                                         _y,
                                         validation_data=(X2, _y2),
                                         batch_size=mini_batch_size,
                                         nb_epoch=epochs,
                                         shuffle='batch',
                                         class_weight=class_weight)
            plot_training_info(exp, ['accuracy', 'loss'], save_plots,
                               history.history)

            # ==================== EVALUATION ========================
            if compute_metrics:
                predicted = classifier.predict(np.asarray(X2))
                for i in range(len(predicted)):
                    if predicted[i] < threshold:
                        predicted[i] = 0
                    else:
                        predicted[i] = 1
                # Array of predictions 0/1
                predicted = np.asarray(predicted)
                # Compute metrics and print them
                cm = confusion_matrix(_y2, predicted, labels=[0, 1])
                tp = cm[0][0]
                fn = cm[0][1]
                fp = cm[1][0]
                tn = cm[1][1]
                tpr = tp / float(tp + fn)
                fpr = fp / float(fp + tn)
                fnr = fn / float(fn + tp)
                tnr = tn / float(tn + fp)
                precision = tp / float(tp + fp)
                recall = tp / float(tp + fn)
                specificity = tn / float(tn + fp)
                f1 = 2 * float(precision * recall) / float(precision + recall)
                accuracy = accuracy_score(_y2, predicted)

                print('TP: {}, TN: {}, FP: {}, FN: {}'.format(tp, tn, fp, fn))
                print('TPR: {}, TNR: {}, FPR: {}, FNR: {}'.format(
                    tpr, tnr, fpr, fnr))
                print('Sensitivity/Recall: {}'.format(recall))
                print('Specificity: {}'.format(specificity))
                print('Precision: {}'.format(precision))
                print('F1-measure: {}'.format(f1))
                print('Accuracy: {}'.format(accuracy))

                # Store the metrics for this epoch
                sensitivities.append(tp / float(tp + fn))
                specificities.append(tn / float(tn + fp))
                accuracies.append(accuracy)

        print('5-FOLD CROSS-VALIDATION RESULTS ===================')
        print("Sensitivity: %.2f%% (+/- %.2f%%)" %
              (np.mean(sensitivities), np.std(sensitivities)))
        print("Specificity: %.2f%% (+/- %.2f%%)" %
              (np.mean(specificities), np.std(specificities)))
        print("Accuracy: %.2f%% (+/- %.2f%%)" %
              (np.mean(accuracies), np.std(accuracies)))
Ejemplo n.º 20
0
saver_dir_res = 'Results'
file_name = os.path.join(
    saver_dir_res,
    'Results_ger_age_epoch_{}_model_no_{}.xls'.format(epochs, model_no))

saver_dir_models = 'Trained_models/Start_ger_age'
if not os.path.exists(saver_dir_models):
    os.mkdir(saver_dir_models)

if not os.path.exists(saver_dir_res):
    os.mkdir(saver_dir_res)

data, atribute, sensitive, output, pr_gr, un_gr = german_dataset()
skf = KFold(n_splits=10)
skf.get_n_splits(atribute, output)

inp = atribute.shape[1]
AUC_y = np.zeros(model_no)
AUC_A = np.zeros(model_no)

wb = Workbook()

columns = [
    "AUC_y", "AUC_A", 'bal_acc', 'avg_odds_diff', 'disp_imp', 'stat_par_diff',
    'eq_opp_diff', 'theil_ind'
]

alpha = np.linspace(0.1, 2.5, 5)

sheets = [wb.add_sheet('{}'.format(i)) for i in alpha]
Ejemplo n.º 21
0
def main():
    #########################################################
    # DATA PREPARATION
    # The train set has 60k rows and 784 columns, so its shape is (60k,784).
    # Each row is a 28 by 28 pixel picture.
    # I will reshape the train set to have (60k,1) shape, i.e. each row will contain a 28 by 28 matrix of pixel color values.
    # Same for the test set.
    #########################################################
    y_train_CNN = train.ix[:, 0].values.astype(
        'int32')  # only labels i.e targets digits
    X_train_CNN = np.array(train.iloc[:, 1:].values).reshape(
        train.shape[0], 1, 28,
        28).astype(np.uint8)  # reshape to be [samples][pixels][width][height]
    print('train shape after reshape: {}'.format(X_train_CNN.shape))

    y_test_CNN = test.ix[:, 0].values.astype(
        'int32')  # only labels i.e targets digits
    X_test_CNN = np.array(test.iloc[:, 1:].values).reshape(
        (test.shape[0], 1, 28, 28)).astype(np.uint8)
    print('test shape after reshape: {}'.format(X_test_CNN.shape))

    # normalize inputs from 0-255 to 0-1
    X_train_CNN = X_train_CNN / 255
    X_test_CNN = X_test_CNN / 255

    #scaler = StandardScaler()
    #X_train_CNN = scaler.fit_transform(X_train_CNN)
    #X_test_CNN = scaler.fit_transform(X_test_CNN)

    # one hot encode outputs
    y_train_CNN = to_categorical(y_train_CNN)
    y_test_CNN = to_categorical(y_test_CNN)
    num_classes = y_train_CNN.shape[1]

    X_train = X_train_CNN
    X_val = X_test_CNN
    y_train = y_train_CNN
    y_val = y_test_CNN

    #########################################################
    # BUILDE THE MODEL AND EVALUATE IT USING K-FOLD
    #########################################################

    kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)
    kf.get_n_splits(X_train)

    acc_scores = list()

    for fold, (train_index, test_index) in enumerate(kf.split(X_train)):
        print('\n Fold %d' % (fold))

        X_tr, X_v = X_train[train_index], X_train[test_index]
        y_tr, y_v = y_train[train_index], y_train[test_index]
        # build the model
        model = model_cnn(num_classes)
        # fit model
        model.fit(X_tr,
                  y_tr,
                  epochs=epochs,
                  validation_data=(X_v, y_v),
                  verbose=2,
                  batch_size=batch_size,
                  callbacks=callbacks,
                  shuffle=True)

        acc = model.evaluate(X_v, y_v, verbose=0)
        acc_scores.append(acc[1])

        print('Fold %d: Accuracy %.2f%%' % (fold, acc[1] * 100))

    print('Accuracy scores: ', acc_scores)

    mean_acc = mean(acc_scores)
    standard_deviation_acc = mt.sqrt(
        sum_of_square_deviation(acc_scores, mean_acc))

    print('=====================')
    print('Mean Accuracy %f' % mean_acc)
    print('=====================')
    print('=====================')
    print('Stdev Accuracy %f' % standard_deviation_acc)
    print('=====================')

    model = model_cnn(num_classes)
    # Fit the final model
    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_val, y_val),
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks=callbacks,
                        verbose=2)

    # Final evaluation of the model
    scores = model.evaluate(X_val, y_val, verbose=0)
    print("Error: %.2f%%" % (100 - scores[1] * 100))
    print("Accuracy: %.2f%%" % (scores[1] * 100))

    # summarize history for accuracy
    fig_acc = plt.figure(figsize=(10, 10))
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    fig_acc.savefig("../../Output/model_accuracy_fm_cnn.png")
Ejemplo n.º 22
0
def ssnal_elastic_path(A,
                       b,
                       c_lam_vec=None,
                       alpha=0.9,
                       x0=None,
                       y0=None,
                       z0=None,
                       Aty0=None,
                       max_selected=None,
                       cv=False,
                       n_folds=10,
                       sgm=5e-3,
                       sgm_increase=5,
                       sgm_change=1,
                       step_reduce=0.5,
                       mu=0.2,
                       tol_ssn=1e-6,
                       tol_ssnal=1e-6,
                       maxiter_ssn=50,
                       maxiter_ssnal=100,
                       use_cg=True,
                       r_exact=2e4,
                       plot=False,
                       print_lev=2):
    """
    --------------------------------------------------------------------------
    ssnal algorithm to solve the elastic net for a list of lambda1 and lambda2
    --------------------------------------------------------------------------

    ----------------------------------------------------------------------------------------------------------------------
    :param A: design matrix (m x n)
    :param b: response vector (m, )
    :param c_lam_vec: np.array to determine all the values of lambda1 -- lambda1 = c_lam_vec * lambda1_max
    :param alpha: an array to determin lambda 2 -- lam2 = (1 - alpha) * lam1
    :param max_selected: if given, the algorithm stops when selects a number of features > max_selected
    :param x0: initial value for the lagrangian multiplier (variable of the primal problem) (n, ) -- vector 0 if not given
    :param y0: initial value fot the first variable of the dual problem  (m, ) -- vector of 0 if not given
    :param z0: initial value for the second variable of the dual problem (n, ) -- vector of 0 if not given
    :param Aty0: np.dot(A.transpose(), y0) (n,)
    :param max_selected: maximum number of features selected
    :param cv: True/False. I true, a cross validation is performed
    :param n_folds: number of folds to perform the cross validation
    :param sgm: starting value of the augmented lagrangian parameter sigma
    :param sgm_increase: increasing factor of sigma
    :param sgm_change: we increase sgm -- sgm *= sgm_increase -- every sgm_change iterations
    :param step_reduce: dividing factor of the step size during the linesearch
    :param mu: multiplicative factor fot the lhs of the linesearch condition
    :param tol_ssn: tolerance for the ssn algorithm
    :param tol_ssnal: global tolerance of the ssnal algorithm
    :param maxiter_ssn: maximum number of iterations for ssn
    :param maxiter_ssnal: maximum number of global iterations
    :param use_cg: True/False. If true, the conjugate gradient method is used to find the direction of the ssn
    :param r_exact: number of features such that we start using the exact method
    :param print_lev: different level of printing (0, 1, 2, 3, 4)
    :param plot: True/False. If true a plot of r_lm, gcv, extended bic and cv (if cv == True) is displayed
    ----------------------------------------------------------------------------------------------------------------------

    ---------------------------------------------------------------------------------------
    :return[0] aic_vec: aic values for each lambda1
    :return[1] ebic_vec: ebic values for each lambda1
    :return[2] gcv_vec: gcv values for each lambda1
    :return[3] cv_vec: cv values for each lambda1
    :return[4] r_vec: number of selected features for each lambda1
    :return[5] r_lm_vec: number of selected features by the debiased model for each lambda1
    :return[6] iter_vec: number of ssnal iterations for each lambda1
    :return[7] times_vec: ssnal time for each lambda1
    :return[8] full_time: time for fitting the full lambda for all the sequence of lambda1
    :return[9] cv_time: time for cross validation
    :return[10] total_time: total time
    ---------------------------------------------------------------------------------------

    REMEMBER: the output os ssnal_elastic_core is a list which contains:
    --------------------------------------------------------------------------------------------------
    :return[0] x: optimal value of the primal variable
    :return[1] y: optimal value of the first dual variable
    :return[2] z: optimal value of the second dual variable
    :return[3] x_lm: linear regression estimates on the
    :return[4] r: number of selected features
    :return[5] r_lm: number of selected features by the debiased model
    :return[6] aic: aic computed on the debiased estimates
    :return[7] ebic: extended bic computed on the debiased estimates
    :return[8] gcv: gcv computed on the debiased estimates
    :return[9] sgm: final value of the augmented lagrangian parameter sigma
    :return[10] lam1: lasso penalization
    :return[11] lam2: ridge penalization
    :return[12] convergence_ssnal: True/False. If true, the ssnal has converged
    :return[13] ssnal_time: total time of ssnal
    :return[14] it_ssnal: total ssnal's iteration
    :return[15] Aty: np.dot(A.transpose(), y) computed at the optimal y. Useful to implement warmstart
    --------------------------------------------------------------------------------------------------

    """

    # -------------------------- #
    #    initialize variables    #
    # -------------------------- #

    m, n = A.shape

    lam1_max = LA.norm(np.dot(A.transpose(), b), np.inf) / alpha

    if x0 is None:
        x0 = np.zeros((n, ))
    if y0 is None:
        y0 = np.zeros((m, ))
    if z0 is None:
        z0 = np.zeros((n, ))

    if max_selected is None:
        max_selected = n

    if c_lam_vec is None:
        c_lam_vec = np.logspace(start=3, stop=1, num=100, base=10.0) / 1000

    n_lam1 = c_lam_vec.shape[0]

    sgm0 = sgm

    # ---------------------- #
    #    initialize flags    #
    # ---------------------- #

    convergence = True
    reached_max = False

    n_lam1_stop = n_lam1

    # ---------------------------- #
    #    create output matrices    #
    # ---------------------------- #

    aic_vec, ebic_vec, gcv_vec = -np.ones([n_lam1]), -np.ones(
        [n_lam1]), -np.ones([n_lam1])
    times_vec, r_vec, r_lm_vec, iter_vec = -np.ones([n_lam1]), -np.ones(
        [n_lam1]), -np.ones([n_lam1]), -np.ones([n_lam1])

    # ---------------------- #
    #    solve full model    #
    # ---------------------- #

    if print_lev > 0:

        print()
        print('--------------------------------------------------')
        print('  solving full model  ')
        print('--------------------------------------------------')

    start_full = time.time()

    for i in range(n_lam1):

        lam1 = alpha * c_lam_vec[i] * lam1_max
        lam2 = (1 - alpha) * c_lam_vec[i] * lam1_max

        if print_lev > 2:
            print(
                '--------------------------------------------------------------------------------------------------'
            )
            print(
                ' FULL MODEL:  lambda1 (ratio) = %.2e  (%.2e)  |  lambda2 = %.2e  |  sigma0 = %.2e'
                % (lam1, c_lam_vec[i], lam2, sgm))
            print(
                '--------------------------------------------------------------------------------------------------'
            )

        # ------------------- #
        #    perform ssnal    #
        # ------------------- #

        fit = ssnal_elastic_core(A=A,
                                 b=b,
                                 lam1=lam1,
                                 lam2=lam2,
                                 x0=x0,
                                 y0=y0,
                                 z0=z0,
                                 Aty0=Aty0,
                                 sgm=sgm,
                                 sgm_increase=sgm_increase,
                                 sgm_change=sgm_change,
                                 step_reduce=step_reduce,
                                 mu=mu,
                                 tol_ssn=tol_ssn,
                                 tol_ssnal=tol_ssnal,
                                 maxiter_ssn=maxiter_ssn,
                                 maxiter_ssnal=maxiter_ssnal,
                                 use_cg=use_cg,
                                 r_exact=r_exact,
                                 print_lev=print_lev - 3)

        # ----------------------- #
        #    check convergence    #
        # ----------------------- #

        if not fit[10]:
            convergence = False
            break

        # ---------------------------- #
        #    update starting values    #
        # ---------------------------- #

        x0, y0, z0, Aty0, sgm = fit[0], fit[1], fit[2], fit[15], fit[9]

        # ---------------------------- #
        #    update output matrices    #
        # ---------------------------- #

        times_vec[i], r_vec[i], r_lm_vec[i], iter_vec[i] = fit[13], fit[
            4], fit[5], fit[14]

        r_lm = fit[5]
        if r_lm > 0:
            aic_vec[i], ebic_vec[i], gcv_vec[i] = fit[6], fit[7], fit[8]

        # --------------------------------------- #
        #    check number of selected features    #
        # --------------------------------------- #

        r_lm = fit[5]
        if r_lm > max_selected:
            n_lam1_stop = i + 1
            reached_max = True
            break

    # ------------------- #
    #    end full model   #
    # ------------------- #

    full_time = time.time() - start_full

    if not convergence:
        print('--------------------------------------------------')
        print(' snall has not converged for lam1 = %.4f, lam2 = %.4f ' %
              (lam1, lam2))
        print('--------------------------------------------------')

    if reached_max:
        print('--------------------------------------------------')
        print(' max number of features has been selected')
        print('--------------------------------------------------')

    # -------------- #
    #    start cv    #
    # -------------- #

    cv_time = 0
    cv_mat = -np.ones([n_lam1_stop, n_folds])

    if cv and convergence:

        print('--------------------------------------------------')
        print('  performing cv  ')
        print('--------------------------------------------------')

        x0_cv, z0_cv = np.zeros((n, )), np.zeros((n, ))
        Aty0_cv = None
        sgm_cv = sgm0

        fold = 0

        start_cv = time.time()

        # ------------- #
        #    split A    #
        # ------------- #

        kf = KFold(n_splits=n_folds)
        kf.get_n_splits(A)

        # -------------------- #
        #    loop for folds    #
        # -------------------- #

        for train_index, test_index in kf.split(A):

            A_train, A_test = A[train_index], A[test_index]
            b_train, b_test = b[train_index], b[test_index]

            y0_cv = np.zeros((np.shape(train_index)[0], ))

            # -------------------- #
            #    loop for lam1    #
            # -------------------- #

            for i_cv in tqdm(range(n_lam1_stop)):

                lam1 = c_lam_vec[i_cv] * lam1_max
                lam2 = (1 - alpha) * lam1

                # ------------------- #
                #    perform ssnal    #
                # ------------------- #

                fit_cv = ssnal_elastic_core(A=A_train,
                                            b=b_train,
                                            lam1=lam1,
                                            lam2=lam2,
                                            x0=x0_cv,
                                            y0=y0_cv,
                                            z0=z0_cv,
                                            Aty0=Aty0_cv,
                                            sgm=sgm_cv,
                                            sgm_increase=sgm_increase,
                                            sgm_change=sgm_change,
                                            step_reduce=step_reduce,
                                            mu=mu,
                                            tol_ssn=tol_ssn,
                                            tol_ssnal=tol_ssnal,
                                            maxiter_ssn=maxiter_ssn,
                                            maxiter_ssnal=maxiter_ssnal,
                                            use_cg=use_cg,
                                            r_exact=r_exact,
                                            print_lev=0)

                # ------------------- #
                #    update cv mat    #
                # ------------------- #

                cv_mat[i_cv,
                       fold] = LA.norm(np.dot(A_test, fit_cv[3]) - b_test)**2

                # ---------------------------- #
                #    update starting values    #
                # ---------------------------- #

                if i_cv == n_lam1_stop:
                    x0_cv, y0_cv, z0_cv, Aty0_cv, sgm_cv = None, None, None, None, sgm0

                else:
                    x0_cv, y0_cv, z0_Cv, Aty0_cv, sgm_cv = fit_cv[0], fit_cv[
                        1], fit_cv[2], fit_cv[15], fit_cv[9]

            # ------------------------ #
            #    end loop for lam1    #
            # ------------------------ #

            fold += 1

        # ------------ #
        #    end cv    #
        # ------------ #

        cv_time = time.time() - start_cv

    # ---------------------------- #
    #    printing final results    #
    # ---------------------------- #

    if cv:
        cv_vec = cv_mat.mean(1) / m
    else:
        cv_vec = -np.ones([n_lam1_stop])

    total_time = full_time + cv_time

    time.sleep(0.1)

    print('')
    print('')
    print('------------------------------------------------------------')
    print(' total time:  %.4f' % total_time)

    if cv:
        print('------------------------------------------------------------')
        print('  full time:  %.4f' % full_time)
        print('------------------------------------------------------------')
        print('  cv time:    %.4f' % cv_time)

    print('------------------------------------------------------------')

    if print_lev > 1:

        print('')
        # print('------------------------------------------------------------')
        print(
            '    c_lam    lam1      lam2    r_lm    gcv     ebic     cv      ')
        print('------------------------------------------------------------')

        np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
        print(
            np.stack((c_lam_vec[:n_lam1_stop],
                      alpha * c_lam_vec[:n_lam1_stop] * lam1_max,
                      (1 - alpha) * c_lam_vec[:n_lam1_stop] * lam1_max,
                      r_lm_vec[:n_lam1_stop], gcv_vec[:n_lam1_stop],
                      ebic_vec[:n_lam1_stop], cv_vec), -1))
        print('\n')

    if plot:
        plot_cv_ssnal_elstic(r_lm_vec, ebic_vec, gcv_vec, cv_vec, alpha,
                             c_lam_vec)

    return aic_vec[:n_lam1_stop], \
           ebic_vec[:n_lam1_stop], \
           gcv_vec[:n_lam1_stop], \
           cv_vec, \
           r_vec[:n_lam1_stop], \
           r_lm_vec[:n_lam1_stop], \
           iter_vec[:n_lam1_stop], \
           times_vec[:n_lam1_stop], \
           full_time, cv_time, total_time
Ejemplo n.º 23
0
def main(batch_size):
    # read train data

    train_input = np.genfromtxt('ctg_data_cleaned.csv', delimiter=',')
    trainX, train_Y = train_input[1:, :21], train_input[1:, -1].astype(int)
    trainX = scale(trainX, np.min(trainX, axis=0), np.max(trainX, axis=0))

    trainY = np.zeros((train_Y.shape[0], NUM_CLASSES))
    trainY[np.arange(train_Y.shape[0]), train_Y - 1] = 1  # one hot matrix

    # experiment with small datasets
    # trainX = trainX[:1000]
    # trainY = trainY[:1000]

    # split the test and training data into 70:30
    trainX, testX, trainY, testY = train_test_split(trainX, trainY, test_size=0.3, shuffle=True)

    # n = trainX.shape[0]
    # print(n)

    # Create the model
    x = tf.placeholder(tf.float32, [None, NUM_FEATURES])
    y_ = tf.placeholder(tf.float32, [None, NUM_CLASSES])

    logits, h_weights, weights = fnn(x, num_neurons)

    # Build the graph for the deep net

    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_, logits=logits)
    L2_regularization = tf.nn.l2_loss(h_weights) + tf.nn.l2_loss(weights)
    loss = tf.reduce_mean(cross_entropy + beta * L2_regularization)

    # Create the gradient descent optimizer with the given learning rate.
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = optimizer.minimize(loss, global_step=global_step)

    correct_prediction = tf.cast(tf.equal(tf.argmax(logits, 1), tf.argmax(y_, 1)), tf.float32)
    accuracy = tf.reduce_mean(correct_prediction)

    kf = KFold(n_splits=5, shuffle=True)
    kf.get_n_splits(trainX)
    # print(kf)
    time_taken = []

    for train_index, test_index in kf.split(trainX):
        # print("TRAIN:", train_index, "TEST:", test_index)
        trainX_, testX_ = trainX[train_index], trainX[test_index]
        trainY_, testY_ = trainY[train_index], trainY[test_index]

        N = len(trainX_)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            train_acc = []
            test_acc = []

            for i in range(epochs):
                start_time = time.time()

                for start, end in zip(range(0, N, batch_size), range(batch_size, N, batch_size)):
                    train_op.run(feed_dict={x: trainX_[start:end], y_: trainY_[start:end]})

                end_time = time.time()

                duration = end_time - start_time

                train_acc.append(accuracy.eval(feed_dict={x: trainX_, y_: trainY_}))
                test_acc.append(accuracy.eval(feed_dict={x: testX_, y_: testY_}))

                time_taken.append(duration)
                # print(time_taken)

                if i % 100 == 0:
                    print('iter %d: training accuracy %g' % (i, train_acc[i]))
                    print('iter %d: test accuracy %g' % (i, test_acc[i]))
                    print(duration, '\n')

    avg_time_taken = np.mean(np.array(time_taken))

    return avg_time_taken
Ejemplo n.º 24
0
def train(n_components, alpha, reg, n_samples):
    print("Loading data...")
    movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data(n_samples=n_samples)
    print("number of users with ratings: {}".format(len(np.unique(rating_indices[:,0]))))
    print("number of movies with ratings: {}".format(len(np.unique(rating_indices[:,1]))))
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True)
    kf.get_n_splits(rating_indices)
    ratings = scipy.sparse.dok_matrix(ratings)
    if not n_components or not alpha or not reg:
        alphas = [0.01, 0.1, 1, 10]
        regs = [0.01, 0.1]
        components = [5, 10, 15, 20, 30]
        print("Finding optimal parameters...")
        best_loss = float('inf')
        best_k, best_alpha, best_reg = 0, 0, 0
        for n_components in components:
            for alpha in alphas:
                for reg in regs:
                    mean_loss = 0.0
                    print("n_components: {}, alpha: {}, reg: {}".format(n_components, alpha, reg))
                    for k, (train_index, test_index) in enumerate(kf.split(rating_indices)):
                        print("Fold {}".format(k))
                        test_indices = rating_indices[test_index]
                        test_indices = test_indices[:,0], test_indices[:,1], test_indices[:,2]
                        user_test_indices, item_test_indices = test_indices[0], test_indices[1]
                        data_train = scipy.sparse.lil_matrix(ratings)
                        data_train[user_test_indices, item_test_indices] = 0
                        data_train = scipy.sparse.csr_matrix(ratings)
                        start = time.time()
                        print("Fitting...")
                        nmf = NMF(n_components=n_components, alpha=alpha, l1_ratio=reg, init='nndsvd', tol=0.001, verbose=1)
                        P = nmf.fit_transform(data_train)
                        Q = nmf.components_
                        acc, loss = evaluate(P, Q, test_indices)
                        print("Elapsed time: {:.1f}s".format(time.time()-start))
                        print("loss: {:.4f} - acc: {:.4f}".format(loss, acc))
                        mean_loss = (mean_loss*k + loss) / (k+1)
                    if mean_loss < best_loss:
                        best_loss = mean_loss
                        best_k = n_components
                        best_alpha = alpha
                        best_reg = reg
        print("best k: {}, best alpha: {}, best reg: {}, best loss: {:.4f}".format(best_k, best_alpha, best_reg, best_loss))
    else:
        print("Performing cross validation...")
        mean_acc = 0.0
        mean_loss = 0.0
        for k, (train_index, test_index) in enumerate(kf.split(rating_indices)):
            print("Fold {}".format(k))
            test_indices = rating_indices[test_index]
            test_indices = test_indices[:,0], test_indices[:,1], test_indices[:,2]
            user_test_indices, item_test_indices = test_indices[0], test_indices[1]
            data_train = scipy.sparse.lil_matrix(ratings)
            data_train[user_test_indices, item_test_indices] = 0
            data_train = scipy.sparse.csr_matrix(data_train)
            start = time.time()
            print("Fitting...")
            nmf = NMF(n_components=n_components, alpha=alpha, l1_ratio=reg, init='nndsvd', tol=0.001, verbose=1)
            P = nmf.fit_transform(data_train)   
            Q = nmf.components_
            acc, loss = evaluate(P, Q, test_indices)
            print("Elapsed time: {:.4f}".format(time.time()-start))
            print("loss: {:.4f} - acc: {:.4f}".format(loss, acc))
            mean_acc = (mean_acc*k + acc) / (k+1)  
            mean_loss = (mean_loss*k + loss) / (k+1)
        print("mean loss: {:.4f} - mean acc: {:.4f}".format(mean_loss, mean_acc))
Ejemplo n.º 25
0
            
    
    print("Epoch: %d, Train: %.3f, Val: %.3f, Val IOU: %.3f" % (e, np.mean(train_loss), np.mean(val_loss), np.mean(val_iou)))
    if np.mean(val_iou) > best_iou:
        torch.save(model.state_dict(), '../torch_parameters/background_classifier.pt') #save
        i = 0 #reset 
        best_iou = np.mean(val_iou) #reset
    #elif i > patience:
    #    break
    
    scheduler.step()


#load best model
model = get_model(num_classes = 1, num_filters = 32, pretrained = True)
model.load_state_dict(torch.load('../torch_parameters/background_classifier.pt'))

#test predictions
model.train(False)
all_predictions = []
with torch.no_grad():
    for image in tqdm(data.DataLoader(test_dataset, batch_size = 100)):
        image = image[0].type(torch.FloatTensor).cuda()
        y_pred = model(image).cpu().data.numpy()
        all_predictions.append(y_pred)
all_predictions_stacked = np.vstack(all_predictions)[:, 0, :, :]/fold.get_n_splits()




for i in to_dummy:
    dummies = pd.get_dummies(X[i], prefix=i)
    #print dummies
    #dummies=dummies.iloc[:,1:]
    X = X.drop(i, 1)
    X = pd.concat([dummies, X], axis=1)

num_cols = [
    i for i in df.columns
    if i not in to_dummy + ['heating_load', 'cooling_load']
]

from sklearn.model_selection import KFold

kf = KFold(n_splits=10)
print("Using ", kf.get_n_splits(X), " folds")


def Model(regressor, f):

    avg_r2_train = []
    avg_r2_test = []
    avg_meanS_train = []
    avg_meanS_test = []

    avg_r2_train_y2 = []
    avg_r2_test_y2 = []
    avg_meanS_train_y2 = []
    avg_meanS_test_y2 = []
    from sklearn.preprocessing import StandardScaler
    i = 1.0
Ejemplo n.º 27
0
    temp.append(t)
x_post = np.array(list(temp), dtype=np.int32)

ori_id = np.array(list(ori_id))

# Randomly shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_pre_shuffled = x_pre[shuffle_indices]
x_post_shuffled = x_post[shuffle_indices]
y_shuffled = y[shuffle_indices]
ori_id_shuffled = ori_id[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
kf = KFold(n_splits=5)
kf.get_n_splits(x_pre_shuffled)

#dev_sample_index = int(0.2 * float(len(y)))

avg_accuracy = 0.0

split = 0
for train_index, test_index in kf.split(x_pre_shuffled):
    out.write("*****************************************\n")
    out.write("*****************************************\n")
    out.write("*****************************************\n")
    out.write("This is the {} split\n".format(split + 1))
    split += 1
    out.write("*****************************************\n")
    x_pre_train, x_pre_test = x_pre_shuffled[train_index], x_pre_shuffled[
        test_index]
Ejemplo n.º 28
0
def train(data, pre_embedding, config, output_dir):
    # x, y, x_test, y_test, vocab_processor
    # Training
    # ==================================================
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=config['allow_soft_placement'],
            log_device_placement=config['log_device_placement'])
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            model_config = _define_model_config(data, pre_embedding, config)
            cnn = build_model(model_config)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.join(output_dir, timestamp)
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.join(out_dir, "checkpoints")
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=config['num_checkpoints'])

            # Write vocabulary
            data['vocab_processor'].save(os.path.join(out_dir, "vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: config['dropout_keep_prob']
                }

                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)

                time_str = datetime.datetime.now().isoformat()
                print("Train: {}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("Test: {}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            def train_with_batch(saver, batches, x_dev, y_dev, evaluate_every,
                                 checkpoint_every, checkpoint_prefix):
                """
                Train model with batches
                """
                for batch in batches:
                    x_batch, y_batch = zip(*batch)
                    train_step(x_batch, y_batch)
                    current_step = tf.train.global_step(sess, global_step)
                    if current_step % evaluate_every == 0:
                        dev_step(x_dev, y_dev, writer=dev_summary_writer)

            # Generate batches
            use_k_fold = config['use_k_fold']
            if use_k_fold:
                kf = KFold(n_splits=10)
                fold_size = kf.get_n_splits() - 1
                for train_idx, dev_idx in kf.split(x, y):
                    x_train = data['x_train'][train_idx]
                    x_dev = data['x_test'][dev_idx]

                    y_train = data['y_train'][train_idx]
                    y_dev = data['y_test'][dev_idx]

                    # Generate batches
                    batches = batch_iter(list(zip(x_train, y_train)),
                                         config['batch_size'],
                                         config['num_epochs'])
                    # Training loop. For each batch...
                    train_with_batch(saver, batches, x_dev, y_dev, fold_size,
                                     fold_size, checkpoint_prefix)
            else:
                batches = batch_iter(
                    list(zip(data['x_train'], data['y_train'])),
                    config['batch_size'], config['num_epochs'])
                train_with_batch(saver, batches, data['x_test'],
                                 data['y_test'], config['evaluate_every'],
                                 config['checkpoint_every'], checkpoint_prefix)
                # save predict results
                feed_dict = {
                    cnn.input_x: data['x_test'],
                    cnn.input_y: data['y_test'],
                    cnn.dropout_keep_prob: 1.0
                }

            predictions = sess.run([cnn.predictions], feed_dict)
            predictions, scores, input_y = sess.run(
                [cnn.predictions, cnn.scores, cnn.input_y], feed_dict)
            pickle.dump(scores,
                        open(os.path.join(checkpoint_dir, 'scores.pkl'), 'wb'))
            pickle.dump(
                predictions,
                open(os.path.join(checkpoint_dir, 'predictions.pkl'), 'wb'))
            pickle.dump(
                input_y, open(os.path.join(checkpoint_dir, 'input_y.pkl'),
                              'wb'))

            # save model
            current_step = tf.train.global_step(sess, global_step)
            print('checkpoint_prefix: ', checkpoint_prefix)
            print('current_step:', current_step)
            path = saver.save(sess,
                              checkpoint_prefix,
                              global_step=current_step)
            print("Saved model checkpoint to {}\n".format(path))
Ejemplo n.º 29
0
def RESNET_onehot(Dropout1=0,Epochs= 20,Batch_size=64):
    # 优化器选择 Adam 优化器。
    # 损失函数使用 sparse_categorical_crossentropy,
    # 还有一个损失函数是 categorical_crossentropy,两者的区别在于输入的真实标签的形式,
    # sparse_categorical 输入的是整形的标签,例如 [1, 2, 3, 4],categorical 输入的是 one-hot 编码的标签。
    
    Feature_test = np.load("../../data_all/TCRA_train_feature_array.npy")    
    Label_array = np.load("../../data_all/TCRA_train_label_array.npy")
       
    X = Feature_test#[:,0:29,:] #提取one-hot特征
    #print(X[0])
    Y = Label_array#[:,1]

    X = X.reshape(len(X),-1)
    #loo = LeaveOneOut()
    
    kf = KFold(n_splits=5,shuffle=True,random_state=0)
    kf.get_n_splits(X)
    TN = FP = FN = TP = 0
    aa = 1 

    for train_index, test_index in kf.split(X):
        np.random.shuffle(train_index)
        np.random.shuffle(test_index)
        
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        
        
        
        
        
        X_train= X_train.reshape([len(X_train),29,20,2])
        X_test = X_test.reshape([len(X_test),29,20,2])
        X_test=tf.cast(X_test, tf.float32)
        info = "using the one_hoot th model\n"
        modelfile = './tra_resnet_model.h5'
        Y_pred = resnet_attention_train_predict(29, 19 ,modelfile,info,2,X_train, Y_train,X_test, Y_test)
        
        
        Y_pred = np.argmax(Y_pred, axis=-1)
        Y_test = np.argmax(Y_test, axis=-1)

        confusion_matrix1 =confusion_matrix(Y_test,Y_pred)
        
        
        
        
        
        TP += confusion_matrix1[0,0]
        FN += confusion_matrix1[0,1]
        FP += confusion_matrix1[1,0]
        TN += confusion_matrix1[1,1]
        
#         accuracy = accuracy_score(Y_test,Y_pred) #准确率
#         precision = precision_score(Y_test,Y_pred) #精确率
#         recall = recall_score(Y_test,Y_pred) #召回率
#         f1= f1_score(Y_test,Y_pred) #F1
               
#         print('混淆矩阵\n',confusion_matrix1,
#               '\n准确率ACC:',accuracy,
#               '\n精确率precision:',precision,
#               '\n召回率recall:',recall,
#               '\nF1:',f1,
#              )
               
#         y_predict = model.predict(X_test)      
        
#         y_probs = model.predict_proba(X_test) #模型的预测得分
#         #print(y_probs)
        
#         fpr, tpr, thresholds = metrics.roc_curve(Y_test,y_probs)
#         roc_auc = auc(fpr, tpr)  #auc为Roc曲线下的面积
#         #开始画ROC曲线
#         plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)
#         plt.legend(loc='lower right')
#         plt.plot([0,1],[0,1],'r--')
#         plt.xlim([-0.1,1.1])
#         plt.ylim([-0.1,1.1])
#         plt.xlabel('False Positive Rate') #横坐标是fpr
#         plt.ylabel('True Positive Rate')  #纵坐标是tpr
#         plt.title('Receiver operating characteristic example')
#         plt.show()
        
        #model.save('./data_625/model_'+str(aa)+'.h5')
        print(aa)
        
        
        if aa == 1:
            Y_test_all = Y_test
            Y_pred_all = Y_pred
        else:
            Y_test_all = np.append(Y_test_all, Y_test, axis=0)
            Y_pred_all = np.append(Y_pred_all, Y_pred, axis=0) 

        aa += 1
        
        
    print('\n\n总混淆矩阵')
    print(TP,FN)
    print(FP,TN)
    
    #print(Y_test_all[0])
            
    accuracy = accuracy_score(Y_test_all,Y_pred_all) #准确率
    precision = precision_score(Y_test_all,Y_pred_all) #精确率
    recall = recall_score(Y_test_all,Y_pred_all) #召回率
    f1= f1_score(Y_test_all,Y_pred_all) #F1
    
    MCC = matthews_corrcoef(Y_test_all,Y_pred_all) #MCC

   
    
    print('\n准确率ACC:',accuracy,
          '\n精确率precision:',precision,
          '\n召回率recall:',recall,
          '\nF1:',f1,
          '\nMCC:',MCC 
         )
    def find_best_params_GridSearchCV(self, gridsearch, X_local, y_local):
        """
        Find best set of parameters given a grid-search round 
        across various folds.
        """
        """ Generates all combination of hyperparameters """
        hyperparam_space = list(ParameterGrid(gridsearch))

        if len(hyperparam_space) == 1:
            return hyperparam_space[0]
        """ Splits local folds """
        kf = KFold(n_splits=self.cross_validation)
        kf.get_n_splits(self.X_train)

        folds = []
        for train_index, test_index in kf.split(X_local):
            folds.append([pd.Series(train_index), pd.Series(test_index)])

        n_folds = len(folds)
        n_combinations = len(hyperparam_space)

        if self.verbose:
            msg = "Fitting %s models...\n"
            if self.verbose:
                print(msg % (n_combinations))
        """ Performs gridsearch """
        #Stores performance, Stores classification reports
        performance = []
        for params_it, params in enumerate(hyperparam_space):
            time_start = time.time()

            #Evaluation rounds
            local_results = []
            for fold_it, fold in enumerate(folds):
                X_train = X_local.iloc[fold[0]]
                X_test = X_local.iloc[fold[1]]
                y_train = y_local.iloc[fold[0]]
                y_test = y_local.iloc[fold[1]]

                params['eval_set'] = [(X_test, y_test)]

                alg = xgbw.XGBWrapper(**params)
                alg.fit(X_train, y_train)

                pred_test = alg.predict(X_test)

                local_report = class_report(y_true=y_test, y_pred=pred_test)

                local_results.append(local_report)

            #Stores performance evaluation given the performance-policy
            metric, statistic = self.performance_politic.split('__')
            local_performance = []

            for local_report in local_results:
                local_report = local_report.drop('Global')
                metric_results = local_report[metric]

                if statistic == 'min':
                    metric_stat = metric_results.min()
                elif statistic == 'max':
                    metric_stat = metric_results.max()
                elif statistic == 'mean':
                    metric_stat = metric_results.mean()

                local_performance.append(metric_stat)

            local_performance = pd.Series(local_performance)
            performance.append(local_performance)

            time_end = time.time()
            elapsed_time = (time_end - time_start)

            if self.gridsearch_verbose:
                msg = "%s of %s - %s: %s  - %s s" % (
                    (params_it + 1), n_combinations, self.performance_politic,
                    round(local_performance.mean(), 4), round(elapsed_time, 2))
                if self.verbose:
                    print(msg)

                for param_name in params.keys():
                    if param_name != 'eval_set':
                        msg = "\t%s: %r" % (param_name, params[param_name])
                        if self.verbose:
                            print(msg)
                if self.verbose:
                    print('')

        performance = pd.DataFrame(performance)

        mean_performance = performance.mean(axis=1)
        idx_best = mean_performance.idxmax()
        best_parameters = hyperparam_space[idx_best]

        return best_parameters
Ejemplo n.º 31
0
from random import shuffle
from torch.autograd import Variable
from sklearn.model_selection import KFold

# add manual_seed
torch.manual_seed(1)

shuffled = D.text_and_label
shuffled = [[D.text_and_label[0][i], D.text_and_label[1][i]]
            for i in range(len(D.text_and_label[0]))]
shuffle(shuffled)

# definition for K-Folds

kf = KFold(n_splits=10)
kf.get_n_splits(shuffled)

scores = []
k = 0
print('Training Start ...')
for train_index, test_index in kf.split(shuffled):
    print('now : [%d] Fold' % k)
    CNN = Net.CnnNetwork()
    optimizer = optim.SGD(CNN.parameters(), lr=0.001, momentum=0.9)
    criterion = nn.BCELoss()
    CNN.cuda()

    k = k + 1
    # @ Cross Validation : Train Part
    CNN.train()
    for epoch in range(10):
Ejemplo n.º 32
0
from sklearn.model_selection import KFold
from Models import naive_bayes, logistic_regression, knn, cart
from LoadDataset import load_dataset

data, target = load_dataset("peersim.csv")

kf = KFold(n_splits=10)
kf.get_n_splits(data)

print(kf)

for train_index, test_index in kf.split(data):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = target[train_index], target[test_index]
Ejemplo n.º 33
0
indices = np.column_stack(indices)
indices = pd.DataFrame(indices)
test = pd.read_csv(path+'test_start.csv')

test = pd.concat([test, indices], axis=1)
dtest = xgb.DMatrix(test)

# y_pred = model.predict(dtest)
#
# test = pd.read_csv(path+'test_start.csv')
# output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
# output.to_csv(path+'xgboost-depth{}-pca-ica.csv'.format(xgb_params['max_depth']), index=False)

n_splits = 5
kf = KFold(n_splits=n_splits)
kf.get_n_splits(train)
# dtest = xgb.DMatrix(test)
predictions = np.zeros((test.shape[0], n_splits))
score = 0

oof_predictions = np.zeros(train.shape[0])
for fold, (train_index, test_index) in enumerate(kf.split(train)):
    X_train, X_valid = train.iloc[train_index, :], train.iloc[test_index, :]
    y_train, y_valid = y[train_index], y[test_index]

    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_valid, label=y_valid)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    model = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=50, feval=xgb_r2_score, maximize=True, verbose_eval=False)
# In[12]:

scores = []

# In[13]:


def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return abs(model.score(X_test, y_test))


# In[14]:

kf = KFold(n_splits=5)
kf.get_n_splits(X_train)

# In[15]:

KFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in kf.split(df):
    x_train, x_test = df[train_index], df[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    scores.append(get_score(regressor, x_train, x_test, y_train, y_test))

# In[16]:

scores

# In[17]:
Ejemplo n.º 35
0
    clf = model()
    # train the model
    clf.fit(X_train, y_train)
    # to compute training error, first make predictions on training set
    y_hat_train = clf.predict(X_train)
    # then compare our prediction with true labels using the metric
    training_error = r2_score(y_train, y_hat_train)


    # CROSS-VALIDATION ERROR
    from sklearn.model_selection import KFold
    from numpy import zeros, mean
    # 3-fold cross-validation
    n = 3
    kf = KFold(n_splits=n)
    kf.get_n_splits(X_train)
    i=0
    scores = zeros(n)
    for train_index, test_index in kf.split(X_train):
        Xtr, Xva = X_train[train_index], X_train[test_index]
        Ytr, Yva = y_train[train_index], y_train[test_index]
        M = model()
        M.fit(Xtr, Ytr)
        Yhat = M.predict(Xva)
        scores[i] = r2_score(Yva, Yhat)
        print ('Fold', i+1, 'example metric = ', scores[i])
        i=i+1
    cross_validation_error = mean(scores)

    # Print results
    print("\nThe scores are: ")
 ##reshaping it again to -1*size*size*1. 1 represents the color code as grayscale
 #########
 images = labeledData(data, files, sz)
 shuffle(images)
 img_data = np.array([i[0] for i in images]).reshape(-1,sz,sz,1)
 lbl_data = np.array([i[1] for i in images])
 #########
 
  
 kernel = 3
 strides = 2
 seed = 5
 
 #Split k-fold into desired number n_splits. 
 kfold = KFold(n_splits=12, shuffle=True, random_state=seed)
 kfold.get_n_splits(img_data)
 
 #Store-variables
 scores = []
 losses = []
 probabilities = []
 classes = []
 label_data_1D = []
 
 #Run validation
 #IMPORTANT NOTE: Validation will take a very long time. 
 #It may appear frozen but is in fact processing. 
 for train, test in kfold.split(img_data):
     #Use Keras Sequential and add convolution layers
     model = Sequential()