def train_dev_split_cv(df, k_folds=None, match_num=None):
    """
    split train dev sets using cross validation
    To-do: integrate more CV methods, e.g., loo, lop, etc
    :param df: data frame
    :param k_folds: number of folds specified in cross validation
    :return: data_train,  in lists
    """
    output = []
    X = df.iloc[:, df.columns != 'target']
    y = df.target.to_frame()

    if not k_folds:
        print('Applied 5-fold cross validation by default')
        k_folds = 5

    # K Fold CV
    group_kfold = GroupKFold(n_splits=k_folds)
    group_kfold.get_n_splits(X, y, groups=X.PID)

    for train_index, test_index in group_kfold.split(X, y, groups=X.PID):
        df_subset = {}
        df_subset['data_train'], df_subset['data_test'] = X.iloc[
            train_index], X.iloc[test_index]
        df_subset['target_train'], df_subset['target_test'] = y.iloc[
            train_index], y.iloc[test_index]
        output.append(df_subset)
    return output
Exemple #2
0
def group_k_fold(make_model, feature_vector):
    group_kfold = GroupKFold(n_splits=5)
    group_kfold.get_n_splits(feature_vector.features, feature_vector.target,
                             feature_vector.pdb_ids)

    predicted_proba = np.zeros_like(feature_vector.target, dtype=np.float32)
    predicted = np.zeros_like(feature_vector.target, dtype=np.float32)

    for train_index, test_index in group_kfold.split(feature_vector.features,
                                                     feature_vector.target,
                                                     feature_vector.pdb_ids):
        X_train, X_test = feature_vector.features[
            train_index], feature_vector.features[test_index]
        y_train, y_test = feature_vector.target[
            train_index], feature_vector.target[test_index]

        model = make_model()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        predicted[test_index] = y_pred
        predicted_proba[test_index] = model.predict_proba(X_test)[:, 1]

        print(precision_recall_fscore_support(y_test, y_pred))

    return ClassificationResult(target=feature_vector.target,
                                predicted=predicted,
                                predicted_proba=predicted_proba)
Exemple #3
0
def GroupKFold_Amir(input, n_splits):
    X = input
    y = X.landmarks_frame.KL[:]
    y = y.reset_index(drop=True)
    groups = X.landmarks_frame.ID[:]
    group_kfold = GroupKFold(n_splits)
    group_kfold.get_n_splits(X, y, groups)
    print(group_kfold)
    return group_kfold.split(X, y, groups)
Exemple #4
0
def group_test(X, y, model, groups):
    # X=pre_x[:,chosen_vars]
    # x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.3)
    group_kfold = GroupKFold(n_splits=10)
    group_kfold.get_n_splits(X, y, groups)
    acc_arr = []

    for train_index, test_index in group_kfold.split(X, y, groups):
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        # print(train_index)
        # print(test_index)
        for id in train_index:
            X_train.append(X.iloc[id])
        for id in test_index:
            X_test.append(X.iloc[id])
        for id in train_index:
            y_train.append(y[id])
        for id in test_index:
            y_test.append(y[id])
        # print(np.shape(X_train))
        # print(np.shape(X_test))
        # print(np.shape(y_train))
        # print(np.shape(y_test))
        if model == 'svm':
            clf = SVC(gamma='auto').fit(X_train, y_train)
            tmp_score = clf.score(X_test, y_test)
            acc_arr.append(tmp_score)
        elif model == 'rf_extra':
            clf = ExtraTreesClassifier(n_estimators=100).fit(X_train, y_train)
            tmp_score = clf.score(X_test, y_test)
            acc_arr.append(tmp_score)
        elif model == 'rf':
            clf = RandomForestClassifier(n_estimators=100).fit(
                X_train, y_train)
            tmp_score = clf.score(X_test, y_test)
            acc_arr.append(tmp_score)
        elif model == 'nb':
            clf = GaussianNB().fit(X_train, y_train)
            tmp_score = clf.score(X_test, y_test)
            acc_arr.append(tmp_score)
        elif model == 'lr':
            clf = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
            tmp_score = clf.score(X_test, y_test)
            acc_arr.append(tmp_score)
        elif model == 'nn':
            tmp_score = nn(X_train, X_test, y_train, y_test)
            acc_arr.append(tmp_score)
        # score=np.mean(cross_val_score(svm,X,y,cv=10))
        # print(groupdic[groupid],modeldic[modelid],np.shape(X_test),np.shape(X_train))
        # print('accuracy='+','+str(qwer)+'\n')
    return np.mean(acc_arr)
Exemple #5
0
def get_grouped_k_fold_splits(confused_list, not_confused_list, num_folds):
    """ Splits data ensuring no users have data in training and eval sets.

        Args:
            confused_list (list): list of data item names labelled as confused
            not_confused_list (list): list of data item names labelled as not_confused
            num_folds (int): number of folds for cross validation.

        Returns: (in following order)
            train_confused_splits (list): each element is a list containing the
                file names of the data items for this partition of the dataset
            test_confused_splits (list): as above
            train_not_confused_splits (list): as above
            test_not_confused_splits (list): as above
    """

    train_confused_splits = []
    test_confused_splits = []

    # make list where each index corresponds to the "group" (userID)
    confused_groups = [uid.split('_')[0][:-1] for uid in confused_list]
    not_confused_groups = [uid.split('_')[0][:-1] for uid in not_confused_list]

    # get train test splits for confused class
    dummy_y = [1 for i in range(len(confused_list))]
    gkf = GroupKFold(n_splits=num_folds)
    gkf.get_n_splits(X=confused_list, y=dummy_y, groups=confused_groups)
    for train, test in gkf.split(X=confused_list,
                                 y=dummy_y,
                                 groups=confused_groups):
        train_confused_splits.append([confused_list[i] for i in train])
        test_confused_splits.append([confused_list[i] for i in test])

    train_not_confused_splits = []
    test_not_confused_splits = []

    # get train test splits for not_confused class
    dummy_y = [1 for i in range(len(not_confused_list))]
    gkf = GroupKFold(n_splits=num_folds)
    gkf.get_n_splits(X=not_confused_list,
                     y=dummy_y,
                     groups=not_confused_groups)
    for train, test in gkf.split(X=not_confused_list,
                                 y=dummy_y,
                                 groups=not_confused_groups):
        train_not_confused_splits.append([not_confused_list[i] for i in train])
        test_not_confused_splits.append([not_confused_list[i] for i in test])

    split = (train_confused_splits, test_confused_splits,
             train_not_confused_splits, test_not_confused_splits)

    return split
Exemple #6
0
def group_test_3(pre_x, kmeans_labels, names, num_dic, groups, num_vars,
                 meta_i):
    chosen_vars = np.zeros(meta_i)
    chosen_values = np.zeros(
        meta_i) - 1  #subtract one so that decent negative values can be chosen
    # print('===')
    for j in range(meta_i):
        # old_val=np.inf*-1
        # print(j)
        for i in range(num_vars):  #clean this routine up? check for errors?
            clust = kmeans_labels[i]
            if clust == j:
                # print(names[i])
                new_val = num_dic[i]
                old_val = chosen_values[clust]
                if old_val < new_val:
                    # print(names[i],old_val,new_val)
                    chosen_vars[clust] = int(i)
                    chosen_values[clust] = new_val
    # print(chosen_vars)
    # print(type(chosen_vars))
    chosen_works = []
    chosen_names = []
    for qq in list(chosen_vars):
        chosen_names.append(names[int(qq)])
        chosen_works.append(int(qq))
    X = pre_x[:, chosen_works]
    group_kfold = GroupKFold(n_splits=10)  #make new func
    group_kfold.get_n_splits(X, y, groups)
    acc_arr = []
    for train_index, test_index in group_kfold.split(X, y, groups):
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        # print(train_index)
        # print(test_index)
        for id in train_index:
            X_train.append(X[id])
        for id in test_index:
            X_test.append(X[id])
        for id in train_index:
            y_train.append(y[id])
        for id in test_index:
            y_test.append(y[id])
        clf = RandomForestClassifier().fit(X_train, y_train)
        tmp_score = clf.score(X_test, y_test)
        acc_arr.append(tmp_score)
    return np.mean(acc_arr), chosen_names
def split_data(input_file, output_dir, seed, n_folds):
    df = pd.read_csv(input_file, sep='\t', dtype=str)

    # shuffle rows of dataframe several times
    for _ in range(5):
        df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    # get group indeces
    hearing_to_num = {}
    for idx, hearing_id in enumerate(df['hearing_id'].unique()):
        hearing_to_num[hearing_id] = idx
    df['hearing_num'] = df['hearing_id'].map(hearing_to_num)
    group_idxs = df['hearing_num'].values

    outer_cv = GroupKFold(n_splits=n_folds)
    # Split X and y into K-partitions to outer CV
    indeces = df.index.values
    for (i, (train_index, test_index)) in enumerate(outer_cv.split(indeces, indeces, groups=group_idxs)):
        print('Fold: ', str(i), '/', str(outer_cv.get_n_splits()-1))
        fold_dir = os.path.join(output_dir, 'fold'+str(i))
        Path(fold_dir).mkdir(parents=True, exist_ok=True)
        file_name_train = os.path.join(fold_dir, 'train.tsv')
        df.loc[train_index][COLUMN_NAMES].to_csv(file_name_train, sep='\t', index=False)

        file_name_test = os.path.join(fold_dir, 'test.tsv')
        df.loc[test_index][COLUMN_NAMES].to_csv(file_name_test, sep='\t', index=False)
def kfold_holdout(X, y, groups, splits=5):
    group_kfold = GroupKFold(n_splits=splits)
    group_kfold.get_n_splits(X, y, groups)

    d_obj = Data(splits=splits, holdout=False)

    for train_index, test_index in group_kfold.split(X, y, groups):
        # inplace shuffeling
        shuffle(train_index)
        shuffle(test_index)
        d_obj.Xs_train.append(X[train_index])
        d_obj.Xs_val.append(X[test_index])
        d_obj.ys_train.append(y[train_index])
        d_obj.ys_val.append(y[test_index])

    return d_obj
def plot_roc_with_cv(classifier, X, y, groups, cv=6):
    """
    Plot the ROC curve with k fold cross validation
    """
    cv = GroupKFold(n_splits=cv)
    cv.get_n_splits(X, y, groups)
    plt.figure(figsize=(8,7))
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    i = 0
    for train, test in cv.split(X, y, groups):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
        i += 1   
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, 
                     label=r'$\pm$ 1 std. dev.')

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.legend(loc="lower right")
    plt.show()
Exemple #10
0
def get_grouped_splits(confused_items, not_confused_items, k):
    """ Splits data ensuring no users have data in training and eval sets.

        Args:
            confused (list): list of data item names labelled as confused
            not_confused (list): list of data item names labelled as not_confused
            k (int): number of folds for cross validation.

        Returns: (in following order)
            train_confused_splits (list): each element is a list containing the
                file names of the data items for this partition of the dataset
            test_confused_splits (list): as above
            train_not_confused_splits (list): as above
            test_not_confused_splits (list): as above
    """

    train_confused_splits = []
    test_confused_splits = []
    train_not_confused_splits = []
    test_not_confused_splits = []

    # make list where each index corresponds to the "group" (userID)
    groups = [uid.split('_')[0][:-1] for uid in confused_items] + \
             [uid.split('_')[0][:-1] for uid in not_confused_items]
    # get train test splits for confused class
    dummy_y = [0 for i in range(len(confused_items))] + \
              [1 for i in range(len(not_confused_items))]

    items = confused_items + not_confused_items

    gkf = GroupKFold(n_splits=k)
    gkf.get_n_splits(X=items, y=dummy_y, groups=groups)
    for train, test in gkf.split(X=items, y=dummy_y, groups=groups):
        train_confused_splits.append([items[i] for i in train if dummy_y[i] == 0])
        test_confused_splits.append([items[i] for i in test if dummy_y[i] == 0])
        train_not_confused_splits.append([items[i] for i in train if dummy_y[i] == 1])
        test_not_confused_splits.append([items[i] for i in test if dummy_y[i] == 1])

    return (train_confused_splits, test_confused_splits,
            train_not_confused_splits, test_not_confused_splits)
Exemple #11
0
def kfold_holdout(X, y, groups, splits, holdout):
    group_kfold = GroupKFold(n_splits=splits)
    group_kfold.get_n_splits(X, y, groups)

    d_obj = Data(splits=splits, holdout=holdout)

    for train_index, test_index in group_kfold.split(X, y, groups):
        # inplace shuffeling
        shuffle(train_index)
        shuffle(test_index)
        # generate folds
        if holdout == True:
            if d_obj.X_test_holdout is None:
                # first folds are for test only
                d_obj.X_train_holdout, d_obj.X_test_holdout = X[
                    train_index], X[test_index]
                d_obj.y_train_holdout, d_obj.y_test_holdout = y[
                    train_index], y[test_index]
                store_test_index = test_index
            else:
                # holdout idx if re-occuring in train
                train_index = [
                    x for x in train_index if x not in store_test_index
                ]
                d_obj.Xs_train.append(X[train_index])
                d_obj.Xs_val.append(X[test_index])
                d_obj.ys_train.append(y[train_index])
                d_obj.ys_val.append(y[test_index])

        elif holdout == False:
            d_obj.Xs_train.append(X[train_index])
            d_obj.Xs_val.append(X[test_index])
            d_obj.ys_train.append(y[train_index])
            d_obj.ys_val.append(y[test_index])
        else:
            print("Something is wrong here")
            exit()

    return d_obj
Exemple #12
0
def train_test_split_KFold(obj):
    from sklearn.model_selection import GroupKFold
    kf1 = GroupKFold(n_splits=5)
    kf1.get_n_splits(obj.X, obj.Y, obj.MRNs.astype(int))

    hold_out_count = 0

    for main_index, hold_out_index in kf1.split(obj.X, obj.Y,
                                                obj.MRNs.astype(int)):
        print(main_index)
        if (hold_out_count == 4):
            obj.X_hold_out = obj.X[hold_out_index, :]
            obj.Y_hold_out = obj.Y[hold_out_index]
            obj.hold_out_MRNs = obj.MRNs[hold_out_index]
            obj.hold_out_entryDates = obj.entryDates[hold_out_index]
            obj.hold_out_indices = hold_out_index

            obj.X = obj.X[main_index, :]
            obj.Y = obj.Y[main_index]
            obj.MRNs = obj.MRNs[main_index]
            obj.entryDates = obj.entryDates[main_index]
            obj.cv_indices = main_index
        hold_out_count += 1
Exemple #13
0
    def create_training_files(data_path, num_folds, training_folder,
                              full_train_path):
        """
        Loads in an excel file containing manually labeled tokens and creates a tab delimited file for use with training stanford CRF NER model
        Blank rows are imported as NaN which are intended to be blanks in training file to seperate "documents"
        :return: Saves a temporary folder in logs/ner_cv that will be used for cross-validation (will be cleaned up by later function TODO)
        """
        df = pd.read_excel(data_path, sheet_name='Tokens')
        group_kfold = GroupKFold(n_splits=num_folds)
        group_kfold.get_n_splits()

        for i, (train_index, test_index) in enumerate(
                group_kfold.split(df.Token, df.Label, df.OG_Text)):
            cv_folder = os.path.join(training_folder, f'fold_{i+1}')
            os.makedirs(cv_folder, exist_ok=True)
            df.iloc[train_index].to_csv(os.path.join(cv_folder, 'train.tsv'),
                                        columns=['Token', 'Label'],
                                        sep='\t',
                                        index=False,
                                        header=False)
            df.iloc[test_index].to_csv(os.path.join(cv_folder, 'test.tsv'),
                                       columns=['Token'],
                                       sep='\t',
                                       index=False,
                                       header=False)
            df.iloc[test_index].to_csv(os.path.join(cv_folder, 'labels.tsv'),
                                       columns=['Token', 'Label'],
                                       sep='\t',
                                       index=False,
                                       header=False)

        df.to_csv(full_train_path,
                  columns=['Token', 'Label'],
                  sep='\t',
                  index=False,
                  header=False)
Exemple #14
0
d = d.drop(["speaker"], axis=1)
X, y = d.iloc[:, 1:].values, d.iloc[:, 0].values
# 1 Layer


def flatten(mylist): return [item for sublist in mylist for item in sublist]


cmodel1_tprs = []
cmodel1_aucs = []
cmodel1_resultsA = []
cmodel1_resultsB = []
mean_fpr = np.linspace(0, 1, 100)

group_kfold = GroupKFold(n_splits=5)
group_kfold.get_n_splits(X, y, speaker)
print(group_kfold,)
model1_cvscores = []
model1_history_main = []  # This will save the results from all cross-validations

sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

# MODEL 1
MODEL_NO = "1"


model1 = Sequential()
model1.add(Dense(300, input_dim=24, activation='relu'))
model1.add(Dense(300, activation='relu'))
model1.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

X, y, names, groups, _ = ml_data_parser('30_data.csv')
print('i,random_forest,svm,naiive_bayes,logistic_reg,extra_random_forest')
cv_num = 10

for i in range(1, len(names)):
    new_x = X[:, :i]
    # f=[np.shape(new_x)[0]]
    # f.append(np.shape(new_x)[1])
    # for j in range(i):
    # f.append(names[j])
    # print(','.join(map(str,f)))
    # x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.33)
    group_kfold = GroupKFold(n_splits=cv_num)
    group_kfold.get_n_splits(new_x, y, groups)
    xlen = len(names)
    rfacc = []
    svmacc = []
    nbacc = []
    lracc = []
    extrarfacc = []
    for train_index, test_index in group_kfold.split(new_x, y, groups):
        x_train = []
        x_test = []
        y_train = []
        y_test = []
        # print(train_index)
        # print(test_index)
        for id in train_index:
            x_train.append(new_x[id])
Exemple #16
0
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(features_train)

X_train = scaler.transform(features_train)
X_test = scaler.transform(features_test)
y_train = labels_train.flatten()
y_test = labels_test.flatten()

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix 
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV
group_kfold = GroupKFold(n_splits=4)
group_kfold.get_n_splits(X_train, y_train, groups_train)

tuned_parameters = [{'solver': ['sgd'], 'momentum': [0.3,0.6,0.9],
           'learning_rate_init': [0.01,0.02,0.05,0.1,0.2,0.5],'nesterovs_momentum': [False,True],
           'learning_rate': ['constant','invscaling','adaptive']},
                    {'solver': ['sgd'], 'momentum': [0],
           'learning_rate_init': [0.01,0.02,0.05,0.1,0.2,0.5]},
                    {'solver': ['adam'],
           'learning_rate_init': [0.01,0.02,0.05,0.1,0.2,0.5]},
                    {'solver': ['lbfgs']}]

scores = ['precision', 'recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
Exemple #17
0
toronto.X = toronto.df.iloc[:, 0:49].values
toronto.Y = toronto.df.iloc[:, 49].values
toronto.Y = nd.replace(
    nd.replace(nd.replace(toronto.Y.astype(str), 'F 4', '4'), 'F 1', '0'),
    'F 0', '0').astype(int)
toronto.MRNs = toronto.df.iloc[:, 51]
toronto.entryDates = toronto.df.iloc[:, 52]
toronto.split = 'groupKFold'  # KFold # groupKFold
dft = toronto.df

from sklearn.model_selection import GroupKFold
from sklearn.model_selection import KFold

kf = GroupKFold(n_splits=10)
normalKF = KFold(n_splits=10, shuffle=True, random_state=0)
kf.get_n_splits(toronto.X, toronto.Y, toronto.MRNs.astype(int))

svmObj.params = {
    'method': 'label',
    'threshold': 0.40,
    'C': 0.5,
    'gamma': 'auto',
    'kernel': 'rbf',
    'degree': 3,
    'coef0': 0.5,
    'shrinking': True,
    'tol': 0.001
}
rfcObj.params = {
    'n_estimators': 100,
    'criterion': 'entropy',
Exemple #18
0
def main():

    parser = OptionParser()

    parser.add_option("-l", "--load_model", dest="loadModel", default=False)
    parser.add_option("-c",
                      "--continue",
                      dest="continueTraining",
                      action='store_true',
                      default=False)

    parser.add_option("--vb",
                      dest="verbose",
                      action='store_true',
                      default=False)

    parser.add_option("-e", "--epochs", dest="epochs", default=100)
    parser.add_option("--bs", "--batchSize", dest="batchSize", default=5)
    parser.add_option("--lr",
                      "--learning_rate",
                      dest="learningRate",
                      default=0.001)
    parser.add_option("-p", "--patience", dest="patience", default=70)

    (options, args) = parser.parse_args()

    # database to use
    dbPath = '../dbHdf5/dataset1_2d_onlyTumor_cropped_x-75-425_y-75-425.hdf5'

    modelDir = '../models'  # where models are saved
    modelArch = 'maskNet002'  # model architecture to use from modelLib.py
    modelName = 'maskNet002_007'  # name to save model with

    epochs = int(options.epochs)
    batchSize = int(options.batchSize)

    modelFolder = os.path.join(modelDir, modelName)
    weightsFolder = os.path.join(modelFolder, "weights")
    ensureDir(weightsFolder)

    notes = "Model trained on augmented data (hor flip, ver flip and elastic). Using Dice Coeff Loss"

    with open(os.path.join(modelFolder, "trainingData.txt"), "w") as df:
        df.write("Dataset\t%s\n" % dbPath)
        df.write("Architecture\t%s\n" % modelArch)
        df.write("Batch Size\t%s\n" % batchSize)
        df.write("Notes\t%s\n" % notes)

    db = h5py.File(dbPath, 'r')
    X = db['slice'][...]
    X = np.float32(X)
    X = np.expand_dims(X, -1)

    Y = db['mask'][...]
    Y = np.expand_dims(Y, -1)
    Y = np.float32(Y)

    cases = db['case'][...]

    db.close()

    group_kfold = GroupKFold(n_splits=4)
    group_kfold.get_n_splits(X, Y, cases)

    kdx = 0
    for train_index, test_index in group_kfold.split(X, Y, cases):
        kdx += 1
        X_train = X[train_index]
        Y_train = Y[train_index]
        X_test = X[test_index]
        Y_test = Y[test_index]

        with open(os.path.join(modelFolder, "trainingData.txt"), "a") as df:
            df.write("\nTraining Cases for CV-%d (%d)\t" %
                     (kdx, len(train_index)))
            df.write("\t".join(np.unique(cases[train_index])))
            df.write("\n")
            df.write("Test Cases for CV-%d (%d)\t" % (kdx, len(test_index)))
            df.write("\t".join(np.unique(cases[test_index])))
            df.write("\n")

        bestModelPath = os.path.join(weightsFolder,
                                     "best_fold_%02d.hdf5" % kdx)
        ensureDir(bestModelPath)

        # creating model
        model = makeModel(modelArch, verbose=options.verbose)
        model.save(os.path.join(modelFolder, modelName + '.h5'))

        adam = Adam(lr=float(options.learningRate),
                    beta_1=0.9,
                    beta_2=0.999,
                    epsilon=1e-06,
                    decay=0.00001)

        model.compile(loss=[customLoss.dice_coef_loss], optimizer=adam)

        # loading model
        if options.loadModel:
            print("\n\nLoading Model Weights:\t %s" % modelName)
            model = load_model(bestModelPath)
            log = np.genfromtxt(os.path.join(modelFolder,
                                             modelName + '_trainingLog.csv'),
                                delimiter=',',
                                dtype=str)[1:, 0]
            epochStart = len(log)
        else:
            epochStart = 0

        print("\nCross Validation Fold : %02d \n" % kdx)
        # totalSamples = getSampleCount(dbPath,'slice')

        # trainGen = dataGenerator(dbPath,'slice','mask',batchSize,extendDim=True)
        # nTrainSamples = totalSamples

        # callbacks
        check1 = ModelCheckpoint(os.path.join(
            weightsFolder, modelName + "_fold_%02d" % kdx +
            "_{epoch:02d}-loss-{val_loss:.3f}.hdf5"),
                                 monitor='val_loss',
                                 save_best_only=True,
                                 mode='auto')
        check2 = ModelCheckpoint(bestModelPath,
                                 monitor='val_loss',
                                 save_best_only=True,
                                 mode='auto')
        check3 = EarlyStopping(monitor='val_loss',
                               min_delta=0.01,
                               patience=int(options.patience),
                               verbose=0,
                               mode='auto')
        check4 = CSVLogger(os.path.join(modelFolder,
                                        modelName + '_trainingLog.csv'),
                           separator=',',
                           append=True)
        check5 = ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.1,
                                   patience=int(options.patience),
                                   verbose=1,
                                   mode='auto',
                                   epsilon=0.0001,
                                   cooldown=0,
                                   min_lr=1e-10)

        print("\nInitiating Training:\n")
        # trained_model = model.fit_generator(trainGen, steps_per_epoch=(nTrainSamples // batchSize), epochs=epochs, initial_epoch=epochStart,
        # 									callbacks=[check1,check2,check3,check4,check5], verbose=1)

        model.fit(X_train,
                  Y_train,
                  validation_data=(X_test, Y_test),
                  batch_size=batchSize,
                  epochs=epochs,
                  initial_epoch=epochStart,
                  callbacks=[check1, check2, check3, check4, check5],
                  verbose=1)

        del X_test
        del X_train
        del Y_test
        del Y_train
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

with open('/home/kamer/notebooks/data/G9_data/action_data.pkl', 'rb') as f:
    X, y, z = cPickle.load(f)

X_transformer = QuantileTransformer(output_distribution='uniform')
X = X_transformer.fit_transform(X.reshape(-1, 128)).reshape(-1, 8, 128)

from sklearn.model_selection import GroupKFold

group_kfold = GroupKFold(n_splits=5)
group_kfold.get_n_splits(X, y, z)

from sklearn.utils.class_weight import compute_class_weight

epochs = 50

all_preds = []
all_targets = []

for train_index, test_index in group_kfold.split(X, y, z):

    model = Arch2(in_channels=8, out_channels=6, gap_size=128)
    model.to(torch.device('cuda'))

    optimizer = AdamW(params=model.parameters(), lr=1e-4)  #2e-4

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    cw = torch.Tensor(
Exemple #20
0
                 axis=1)

respList = np.array([list(data.iloc[0, 4:])])
for i in range(1, len(data)):
    respList = np.append(respList, [list(data.iloc[i, 4:])], axis=0)
print(respList.shape)

docs = list(utterances.values)
groups = data['ObsID']  # KFOLD ADDITION
group_kfold = GroupKFold(n_splits=5)  # KFOLD ADDITION

# vectorize bag of words
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(docs)
docs2 = vectorizer.transform(docs).toarray()
group_kfold.get_n_splits(docs2, respList, groups)  # KFOLD ADDITION

score_array = []  # KFOLD ADDITION
acc_array = []  # KFOLD ADDITION
roc_array = []  # KFOLD ADDITION

for train_index, test_index in group_kfold.split(docs2, respList,
                                                 groups):  # KFOLD ADDITION
    print("TRAIN:", train_index, "TEST:", test_index)  # KFOLD ADDITION
    X_train, X_test = docs2[train_index], docs2[test_index]  # KFOLD ADDITION
    y_train, y_test = respList[train_index], respList[
        test_index]  # KFOLD ADDITION
    print(X_train, X_test, y_train, y_test)  # KFOLD ADDITION

    # X_train, X_test, y_train, y_test = train_test_split(docs2, respList, test_size=0.2)
Exemple #21
0
fs = MultiSURF().fit(X, y)
ms_array = list(fs.feature_importances_)
feature_importance = {}
num_dic = {}
trans_x = np.transpose(X)
max_val = 0
for i in range(num_vars):
    feature_importance[names[i]] = ms_array[i]
    num_dic[i] = ms_array[i]
    if max_val < num_dic[i]:
        max_val = num_dic[i]
        best_feature = i
for a in range(10):
    x1 = X[:, best_feature].reshape(-1, 1)
    group_kfold = GroupKFold(n_splits=10)
    group_kfold.get_n_splits(x1, y, groups)
    acc_arr = []
    for train_index, test_index in group_kfold.split(X, y, groups):
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        for id in train_index:
            X_train.append(x1[id])
        for id in test_index:
            X_test.append(x1[id])
        for id in train_index:
            y_train.append(y[id])
        for id in test_index:
            y_test.append(y[id])
        clf = RandomForestClassifier().fit(X_train, y_train)
def main():

    #%%
    n_estimators_default = ITER
    n_fold = N_FOLD


    #%%
    for t in CTYPES:
        params = PARAMS[t]
        params['random_state'] = SEED
        params['num_threads'] = CPU

        # Train set
        X = pd.read_csv(DATA_PATH/'train'/f'{t}_full.csv', index_col=0)
        X = reduce_mem_usage(X)
        y_all = pd.read_csv(ORIGIN_PATH/'scalar_coupling_contributions.csv').drop('type', axis=1)
        y_all = reduce_mem_usage(y_all)
        X = X.merge(y_all, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left')
        ys = {
            'sum': X['scalar_coupling_constant'],
            'fc': X['fc'],
            'sd': X['sd'],
            'pso': X['pso'],
            'dso': X['dso'],
        }
        X = X.drop(['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso'], axis=1)

        X_test = pd.read_csv(DATA_PATH/'test'/f'{t}_full.csv', index_col=0)
        X_test = reduce_mem_usage(X_test)

        X_all = pd.concat([X, X_test])
        cat_features = []
        for col in X_all.columns:
            if col[-5:] == '_atom' or col in ['atom_A', 'atom_B']:
                cat_features.append(col)
        print(cat_features)
        for col in cat_features:
            print(col)
            X_all[col] = label_encode(X_all[col])


        X = X_all.iloc[:len(X)]
        X_test = X_all.iloc[len(X):]
        del X_all; gc.collect()


        index_train = X['id']
        groups = X['molecule_name']
        if t[:2] == '1J':
            X_t = X.drop(['atom_index_0','atom_index_1','id', 'type', 'molecule_name'],axis=1)
        elif t[:2] == '2J':
            X_t = X.drop(['atom_index_0','atom_index_1','atom_index_A','id', 'type', 'molecule_name'],axis=1)
        elif t[:2] == '3J':
            X_t = X.drop(['atom_index_0','atom_index_1','atom_index_A','atom_index_B','id','type', 'molecule_name'],axis=1)

        index_test = X_test['id']
        if t[:2] == '1J':
            X_test_t = X_test.drop(['atom_index_0','atom_index_1','id', 'type', 'molecule_name'],axis=1)
        elif t[:2] == '2J':
            X_test_t = X_test.drop(['atom_index_0','atom_index_1','atom_index_A','id', 'type', 'molecule_name'],axis=1)
        elif t[:2] == '3J':
            X_test_t = X_test.drop(['atom_index_0','atom_index_1','atom_index_A','atom_index_B','id','type', 'molecule_name'],axis=1)


        params['categorical_feature'] = [X_t.columns.get_loc(x) for x in cat_features]

        for ytype, y_t in ys.items():
            res = []
            if opt.icm and ytype == 'sum':
                continue
            elif not opt.icm and ytype != 'sum':
                continue

            # Split data
            folds = GroupKFold(n_splits=n_fold)
            folds.get_n_splits(X_t,y_t,groups)

            # Train!
            print(f'Starting {t} / {ytype}')
            print(f'Params:\n{params}')
            result_dict_lgb3 = train_model_regression(X=X_t, X_test=X_test_t, y=y_t,
                                                      params=params, folds=folds, model_type='lgb', eval_metric='mae',
                                                      plot_feature_importance=True, verbose=1000, early_stopping_rounds=200,
                                                      n_estimators=n_estimators_default, groups=groups,
                                                      feature_importance_path=f'results/{t}.png')
            if opt.icm:
                res.append((f'{t}_{ytype}', result_dict_lgb3))
                with open(RESULT_PATH/f'{t}_{ytype}.pkl', 'wb') as f:
                    pickle.dump(res, f)
            else:
                res.append((t, result_dict_lgb3))
                with open(RESULT_PATH/f'{t}.pkl', 'wb') as f:
                    pickle.dump(res, f)
Exemple #23
0
def main():
    #%%
    X = pd.read_csv(DATA_PATH / 'train' / f'{CTYPE}_full.csv', index_col=0)
    X = reduce_mem_usage(X)
    y = X['scalar_coupling_constant']
    X = X.drop(['scalar_coupling_constant'], axis=1)

    X_test = pd.read_csv(DATA_PATH / 'test' / f'{CTYPE}_full.csv', index_col=0)
    X_test = reduce_mem_usage(X_test)

    #%%
    X = X.fillna(0)
    X_test = X_test.fillna(0)

    X_all = pd.concat([X, X_test])
    cat_features = []
    for col in X_all.columns:
        if col[-5:] == '_atom' or col in ['atom_A', 'atom_B']:
            cat_features.append(col)
    print(cat_features)
    for col in cat_features:
        print(col)
        X_all[col] = label_encode(X_all[col])

    print('dummie', X_all.shape)
    X_all = pd.get_dummies(X_all,
                           columns=cat_features,
                           drop_first=True,
                           dummy_na=True)
    print('->', X_all.shape)

    X = X_all.iloc[:len(X)]
    X_test = X_all.iloc[len(X):]
    del X_all
    gc.collect()

    index_train = X['id']
    groups = X['molecule_name']
    if CTYPE[:2] == '1J':
        X_t = X.drop(
            ['atom_index_0', 'atom_index_1', 'id', 'type', 'molecule_name'],
            axis=1)
    elif CTYPE[:2] == '2J':
        X_t = X.drop([
            'atom_index_0', 'atom_index_1', 'atom_index_A', 'id', 'type',
            'molecule_name'
        ],
                     axis=1)
    elif CTYPE[:2] == '3J':
        X_t = X.drop([
            'atom_index_0', 'atom_index_1', 'atom_index_A', 'atom_index_B',
            'id', 'type', 'molecule_name'
        ],
                     axis=1)
    y_t = y

    index_test = X_test['id']
    if CTYPE[:2] == '1J':
        X_test_t = X_test.drop(
            ['atom_index_0', 'atom_index_1', 'id', 'type', 'molecule_name'],
            axis=1)
    elif CTYPE[:2] == '2J':
        X_test_t = X_test.drop([
            'atom_index_0', 'atom_index_1', 'atom_index_A', 'id', 'type',
            'molecule_name'
        ],
                               axis=1)
    elif CTYPE[:2] == '3J':
        X_test_t = X_test.drop([
            'atom_index_0', 'atom_index_1', 'atom_index_A', 'atom_index_B',
            'id', 'type', 'molecule_name'
        ],
                               axis=1)

    sc = StandardScaler()
    X_t = sc.fit_transform(X_t)
    X_test_t = sc.transform(X_test_t)

    #%%
    folds = GroupKFold(n_splits=N_FOLD)
    folds.get_n_splits(X_t, y_t, groups)
    fold_split = folds.split(X_t, y_t, groups)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # for test set prediction
    X_test_t = torch.tensor(X_test_t, dtype=torch.float).to(device)
    test_ds = torch.utils.data.TensorDataset(X_test_t)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    #
    oof = np.zeros(len(X_t))
    prediction = np.zeros(len(X_test_t))
    avg_losses = []
    avg_val_losses = []

    for fold_i, (train_idx, valid_idx) in enumerate(fold_split):
        print(f'Fold {fold_i + 1} started at {time.ctime()}')

        # dataset
        X_train = torch.tensor(X_t[train_idx.astype(int)],
                               dtype=torch.float).to(device)
        X_valid = torch.tensor(X_t[valid_idx.astype(int)],
                               dtype=torch.float).to(device)
        y_train = torch.tensor(np.array(y_t)[train_idx.astype(int),
                                             np.newaxis],
                               dtype=torch.float).to(device)
        y_valid = torch.tensor(np.array(y_t)[valid_idx.astype(int),
                                             np.newaxis],
                               dtype=torch.float).to(device)
        train_ds = torch.utils.data.TensorDataset(X_train, y_train)
        valid_ds = torch.utils.data.TensorDataset(X_valid, y_valid)

        train_loader = torch.utils.data.DataLoader(train_ds,
                                                   batch_size=BATCH_SIZE,
                                                   shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid_ds,
                                                   batch_size=BATCH_SIZE,
                                                   shuffle=False)

        # define model each fold
        model = Simple_NN(X_train.shape[1],
                          HIDDEN_DIM,
                          activation=nn.LeakyReLU())
        model.to(device)

        # criterion = nn.L1Loss()
        criterion = nn.SmoothL1Loss()
        mae = nn.L1Loss()

        step_size = 5
        base_lr, max_lr = DEFAULT_LR, 5 * DEFAULT_LR
        #     optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=max_lr)
        optimizer = RAdam(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=max_lr)
        scheduler = CyclicLR(optimizer,
                             base_lr=base_lr,
                             max_lr=max_lr,
                             step_size=step_size,
                             mode='exp_range',
                             gamma=0.99994)

        early_stopping = EarlyStopping(patience=EARLY_STOPPING_ROUNDS,
                                       verbose=True)
        best_weight = {'epoch': None, 'state_dict': None}

        if torch.cuda.device_count() > 1:
            print('{} gpus found.'.format(torch.cuda.device_count()))
            model = torch.nn.DataParallel(model)

        for epoch in range(EPOCH):
            start_time = time.time()
            model.train()
            avg_loss = 0.
            avg_mae = 0.

            # train
            for batch_i, (x, y) in enumerate(train_loader):
                y_pred = model(x)
                if scheduler:
                    scheduler.batch_step()
                loss = criterion(y_pred, y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                avg_loss += loss.item() / len(train_loader)

            # valid
            model.eval()

            oof_fold = np.zeros(X_valid.size(0))
            prediction_fold = np.zeros(len(X_test_t))
            avg_val_loss = 0.

            for batch_i, (x, y) in enumerate(valid_loader):
                y_pred = model(x).detach()
                loss = criterion(y_pred, y)
                metric = mae(y_pred, y)
                avg_val_loss += loss.item() / len(valid_loader)
                avg_mae += metric.item() / len(valid_loader)

            elapsed_time = time.time() - start_time

            if early_stopping(avg_val_loss, model):  # score updated
                print(
                    'Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t MAE={:.4f} \t time={:.2f}s'
                    .format(epoch + 1, EPOCH, avg_loss, avg_val_loss, avg_mae,
                            elapsed_time))
                best_weight['epoch'] = epoch
                best_weight['state_dict'] = model.state_dict()
            if early_stopping.early_stop:
                print("Early stopping!")
                break

        avg_losses.append(avg_loss)
        avg_val_losses.append(avg_val_loss)

        # predict
        print('best epoch for fold {} is {}'.format(fold_i + 1,
                                                    best_weight['epoch'] + 1))
        model.load_state_dict(best_weight['state_dict'])
        for batch_i, (x, _) in enumerate(valid_loader):
            y_pred = model(x).detach()
            oof_fold[batch_i * BATCH_SIZE:(batch_i + 1) *
                     BATCH_SIZE] = y_pred.cpu().numpy()[:, 0]
        for batch_i, (x, ) in enumerate(test_loader):
            y_pred = model(x).detach()
            prediction_fold[batch_i * BATCH_SIZE:(batch_i + 1) *
                            BATCH_SIZE] = y_pred.cpu().numpy()[:, 0]

        oof[valid_idx] = oof_fold
        prediction += prediction_fold / N_FOLD

    # results
    overall_mae = mean_absolute_error(oof, y_t.values)
    overall_logmae = np.log(overall_mae)

    print(
        'Overall \t loss={:.4f} \t val_loss={:.4f} \t MAE={:.4f} \t logMAE={:.4f}'
        .format(np.average(avg_losses), np.average(avg_val_losses),
                overall_mae, overall_logmae))

    res = []
    res_dict = {'oof': oof, 'prediction': prediction}
    res.append((CTYPE, res_dict))

    with open(f'{CTYPE}_DNN.pkl', 'wb') as f:
        pickle.dump(res, f)
Exemple #24
0
import numpy as np
a = np.ones([5, 20])
b = np.zeros([1, 20])
b[0, 10:] = 1
a = a.T
b = b.T

####################
#train_test_aplit
####################
x_train, x_test, y_train, y_test = train_test_split(a,
                                                    b,
                                                    test_size=0.2,
                                                    shuffle=False)
print("y_train")
print(y_train)
print("y_test")
print(y_test)
####################
#KFold
####################
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(a):
    print("train_index:", train_index, ",test_index:", test_index)
####################
#Groupkfold
####################
kf2 = GroupKFold(n_splits=5)
res2 = kf2.get_n_splits(a, b)
for train_index, test_index in kf2.split(a, b):
    print("train_index:", train_index, ",test_index:", test_index)
Exemple #25
0
# import these:
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold  # StratifiedGroupKFold could be used in the future

# after data processing, add these:
groups = data['ObsID']  # selects column to group by
group_kfold = GroupKFold(n_splits=5)  # set number of splits
group_kfold.get_n_splits(
    docs2, respList, groups
)  # split where docs2 = utterance values, respList = 7 classifier columns

# loop through each of the 5 splits:

score_array = []
acc_array = []
roc_array = []

for train_index, test_index in group_kfold.split(docs2, respList, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = docs2[train_index], docs2[test_index]
    y_train, y_test = respList[train_index], respList[test_index]
    print(X_train, X_test, y_train, y_test)

    # Remove your prior split ie X_train, X_test, y_train, y_test = train_test_split(docs2, respList, test_size=0.2)
    # Run prior training as before

    # Append new values in each loop
    score_array.append(score)  # KFOLD ADDITION
    acc_array.append(acc)  # KFOLD ADDITION
    roc_array.append(roc_auc_score(y_test, y_pred,
                                   multi_class='ovr'))  # KFOLD ADDITION
Exemple #26
0
def group_test_2(pre_x, kmeans_labels, names, num_dic, groups, num_vars,
                 meta_i):
    # print('meta-i='+str(meta_i))
    chosen_vars = np.zeros(meta_i)
    chosen_values = np.zeros(meta_i)
    # print('===')
    for i in range(num_vars):  #clean this routine up? check for errors?
        old_val = 0
        new_val = num_dic[i]
        clust = kmeans_labels[i]
        old_val = chosen_values[clust]
        if old_val < new_val:
            # print(names[i],old_val,new_val)
            chosen_vars[clust] = int(i)
            chosen_values[clust] = new_val
    # print(chosen_vars)
    # print(type(chosen_vars))
    chosen_works = []
    chosen_names = []
    for qq in list(chosen_vars):
        chosen_names.append(names[int(qq)])
        chosen_works.append(int(qq))
    X = pre_x[:, chosen_works]
    # print(chosen_names)
    # x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.3)
    group_kfold = GroupKFold(n_splits=3)
    group_kfold.get_n_splits(X, y, groups)
    acc_arr = []
    for train_index, test_index in group_kfold.split(X, y, groups):
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        # print(train_index)
        # print(test_index)
        for id in train_index:
            X_train.append(X[id])
        for id in test_index:
            X_test.append(X[id])
        for id in train_index:
            y_train.append(y[id])
        for id in test_index:
            y_test.append(y[id])

        # if model=='svm':
        # clf=SVC().fit(X_train,y_train)
        # elif model=='rf_extra':
        # clf=ExtraTreesClassifier().fit(X_train,y_train)
        # elif model=='rf':
        # clf=RandomForestClassifier().fit(X_train,y_train)
        # elif model=='nb':
        # clf=GaussianNB().fit(X_train,y_train)
        # elif model=='lr':
        # clf=LogisticRegression().fit(X_train,y_train)
        clf = RandomForestClassifier().fit(X_train, y_train)
        # score=np.mean(cross_val_score(svm,X,y,cv=10))
        # print(groupdic[groupid],modeldic[modelid],np.shape(X_test),np.shape(X_train))
        tmp_score = clf.score(X_test, y_test)
        acc_arr.append(tmp_score)
        # print('accuracy='+','+str(qwer)+'\n')
    return np.mean(acc_arr), chosen_names
Exemple #27
0
X_train = np.loadtxt("data/05_train_df/%s_x_train.csv" % my_dat,
                     skiprows=1,
                     delimiter=",")
Y_train = np.loadtxt("data/05_train_df/%s_y_train.csv" % my_dat,
                     skiprows=1,
                     delimiter=",")

# do a different split to try these
X = X_train
y = Y_train
#X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify = y, random_state=28)
fold_brk = pd.read_csv("data/05_train_df/%s_folds.csv" % my_dat)

group_kfold = GroupKFold(n_splits=3)
group_kfold.get_n_splits(X, y, grps)
for train_index, test_index in group_kfold.split(X, y, grps):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    print(X_train.shape)
    print(X_valid.shape)
    rf_acc = run_acc(RandomForestClassifier(), "RandomForest")
    #print(X_train, X_test, y_train, y_test)

for fold in range(1, 7):
    train_idx = [x != fold for x in fold_brk['partition'].tolist()]
    valid_idx = [x == fold for x in fold_brk['partition'].tolist()]

    X_train = X[train_idx, :]
    y_train = y[train_idx]
# And test those predictions
kappa = cohen_kappa_score(y, predictions)

# Print it up
print("model kappa using XGBClassifier: %.2f" % kappa)

####################################################################################################
# Question 7
####################################################################################################

from sklearn.model_selection import GroupKFold

# split our data
gkf = GroupKFold(n_splits=10)
gkf.get_n_splits(10)

# Create a list of unique users and their indecies
group_dict = {}
groups = np.array([])

for index, row in df_dummies.iterrows():
    student_id = row['STUDENTID']
    if student_id not in group_dict:
        group_dict[student_id] = index
    groups = np.append(groups, group_dict[student_id])

# train and test all the data
kappa_sum = 0
print("Decision Tree")
for i, data_folds in enumerate(gkf.split(x, y, groups=groups)):
Exemple #29
0
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(features_train)

X_train = scaler.transform(features_train)
X_test = scaler.transform(features_test)
y_train = labels_train.flatten()
y_test = labels_test.flatten()

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV
group_kfold = GroupKFold(n_splits=4)
group_kfold.get_n_splits(X_train, y_train, groups_train)

tuned_parameters = [{
    'hidden_layer_sizes': [[6, 6, 6]]
}, {
    'hidden_layer_sizes': [[3, 3, 3]]
}, {
    'hidden_layer_sizes': [[5, 5, 5]]
}, {
    'hidden_layer_sizes': [[6, 6]]
}, {
    'hidden_layer_sizes': [[5, 5]]
}, {
    'hidden_layer_sizes': [[3, 3]]
}, {
    'hidden_layer_sizes': [[6, 5, 3]]
                y = np.hstack((y, i_class_label * np.ones(
                    (length(state_data), ), dtype='int')))
            # Update class label
            i_class_label += int(1)

        # Transpose
        x = x.transpose((1, 0))

        #%% Plot 3D backscatter values

        # Plot
        #labels_dict = None # dict((['live', 'defo'], ['live', 'defo']))
        #modalitypoints3d('reciprocity', x, y, labels_dict=labels_dict, title=dataset_use)

        #%% Classify
        group_kfold.get_n_splits(X=x, y=y, groups=groups)
        # Cross validate - kNN - All data
        knn_all = KNeighborsClassifier(n_neighbors=knn_k)
        knn_scores_all = cross_val_score(knn_all,
                                         x,
                                         y,
                                         groups=groups,
                                         cv=crossval_use)
        #knn_scores_all = cross_val_score(knn_all, x, y, cv=crossval_kfold)
        #print('kNN - ' + dataset_use + ' :')
        #print(np.mean(knn_scores_all))
        knn_mean_acc[dataset_use] = np.mean(knn_scores_all)
        knn_all_acc[dataset_use] = knn_scores_all

        rf_all = RandomForestClassifier(n_estimators=rf_ntrees, random_state=0)
        rf_scores_all = cross_val_score(rf_all,