def gen_sample_array(self):
        try:
            from sklearn.model_selection import StratifiedShuffleSplit
        except:
            print('Need scikit-learn for this functionality')

        s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
        X = torch.randn(self.class_vector.size(0), 2).numpy()
        y = self.class_vector.numpy()
        s.get_n_splits(X, y)

        train_index, test_index = next(s.split(X, y))
        return np.hstack([train_index, test_index])
Ejemplo n.º 2
0
    def generate_balanced_splits(cls, samples, labels):
        #random_state=None for real random or random_state={seed number}
        sss = StratifiedShuffleSplit(n_splits=1,
                                     test_size=test_set_percentage,
                                     random_state=None)
        sss.get_n_splits(samples, labels)
        for train_index, test_index in sss.split(samples, labels):
            train_set = samples[train_index]
            train_labels = labels[train_index]
            test_set = samples[test_index]
            test_labels = labels[test_index]

        return train_set, train_labels, test_set, test_labels
Ejemplo n.º 3
0
def doExp(datasetPath,
          epsilon,
          varianceRatio,
          n_trails,
          numOfDimensions,
          logPath,
          isLinearSVM=True):
    if os.path.basename(datasetPath).endswith('npy'):
        data = np.load(datasetPath)
    else:
        #data = np.loadtxt(datasetPath, delimiter=",");
        data = pd.read_csv(datasetPath, delimiter=",", header=None).values
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data[:, 1:])
    globalPCA = PCAImpl(data_std)

    numOfFeature = data.shape[1] - 1
    largestReducedFeature = globalPCA.getNumOfPCwithKPercentVariance(
        varianceRatio)
    print "%d/%d dimensions captures %.2f variance." % (
        largestReducedFeature, numOfFeature, varianceRatio)
    xDimensions = None
    if numOfDimensions > numOfFeature:
        xDimensions = np.arange(1, numOfFeature)
        largestReducedFeature = numOfFeature
    else:
        xDimensions = np.arange(
            1, largestReducedFeature,
            max(largestReducedFeature / numOfDimensions, 1))

    cprResult = []
    rs = StratifiedShuffleSplit(n_splits=n_trails,
                                test_size=.2,
                                random_state=0)
    rs.get_n_splits(data[:, 1:], data[:, 0])

    for train_index, test_index in rs.split(data[:, 1:], data[:, 0]):
        trainingData = data[train_index]
        testingData = data[test_index]

        tmpResult = singleExp(xDimensions, trainingData, testingData,
                              largestReducedFeature, epsilon, isLinearSVM)
        with open(logPath, "a") as f:
            np.savetxt(f, tmpResult, delimiter=",", fmt='%1.3f')
        cprResult.append(tmpResult)

    cprResult = np.vstack(cprResult)
    for result in cprResult:
        print ','.join(['%.3f' % num for num in result])

    return cprResult
Ejemplo n.º 4
0
def split(dpath, proc_data_path):
    make_dir(proc_data_path + 'models')
    make_dir(proc_data_path + 'data/test')
    make_dir(proc_data_path + 'data/train/0')  #not
    make_dir(proc_data_path + 'data/val/0')  #not
    make_dir(proc_data_path + 'data/train/1')  #open
    make_dir(proc_data_path + 'data/val/1')  #open
    make_dir(proc_data_path + 'data/train/2')  #checked
    make_dir(proc_data_path + 'data/val/2')  #checked

    imgs_path = glob.glob(dpath + "/**/*.png", recursive=True)
    labels = []
    for img in imgs_path:
        if img.find("not") != -1:
            labels.append(0)
        elif img.find("open") != -1:
            labels.append(1)
        else:
            labels.append(2)
    imgs_path = np.array(imgs_path)
    labels = np.array(labels)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    sss.get_n_splits(imgs_path, labels)
    for train_index, test_index in sss.split(imgs_path, labels):
        X_train, X_test = imgs_path[train_index], imgs_path[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        t_cnt_per_cls = int((y_test.shape[0] / 3) / 2)

        tests = list(
            np.concatenate([(X_train[y_train == 0])[0:t_cnt_per_cls],
                            (X_train[y_train == 1])[0:t_cnt_per_cls],
                            (X_train[y_train == 2])[0:t_cnt_per_cls]]))

        for img_i, img in enumerate(X_train):
            img_name = (img.strip().replace("\\", "/").split("/"))[-1]
            if img_name[0] == '_':
                continue
            if img in tests:
                dst = proc_data_path + 'data/test/' + str(
                    img_i) + "_" + img_name
            else:
                dst = proc_data_path + 'data/train/' + str(
                    y_train[img_i]) + "/" + str(img_i) + "_" + img_name
            copyfile(img, dst)
        for img_i, img in enumerate(X_test):
            img_name = (img.strip().replace("\\", "/").split("/"))[-1]
            if img_name[0] == '_':
                continue
            dst = proc_data_path + 'data/val/' + str(
                y_test[img_i]) + "/" + str(img_i) + "_" + img_name
            copyfile(img, dst)
Ejemplo n.º 5
0
    def gen_sample_array(self):
        try:
            from sklearn.model_selection import StratifiedShuffleSplit
        except:
            print('Need scikit-learn for this functionality')
        import numpy as np
        
        s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
        X = th.randn(self.class_vector.size(0),2).numpy()
        y = self.class_vector.numpy()
        s.get_n_splits(X, y)

        train_index, test_index = next(s.split(X, y))
        return np.hstack([train_index, test_index])
Ejemplo n.º 6
0
def main(_):
    paths, labels = None, None
    dirname, _ = ospath.split(ospath.abspath(__file__))

    try:
        data_dir = dirname + '/../../data/cells'
        paths, labels = import_data(data_dir=data_dir,
                                    in_memory=False,
                                    extension=args.extension)

        monitored_data, monitored_label, unmonitored_data = split_mon_unmon(
            paths, labels)
        monitored_data, monitored_label, unmonitored_data = np.array(
            monitored_data), np.array(monitored_label), np.array(
                unmonitored_data)

        helpers.shuffle_data(unmonitored_data)
        unmon_train, unmon_test = unmonitored_data[:int(
            (1 - TEST_SIZE) * len(unmonitored_data))], unmonitored_data[int(
                (1 - TEST_SIZE) * len(unmonitored_data)):]

        sss = StratifiedShuffleSplit(n_splits=1,
                                     test_size=TEST_SIZE,
                                     random_state=123)
        sss.get_n_splits(monitored_data, monitored_label)

        for train_index, test_index in sss.split(monitored_data,
                                                 monitored_label):
            X_train, X_test = monitored_data[train_index], monitored_data[
                test_index]
            y_train, y_test = monitored_label[train_index], monitored_label[
                test_index]

            X_train = np.append(X_train, unmon_train)
            X_test = np.append(X_test, unmon_test)

            y_train = np.append(y_train, [-1] * len(unmon_train))
            y_test = np.append(y_test, [-1] * len(unmon_train))

            store_data(X_test, 'X_test')
            store_data(y_test, 'y_test')

            stdout.write("Training on data...\n")
            run_model(X_train, in_memory=False)
            stdout.write("Finished running model.")
            break

    except KeyboardInterrupt:
        stdout.write("Interrupted, this might take a while...\n")
        exit(0)
Ejemplo n.º 7
0
    def ratio_data_loader(self):
        """

        :return: train_data and test_data updated
        """
        test_size = 0.1
        num_sol = 100
        num_of_features = 200

        pair_num = int(self.full_data[0].shape[0] /
                       num_sol)  # 20 is the num of solutions

        X_TR = []
        X_TS = []
        Y_TR = []
        Y_TS = []

        dataset_X = self.full_data[0].reshape(pair_num, num_sol,
                                              num_of_features)
        dataset_Y = self.full_data[1].reshape(pair_num, num_sol)
        for idx, pair_X in enumerate(dataset_X):
            pair_Y = dataset_Y[idx]
            stratSplit = StratifiedShuffleSplit(n_splits=1,
                                                test_size=test_size,
                                                random_state=42)
            stratSplit.get_n_splits(pair_X, pair_Y)
            for train_idx, test_idx in stratSplit.split(pair_X, pair_Y):
                X_train = pair_X[train_idx]
                Y_train = pair_Y[train_idx]

                X_test = pair_X[test_idx]
                Y_test = pair_Y[test_idx]

                X_TR.append(X_train)
                X_TS.append(X_test)
                Y_TR.append(Y_train)
                Y_TS.append(Y_test)
        X_TR = np.array(X_TR).reshape(-pair_num * int(test_size * num_sol),
                                      num_of_features)
        X_TS = np.array(X_TS).reshape(-pair_num * int(test_size * num_sol),
                                      num_of_features)
        Y_TR = np.array(Y_TR).reshape(-pair_num * int(test_size * num_sol), 1)
        Y_TS = np.array(Y_TS).reshape(-pair_num * int(test_size * num_sol), 1)

        self.train_data = (X_TR, Y_TR)
        self.test_data = (X_TS, Y_TS)

        print("train data shape: ", X_TR.shape)
        print("test data shape: ", X_TS.shape)
Ejemplo n.º 8
0
def CrossValidation(model):
    S = 5
    sss = StratifiedShuffleSplit(n_splits=S, test_size=0.3)
    sss.get_n_splits(x, y)
    f, acc = 0, 0
    for train_index, test_index in sss.split(x, y):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        print(x_train.shape, x_test.shape)
        model.fit(x_train, y_train)
        ypred = np.where(model.predict(x_test) > 0.5, 1, 0)
        f += f1_score(y_test, ypred) / S
        acc += accuracy_score(y_test, ypred) / S
    print(model)
    print(f, acc)
Ejemplo n.º 9
0
    def gen_sample_array(self):
        try:
            from sklearn.model_selection import StratifiedShuffleSplit
        except ModuleNotFoundError:
            print('Need scikit-learn for this functionality')
        except Exception:
            print('There exists some errors in your scikit-learn installation')

        s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
        X = th.randn(self.class_vector.size(0),2).numpy()
        y = self.class_vector.numpy()
        s.get_n_splits(X, y)

        train_index, test_index = next(s.split(X, y))
        return np.hstack([train_index, test_index])
Ejemplo n.º 10
0
def get_stratified_sample(X,y,verbose=True,test_size=.2):
    """
    return stratified sampled X and y
    X : x matrix(input)
    y : y matrix(output)
    test_size : fration of total data in test set
    """
    sss = StratifiedShuffleSplit(n_splits=10, test_size=test_size, random_state=0)
    sss.get_n_splits(X, y)
    print(sss)       
    for train_index, test_index in sss.split(X, y):
        if verbose:
            print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    return [X_train,X_test,y_train,y_test]
Ejemplo n.º 11
0
def metrics(X, Y, n_classes, norm):

    n_iter = 3000
    # Ten fold cross

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    sss.get_n_splits(X, Y)

    ms = np.zeros((n_classes, n_classes))

    for train_index, test_index in sss.split(X, Y):
        print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

        if (norm):
            normalize(x_train)
            normalize(x_test)

        mlp = MLPClassifier(hidden_layer_sizes=(16, 16, 16), max_iter=n_iter)
        mlp.fit(x_train, y_train)

        predictions = mlp.predict(x_test)
        ms += confusion_matrix(y_test, predictions)

    print("10 Foldcross Validation \n", ms)
    print("Accuracy: ", sum(ms.diagonal()) / np.sum(ms))

    # Confusion matrix

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4)

    if (norm):
        normalize(X_train)
        normalize(X_test)

    mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=n_iter)
    mlp.fit(X_train, Y_train)

    predictions = mlp.predict(X_test)
    cm = confusion_matrix(Y_test, predictions)

    print("\nConfusion Matrix \n", cm)
    print("Accuracy: ", sum(cm.diagonal()) / np.sum(cm))

    print("\nKappa Score\n", cohen_kappa_score(Y_test, predictions))
Ejemplo n.º 12
0
    def compute(self,X,y,C,gamma, test_size=0.3, n_iterations = 5, training_set_minsize = 10, learning_curves_step = 20):   
        
        assert len(X)==len(y)
        assert len(y)>training_set_minsize
        
        assert isinstance(C, (int, float))
        assert isinstance(gamma, (int, float))
        
        train_size=int( round( (1-test_size) * len(y) ))   
                                                  
        set_ripartitions = StratifiedShuffleSplit(n_splits=n_iterations, 
                                                  test_size = test_size)			# CORRECT!
                                                    
        n_iter=set_ripartitions.get_n_splits(X, y)									# CORRECT!
        
        n_samples=X.shape[0]
        n_features=X.shape[1]
        
        m_list=range(training_set_minsize,train_size,learning_curves_step)
            
        tr_errors=np.zeros((len(m_list),1),dtype=np.float)
        cv_errors=np.zeros((len(m_list),1),dtype=np.float)
        
        for train,test in set_ripartitions.split(X, y):								# CORRECT!
            X_tr,X_cv,y_tr,y_cv =X[train],X[test],y[train],y[test]
        
            idx=0
            for m in m_list:
                
                y_mask = self.stratifiedShuffleMask(y_tr,m)
                x_mask = np.kron(np.ones((n_features,1)),y_mask).T
            
                reduced_X = X_tr[x_mask!=0].reshape(m,n_features)
                reduced_y = y_tr[y_mask!=0]


				"""
				TODO: Exercise 3
				Read the code of the current method "compute" and understand what is
				happening. Once you have understood the code, try to understand the
				meaning of the stratifiedShuffleMask method. What is that method suppose 
				to do? What do reduced_X and reduced_y contain?
				Then, compute for each m, the training error and the cross-validation error
				averaged by the different re-arranged dataset ripartitions, and store them 
				relatively in the tr_errors and cv_errors numpy vectors, order by the idx index.
				"""
            
                raise Exception("One last effort! It is the last exercise.")
            
                
                
				idx+=1
                
                result=dict()
                result["m_list"]=m_list
                result["tr_errors"]=tr_errors
                result["cv_errors"]=cv_errors
Ejemplo n.º 13
0
def eval_model(X, y, args):
    from sklearn.model_selection import StratifiedShuffleSplit
    kf = StratifiedShuffleSplit(n_splits=args.cv, random_state=args.samples[0])
    kf.get_n_splits(X, y)

    partition_idx = 0
    for train_idx, test_idx in kf.split(X, y):
        partition_idx += 1
        (x_train,
         y_train), (x_test, y_test) = load_partition(train_idx, test_idx, X, y)

        calls = get_callbacks(args, partition_idx)

        for s_idx, seed in enumerate(args.samples):
            print('{} Training with SEED {}'.format(s_idx, seed))
            weight_file_name = '{}-{}-partition_{}-seed_{}'.format(
                args.model_type, args.timestamp, partition_idx,
                s_idx) + '-epoch_{epoch:02d}-loss_{val_loss:.2f}.hdf5'
Ejemplo n.º 14
0
def searchMethodFun():
    weightRange = [
        dict(zip(range(0, 2), (1, values))) for values in range(14, 24)
    ]
    param_gridA, method = model_params(methodChoice)
    pipeA = pipeline.make_pipeline(preprocessing.StandardScaler(), method)
    # 产生指定数量的独立的train/test数据集划分, 首先对样本全体打乱, 然后划分出train/test对
    # 返回分层划分, 创建划分的时候要保证每一个划分中类的样本比例与整体数据集中的原始比例保持一致
    fscore = make_scorer(scoring_method, pos_label=1)
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
    sss.get_n_splits(y_train)
    searchMethod = GridSearchCV(pipeA,
                                param_grid=param_gridA,
                                scoring=fscore,
                                cv=sss,
                                n_jobs=1)
    # searchMethod = RandomizedSearchCV(pipeA, param_distributions=param_gridA,n_iter=20)
    return searchMethod
Ejemplo n.º 15
0
    def train_model(self):
        from keras import callbacks as C
        from sklearn.model_selection import StratifiedShuffleSplit

        self.compile()

        kf = StratifiedShuffleSplit(n_splits=1, random_state=13, test_size=0.1)
        kf.get_n_splits(self.x_train, self.y_train)

        for t_index, v_index in kf.split(self.x_train, self.y_train):
            X_train, X_val = self.x_train[t_index], self.x_train[v_index]
            Y_train, Y_val = self.y_train[t_index], self.y_train[v_index]

            val_data = (X_val, Y_val)

            self.fit(X_train, Y_train, val_data)

            return self.model
Ejemplo n.º 16
0
    def __init__(self, dataset_name, split, out_path):
        uea_ucr_datasets.list_datasets()
        if split == 'testing':
            data = uea_ucr_datasets.Dataset(dataset_name, train=False)
            self.X, self.y = _to_array(data)
        elif split in ['training', 'validation']:
            validation_split_file = os.path.join(out_path,
                                                 'validation_split_file.pkl')
            data = uea_ucr_datasets.Dataset(dataset_name, train=True)
            X, y = _to_array(data)
            if not os.path.isfile(validation_split_file):
                print('Generating stratified training/validation split...')
                #now create the splits:
                from sklearn.model_selection import StratifiedShuffleSplit
                sss = StratifiedShuffleSplit(n_splits=1,
                                             test_size=0.2,
                                             random_state=42)
                #X_dummy = np.zeros([len(y),2])
                sss.get_n_splits(
                    X, y
                )  #for simpler NaN handling, we use dummy data for splitting,
                #as only labels are relevant

                training_indices, validation_indices = next(sss.split(X, y))
                split_dict = {
                    'training': training_indices,
                    'validation': validation_indices
                }
                #save the split ids
                if not os.path.exists(out_path):
                    os.makedirs(out_path, exist_ok=True)
                with open(validation_split_file, 'wb') as f:
                    pickle.dump(split_dict,
                                f)  #protocol=pickle.HIGHEST_PROTOCOL)
            else:
                print('Loading stratified training/validation split.')
                with open(validation_split_file, 'rb') as f:
                    split_dict = pickle.load(f)
            indices = split_dict[split]
            self.X = X[indices]  #subsetting the split
            self.y = y[indices]
        else:
            raise ValueError('Provided split not available.',
                             'Use any of [training, validation, testing]')
Ejemplo n.º 17
0
def CrossValidate_LR(X, y, config, output_layer_size):
    kf = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    kf.get_n_splits(X, y)

    config["loss"] = "ce"
    config["seed"] = 1234
    config.update({
        "input_layer_size": X.shape[1],
        "output_layer_size": output_layer_size
    })

    lr = [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]
    opt = ["adam", "nag", "momentum", "gd"]

    # lr = [0.1]
    # opt = ["nag"]

    f = open('LR_ModelSelection.txt', 'w')
    f.write('lr\tOptimizer\tCV\tAccuracy\tPrecision\tRecall\tF1\n')

    for l in lr:
        for o in opt:
            config["lr"] = l
            config["opt"] = o

            i = 0
            for train_index, val_index in kf.split(X, y):
                print('lr {}, CV={}...............\n'.format(l, i))

                X_train, X_val = X[train_index], X[val_index]
                y_train, y_val = y[train_index], y[val_index]

                # Configuring the neural network with the hyperparameters for Logistic Regression
                LR = LogisticRegression(config)

                i += 1
                # Train, validate and test
                Accuracy, Precision, Recall, F1 = LR.Train_LR(
                    X_train, y_train, X_val, y_val)

                f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                    l, o, i, Accuracy, Precision, Recall, F1))

    f.close()
 def stratified_split(self, X, y, n_splits=10, test_size=0.2):
     """ Sklearn function to do stratified splitting of the input
         @param: X List of text vlaues for train/test
         @param: y List of expected labels for train/test
         @param: n_splits Number of splits to return the data in (default 10)
         @param: test_size Size of data to hold for testing purposes (default 0.2)
         
         return List of dictionaries separated by keys train, test
         """
     skf = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)
     skf.get_n_splits(X, y)
     splits = []
     for train_index, test_index in skf.split(X, y):
         X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
         y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
         # add augmentation code here
         splits.append({"train": {"X": X_train, "y": y_train},
                        "test": {"X": X_test, "y": y_test}})
     return splits
Ejemplo n.º 19
0
def Check_model_on_random_splits(
    X,
    y,
    supp,
    n_splits,
    verbose,
    stats_model=True
    #                                  , gain_charts = False
):

    X_small = X[supp]
    sss = StratifiedShuffleSplit(n_splits=n_splits,
                                 test_size=.5,
                                 random_state=1234)
    sss.get_n_splits(X_small, y)
    out_arr = []
    for train_index, test_index in sss.split(X_small, y):
        X_train, X_test = X_small.iloc[train_index], X_small.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        out = run_WOE(X_train,
                      X_test,
                      y_train,
                      y_test,
                      max_bin=5,
                      keep_all=True)
        X_WOE_tranformed_train, X_WOE_tranformed_test, WOE, WOE_concise = out[
            0], out[1], out[2], out[3]
        try:
            out = Model_on_vars(
                supp_var=supp,
                X_WOE_tranformed_train=X_WOE_tranformed_train,
                X_WOE_tranformed_test=X_WOE_tranformed_test,
                y_train=y_train,
                y_test=y_test,
                verbose=verbose,
                stats_model=stats_model
                #                                 , gain_charts = gain_charts
            )
            out_arr.append(out)
        except:
            print('whoops')
    return (out_arr)
Ejemplo n.º 20
0
def make_dataset(full_path, seed):
    full_file_names = get_file_names(full_path)
    X = []
    X_flattened = []
    X_flattened_6 = []
    y_names = []
    y_numbers = []
    for folder_number in range(9):
        for filename in range(len(full_file_names[folder_number])):
            path = full_file_names[folder_number][filename]
            sample_data = genfromtxt(path, delimiter=',')
            X.append(sample_data)
            l = [x[0:6] for x in sample_data]
            X_flattened_6.append(list(chain.from_iterable(l)))
            X_flattened.append(list(chain.from_iterable(sample_data)))
            label = path.split('/')
            label = label[len(label) - 1].split('.')[0]
            label = label[0:len(label) - 1]
            y_names.append(label)
            y_numbers.append(conversion(label))

    X_to_use = X_flattened
    y_numbers = conversion_one_hot(y_numbers)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=seed)
    sss.get_n_splits(X_to_use, y_numbers)
    for train_index, test_index in sss.split(X_to_use, y_numbers):
        train_X, test_X = np.array(X_to_use,
                                   dtype=np.float32)[train_index], np.array(
                                       X_to_use, dtype=np.float32)[test_index]
        train_y, test_y = np.array(y_numbers,
                                   dtype=np.float32)[train_index], np.array(
                                       y_numbers, dtype=np.float32)[test_index]

    ssss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed)
    ssss.get_n_splits(test_X, test_y)
    for test_index, validate_index in ssss.split(test_X, test_y):
        test_X, validate_X = test_X[test_index], test_X[validate_index]
        test_y, validate_y = test_y[test_index], test_y[validate_index]

    print("train size: ", len(train_X))
    print("validate size: ", len(validate_X))
    print("test_ size: ", len(test_X))
    return train_X, validate_X, test_X, train_y, validate_y, test_y
Ejemplo n.º 21
0
def test_imb_performance():
    from maatpy.dataset import simulate_dataset
    from sklearn.metrics import cohen_kappa_score
    from sklearn.model_selection import StratifiedShuffleSplit
    imb = simulate_dataset(n_samples=100, n_features=2, n_classes=2, weights=[0.9, 0.1], random_state=0)
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    sss.get_n_splits(imb.data, imb.target)
    for train_index, test_index in sss.split(imb.data, imb.target):
        X_train, X_test = imb.data[train_index], imb.data[test_index]
        y_train, y_test = imb.target[train_index], imb.target[test_index]
    adaboost = AdaBoostClassifier(random_state=0)
    adaboost.fit(X_train, y_train)
    adaboost_score = cohen_kappa_score(adaboost.predict(X_test), y_test)


    clf = SMOTEBoost(random_state=0)
    clf.fit(X_train, y_train)
    score = cohen_kappa_score(clf.predict(X_test), y_test)
    assert score >= adaboost_score, "Failed with score = %f; AdaBoostClassifier score= %f" % (score, adaboost_score)
Ejemplo n.º 22
0
    def gen_sample_array(self):
        try:
            from sklearn.model_selection import StratifiedShuffleSplit
        except:
            print('Need scikit-learn for this functionality')
        import numpy as np
        cn = []
        ad = []
        emci = []
        lmci = []
        s = StratifiedShuffleSplit(n_splits=self.n_splits,
                                   test_size=0.25,
                                   random_state=19)
        X = torch.randn(self.class_vector.size(0), 4).numpy()
        y = self.class_vector.numpy()
        s.get_n_splits(X, y)

        train_index, test_index = next(s.split(X, y))
        indices = np.hstack([train_index, test_index])

        for i in indices:
            if y[indices[i]] == 0:
                cn.append(indices[i])
            elif y[indices[i]] == 1:
                ad.append(indices[i])
            elif y[indices[i]] == 2:
                emci.append(indices[i])
            else:
                lmci.append(indices[i])

        new_indices = []
        for i in range(s.get_n_splits(X, y)):
            new_indices.append(cn[i])
            new_indices.append(cn[i])
            new_indices.append(ad[i])
            new_indices.append(ad[i])
            new_indices.append(emci[i])
            new_indices.append(emci[i])
            new_indices.append(lmci[i])
            new_indices.append(lmci[i])

        return new_indices
Ejemplo n.º 23
0
def split_data(client, data, n=1):
    """
	Specifications:
	    Splits data into n stratified samples (1 by default)
	
	Args:
	    client (TYPE): ...
	    data (dict): {data:label}
	    n (int, optional): number of splits
	
	Yields:
	    tuple of 2 dicts: train and test dictionary 
	"""

    data_points, labels = zip(*data.items())

    #Dict to enumerate labels
    enumeration = client.collection("meta_data1").document(
        "s2i").get().to_dict()

    labels = [enumeration[label] for label in labels]
    sss = StratifiedShuffleSplit(n_splits=n, test_size=0.3, random_state=0)
    sss.get_n_splits(data_points, labels)

    for train_index, test_index in sss.split(data_points, labels):
        train_data = []
        train_labels = []
        test_data = []
        test_labels = []

        for x in train_index:
            train_data.append(data_points[x])
            train_labels.append(labels[x])

        for y in test_index:
            test_data.append(data_points[y])
            test_labels.append(labels[y])

        data_train = dict(zip(train_data, train_labels))
        data_test = dict(zip(test_data, test_labels))

        yield data_train, data_test
Ejemplo n.º 24
0
def train_and_cross_validate(sizes, num_hidden=8, n_epochs=50000, eta=0.01):

    X_train, X_test, y_train, y_test, X, Y = prepare_data()
    get_feature_importance(X, Y)
    nn = NN(sizes)
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=0)

    sss.get_n_splits(X, Y)

    j = 0
    for train_index, test_index in sss.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        X_train = np.array(X_train, dtype=np.float32)
        y_train = np.array(y_train, dtype=np.float32)
        train(nn, X_train, y_train, X_test, y_test, j)
        test(nn, X_test, y_test)
        np.savetxt("nn_network_weights.txt", nn._W)
        np.savetxt("nn_network_biases.txt", nn._b)
        j = j + 1
Ejemplo n.º 25
0
def take_stratified_split(_df,
                          target,
                          n_splits=10,
                          valid_size=None,
                          test_idx=None):
    df = _df.copy()
    if test_idx is not None:
        df_test = df.iloc[test_idx].copy()
        df = df[~df.index.isin(test_idx)]

    y = df[target]
    seed = 1234

    if n_splits == 1:  # take train valid split
        assert valid_size is not None
        sss = StratifiedShuffleSplit(n_splits=n_splits,
                                     test_size=valid_size,
                                     random_state=seed)
        sss.get_n_splits(df, y)
        train_index, valid_index = sss.split(df, y).__next__()
        df.loc[train_index, 'Fold'] = 0  # fold 0 is train data
        df.loc[valid_index, 'Fold'] = 1  # fold 1 is validation data
        if test_idx is not None:
            df_test['Fold'] = 2  # fold 2 is test data
            df = pd.concat([df, df_test], ignore_index=True)

    else:
        assert valid_size is None
        skf = StratifiedKFold(n_splits=n_splits,
                              random_state=seed,
                              shuffle=True)
        for i, (_, valid_index) in enumerate(skf.split(df, y)):
            df.loc[valid_index, 'Fold'] = i
        if test_idx is not None:
            df_test[
                'Fold'] = i + 1  # a fold which has max value is corresponding to test data
            df = pd.concat([df, df_test], ignore_index=True)

    df.Fold = df.Fold.astype(int)

    return df
Ejemplo n.º 26
0
def crossValPrediction(otu_use,
                       y,
                       max_depth=10,
                       n_estimators=65,
                       weight=5,
                       plot=False,
                       plot_pr=False,
                       folds=5):
    kf = StratifiedShuffleSplit(n_splits=folds)
    kf.get_n_splits(otu_use, y)

    auc_crossVal = []
    auc_prec_crossVal = []
    f1_crossVal = []
    feat_imp_crossVal = []
    i = 0
    for train_index, val_index in kf.split(otu_use, y):
        otu_train = otu_use.iloc[train_index, :]
        otu_val = otu_use.iloc[val_index, :]
        y_train = np.array(y)[train_index]
        y_val = np.array(y)[val_index]

        plt.subplot(1, 2, 1)
        m, auc, auc_train, fpr, tpr, prec, f1, f2, feat_imp = predictIBD(
            otu_train,
            y_train,
            otu_val,
            y_val,
            max_depth=max_depth,
            n_estimators=n_estimators,
            weight=weight,
            plot=plot,
            plot_pr=plot_pr,
            feat_imp=True)
        auc_crossVal.append(auc)
        auc_prec_crossVal.append(prec)
        f1_crossVal.append(f1)
        feat_imp_crossVal.append(feat_imp)

        i = i + 1
    return (auc_crossVal, auc_prec_crossVal, f1_crossVal, feat_imp_crossVal)
    def execute(self, params, **kwargs):
        from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, GridSearchCV

        train_no_na = self.marvin_initial_dataset['train'][
            params["pred_cols"] + [params["dep_var"]]].dropna()

        print("Length: {}".format(len(train_no_na)))

        # Feature Engineering
        data_X = train_no_na[params["pred_cols"]]
        data_X.loc[:, 'Sex'] = data_X.loc[:, 'Sex'].map({
            'male': 1,
            'female': 0
        })
        data_y = train_no_na[params["dep_var"]]

        # Prepare for Stratified Shuffle Split
        sss = StratifiedShuffleSplit(n_splits=5, test_size=.6, random_state=0)
        sss.get_n_splits(data_X, data_y)

        # Get Test Dataset
        test_no_na = self.marvin_initial_dataset['test'][
            params["pred_cols"]].dropna()

        print("Length: {}".format(len(test_no_na)))

        # Feature Engineering
        test_X = test_no_na[params["pred_cols"]]
        test_X.loc[:, 'Sex'] = test_X.loc[:, 'Sex'].map({
            'male': 1,
            'female': 0
        })

        self.marvin_dataset = {
            'X_train': data_X,
            'y_train': data_y,
            'X_test': test_X,
            'sss': sss
        }

        print("Preparation is Done!!!!")
def find_best(classifier, parameters):

    estimators = [('select',SelectKBest()), ('clf',classifier)]
    pipe = Pipeline(estimators)

    #pp.pprint(sorted(pipe.get_params().keys()))
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.9, random_state=42)
    sss.get_n_splits(features,labels)

    result = GridSearchCV(pipe, parameters, cv = sss)
    result.fit(features, labels)
    clf = result.best_estimator_

    print result.best_params_

    my_features_list = [features_list[i+1] for i in clf.named_steps['select'].get_support(indices=True)]
    my_features_list.insert(0, "poi")

    print my_features_list
    dump_classifier_and_data(clf, my_dataset, my_features_list)
    test_classifier(clf, my_dataset, my_features_list)
Ejemplo n.º 29
0
def train_test_split(X, y, rnd_seed):
    """
    split the features and the labels according to the indices
    :param X: feature set, should be array or list
    :param y: labels, should be array or list
    :param rnd_seed: random seed
    """
    # generate indices for the train and test set
    indices = [i for i in range(len(y))]
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=rnd_seed)
    sss.get_n_splits(indices, y)
    train_indices, test_indices = next(sss.split(indices, y))

    # train/test split
    X_train = [X[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]

    y_train = [y[i] for i in train_indices]
    y_test = [y[i] for i in test_indices]

    return X_train, X_test, y_train, y_test
Ejemplo n.º 30
0
    def get_partition(self):
        
        df = self.get_binarized_data()
        ids = df.index 
        labels = df.values.flatten()
      
        sss = StratifiedShuffleSplit(n_splits=1, test_size= self.config.val_size)
        sss.get_n_splits(ids, labels)
        for train_index, test_index in sss.split(ids, labels):
            ids_train, ids_val = ids[train_index], ids[test_index]
            y_train, y_val = labels[train_index], labels[test_index]    
        
        test_data = pd.read_table('MICCAI_Test.txt', index_col = 0, delim_whitespace = True, header = 0)

        ids_test = test_data.index
        y_test = test_data.apply(self.le.fit_transform).values.flatten()

        partition_ids = {'train': list(ids_train), 'val': list(ids_val), 'test': list(ids_test)}        
        partition_labels = {'train': list(y_train), 'val': list(y_val), 'test': list(y_test)}
    
        return partition_ids, partition_labels  
Ejemplo n.º 31
0
def StraitKFold(classifier, X, y):
    skf = StratifiedShuffleSplit(n_splits = 3)
    skf.get_n_splits(X, y)
    y_scores = pd.DataFrame()
    y_tests = pd.DataFrame()
    y_pred = pd.DataFrame() 
    f1 = np.array([])
    n = 0
    for train_index, test_index in skf.split(X, y):
        classifier.fit(X.iloc[train_index, :], y.iloc[train_index, 0])
        y_scores['fold_'+str(n)] = classifier.decision_function(X.iloc[test_index, :])
        y_pred['fold_'+str(n)] = classifier.predict(X.iloc[test_index, :])
        y_tests['fold_'+str(n)] = y.iloc[test_index, 0].values
        f1 = np.append(f1, metrics.f1_score(y.iloc[test_index, 0], y_pred.iloc[:,n]))
        accuracy = np.append(f1, metrics.accuracy_score(y.iloc[test_index, 0], y_pred.iloc[:,n]))
        n += 1
    f1_score = np.mean(f1)
    accuracy = np.mean(accuracy)
    print('mean accuracy score: '+str(accuracy))
    print('mean f1 score: '+str(f1_score))
    return y_scores, y_tests, accuracy, f1_score
Ejemplo n.º 32
0
            'low_interest_manager_id',
            'low_interest_building_id',
            'low_interest_display_address',
            'n_listings_of_manager']

features.extend(common_managers)
print(f'number of features: {len(features)}')

X = sparse.hstack([train[features], train_ft_tfidf_transformed]).tocsr()
train['interest_level'] = train.interest_level.apply(lambda x: set_int_for_category(x))
y = train['interest_level']



sss = StratifiedShuffleSplit(n_splits = 2, test_size=0.35, random_state=0)
sss.get_n_splits(X=X, y=y)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_validate = X[train_index], X[test_index]
    y_train, y_validate = y.iloc[train_index], y.iloc[test_index]

# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_validate, label=y_validate, reference=lgb_train)

print('data set has been setup')

# specify your configurations as a dict
params = {
    'task': 'train',
Ejemplo n.º 33
0
            w_featdim = random.choice((64, 128, 256)),
            w_featdrop = random.choice((0.1, 0.2, 0.5)),
            rnn = random.choice(('GRU', 'LSTM')))
        
        return o, hash(str(o))
    def __str__(self):
        str = ""
        for attr in self.__slots__:
            str += '{}={}, '.format(attr, getattr(self, attr))
        return str[:-2]

data = load(opt.input_prefix)
nepoch = 40

ssp = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
ssp.get_n_splits(data.docs, data.labels)
trn_idx, dev_idx = list(ssp.split(data.docs, data.labels))[0]

trn_labels = to_categorical(np.array(data.labels)[trn_idx])
dev_labels = to_categorical(np.array(data.labels)[dev_idx])

search_iter = 1000
search_done = set()
for _ in range(search_iter):
    o, h = Options.sample()
    if h in search_done: continue
    search_done.add(h)

    if not o.c_maxlen:
        o.c_maxlen = np.max(data.len_char)
    c_vocab = Counter({k:v for k,v in data.chars.items() if v > o.c_cutoff})
Ejemplo n.º 34
0
    from sklearn.ensemble import RandomForestClassifier
    m = RandomForestClassifier(class_weight=opt.class_weight,n_estimators=300,random_state=seed)
else:
    from sklearn.svm import LinearSVC
    m = LinearSVC(dual=True, C=opt.C, verbose=0,
            class_weight=opt.class_weight)
   
if opt.mult_class == 'ovo':
    mc = OneVsOneClassifier
elif opt.mult_class == 'ovr':
    mc = OneVsRestClassifier
if opt.classifier != 'rf':
    m = mc(m, n_jobs=opt.n_jobs)

ssp = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
ssp.get_n_splits(docs, labels)
trn_idx, dev_idx = list(ssp.split(data.docs, data.labels))[0]

acc = []
f1M = []

train_docs = docs[trn_idx]
train_labels = labels[trn_idx]
dev_docs = docs[dev_idx]
dev_labels = labels[dev_idx]
split_size = round(len(trn_idx)/10)
for i in range(10):
    info("training up to {}".format((i+1)*split_size))
    m.fit(train_docs[0:(i+1)*split_size], train_labels[0:(i+1)*split_size])
    pred = m.predict(dev_docs)
    acc.append(accuracy_score(dev_labels, pred))