def featImpMDA_Clustered(clf, X, y, clstrs, n_splits=10):
    cvGen = KFold(n_splits=n_splits)
    scr0, scr1 = pd.Series(dtype='float64'), pd.DataFrame(
        columns=clstrs.keys())
    for i, (train, test) in enumerate(cvGen.split(X=X)):
        X0, y0, = X.iloc[train, :], y.iloc[train]
        X1, y1 = X.iloc[test, :], y.iloc[test]
        fit = clf.fit(X=X0, y=y0)
        prob = fit.predict_proba(X1)
        scr0.loc[i] = -log_loss(y1, prob, labels=clf.classes_)
        for j in scr1.columns:
            X1_ = X1.copy(deep=True)
            for k in clstrs[j]:
                np.random.shuffle(X1_[k].values)  # shuffle clusters
            prob = fit.predict_proba(X1_)
            scr1.loc[i, j] = -log_loss(y1, prob, labels=clf.classes_)
        imp = (-1 * scr1).add(scr0, axis=0)
        imp = imp / (-1 * scr1)
        imp = pd.concat(
            {
                'mean': imp.mean(),
                'std': imp.std() * imp.shape[0]**-.5
            }, axis=1)
        imp.index = ['C_' + str(i) for i in imp.index]
    return imp
Beispiel #2
0
def kfolded(data, folds, seed=1337):
    kf = KFold(n_splits=folds, random_state=seed)

    for i, (train_index, test_index) in enumerate(kf.split(data)):
        if len(data.shape) > 1:
            yield (data[train_index, :], data[test_index, :], i)
        else:
            yield (data[train_index], data[test_index], i)
Beispiel #3
0
def cv(config):
    np.random.seed(3435)

    data_file = config.get('data_file')
    with open(data_file, "rb") as f:
        (datasets, targets, vocab) = pickle.load(f)
    logger.info('Loaded vect_file: %s', data_file)

    if config.get('vector_type') == 'word2vec':
        w2v_file = config.get('w2v_file')
        w2v = load_bin_vec(w2v_file, vocab)
        add_unknown_words(w2v, vocab)
        initialW = []
        for entry in sorted(vocab.items(), key=lambda x: x[1]):
            initialW.append(w2v[entry[0]])
        initialW = np.array(initialW)
        logger.info('Loaded word2vec: %s', w2v_file)
    else:
        initialW = None

    model_config = {}
    model_config.update(config.get('model'))
    model_config['batch_size'] = config.get('batch_size')
    model_config['epoch'] = config.get('epoch')
    model_config['gpu'] = config.get('gpu')
    model_config['non_static'] = config.get('non_static')
    model_config['n_vocab'] = len(vocab)
    model_config['doc_length'] = datasets.shape[1]
    model_config['initialW'] = initialW
    for phase in range(1, config.get('phase') + 1):
        logger.info('Cross Validation: %d/%d', phase, config.get('phase'))
        kf = KFold(n_splits=config.get('split'))
        for train_index, test_index in kf.split(datasets):
            train_index = np.random.permutation(train_index)
            X_train = datasets[train_index]
            Y_train = targets[train_index]
            X_test = datasets[test_index]
            Y_test = targets[test_index]

            logger.info('Fitting: %s -> %s', X_train.shape, Y_train.shape)
            (_, clf) = create_classifier(**model_config)
            clf.fit(X_train,
                    Y_train,
                    dataset_creator=lambda X, y, model: XyDataset(
                        X=X, y=y, model=model, X_dtype=np.int32))

            logger.info('Predicting: %s -> %s', X_test.shape, Y_test.shape)
            preds = clf.predict(X_test,
                                dataset_creator=lambda X, model: XyDataset(
                                    X=X, model=model, X_dtype=np.int32))

            logger.info('accuracy: {0}'.format(accuracy_score(Y_test, preds)))
            if config.get('fold_out'):
                break

    logger.info('Done')
Beispiel #4
0
class HybridCV(with_metaclass(ABCMeta, BaseCrossValidator)):
    @abstractmethod
    def choose_loo(self, y):
        pass
    
    def __init__(self, n_folds, shuffle=True, **kwargs):
        self.n_folds = n_folds
        self.base_kfold = KFold(self.n_folds, shuffle=shuffle, **kwargs)
    
    @abstractmethod
    def get_n_splits(self, X, y, groups):
        pass
    
    def _iter_test_masks(self, X=None, y=None, groups=None):
        loo_indices = self.choose_loo(X, y, groups)
        not_loo_mask = np.ones(X.shape[0], dtype=bool)
        for idx in loo_indices:
            result = np.zeros(X.shape[0], dtype=bool)
            result[idx] = True
            not_loo_mask[idx] = False
            yield result
        for result in self.base_kfold._iter_test_masks(X=X, y=y, groups=groups):
            result_ = (result & not_loo_mask)
            if np.any(result_!=0):
                return result_
Beispiel #5
0
def linearRegression(data, targets):
    numOfDataPnts = data.shape[0]
    scores = []
    sixFoldCrossValid = KFold(n=numOfDataPnts, n_folds=6, shuffle=True, random_state=None)
    i = 0
    for train_index, test_index in sixFoldCrossValid:
        # split the data into training and testing sets
        data_train, data_test = data[train_index], data[test_index]
        target_train, target_test = targets[train_index], targets[test_index]
        
        # execute the underlying linear regression
        linRegress = lm.LinearRegression()
        linRegress.fit(data_train, target_train)
        targetPredicted = linRegress.predict(data_test)
        scores.append(linRegress.score(data_test, target_test))
        
        # make plot of true vs predicted reactivity
        plt.scatter(targetPredicted, target_test) #, color='b', s=121/2, alpha=.4)
        axes = plt.gca()
        slope, intercept, r_value, p_value, std_err = stats.linregress(targetPredicted,target_test)
        rSquared = r_value**2
        plt.annotate(str(rSquared), xy=(1,4), xytext=(1, 4), textcoords='figure points')
        m, b = np.polyfit(targetPredicted, target_test, 1)
        X_plot = np.linspace(axes.get_xlim()[0],axes.get_xlim()[1],100)
        plt.plot(X_plot, m*X_plot + b, '-')
        plt.xlabel('Predicted values')
        plt.ylabel('True values')
        plt.title('Scatter plot of true vs. predcited values')
        plt.savefig('linReg%i.png' % i)
        plt.clf()
        i += 1
    meanScore = np.mean(scores)
    print ("prediction score: ", meanScore)
    return meanScore
Beispiel #6
0
def featImpMDA_Clustered(clf, X, y, clstrs, n_splits=10):
    """

    SNIPPET 6.5 Clustered MDA
    Args:
        clf:
        X:
        y:
        clstrs:
        n_splits:

    Returns:

    """
    from sklearn.metrics import log_loss
    from sklearn.model_selection._split import KFold

    cvGen = KFold(n_splits=n_splits)
    scr0, scr1 = pd.Series(), pd.DataFrame(columns=clstrs.keys())

    for i, (train, test) in enumerate(cvGen.split(X=X)):
        X0, y0 = X.iloc[train, :], y.iloc[train]
        X1, y1 = X.iloc[test, :], y.iloc[test]
        fit = clf.fit(X=X0, y=y0)
        prob = fit.predict_proba(X1)

        scr0.loc[i] = -log_loss(y1, prob, labels=clf.classes_)

        for j in scr1.columns:
            X1_ = X1.copy(deep=True)

            for k in clstrs[j]:
                np.random.shuffle(X1_[k].values)  # shuffle clusters

            prob = fit.predict_proba(X1_)
            scr1.loc[i, j] = -log_loss(y1, prob, labels=clf.classes_)

    imp = (-1 * scr1).add(scr0, axis=0)
    imp /= -1 * scr1
    imp = pd.concat({
        'mean': imp.mean(),
        'std': imp.std() * imp.shape[0]**-.5
    },
                    axis=1)
    imp.index = ['C_' + str(i) for i in imp.index]

    return imp
def check_cv2(cv=3, y=None, classifier=False, random_state=None):
    """Input checker utility for building a cross-validator

    NOTE: this is the same as sklearn.model_selection._split.check_cv but with an added parameter for random_state
    So that nested CV splits are reproduceable

    Parameters
    ----------
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if classifier is True and ``y`` is either
        binary or multiclass, :class:`StratifiedKFold` is used. In all other
        cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    y : array-like, optional
        The target variable for supervised learning problems.

    classifier : boolean, optional, default False
        Whether the task is a classification task, in which case
        stratified KFold will be used.

    random_state : None, int or RandomState
        When shuffle=True, pseudo-random number generator state used for
        shuffling. If None, use default numpy RNG for shuffling.

    Returns
    -------
    checked_cv : a cross-validator instance.
        The return value is a cross-validator which generates the train/test
        splits via the ``split`` method.
    """
    if cv is None:
        cv = 3

    if isinstance(cv, numbers.Integral):
        if (classifier and (y is not None)
                and (type_of_target(y) in ('binary', 'multiclass'))):
            return StratifiedKFold(cv, random_state=random_state)
        else:
            return KFold(cv, random_state=random_state)

    if not hasattr(cv, 'split') or isinstance(cv, str):
        if not isinstance(cv, Iterable) or isinstance(cv, str):
            raise ValueError("Expected cv as an integer, cross-validation "
                             "object (from sklearn.model_selection) "
                             "or an iterable. Got %s." % cv)
        return _CVIterableWrapper(cv)

    return cv  # New style cv objects are passed without any modification
Beispiel #8
0
    def evalModel(self):
        Log(LOG_INFO) << "Evaluate CV score ..."

        kfold = KFold(n_splits=10, shuffle=False)
        res = cross_val_score(self.mlEngine.getEstimator(),
                              self.totalFeatureMatrix,
                              self.totalLabels,
                              cv=kfold,
                              n_jobs=-1)
        Log(LOG_INFO) << "CV accuracy: %f" % res.mean()
Beispiel #9
0
def featImpMDA(clf, X, y, n_splits=10):
    """
    feat importance based on OOS score reduction
    SNIPPET 6.3 Implementation of MDA

    Args:
        clf:
        X:
        y:
        n_splits:

    Returns:

    """

    from sklearn.metrics import log_loss
    from sklearn.model_selection._split import KFold
    cvGen = KFold(n_splits=n_splits)
    scr0, scr1 = pd.Series(), pd.DataFrame(columns=X.columns)

    for i, (train, test) in enumerate(cvGen.split(X=X)):
        X0, y0 = X.iloc[train, :], y.iloc[train]
        X1, y1 = X.iloc[test, :], y.iloc[test]
        fit = clf.fit(X=X0, y=y0)  # the fit occurs here

        prob = fit.predict_proba(X1)  # prediction before shuffling
        scr0.loc[i] = -log_loss(y1, prob, labels=clf.classes_)

        for j in X.columns:
            X1_ = X1.copy(deep=True)
            np.random.shuffle(X1_[j].values)  # shuffle one column
            prob = fit.predict_proba(X1_)  # prediction after shuffling
            scr1.loc[i, j] = -log_loss(y1, prob, labels=clf.classes_)

    imp = (-1 * scr1).add(scr0, axis=0)
    imp /= -1 * scr1
    imp = pd.concat({
        'mean': imp.mean(),
        'std': imp.std() * imp.shape[0]**-.5
    },
                    axis=1)  # CLT
    return imp
class StackingModel(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, mod, meta_model):
        self.mod = mod
        self.meta_model = meta_model
        self.kf = KFold(n_splits=5, random_state=42, shuffle=True)

    def fit(self, X, y):
        self.saved_model = [list() for i in self.mod]
        oof_train = np.zeros((X.shape[0], len(self.mod)))

        for i, model in enumerate(self.mod):
            for train_index, val_index in self.kf.split(X, y):
                renew_model = clone(model)
                renew_model.fit(X[train_index], y[train_index])
                self.saved_model[i].append(renew_model)
                oof_train[val_index, i] = renew_model.predict(X[val_index])

        self.meta_model.fit(oof_train, y)
        return self

    def predict(self, X):
        whole_test = np.column_stack([
            np.column_stack(model.predict(X)
                            for model in single_model).mean(axis=1)
            for single_model in self.saved_model
        ])
        return self.meta_model.predict(whole_test)

    def get_oof(self, X, y, test_X):
        oof = np.zeros((X.shape[0], len(self.mod)))
        test_single = np.zeros((test_X.shape[0], 5))
        test_mean = np.zeros((test_X.shape[0], len(self.mod)))
        for i, model in enumerate(self.mod):
            for j, (train_index, val_index) in enumerate(self.kf.split(X, y)):
                clone_model = clone(model)
                clone_model.fit(X[train_index], y[train_index])
                oof[val_index, i] = clone_model.predict(X[val_index])
                test_single[:, j] = clone_model.predict(test_X)
            test_mean[:, i] = test_single.mean(axis=1)
        return oof, test_mean
def featImpMDA(clf, X, y, n_splits=10):
    #feat importance based on OOS score reduction
    cvGen = KFold(n_splits=n_splits)
    scr0, scr1 = pd.Series(dtype='float64'), pd.DataFrame(columns=X.columns)
    for i, (train, test) in enumerate(cvGen.split(X=X)):
        x0, y0 = X.iloc[train, :], y.iloc[train]
        x1, y1 = X.iloc[test, :], y.iloc[test]
        fit = clf.fit(X=x0, y=y0)  # the fit occures
        prob = fit.predict_proba(x1)  #prediction before shuffles
        scr0.loc[i] = -log_loss(y1, prob, labels=clf.classes_)
        for j in X.columns:
            X1_ = x1.copy(deep=True)
            np.random.shuffle(X1_[j].values)  #shuffle one columns
            prob = fit.predict_proba(X1_)  #prediction after shuffle
            scr1.loc[i, j] = -log_loss(y1, prob, labels=clf.classes_)
    imp = (-1 * scr1).add(scr0, axis=0)
    imp = imp / (-1 * scr1)
    imp = pd.concat({
        'mean': imp.mean(),
        'std': imp.std() * imp.shape[0]**-.5
    },
                    axis=1)  #CLT
    return imp
Beispiel #12
0
def cross_validate_approximation(train_samples,
                                 train_vals,
                                 options,
                                 nfolds,
                                 method,
                                 random_folds=True):
    ntrain_samples = train_samples.shape[1]
    if random_folds != 'sklearn':
        fold_sample_indices = get_random_k_fold_sample_indices(
            ntrain_samples, nfolds, random_folds)
    else:
        from sklearn.model_selection._split import KFold
        sklearn_cv = KFold(nfolds)
        indices = np.arange(train_samples.shape[1])
        fold_sample_indices = [
            te for tr, te in sklearn_cv.split(train_vals, train_vals)
        ]

    approx_list = []
    residues_list = []
    cv_score = 0
    for kk in range(len(fold_sample_indices)):
        K = np.ones(ntrain_samples, dtype=bool)
        K[fold_sample_indices[kk]] = False
        train_samples_kk = train_samples[:, K]
        train_vals_kk = train_vals[K, :]
        test_samples_kk = train_samples[:, fold_sample_indices[kk]]
        test_vals_kk = train_vals[fold_sample_indices[kk]]
        approx_kk = approximate(train_samples_kk, train_vals_kk, method,
                                options).approx
        residues = approx_kk(test_samples_kk) - test_vals_kk
        approx_list.append(approx_kk)
        residues_list.append(residues)
        cv_score += np.sum(residues**2, axis=0)
    cv_score = np.sqrt(cv_score / ntrain_samples)
    return approx_list, residues_list, cv_score
def check_cv(cv=3, y=None, classifier=False):
    """Dask aware version of ``sklearn.model_selection.check_cv``

    Same as the scikit-learn version, but works if ``y`` is a dask object.
    """
    if cv is None:
        cv = 3

    # If ``cv`` is not an integer, the scikit-learn implementation doesn't
    # touch the ``y`` object, so passing on a dask object is fine
    if not isinstance(y, Base) or not isinstance(cv, numbers.Integral):
        return model_selection.check_cv(cv, y, classifier)

    if classifier:
        # ``y`` is a dask object. We need to compute the target type
        target_type = delayed(type_of_target, pure=True)(y).compute()
        if target_type in ('binary', 'multiclass'):
            return StratifiedKFold(cv)
    return KFold(cv)
def autoencoder_dim_tuning_graph():
    '''run the autoencoder with a variety of hidden layer dimensionalities and plot the cross
    validation errors for each
    '''

    data = read_atoms_data()
    scaledData = data / 10 - 0.5
    kFold = KFold(n_splits=5, shuffle=True)
    errors = []
    
#     for layer1Dim in range(6,16):
    for layer1Dim in range(4,5):
        print('LAYER 1 DIMENSIONALITY: ', layer1Dim)
        errors.append([])
        latentLayerDims = range(4,layer1Dim+1)
        for latentLayerDim in latentLayerDims:
            auto = Autoencoder(hiddenDims=[layer1Dim,latentLayerDim])
            errors[-1].append(-10.0 * np.mean(cross_val_score(auto, scaledData, cv=kFold)))
        plt.semilogy(latentLayerDims,errors[-1],label=layer1Dim)
    print(errors)
 def individual_training_executor(self, dim):
     # make a pipeline with preprocessing, autoencoder, regression
     scaler = MinMaxScaler(feature_range=(-0.5,0.5))
     autoencoder = Autoencoder(logPath=self.get_path(dim), hiddenDims=[50,dim],beta=0.1)
     mlPipeline = make_pipeline(scaler, autoencoder)
     
     # read in the data and train the autoencoder
     data, targets = self.read_mopac_reactivity_data()
     mlPipeline.fit(data, targets)
     
     # test the accuracy of an SVM on the transformed data using cross validation
     latent = mlPipeline.transform(data)
     regressor = SVR(C=10000)
     cross_validator = KFold(n_splits=5, shuffle=True, random_state=40)
     predictions = cross_val_predict(regressor, latent, targets, cv=cross_validator)
     
     # make a cross_val_predict-ed vs actual graph
     main.plotScatterPlot(targets, predictions, 'predictedVsActual')
     
     # print the cross validation actual and predicted targets to file
     actualThenPredicted = np.array([targets, predictions])
     np.savetxt('actualThenPredicted.txt', actualThenPredicted)
Beispiel #16
0
def main():
    parser = make_parser()
    args = parser.parse_args()

    if args.tf_seed != -1:
        tf.random.set_random_seed(args.tf_seed)
    if not args.no_shuffle and args.shuffle_seed != -1:
        np.random.seed(args.shuffle_seed)

    # load data
    train_x, train_y = read_input_data(args.train_h5)
    test_x, test_y = read_input_data(args.test_h5) # used as val

    # SpaceNet
    all_ids = np.array(generate_ids(args.data_dirs, None))
    kfold = KFold(n_splits=2, shuffle=True)  # args.n_folds
    splits = [s for s in kfold.split(all_ids)]
    folds = [int(f) for f in '0'.split(",")]
    fold = folds[0]
    train_ind, test_ind = splits[fold]
    train_ids = all_ids[train_ind]
    val_ids = all_ids[test_ind]
    masks_dict = get_groundtruth(args.data_dirs)

    # Returns normalized to interval [-1, 1]
    train_generator = MULSpacenetDataset(
        data_dirs=args.data_dirs,
        wdata_dir=args.wdata_dir,
        image_ids=train_ids,
        batch_size=args.train_batch_size,
        crop_shape=(args.crop_size, args.crop_size),
        seed=777,
        image_name_template='PS-MS/SN3_roads_train_AOI_5_Khartoum_PS-MS_{id}.tif',
        masks_dict=masks_dict
    )

    val_generator = MULSpacenetDataset(
        data_dirs=args.data_dirs,
        wdata_dir=args.wdata_dir,
        image_ids=val_ids,
        batch_size=args.test_batch_size,
        crop_shape=(args.crop_size, args.crop_size),
        seed=777,
        image_name_template='PS-MS/SN3_roads_train_AOI_5_Khartoum_PS-MS_{id}.tif',
        masks_dict=masks_dict
    )

    # train_x in shape (batch_size, width, height, channels) = (train_batch_size, crop_size, crop_size, 12)
    # train_x, train_y = next(train_generator)
    # train_generator.reset()
    # test_x, test_y = next(val_generator)
    # val_generator.reset()
    # train_y_shape = train_y.shape


    images_scale = np.max(train_x)
    if images_scale > 1:
        print('Normalizing images by a factor of {}'.format(images_scale))
        train_x = train_x / images_scale
        test_x = test_x / images_scale

    if args.test_batch_size == 0:
        args.test_batch_size = test_y.shape[0]

    print('Data shapes:', train_x.shape, train_y.shape, test_x.shape, test_y.shape)

    if train_y.shape[0] % args.train_batch_size != 0:
        print("WARNING batch size doesn't divide train set evenly")
    if train_y.shape[0] % args.large_batch_size != 0:
        print("WARNING large batch size doesn't divide train set evenly")
    if test_y.shape[0] % args.test_batch_size != 0:
        print("WARNING batch size doesn't divide test set evenly")

    # build model
    if args.arch == 'linknet':
        model = network_builders.build_linknet()
    elif args.arch == 'fc':
        model = network_builders.build_network_fc(args)
    elif args.arch == 'fc_cust':
        model = network_builders.build_fc_adjustable(args)
    elif args.arch == 'lenet':
        model = network_builders.build_lenet_conv(args)
    elif args.arch == 'allcnn':
        model = network_builders.build_all_cnn(args)
    elif args.arch == 'resnet':
        model = network_builders.build_resnet(args)
    elif args.arch == 'vgg':
        model = network_builders.build_vgg_half(args)
    else:
        raise Error("Unknown architeciture {}".format(args.arch))

    init_model(model, args)
    define_training(model, args)

    sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)))
    sess.run(tf.global_variables_initializer())

    if args.init_weights_h5:
        load_initial_weights(sess, model, args)

    for collection in ['tb_train_step']: # 'eval_train' and 'eval_test' added manually later
        tf.summary.scalar(collection + '_acc', model.accuracy, collections=[collection])
        tf.summary.scalar(collection + '_loss', model.loss, collections=[collection])

    tb_writer, hf = None, None
    dsets = {}
    if args.output_dir:
        tb_writer = tf.summary.FileWriter(args.output_dir, sess.graph)
        # set up output for gradients/weights
        if args.save_weights:
            dim_sum = sum([tf.size(var).eval() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)])
            total_iters = args.num_epochs * int(train_y.shape[0] / args.train_batch_size)
            total_chunks = int(total_iters / args.save_every)
            hf = h5py.File(args.output_dir + '/weights', 'w-')

            # write metadata
            var_shapes = np.string_(';'.join([str(var.get_shape()) for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)]))
            hf.attrs['var_shapes'] = var_shapes
            var_names = np.string_(';'.join([str(var.name) for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)]))
            hf.attrs['var_names'] = var_names

            # all individual weights at every iteration, where all_weights[i] = weights before iteration i:
            dsets['all_weights'] = hf.create_dataset('all_weights', (total_chunks + 1, dim_sum), dtype='f8', compression='gzip')
            print(f'all_weights shape: ({total_chunks + 1}, {dim_sum})')
        if args.save_training_grads:
            dsets['training_grads'] = hf.create_dataset('training_grads', (total_chunks, dim_sum), dtype='f8', compression='gzip')

    ########## Run main thing ##########
    print('=' * 100)
    train_and_eval(sess, model, train_x, train_y, test_x, test_y, tb_writer, dsets, args)
    # train_and_eval(sess, model, train_y_shape, train_generator, val_generator, tb_writer, dsets, args)

    if tb_writer:
        tb_writer.close()
    if hf:
        hf.close()
Beispiel #17
0
def feature_importance_mda(classifier,
                           X,
                           y,
                           n_splits=10,
                           plot=False,
                           figsize=(10, 10)):
    """
    Feature importance based out-of-sample Mean-Decrease Accuracy (MDA).
    
    Arguments
    ---------
    classifier : tree classifier
      Tree classifier to apply on data.
    X : pandas.DataFrame
      Data Frame with features as columns and samples as rows.
    y : pandas.Series
      Series containing class membership.
    plot : bool
      Option to plot feature importance.
    figsize : (float, float)
      Dimensions of the plot.
    
    Returns
    -------
    pandas.DataFrame
      Data frame with features importance.
    
    Notes
    -----
      Function adapted from "Machine Learning for Asset Managers",
      Marcos López de Prado (2020).
    """

    # Checks
    if not isinstance(X, pd.DataFrame):
        raise AssertionError("X must be pandas.DataFrame.")
    if not isinstance(y, pd.Series):
        raise AssertionError("y must be pandas.Series.")

    # Generate K-fold cross validation
    cv_gen = KFold(n_splits=n_splits)
    scr0 = pd.Series()
    scr1 = pd.DataFrame(columns=X.columns)

    # Loop over the folds:
    for i, (train, test) in enumerate(cv_gen.split(X=X)):

        # Train/Test split
        X0, y0 = X.iloc[train, :], y.iloc[train]
        X1, y1 = X.iloc[test, :], y.iloc[test]

        # Fit
        fit = classifier.fit(X=X0, y=y0)

        # Prediction before shuffling
        prob = fit.predict_proba(X1)
        scr0.loc[i] = -log_loss(y1, prob, labels=classifier.classes_)

        for j in X.columns:
            X1_ = X1.copy(deep=True)

            # Shuffle one column
            np.random.shuffle(X1_[j].values)

            # Prediction after shuffling
            prob = fit.predict_proba(X1_)
            scr1.loc[i, j] = -log_loss(y1, prob, labels=classifier.classes_)

    fimp_df = (-1 * scr1).add(scr0, axis=0)
    fimp_df = fimp_df / (-1 * scr1)
    fimp_df = pd.concat(
        {
            'Importance Mean': fimp_df.mean(),
            'Importance Std': fimp_df.std() * fimp_df.shape[0]**-.5
        },
        axis=1)

    # Sort values
    sorted_fimp = fimp_df.sort_values(by='Importance Mean')

    # Plot
    if plot is True:
        plt.figure(figsize=figsize)
        plt.title(
            "Feature importance based on out-of-sample Mean-Decrease Accuracy (MDA)."
        )
        plt.barh(y=sorted_fimp.index,
                 width=sorted_fimp['Importance Mean'],
                 xerr=sorted_fimp['Importance Std'])
        plt.show()

    return sorted_fimp


#---------#---------#---------#---------#---------#---------#---------#---------#---------#
Beispiel #18
0
def cross_validation():
    """
    Computes multiple classifiers, saves scores, plots data file

    :return: Dict with classifiers scores
    """
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1, max_iter=1000),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis(),
        SGDClassifier(),
        LogisticRegression()
    ]
    names = [
        "Nearest Neighbors",
        "Linear SVM",
        "RBF SVM",
        "Gaussian Process",
        "Decision Tree",
        "Random Forest",
        "Neural Net",
        "AdaBoost",
        "Naive Bayes",
        "QDA",
        "SGD",
        "Logistic Regression"
    ]
    # Splitting data for cross validation
    kf = KFold(n_splits=5, shuffle=True)
    splits = list(kf.split(X))
    cv_result = {}
    # Iterates trough classifiers, saving each score to a dict
    for classifier, name in zip(classifiers, names):
        model = classifier
        model_accuracy = []
        model_precision = []
        model_recall = []
        model_f1score = []
        for split in splits:
            train_indices, test_indices = split
            X_train = X[train_indices]
            X_test = X[test_indices]
            y_train = y[train_indices]
            y_test = y[test_indices]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            model_accuracy.append(accuracy_score(y_test, y_pred))
            model_precision.append(precision_score(y_test, y_pred, average='weighted'))
            model_recall.append(recall_score(y_test, y_pred, average='weighted'))
            model_f1score.append(f1_score(y_test, y_pred, average='weighted'))
        cv_result[name] = [model_accuracy, model_precision, model_recall, model_f1score]
    # Saving to a data file
    export_df = pd.DataFrame()
    for key, values in cv_result.items():
        export_df[f'{key}'] = values
    export_df.to_csv('data/cv_result.csv')

    return cv_result
Beispiel #19
0
def main():
    if args.crop_size:
        print('Using crops of shape ({}, {})'.format(args.crop_size, args.crop_size))
    else:
        print('Using full size images')

    all_ids = np.array(generate_ids(args.data_dirs, args.clahe))
    np.random.seed(args.seed)
    kfold = KFold(n_splits=args.n_folds, shuffle=True)

    splits = [s for s in kfold.split(all_ids)]
    folds = [int(f) for f in args.fold.split(",")]
    for fold in folds:
        encoded_alias = encode_params(args.clahe, args.preprocessing_function, args.stretch_and_mean)
        city = "all"
        if args.city:
            city = args.city.lower()
        best_model_file = '{}/{}_{}_{}.h5'.format(args.models_dir, encoded_alias, city, args.network)
        channels = 8
        if args.ohe_city:
            channels = 12
        model = make_model(args.network, (None, None, channels))

        if args.weights is None:
            print('No weights passed, training from scratch')
        else:
            print('Loading weights from {}'.format(args.weights))
            model.load_weights(args.weights, by_name=True)
        freeze_model(model, args.freeze_till_layer)

        optimizer = RMSprop(lr=args.learning_rate)
        if args.optimizer:
            if args.optimizer == 'rmsprop':
                optimizer = RMSprop(lr=args.learning_rate)
            elif args.optimizer == 'adam':
                optimizer = Adam(lr=args.learning_rate)
            elif args.optimizer == 'sgd':
                optimizer = SGD(lr=args.learning_rate, momentum=0.9, nesterov=True)

        train_ind, test_ind = splits[fold]
        train_ids = all_ids[train_ind]
        val_ids = all_ids[test_ind]
        if args.city:
            val_ids = [id for id in val_ids if args.city in id[0]]
            train_ids = [id for id in train_ids if args.city in id[0]]
        print('Training fold #{}, {} in train_ids, {} in val_ids'.format(fold, len(train_ids), len(val_ids)))
        masks_gt = get_groundtruth(args.data_dirs)
        if args.clahe:
            template = 'CLAHE-MUL-PanSharpen/MUL-PanSharpen_{id}.tif'
        else:
            template = 'MUL-PanSharpen/MUL-PanSharpen_{id}.tif'

        train_generator = MULSpacenetDataset(
            data_dirs=args.data_dirs,
            wdata_dir=args.wdata_dir,
            clahe=args.clahe,
            batch_size=args.batch_size,
            image_ids=train_ids,
            masks_dict=masks_gt,
            image_name_template=template,
            seed=args.seed,
            ohe_city=args.ohe_city,
            stretch_and_mean=args.stretch_and_mean,
            preprocessing_function=args.preprocessing_function,
            crops_per_image=args.crops_per_image,
            crop_shape=(args.crop_size, args.crop_size),
            random_transformer=RandomTransformer(horizontal_flip=True, vertical_flip=True),
        )

        val_generator = MULSpacenetDataset(
            data_dirs=args.data_dirs,
            wdata_dir=args.wdata_dir,
            clahe=args.clahe,
            batch_size=1,
            image_ids=val_ids,
            image_name_template=template,
            masks_dict=masks_gt,
            seed=args.seed,
            ohe_city=args.ohe_city,
            stretch_and_mean=args.stretch_and_mean,
            preprocessing_function=args.preprocessing_function,
            shuffle=False,
            crops_per_image=1,
            crop_shape=(1280, 1280),
            random_transformer=None
        )
        best_model = ModelCheckpoint(filepath=best_model_file, monitor='val_dice_coef_clipped',
                                     verbose=1,
                                     mode='max',
                                     save_best_only=False,
                                     save_weights_only=True)
        model.compile(loss=make_loss(args.loss_function),
                      optimizer=optimizer,
                      metrics=[dice_coef, binary_crossentropy, ceneterline_loss, dice_coef_clipped])

        def schedule_steps(epoch, steps):
            for step in steps:
                if step[1] > epoch:
                    print("Setting learning rate to {}".format(step[0]))
                    return step[0]
            print("Setting learning rate to {}".format(steps[-1][0]))
            return steps[-1][0]

        callbacks = [best_model, EarlyStopping(patience=20, verbose=1, monitor='val_dice_coef_clipped', mode='max')]

        if args.schedule is not None:
            steps = [(float(step.split(":")[0]), int(step.split(":")[1])) for step in args.schedule.split(",")]
            lrSchedule = LearningRateScheduler(lambda epoch: schedule_steps(epoch, steps))
            callbacks.insert(0, lrSchedule)

        if args.clr is not None:
            clr_params = args.clr.split(',')
            base_lr = float(clr_params[0])
            max_lr = float(clr_params[1])
            step = int(clr_params[2])
            mode = clr_params[3]
            clr = CyclicLR(base_lr=base_lr, max_lr=max_lr, step_size=step, mode=mode)
            callbacks.append(clr)

        steps_per_epoch = len(all_ids) / args.batch_size + 1
        if args.steps_per_epoch:
            steps_per_epoch = args.steps_per_epoch

        model.fit_generator(
            train_generator,
            steps_per_epoch=steps_per_epoch,
            epochs=args.epochs,
            validation_data=val_generator,
            validation_steps=len(val_ids),
            callbacks=callbacks,
            max_queue_size=30,
            verbose=1,
            workers=args.num_workers)

        del model
        K.clear_session()
        gc.collect()
Beispiel #20
0
def main():
    if args.crop_size:
        print('Using crops of shape ({}, {})'.format(args.crop_size,
                                                     args.crop_size))
    else:
        print('Using full size images')

    all_ids = np.array(generate_ids(args.data_dirs, args.clahe))
    np.random.seed(args.seed)
    kfold = KFold(n_splits=args.n_folds, shuffle=True)

    splits = [s for s in kfold.split(all_ids)]
    folds = [int(f) for f in args.fold.split(",")]
    for fold in folds:
        encoded_alias = encode_params(args.clahe, args.preprocessing_function,
                                      args.stretch_and_mean)
        city = "all"
        if args.city:
            city = args.city.lower()
        best_model_file = '{}/{}_{}_{}.h5'.format(args.models_dir,
                                                  encoded_alias, city,
                                                  args.network)
        channels = 8
        if args.ohe_city:
            channels = 12
        model = make_model(args.network, (None, None, channels))

        if args.weights is None:
            print('No weights passed, training from scratch')
        else:
            print('Loading weights from {}'.format(args.weights))
            model.load_weights(args.weights, by_name=True)
        freeze_model(model, args.freeze_till_layer)

        optimizer = RMSprop(lr=args.learning_rate)
        if args.optimizer:
            if args.optimizer == 'rmsprop':
                optimizer = RMSprop(lr=args.learning_rate)
            elif args.optimizer == 'adam':
                optimizer = Adam(lr=args.learning_rate)
            elif args.optimizer == 'sgd':
                optimizer = SGD(lr=args.learning_rate,
                                momentum=0.9,
                                nesterov=True)

        train_ind, test_ind = splits[fold]
        train_ids = all_ids[train_ind]
        val_ids = all_ids[test_ind]
        if args.city:
            val_ids = [id for id in val_ids if args.city in id[0]]
            train_ids = [id for id in train_ids if args.city in id[0]]
        print('Training fold #{}, {} in train_ids, {} in val_ids'.format(
            fold, len(train_ids), len(val_ids)))
        masks_gt = get_groundtruth(args.data_dirs)
        if args.clahe:
            template = 'CLAHE-MUL-PanSharpen/MUL-PanSharpen_{id}.tif'
        else:
            template = 'MUL-PanSharpen/MUL-PanSharpen_{id}.tif'

        train_generator = MULSpacenetDataset(
            data_dirs=args.data_dirs,
            wdata_dir=args.wdata_dir,
            clahe=args.clahe,
            batch_size=args.batch_size,
            image_ids=train_ids,
            masks_dict=masks_gt,
            image_name_template=template,
            seed=args.seed,
            ohe_city=args.ohe_city,
            stretch_and_mean=args.stretch_and_mean,
            preprocessing_function=args.preprocessing_function,
            crops_per_image=args.crops_per_image,
            crop_shape=(args.crop_size, args.crop_size),
            random_transformer=RandomTransformer(horizontal_flip=True,
                                                 vertical_flip=True),
        )

        val_generator = MULSpacenetDataset(
            data_dirs=args.data_dirs,
            wdata_dir=args.wdata_dir,
            clahe=args.clahe,
            batch_size=1,
            image_ids=val_ids,
            image_name_template=template,
            masks_dict=masks_gt,
            seed=args.seed,
            ohe_city=args.ohe_city,
            stretch_and_mean=args.stretch_and_mean,
            preprocessing_function=args.preprocessing_function,
            shuffle=False,
            crops_per_image=1,
            crop_shape=(1280, 1280),
            random_transformer=None)
        best_model = ModelCheckpoint(filepath=best_model_file,
                                     monitor='val_dice_coef_clipped',
                                     verbose=1,
                                     mode='max',
                                     save_best_only=False,
                                     save_weights_only=True)
        model.compile(loss=make_loss(args.loss_function),
                      optimizer=optimizer,
                      metrics=[
                          dice_coef, binary_crossentropy, ceneterline_loss,
                          dice_coef_clipped
                      ])

        def schedule_steps(epoch, steps):
            for step in steps:
                if step[1] > epoch:
                    print("Setting learning rate to {}".format(step[0]))
                    return step[0]
            print("Setting learning rate to {}".format(steps[-1][0]))
            return steps[-1][0]

        callbacks = [
            best_model,
            EarlyStopping(patience=20,
                          verbose=1,
                          monitor='val_dice_coef_clipped',
                          mode='max')
        ]

        if args.schedule is not None:
            steps = [(float(step.split(":")[0]), int(step.split(":")[1]))
                     for step in args.schedule.split(",")]
            lrSchedule = LearningRateScheduler(
                lambda epoch: schedule_steps(epoch, steps))
            callbacks.insert(0, lrSchedule)

        if args.clr is not None:
            clr_params = args.clr.split(',')
            base_lr = float(clr_params[0])
            max_lr = float(clr_params[1])
            step = int(clr_params[2])
            mode = clr_params[3]
            clr = CyclicLR(base_lr=base_lr,
                           max_lr=max_lr,
                           step_size=step,
                           mode=mode)
            callbacks.append(clr)

        steps_per_epoch = len(all_ids) / args.batch_size + 1
        if args.steps_per_epoch:
            steps_per_epoch = args.steps_per_epoch

        model.fit_generator(train_generator,
                            steps_per_epoch=steps_per_epoch,
                            epochs=args.epochs,
                            validation_data=val_generator,
                            validation_steps=len(val_ids),
                            callbacks=callbacks,
                            max_queue_size=30,
                            verbose=1,
                            workers=args.num_workers)

        del model
        K.clear_session()
        gc.collect()
 def __init__(self, mod, meta_model):
     self.mod = mod
     self.meta_model = meta_model
     self.kf = KFold(n_splits=5, random_state=42, shuffle=True)
def main():
    parser = make_parser()
    args = parser.parse_args()

    # load data
    train_x, train_y = read_input_data(args.train_h5)
    test_x, test_y = read_input_data(args.test_h5) # used as val for now

    # SpaceNet
    all_ids = np.array(generate_ids(args.data_dirs, None))
    kfold = KFold(n_splits=2, shuffle=True)  # args.n_folds
    splits = [s for s in kfold.split(all_ids)]
    folds = [int(f) for f in '0'.split(",")]
    fold = folds[0]
    train_ind, test_ind = splits[fold]
    train_ids = all_ids[train_ind]
    val_ids = all_ids[test_ind]

    train_generator = MULSpacenetDataset(
        data_dirs=args.data_dirs,
        wdata_dir=args.wdata_dir,
        image_ids=train_ids,
        crop_shape=(args.crop_size, args.crop_size),
        batch_size=args.large_batch_size,
        seed=777,
        image_name_template='PS-MS/SN3_roads_train_AOI_5_Khartoum_PS-MS_{id}.tif',
        masks_dict=get_groundtruth(args.data_dirs)
    )

    val_generator = MULSpacenetDataset(
        data_dirs=args.data_dirs,
        wdata_dir=args.wdata_dir,
        image_ids=val_ids,
        batch_size=args.test_batch_size,
        crop_shape=(args.crop_size, args.crop_size),
        seed=777,
        image_name_template='PS-MS/SN3_roads_train_AOI_5_Khartoum_PS-MS_{id}.tif',
        masks_dict=get_groundtruth(args.data_dirs),
    )

    # x in shape (channels, width, height, num_images) ?
    x, y = next(train_generator)
    train_y_shape = y.shape

    images_scale = np.max(train_x)
    if images_scale > 1:
        print('Normalizing images by a factor of {}'.format(images_scale))
        train_x = train_x / images_scale
        test_x = test_x / images_scale
    input_data = (train_x, train_y, test_x, test_y) # package for more concise argument passing

    if args.test_batch_size == 0:
        args.test_batch_size = test_y.shape[0]

    print('Data shapes:', train_x.shape, train_y.shape, test_x.shape, test_y.shape)
    if train_y.shape[0] % args.large_batch_size != 0:
        print("WARNING large batch size doesn't divide train set evenly")
    if test_y.shape[0] % args.test_batch_size != 0:
        print("WARNING batch size doesn't divide test set evenly")

    # get all_weights. Do it in 1 chunk if it fits into memory
    hf_weights = h5py.File(args.weights_h5, 'r')
    if args.stream_inputs:
        all_weights = hf_weights['all_weights'] # future work: change to streamds if you want it to be faster
    else:
        all_weights = np.array(hf_weights['all_weights'], dtype='f8')
    shapes = [literal_eval(s) for s in hf_weights.attrs['var_shapes'].decode('utf-8').split(';')]

    print(all_weights.shape)
    print(shapes)
    num_iters = min(args.max_iters, all_weights.shape[0] - 1)
    dim_sum = all_weights.shape[1]

    # set up output file
    output_name = args.output_h5
    if not output_name: # use default gradients name
        assert args.weights_h5[-8:] == '/weights'
        output_name = args.weights_h5[:-8] + '/gradients_adaptive'
        if args.max_iters < all_weights.shape[0] - 1:
            output_name += '_{}iters'.format(args.max_iters)
    print('Writing gradients to file {}'.format(output_name))
    dsets = {}
    hf_grads = h5py.File(output_name, 'w-')
    dsets['trainloss'] = hf_grads.create_dataset('trainloss', (num_iters + 1,), dtype='f4', compression='gzip')
    dsets['testloss'] = hf_grads.create_dataset('testloss', (num_iters + 1,), dtype='f4', compression='gzip')
    dsets['num_splits'] = hf_grads.create_dataset('num_splits', (num_iters,), dtype='i', compression='gzip')

    pool = ThreadPool(args.num_gpus)
    iters_to_calc = divide_with_remainder(num_iters, args.num_gpus)
    results = []
    overall_timerstart = time.time()

    for gpu in range(args.num_gpus):
        # each process writes to a different variable in the file
        dsets['grads_train_{}'.format(gpu)] = hf_grads.create_dataset(
            'grads_train_{}'.format(gpu), (len(iters_to_calc[gpu]) * args.default_num_splits + 1, dim_sum),
            maxshape=(None, dim_sum), dtype='f4', compression='gzip')
        dsets['grads_test_{}'.format(gpu)] = hf_grads.create_dataset(
            'grads_test_{}'.format(gpu), (len(iters_to_calc[gpu]) * args.default_num_splits + 1, dim_sum),
            maxshape=(None, dim_sum), dtype='f4', compression='gzip')

        if args.num_gpus > 1:
            ret = pool.apply_async(run_thread, (gpu, iters_to_calc[gpu], all_weights, shapes, train_y_shape, train_generator, val_generator, dim_sum,
                    args, dsets, hf_grads))
            results.append(ret)
        else:
            run_thread(gpu, iters_to_calc[gpu], all_weights, shapes, train_y_shape, train_generator, val_generator, dim_sum,
                args, dsets, hf_grads)

    pool.close()
    pool.join()
    print('return values: ', [res.get() for res in results])
    print('total time elapsed:', time.time() - overall_timerstart)
    hf_weights.close()
    hf_grads.close()
Beispiel #23
0
 def __init__(self, n_folds, shuffle=True, **kwargs):
     self.n_folds = n_folds
     self.base_kfold = KFold(self.n_folds, shuffle=shuffle, **kwargs)