def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    agglo_mean.fit(X)
    agglo_median.fit(X)
    assert_true(np.size(np.unique(agglo_mean.labels_)) == n_clusters)
    assert_true(np.size(np.unique(agglo_median.labels_)) == n_clusters)
    assert_true(np.size(agglo_mean.labels_) == X.shape[1])
    assert_true(np.size(agglo_median.labels_) == X.shape[1])

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert_true(Xt_mean.shape[1] == n_clusters)
    assert_true(Xt_median.shape[1] == n_clusters)
    assert_true(Xt_mean == np.array([1 / 3.]))
    assert_true(Xt_median == np.array([0.]))

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert_true(np.unique(X_full_mean[0]).size == n_clusters)
    assert_true(np.unique(X_full_median[0]).size == n_clusters)

    assert_array_almost_equal(agglo_mean.transform(X_full_mean),
                              Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median),
                              Xt_median)
Example #2
0
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)

    with ignore_warnings():
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    assert_no_warnings(agglo_mean.fit, X)
    assert_no_warnings(agglo_median.fit, X)
    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
    assert np.size(agglo_mean.labels_) == X.shape[1]
    assert np.size(agglo_median.labels_) == X.shape[1]

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert Xt_mean.shape[1] == n_clusters
    assert Xt_median.shape[1] == n_clusters
    assert Xt_mean == np.array([1 / 3.])
    assert Xt_median == np.array([0.])

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert np.unique(X_full_mean[0]).size == n_clusters
    assert np.unique(X_full_median[0]).size == n_clusters

    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        if hasattr(np, 'VisibleDeprecationWarning'):
            # Let's not catch the numpy internal DeprecationWarnings
            warnings.simplefilter('ignore', np.VisibleDeprecationWarning)
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
        assert_equal(len(warning_list), 1)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rnd = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rnd.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)
    with warnings.catch_warnings(record=True) as warning_list:
        warnings.simplefilter("always", DeprecationWarning)
        if hasattr(np, 'VisibleDeprecationWarning'):
            # Let's not catch the numpy internal DeprecationWarnings
            warnings.simplefilter('ignore', np.VisibleDeprecationWarning)
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
        assert_equal(len(warning_list), 1)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    assert_warns(DeprecationWarning, WardAgglomeration)

    with ignore_warnings():
        ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
        ward.fit(X)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert_array_equal(agglo.labels_, ward.labels_)
    assert_true(np.size(np.unique(agglo.labels_)) == 5)

    X_red = agglo.transform(X)
    assert_true(X_red.shape[1] == 5)
    X_full = agglo.inverse_transform(X_red)
    assert_true(np.unique(X_full[0]).size == 5)
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    assert_raises(ValueError, agglo.fit, X[:0])
Example #7
0
def TrainRFRegression(df1):
    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29'])

    # build our design matrices
    y, X = dmatrices(eqn, data=df1, return_type='dataframe')

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=50).fit(X, pd.np.ravel(y))
    reduced_X = X_reduction.transform(X)

    # define our regressor
    mod = RandomForestRegressor(n_estimators=50)

    # fit our data
    res = mod.fit(reduced_X, pd.np.ravel(y))

    # evaluate our fit
    yp = pd.DataFrame({'predicted': res.predict(reduced_X)})
    yp = yp['predicted']
    yt = y['regressand']
    r2 = metrics.r2_score(yt, yp)
    rmse = metrics.mean_absolute_error(yt, yp)
    # save our model, including scalers and feature agglomerator
    with open('RFR_trained_model.pickle', 'wb') as output:
        pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)

    return r2, rmse
Example #8
0
def feature_agglomeration(voters_data, n, rounding=False):
    featagg = FeatureAgglomeration(n_clusters=n)
    featagg.fit(voters_data)
    condensed = featagg.transform(voters_data)

    feature_groups_map = dict(zip(voters_data.columns, featagg.labels_))
    feature_groups_nos = []
    for feature_group_key in feature_groups_map:
        feature_groups_nos.append(feature_groups_map[feature_group_key])
    feature_groups_nos

    group_labels = []
    for feature_group_no in set(feature_groups_nos):
        group_label = ""
        for feature_groups_key in feature_groups_map:
            if feature_groups_map[feature_groups_key] == feature_group_no:
                group_label = group_label + feature_groups_key + ", "
        group_labels.append(group_label[0:-2])
    group_labels

    voters_agglomerated = pd.DataFrame(condensed,
                                       columns=group_labels,
                                       index=voters_data.index)
    if rounding == True:
        voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x))
    print("🔹→💠←🔹 {} features agglomerated into {} hybrid features.".format(
        len(voters_data.columns), len(voters_agglomerated.columns)))
    return voters_agglomerated
Example #9
0
def TestSGDRegression(df1):

    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand',
                    ['any_regressand', 'X25', 'X26', 'X29', 'X13'])

    # build our design matrices
    X = dmatrix(eqn.replace('regressand ~ ', '0+'),
                data=df1,
                return_type='dataframe')

    # load our model, including scalers and feature agglomerator
    with open('SGD_trained_model.pickle', 'rb') as input:
        res = pickle.load(input)

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X)
    reduced_X = X_reduction.transform(X)

    # standardize our data
    X_scaler = StandardScaler().fit(reduced_X)
    std_X = X_scaler.transform(reduced_X)

    # predict the interest rates
    yp = res.predict(std_X)

    return yp
Example #10
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = Pipeline([
            ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                         n_jobs=N_JOBS))])
        self.scaler = StandardScaler()
        self.agglo = FeatureAgglomeration(n_clusters=500)

    def fit(self, X, y):
        y = y.ravel()
        n_samples, n_lags, n_lats, n_lons = X.shape
        self.scaler.fit(X[:, -1].reshape(n_samples, -1))
        X = X.reshape(n_lags * n_samples, -1)
        connectivity = grid_to_graph(n_lats, n_lons)
        self.agglo.connectivity = connectivity
        X = self.scaler.transform(X)
        X = self.agglo.fit_transform(X)
        X = X.reshape(n_samples, -1)
        self.clf.fit(X, y)

    def predict(self, X):
        n_samples, n_lags, n_lats, n_lons = X.shape
        X = X.reshape(n_lags * n_samples, -1)
        X = self.scaler.transform(X)
        X = self.agglo.transform(X)
        X = X.reshape(n_samples, -1)
        return self.clf.predict(X)
Example #11
0
class FeatureAgglomerationDecomposer(Transformer):
    type = 11

    def __init__(self, n_clusters=2, affinity='euclidean', linkage='ward', pooling_func='mean',
                 random_state=1):
        super().__init__("feature_agglomeration_decomposer")
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'
        self.output_type = NUMERICAL

        self.n_clusters = n_clusters
        self.affinity = affinity
        self.linkage = linkage
        self.pooling_func = pooling_func
        self.random_state = random_state

        self.pooling_func_mapping = dict(mean=np.mean, median=np.median, max=np.max)

    @ease_trans
    def operate(self, input_datanode, target_fields=None):
        from sklearn.cluster import FeatureAgglomeration

        X, y = input_datanode.data

        if self.model is None:
            self.n_clusters = int(self.n_clusters)

            n_clusters = min(self.n_clusters, X.shape[1])
            if not callable(self.pooling_func):
                self.pooling_func = self.pooling_func_mapping[self.pooling_func]

            self.model = FeatureAgglomeration(
                n_clusters=n_clusters, affinity=self.affinity,
                linkage=self.linkage, pooling_func=self.pooling_func)
            self.model.fit(X)

        X_new = self.model.transform(X)

        return X_new

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'):
        cs = ConfigurationSpace()
        n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, default_value=25)
        affinity = CategoricalHyperparameter(
            "affinity", ["euclidean", "manhattan", "cosine"], default_value="euclidean")
        linkage = CategoricalHyperparameter(
            "linkage", ["ward", "complete", "average"], default_value="ward")
        pooling_func = CategoricalHyperparameter(
            "pooling_func", ["mean", "median", "max"], default_value="mean")

        cs.add_hyperparameters([n_clusters, affinity, linkage, pooling_func])

        affinity_and_linkage = ForbiddenAndConjunction(
            ForbiddenInClause(affinity, ["manhattan", "cosine"]),
            ForbiddenEqualsClause(linkage, "ward"))
        cs.add_forbidden_clause(affinity_and_linkage)
        return cs
Example #12
0
def featureagglomeration(data_train, data_test, label_train, label_test, args):
    print('feature agglomeration')
    FA = FeatureAgglomeration(n_clusters=10).fit(data_train)
    transformation = FA.transform(data_test)
    agglomeration = find_highest(transformation)
    print('feature agglomeration done')
    compare_class(agglomeration, label_test)
    if args.create_mean:
        create_images_from_rows('fa', mean_image(agglomeration, data_test))
Example #13
0
def test_ward_agglomeration():
    # Check that we obtain the correct solution in a simplistic case
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = rng.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
    agglo.fit(X)
    assert np.size(np.unique(agglo.labels_)) == 5

    X_red = agglo.transform(X)
    assert X_red.shape[1] == 5
    X_full = agglo.inverse_transform(X_red)
    assert np.unique(X_full[0]).size == 5
    assert_array_almost_equal(agglo.transform(X_full), X_red)

    # Check that fitting with no samples raises a ValueError
    with pytest.raises(ValueError):
        agglo.fit(X[:0])
def main():

    # Parameters
    data_directory = '../../data/generated-data-r-10-n-6-4/'
    features_path = '../../data/features-generated-data-r-10-n-6-4'
    booking_file = '../../data/booking.csv'
    users_file = '../../data/user.csv'
    rating_thresholds = []
    true_objects_indexes = [0, 1, 2, 3, 4, 5]
    false_objects_indexes = [6, 7, 8, 9]

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory, booking_file, users_file, rating_thresholds)

    features = get_features(features_path, name_vector)

    fa = FeatureAgglomeration(n_clusters=50)
    fa.fit(features)
    features = fa.transform(features)

    scores_auc = []
    scores_rmse = []
    for i in range(10):
        cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str(
            i) + '.csv'
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='rf')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=False)
        selection.evaluate(evaluation_metric='auc')
        selection.evaluate(evaluation_metric='rmse')
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores_auc.append(score_auc)
        scores_rmse.append(score_rmse)

    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv'
    save_scores(scores_auc, results_file)
    results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv'
    save_scores(scores_rmse, results_file)
Example #15
0
def perform_feature_agglomeration(train_X, train_Y, test_X, test_Y):
    n_clusters = [32]
    fagg_model_accuracies = pd.DataFrame()
    for n_cluster in n_clusters:
        agglo = FeatureAgglomeration(connectivity=None, n_clusters=n_cluster)
        agglo.fit(train_X)
        train_X_reduced = agglo.transform(train_X)
        test_X_reduced = agglo.transform(test_X)

        svc_acc_val = perform_svc(train_X_reduced, train_Y, test_X_reduced,
                                  test_Y)

        rfc_acc_val = perform_rfc(train_X_reduced, train_Y, test_X_reduced,
                                  test_Y)

        knn_acc_val = perform_knn(train_X_reduced, train_Y, test_X_reduced,
                                  test_Y)

        lr_acc_val = perform_linear_regression(train_X_reduced, train_Y,
                                               test_X_reduced, test_Y)

        lc_acc_val = perform_linear_lasso(train_X_reduced, train_Y,
                                          test_X_reduced, test_Y)

        rr_acc_val = perform_ridge_regression(train_X_reduced, train_Y,
                                              test_X_reduced, test_Y)

        enet_acc_val = perform_elastinet_regression(train_X_reduced, train_Y,
                                                    test_X_reduced, test_Y)

        fagg_model_accuracies = fagg_model_accuracies.append([
            svc_acc_val, rfc_acc_val, knn_acc_val, lr_acc_val, lc_acc_val,
            rr_acc_val, enet_acc_val
        ])
        cols = list(fagg_model_accuracies.columns.values)
        cols = cols[-1:] + cols[:-1]
        fagg_model_accuracies = fagg_model_accuracies[cols]
        fagg_model_accuracies = fagg_model_accuracies.sort_values(
            by='r2_score')
    return fagg_model_accuracies
Example #16
0
def _ward_fit_transform(all_subjects_data, fit_samples_indices,
                        connectivity, n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = FeatureAgglomeration(n_clusters=n_parcels,
                                connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels
Example #17
0
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity,
                        n_parcels, offset_labels):
    """Ward clustering algorithm on a subsample and apply to the whole dataset.

    Computes a brain parcellation using Ward's clustering algorithm on some
    images, then averages the signal within parcels in order to reduce the
    dimension of the images of the whole dataset.
    This function is used with Randomized Parcellation Based Inference, so we
    need to save the labels to further perform the inverse transformation
    operation. The function therefore needs an offset to be applied on the
    labels so that they are unique across parcellations.

    Parameters
    ----------
    all_subjects_data : array_like, shape=(n_samples, n_voxels)
      Masked subject images as an array.

    fit_samples_indices : array-like,
      Indices of the samples used to compute the parcellation.

    connectivity : scipy.sparse.coo_matrix,
      Graph representing the spatial structure of the images (i.e. connections
      between voxels).

    n_parcels : int,
      Number of parcels for the parcellations.

    offset_labels : int,
      Offset for labels numbering.
      The purpose is to have different labels in all the parcellations that
      can be built by multiple calls to the current function.

    Returns
    -------
    parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels)
      Average signal within each parcel for each subject.

    labels : np.ndarray, shape=(n_voxels,)
      Labels giving the correspondance between voxels and parcels.

    """
    # fit part
    data_fit = all_subjects_data[fit_samples_indices]
    ward = FeatureAgglomeration(n_clusters=n_parcels,
                                connectivity=connectivity)
    ward.fit(data_fit)
    # transform part
    labels = ward.labels_ + offset_labels  # unique labels across parcellations
    parcelled_data = ward.transform(all_subjects_data)
    return parcelled_data, labels
Example #18
0
def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    with pytest.warns(None) as record:
        agglo_mean.fit(X)
    assert not [w.message for w in record]
    with pytest.warns(None) as record:
        agglo_median.fit(X)
    assert not [w.message for w in record]

    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
    assert np.size(agglo_mean.labels_) == X.shape[1]
    assert np.size(agglo_median.labels_) == X.shape[1]

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert Xt_mean.shape[1] == n_clusters
    assert Xt_median.shape[1] == n_clusters
    assert Xt_mean == np.array([1 / 3.0])
    assert Xt_median == np.array([0.0])

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert np.unique(X_full_mean[0]).size == n_clusters
    assert np.unique(X_full_median[0]).size == n_clusters

    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
Example #19
0
def data_compression(fmri_masked, mask_img, mask_np, output_size):
    """
    data : array_like
         A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints
         The functional dataset that needs to be reduced
    mask : a numpy array of the mask
    output_size : integer
        The number of elements that the data should be reduced to
        
    """

    ## Transform nifti files to a data matrix with the NiftiMasker
    import time
    from nilearn import input_data

    datacompressiontime = time.time()
    nifti_masker = input_data.NiftiMasker(mask_img=mask_img,
                                          memory='nilearn_cache',
                                          mask_strategy='background',
                                          memory_level=1,
                                          standardize=False)

    ward = []

    # Perform Ward clustering
    from sklearn.feature_extraction import image
    shape = mask_np.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask_np)

    #import pdb;pdb.set_trace()
    from sklearn.cluster import FeatureAgglomeration
    start = time.time()
    ward = FeatureAgglomeration(n_clusters=output_size,
                                connectivity=connectivity,
                                linkage='ward')
    ward.fit(fmri_masked)
    #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start))

    labels = ward.labels_

    #print ('Extracting reduced Dimension Data')
    data_reduced = ward.transform(fmri_masked)
    fmri_masked = []
    #print('Data compression took ', (time.time()- datacompressiontime), ' seconds')
    return {'data': data_reduced, 'labels': labels}
Example #20
0
def data_compression(fmri_masked, mask_img, mask_np, compression_dim):
    # TODO @AKI update doc
    """
    Perform...
    
    Parameters
    ----------
    fmri_masked : np.ndarray[ndim=2]
           A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints
           The functional dataset that needs to be reduced
    mask_img : an nibabel img object of the mask
    mask_np : a numpy array of the mask
    compression_dim : integer
        The number of elements that the data should be reduced to

    Returns
    -------
    A dictionaty ...

    """

    from sklearn.feature_extraction import image
    from sklearn.cluster import FeatureAgglomeration

    # Perform Ward clustering
    shape = mask_np.shape
    connectivity = image.grid_to_graph(n_x=shape[0],
                                       n_y=shape[1],
                                       n_z=shape[2],
                                       mask=mask_np)

    ward = FeatureAgglomeration(n_clusters=compression_dim,
                                connectivity=connectivity,
                                linkage='ward')

    ward.fit(fmri_masked)

    labels = ward.labels_
    data_reduced = ward.transform(fmri_masked)

    return {
        'compressor': ward,
        'compressed': data_reduced,
        'labels': labels,
    }
Example #21
0
def TrainSGDRegression(df1):
    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand',
                    ['any_regressand', 'X25', 'X26', 'X29', 'X13'])

    # build our design matrices
    y, X = dmatrices(eqn, data=df1, return_type='dataframe')

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=n_clusters).fit(X)
    reduced_X = X_reduction.transform(X)
    X_scaler = StandardScaler().fit(reduced_X)
    std_X = X_scaler.transform(reduced_X)

    # y_scaler = StandardScaler().fit(y)
    # standardize our data

    # std_y = y_scaler.transform(y)

    # define our regressor
    mod = SGDRegressor(loss='epsilon_insensitive',
                       penalty='elasticnet',
                       alpha=0.0014,
                       epsilon=0.32,
                       n_iter=n_iterations)
    # fit our data
    #res = mod.fit(std_X,pd.np.ravel(std_y))
    res = mod.fit(std_X, pd.np.ravel(y))

    # evaluate our fit
    yp = res.predict(std_X)

    yp = pd.DataFrame({'predicted': yp})
    yp = yp['predicted']
    yt = y['regressand']
    r2 = metrics.r2_score(yt, yp)
    rmse = metrics.mean_absolute_error(yt, yp)

    #save our model
    with open('SGD_trained_model.pickle', 'wb') as output:
        pickle.dump(res, output, pickle.HIGHEST_PROTOCOL)

    return r2, rmse
Example #22
0
def run_evaluation():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', help="Image folder.", default="faces")
    parser.add_argument('--output',
                        help="Statistics output folder.",
                        default="stats")
    args = parser.parse_args()

    # load embeddings
    emb_1 = load_embeddings('embeddings_matthias.pkl')
    emb_2 = load_embeddings('embeddings_laia.pkl')
    emb_3 = load_embeddings('embeddings_elias.pkl')
    emb_lfw = load_embeddings('embeddings_lfw.pkl')

    if emb_1 is None or emb_2 is None:
        print "--- embeddings could not be loaded. Aborting..."
        return

    # ------------------- EVALUATION ON ORIGINAL VECTORS

    ph = PlotHandler()

    # ==== 1. PCA DIMENSION REDUCTION

    # ph.PlotVarianceContribution(emb_lfw)
    # # reduce dimensionality
    # basis, mean = ExtractSubspace(emb_lfw, 0.999)
    # # dump_to_hd("lfw_99.9_subspace.pkl", (basis, mean))
    #
    # reduced_data = ProjectOntoSubspace(emb_lfw, mean, basis)
    # ph.SetTitle("Component Variance Contribution on Subspace")
    # ph.PlotVarianceContribution(reduced_data)
    # ph.Show()

    # ==== 1. FEATURE AGGLOMERATION

    agglo = FeatureAgglomeration(n_clusters=20)
    agglo.fit(emb_lfw)
    X_reduced = agglo.transform(emb_1)

    print np.shape(X_reduced)
Example #23
0
def test_random_feature_agglomeration_encoder_load():
    train_data = np.random.rand(2000, input_dim)

    from sklearn.cluster import FeatureAgglomeration
    model = FeatureAgglomeration(n_clusters=target_output_dim)
    filename = 'feature_agglomeration_model.model'
    pickle.dump(model.fit(train_data), open(filename, 'wb'))

    encoder = TransformEncoder(model_path=filename)

    test_data = np.random.rand(10, input_dim)
    encoded_data = encoder.encode(test_data)
    transformed_data = model.transform(test_data)
    assert encoded_data.shape == (test_data.shape[0], target_output_dim)
    assert type(encoded_data) == np.ndarray
    np.testing.assert_almost_equal(transformed_data, encoded_data)

    save_and_load(encoder, False)
    save_and_load_config(encoder, False, train_data)

    rm_files([encoder.save_abspath, encoder.config_abspath, filename])
Example #24
0
def TestRFRegression(df1):

    # generate the equation to use for our design matrices
    eqn = build_eqn(df1, 'regressand', ['any_regressand', 'X25', 'X26', 'X29'])

    # build our design matrices
    X = dmatrix(eqn.replace('regressand ~ ', ''),
                data=df1,
                return_type='dataframe')

    # load our model, including scalers and feature agglomerator
    with open('RFR_trained_model.pickle', 'rb') as input:
        res = pickle.load(input)

    # employ clustering to reduce our dimensionality
    X_reduction = FeatureAgglomeration(n_clusters=50).fit(X)
    reduced_X = X_reduction.transform(X)
    # define our regressor
    mod = RandomForestRegressor(n_estimators=50)

    # predict the interest rates
    yp = pd.DataFrame({'predicted': res.predict(reduced_X)})

    return yp
Example #25
0
    data1_X_ica = ica1.fit_transform(data1_X_train)
    data1_X_ica_test = ica1.transform(data1_X_test)
    ica2 = FastICA(n_components=90)
    data2_X_ica = ica2.fit_transform(data2_X_train)
    data2_X_ica_test = ica2.transform(data2_X_test)

    grp1 = GaussianRandomProjection(n_components=20)
    data1_X_grp = grp1.fit_transform(data1_X_train)
    data1_X_grp_test = grp1.transform(data1_X_test)
    grp2 = GaussianRandomProjection(n_components=90)
    data2_X_grp = grp2.fit_transform(data2_X_train)
    data2_X_grp_test = grp2.transform(data2_X_test)

    fa1 = FeatureAgglomeration(n_clusters=20)
    data1_X_fa = fa1.fit_transform(data1_X_train)
    data1_X_fa_test = fa1.transform(data1_X_test)
    fa2 = FeatureAgglomeration(n_clusters=90)
    data2_X_fa = fa2.fit_transform(data2_X_train)
    data2_X_fa_test = fa2.transform(data2_X_test)
    ''' clustering '''

    clusters = np.logspace(0.5,
                           2,
                           num=10,
                           endpoint=True,
                           base=10.0,
                           dtype=None)
    for i in range(0, len(clusters)):
        clusters[i] = int(clusters[i])
    print clusters
from sklearn.linear_model import SGDRegressor
from DataTransformations import *

df1=transform_data(sm.load('full_chain_data.pickle'))
base_parameters = {'alpha' : [.00001,.0001,.001,.01, .1] , \
                   'epsilon' : [.1, .2, .3], \
                   'penalty' : ['l2', 'elasticnet'], \
                   'loss' : ['huber', 'epsilon_insensitive']}

eqn = build_eqn(df1,'regressand', ['any_regressand','X25','X26'])
print(eqn)
y, X = dmatrices(eqn, data=df1,return_type = 'dataframe')
print('design matrices generated')

X_reduction = FeatureAgglomeration(n_clusters=100).fit(X,pd.np.ravel(y))
reduced_X = X_reduction.transform(X)

X_scaler = StandardScaler().fit(reduced_X)
std_X = X_scaler.transform(reduced_X)

y_scaler = StandardScaler().fit(y)
std_y = y_scaler.transform(y)
print (std_y.shape)
#svr = SGDRegressor(n_iter = 20, penalty = 'elasticnet', loss='epsilon_insensitive')
svr = SGDRegressor(n_iter = 30, penalty = 'elasticnet', loss= 'epsilon_insensitive', alpha = .0014, epsilon = .32)

#parameters = { 'alpha' : pd.np.arange(.001,.002,.0001), 'epsilon' : pd.np.arange(.25,.35,.01)}
parameters = { 'l1_ratio' : pd.np.arange(.1,.6,.02)}

SGD_clf = GridSearchCV(svr, parameters, verbose = True)
SGD_clf.fit(std_X, pd.np.ravel(std_y))
Example #27
0
fulldata = np.concatenate((X, test), axis=0)

# agg = FeatureAgglomeration(n_clusters = 2000)
# print ("fitting")
# agg.fit(fulldata)
# print "transform"
# fulldata_agg = agg.transform(fulldata)

first500 = fulldata[:,:500]
second = fulldata[:,500:]

agg = FeatureAgglomeration(n_clusters = 200)
print ("fitting")
agg.fit(first500)
first500_agg = agg.transform(first500)

agg = FeatureAgglomeration(n_clusters=1900)
print ("fitting")
agg.fit(second)
second_agg = agg.transform(second)

# new_X = fulldata[:9501,:]
# new_test = fulldata[9501:,:]

new_X = np.concatenate((first500_agg[:9501,:], second_agg[:9501,:]), axis=1)
new_test = np.concatenate((first500_agg[9501:,:], second_agg[9501:,:]), axis=1)

print("saving data...")
np.savetxt("../Data/train_agg_2100.csv", new_X, delimiter=",")
print("saving data...")
Example #28
0
def train_and_test(alpha,
                   predictors,
                   predictor_params,
                   x_filename,
                   y_filename,
                   n_users,
                   percTest,
                   featureset_to_use,
                   diff_weighting,
                   phi,
                   force_balanced_classes,
                   do_scaling,
                   optimise_predictors,
                   report,
                   conf_report=None):
    # all_X = numpy.loadtxt(x_filename, delimiter=",")
    all_X = numpy.load(x_filename + ".npy")
    all_y = numpy.loadtxt(y_filename, delimiter=",")

    print("loaded X and y files", x_filename, y_filename)

    if numpy.isnan(all_X.any()):
        print("nan in", x_filename)
        exit()

    if numpy.isnan(all_y.any()):
        print("nan in", y_filename)
        exit()

    #print("selecting balanced subsample")
    print("t t split")
    X_train, X_test, y_train, y_test = train_test_split(all_X,
                                                        all_y,
                                                        test_size=percTest,
                                                        random_state=666)

    # feature extraction
    # test = SelectKBest(score_func=chi2, k=100)
    # kb = test.fit(X_train, y_train)
    # # summarize scores
    # numpy.set_printoptions(precision=3)
    # print(kb.scores_)
    # features = kb.transform(X_train)
    # mask = kb.get_support()
    # # summarize selected features
    # print(features.shape)
    # X_train = X_train[:,mask]
    # X_test = X_test[:,mask]

    scaler = StandardScaler()
    rdim = FeatureAgglomeration(n_clusters=100)
    if do_scaling:
        # input(X_train.shape)
        X_train = rdim.fit_transform(X_train)
        X_test = rdim.transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        with open('../../../isaac_data_files/qutor_scaler.pkl',
                  'wb') as output:
            pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
        with open('../../../isaac_data_files/qutor_rdim.pkl', 'wb') as output:
            pickle.dump(rdim, output, pickle.HIGHEST_PROTOCOL)

    # print("feature reduction...")
    # pc = PCA(n_components=100)
    # X_train = pc.fit_transform(X_train)
    # X_test = pc.transform(X_test)

    classes = numpy.unique(y_train)
    sample_weights = None
    if (force_balanced_classes):
        X_train, y_train = balanced_subsample(X_train, y_train, 1.0)  #0.118)

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    print("tuning classifier ...")
    for ix, p in enumerate(predictors):
        print(type(p))
        print(p.get_params().keys())

        if optimise_predictors == True and len(predictor_params[ix]) > 1:
            pbest = run_random_search(p, X_train, y_train,
                                      predictor_params[ix])
        else:
            pbest = p.fit(X_train, y_train)
        predictors[ix] = pbest

    print("pickling classifier ...")
    for ix, p in enumerate(predictors):
        p_name = predictor_params[ix]['name']
        with open(
                '../../../isaac_data_files/p_{}_{}_{}.pkl'.format(
                    p_name, alpha, phi), 'wb') as output:
            pickle.dump(p, output, pickle.HIGHEST_PROTOCOL)
    print("done!")

    # report.write("* ** *** |\| \` | |  |) /; `|` / |_| *** ** *\n")
    # report.write("* ** *** | | /_ |^|  |) ||  |  \ | | *** ** *\n")
    #report.write("RUNS,P,FB,WGT,ALPHA,PHI,SCL,0p,0r,0F,0supp,1p,1r,1F,1supp,avg_p,avg_r,avg_F,#samples\n")
    for ix, p in enumerate(predictors):

        report.write(",".join(
            map(str, (all_X.shape[0], str(p).replace(",", ";").replace(
                "\n", ""), force_balanced_classes, diff_weighting, alpha, phi,
                      do_scaling))))

        y_pred_tr = p.predict(X_train)
        y_pred = p.predict(X_test)

        # for x,y,yp in zip(X_train, y_test, y_pred):

        if conf_report:
            conf_report.write(
                str(p).replace(",", ";").replace("\n", "") + "\n")
            conf_report.write(str(alpha) + "," + str(phi) + "\n")
            conf_report.write(str(confusion_matrix(y_test, y_pred)) + "\n")
            conf_report.write("\n")
        # p = precision_score(y_test, y_pred, average=None, labels=classes)
        # r = recall_score(y_test, y_pred, average=None, labels=classes)
        # F = f1_score(y_test, y_pred, average=None, labels=classes)
        p, r, F, s = precision_recall_fscore_support(y_test,
                                                     y_pred,
                                                     labels=classes,
                                                     average=None,
                                                     warn_for=('precision',
                                                               'recall',
                                                               'f-score'))
        avp, avr, avF, _ = precision_recall_fscore_support(
            y_test,
            y_pred,
            labels=classes,
            average='weighted',
            warn_for=('precision', 'recall', 'f-score'))
        for ix, c in enumerate(classes):
            report.write(",{},{},{},{},{},".format(c, p[ix], r[ix], F[ix],
                                                   s[ix]))
        report.write("{},{},{},{}\n".format(avp, avr, avF, numpy.sum(s)))

        # report.write(classification_report(y_test, y_pred)+"\n")
        # report.write("------END OF CLASSIFIER------\n")
        report.flush()
    return X_train, X_test, y_pred_tr, y_pred, y_test, scaler
Example #29
0
    DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=5, max_depth=None)

    error_rate_train_DT_1 = sum(
        DT1.fit(data1_X_train, data1_y_train).predict(data1_X_train) == data1_y_train) * 1.0 / data1_y_train.shape[0]
    print "error_rate_train_DT_1", error_rate_train_DT_1
    error_rate_test_DT_1 = sum(
        DT1.fit(data1_X_train, data1_y_train).predict(data1_X_test) == data1_y_test) * 1.0 / data1_y_test.shape[0]
    print "error_rate_test_DT_2", error_rate_test_DT_1

    for i in range(0, np.shape(data1_X_train)[1]):
        print i
        start_time = time.time()
        fa.set_params(n_clusters=i + 1)
        data1_X_train_fa = fa.fit_transform(data1_X_train)
        data1_X_test_fa = fa.transform(data1_X_test)

        error_rate_train_1[i] = sum(
            DT1.fit(data1_X_train_fa, data1_y_train).predict(data1_X_train_fa) == data1_y_train) * 1.0 / \
                                data1_y_train.shape[0]
        print("error_rate_train_1[%f]" % i), error_rate_train_1[i]
        error_rate_test_1[i] = sum(
            DT1.fit(data1_X_train_fa, data1_y_train).predict(data1_X_test_fa) == data1_y_test) * 1.0 / \
                               data1_y_test.shape[0]
        print("error_rate_test_1[%f]" % i), error_rate_test_1[i]
        print "time consumed:", time.time() - start_time

    file_2.write("FA_error_rate_train_1")
    for i in range(0, len(error_rate_train_1)):
        file_2.write(";")
        file_2.write("%1.9f" % error_rate_train_1[i])
Example #30
0
        #print(goodness)
        goods = goods + [goodness]
    goods = pd.DataFrame(goods)
    avg = pd.concat([avg,goods],axis=1)
print(avg)
'''
'''
fa = FeatureAgglomeration(n_clusters=7).fit(X)
newdata = fa.fit_transform(X)
newdata = pd.DataFrame(newdata)
print(X.head(10))
print(newdata.head(10))
'''

fa = FeatureAgglomeration(n_clusters=5).fit(X)
newdata = fa.transform(X)
recon = fa.inverse_transform(newdata)
recon = pd.DataFrame(recon)
print(reconError(X, recon))
print(pd.DataFrame(fa.labels_))
print(fa.n_leaves_)
print(fa.n_components)
print(pd.DataFrame(fa.children_))
'''
#Finds the K that maximizes AR score
goods  = []
for i in range(2,20):
    labels = KMeans(n_clusters=i).fit(newdata).labels_
    labels_true = Y.tolist()
    goodness = metrics.adjusted_rand_score(labels_true,labels)
    goods.append([i,goodness])
Example #31
0
"""
class sklearn.cluster.FeatureAgglomeration(n_clusters = 2,
										   affinity = "euclidean",
										   memory = None,
										   connectivity = None,
										   compute_full_tree = "auto",
										   linkage = "ward",
										   pooling_func = <function mean>)
"""

# ========================================================================
# data
# ========================================================================
digits = datasets.load_digits()
images = digits.images
X = np.reshape(images, (len(images), -1))
print(X.shape)

# ========================================================================
# 降维
# ========================================================================
agglo = FeatureAgglomeration(n_clusters=32)
agglo.fit(X)
print(agglo.labels_)
print(agglo.n_leaves_)
print(agglo.n_components_)
print(agglo.children_)

X_reduced = agglo.transform(X)
print(X_reduced.shape)
for component in range(1, len(X_train[0])+1):
    grp = GaussianRandomProjection(n_components=component, random_state=1)
    X_train_reduced = grp.fit_transform(X_train)
    X_test_reduced = grp.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=3)  
    knn.fit(X_train_reduced, y_train)
    train_scores.append(knn.score(X_train_reduced, y_train))
    test_scores.append(knn.score(X_test_reduced, y_test))
if dataset_name=='spam':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Spam dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1))
elif dataset_name=='letter':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Letter Recognition dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1))

#FA
train_scores=[]
test_scores=[]
for component in range(1, len(X_train[0])+1):
    fa = FeatureAgglomeration(n_clusters=component)
    X_train_reduced = fa.fit_transform(X_train)
    X_test_reduced = fa.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=3)  
    knn.fit(X_train_reduced, y_train)
    train_scores.append(knn.score(X_train_reduced, y_train))
    test_scores.append(knn.score(X_test_reduced, y_test))
if dataset_name=='spam':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Feature Agglomeration Components (Spam dataset)', 'Number of Agglomerated Components', 'Accuracy', ['Agglomerated train score','Agglomerated test score'], range(1, len(X_train[0])+1))
elif dataset_name=='letter':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Feature Agglomeration Components (Letter Recognition dataset)', 'Number of Agglomerated Components', 'Accuracy', ['Agglomerated train score','Agglomerated test score'], range(1, len(X_train[0])+1))
def main():

    # Parameters
    data_directory = '../data/generated-data-r-10-n-8-2/'
    features_path = '../data/features-generated-data-r-10-n-8-2'
    booking_file = '../data/booking.csv'
    users_file = '../data/user.csv'
    rating_thresholds = []
    true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7]
    false_objects_indexes = [8, 9]

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory, booking_file, users_file, rating_thresholds)

    features = get_features(features_path, name_vector)

    fa = FeatureAgglomeration(n_clusters=50)
    fa.fit(features)
    features = fa.transform(features)

    scores = []
    cv_results_file = './results/bf_real.csv'

    #ratings_matrix = ratings_matrix[:30, :30]
    #selection = BasicFactorization(show_selection_results=False, selection_algorithm='random')
    #selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix,
    #                    users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes,
    #                    true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes,
    #                    paths=name_vector, z_score=True)
    #score, score_rmse = selection.evaluate(evaluation_metric='auc')
    #scores.append(score)

    #exit()

    # K Nearest Neighbors
    #cv_results_file = './results/cv-generated-data-nr-2-n-02-l-100-knn.csv'
    scores_auc = []
    scores_rmse = []
    for i in range(1):
        cv_results_file = './results/xxp1-cv-generated-data-r-10-n-8-2-random-' + str(
            i) + '.csv'
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='random')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=False)
        selection.evaluate(evaluation_metric='auc')
        selection.evaluate(evaluation_metric='rmse')
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores_auc.append(score_auc)
        scores_rmse.append(score_rmse)

    results_file = './scores/v-generated-data-r-10-n-8-2-random-fa-auc.csv'
    save_scores(scores_auc, results_file)
    results_file = './scores/v-generated-data-r-10-n-8-2-random-fa-rmse.csv'
    save_scores(scores_rmse, results_file)

    exit()

    for i in range(10):
        print()

    for _ in range(0):
        selection = ObjectSelection(show_selection_results=False,
                                    selection_algorithm='random')
        # selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file)
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector,
                            z_score=True)
        print('\n\n-----\n\n')
        score_auc, score_rmse = selection.evaluate(evaluation_metric='auc')
        scores.append(score_auc)

    for i in range(10):
        print()

    for _ in range(10):
        selection = BasicFactorization(show_selection_results=False,
                                       selection_algorithm='random')
        selection.transform(ids=img_ids_vector,
                            features=features,
                            ratings=ratings_vector,
                            users_ratings=ratings_matrix,
                            users=users_matrix,
                            cv_results_file=cv_results_file,
                            images_indexes=images_indexes,
                            true_objects_indexes=true_objects_indexes,
                            false_objects_indexes=false_objects_indexes,
                            paths=name_vector)
        score = selection.evaluate(evaluation_metric='auc')
        scores.append(score)

    exit()

    # Parameters
    #data_directory = '../data/experience-6/'
    #features_path = '../data/features-experience-6'
    data_directory = '../data/generated-data-r-2-n-8-2/'
    features_path = '../data/features-generated-data-r-2-n-8-2'
    booking_file = '../data/booking.csv'
    users_file = '../data/user.csv'
    cv_results_file = 'results/cv-generated-data-r-2-n-8-2-x.csv'
    true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7]
    false_objects_indexes = [8, 9]

    #file_to_delete = data_directory + '.DS_Store'
    #os.remove(file_to_delete)

    file_names = os.listdir(data_directory)
    img_ids_vector = [int(name.split('-')[0]) for name in file_names]
    ratings_vector = [int(name.split('-')[-2]) for name in file_names]
    name_vector = [data_directory + name for name in file_names]
    images_indexes = [name.split('-')[3].split('.')[0] for name in file_names]
    rating_thresholds = [1, 2]
    #rating_thresholds = []

    ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data(
        data_directory,
        booking_file,
        users_file,
        rating_thresholds,
        binary=True)

    features = get_features(features_path, name_vector)

    cv_results_file = './results/cv-generated-data-r-2-n-8-2-knn-y.csv'

    selection = ObjectSelection(show_selection_results=False,
                                selection_algorithm='random')
    selection.transform(ids=img_ids_vector,
                        features=features,
                        ratings=ratings_vector,
                        users_ratings=ratings_matrix,
                        users=users_matrix,
                        cv_results_file=cv_results_file,
                        images_indexes=images_indexes,
                        true_objects_indexes=true_objects_indexes,
                        false_objects_indexes=false_objects_indexes,
                        paths=name_vector,
                        use_user_data=True)
    selection.evaluate(evaluation_metric='auc')

    exit()

    selection = BasicFactorizationNmf(show_selection_results=True,
                                      selection_algorithm='random')
    selection.transform(ids=img_ids_vector,
                        features=features,
                        ratings=ratings_vector,
                        users_ratings=ratings_matrix,
                        users=users_matrix,
                        cv_results_file=cv_results_file,
                        images_indexes=images_indexes,
                        true_objects_indexes=true_objects_indexes,
                        false_objects_indexes=false_objects_indexes,
                        paths=name_vector)
    selection.evaluate(evaluation_metric='auc')
Example #34
0
first_plot = plot_roi(labels_img,
                      mean_func_img,
                      title="Ward parcellation",
                      display_mode='xz')
# labels_img is a Nifti1Image object, it can be saved to file with the
# following code:
labels_img.to_filename('parcellation.nii')

# Display the original data
plot_epi(nifti_masker.inverse_transform(fmri_masked[0]),
         cut_coords=first_plot.cut_coords,
         title='Original (%i voxels)' % fmri_masked.shape[1],
         display_mode='xz')

# A reduced data can be create by taking the parcel-level average:
# Note that, as many objects in the scikit-learn, the ward object exposes
# a transform method that modifies input features. Here it reduces their
# dimension
fmri_reduced = ward.transform(fmri_masked)

# Display the corresponding data compressed using the parcellation
fmri_compressed = ward.inverse_transform(fmri_reduced)
compressed_img = nifti_masker.inverse_transform(fmri_compressed[0])

plot_epi(compressed_img,
         cut_coords=first_plot.cut_coords,
         title='Compressed representation (2000 parcels)',
         display_mode='xz')

plt.show()
Example #35
0
                (clases_historial_alim == clase_target_inicial[0]).mean(), 2)
            print(
                '\n\nPorcentaje de días con alimentación dentro de la clase del target nutricional utilizando alimentos de marcas:',
                porcentaje_hist_clase_target, '%')

        else:

            #############################################################################################
            ##   COMIENZO DEL ALGORITMO DE IA PARA ELEGIR ALIMENTOS EN BASE A LA INGESTA RECOMENDADA   ##
            #############################################################################################

            feature_agglom = FeatureAgglomeration(n_clusters=cluster_v[-1])

            feature_agglom.fit(nut_data)

            features_reduced = feature_agglom.transform(nut_data)

            aux_features_max = np.max(features_reduced, axis=0)
            aux_features_min = np.min(features_reduced, axis=0)

            target_nut_norm = ((feature_agglom.transform(
                np.expand_dims(target_nut / masa_inicial_comidas_g * 100,
                               axis=0)) - aux_features_min) /
                               (aux_features_max - aux_features_min))[0]

            pendiente_aprendizaje_v = []
            recompensa_m = []

            for N_alimentos in N_alimentos_v:

                # Creación de la carpeta para salvar las Figuras y datos de cada conjunto de alimentos por separado
# Second, we illustrate the effect that the clustering has on the
# signal. We show the original data, and the approximation provided by
# the clustering by averaging the signal on each parcel.
#
# As you can see below, this approximation is very good, although there
# are only 2000 parcels, instead of the original 60000 voxels

# Display the original data
plot_epi(nifti_masker.inverse_transform(fmri_masked[0]),
         cut_coords=cut_coords,
         title='Original (%i voxels)' % fmri_masked.shape[1],
         vmax=fmri_masked.max(), vmin=fmri_masked.min(),
         display_mode='xz')

# A reduced data can be create by taking the parcel-level average:
# Note that, as many objects in the scikit-learn, the ward object exposes
# a transform method that modifies input features. Here it reduces their
# dimension
fmri_reduced = ward.transform(fmri_masked)

# Display the corresponding data compressed using the parcellation
fmri_compressed = ward.inverse_transform(fmri_reduced)
compressed_img = nifti_masker.inverse_transform(fmri_compressed[0])

plot_epi(compressed_img, cut_coords=cut_coords,
         title='Compressed representation (2000 parcels)',
         vmax=fmri_masked.max(), vmin=fmri_masked.min(),
         display_mode='xz')

show()