Esempio n. 1
0
def train_test_from_dataset(dataset,
                            test_size=0.2,
                            batch_size=64,
                            wanted_views=None):

    sample_labels = list(dataset.sample_labels)
    label_encoder = LabelEncoder().fit(sample_labels)
    sample_labels = label_encoder.transform(sample_labels)

    label_map = lambda l: int(label_encoder.transform([l])[0])
    collate_fn = PersistenceDiagramProviderCollate(dataset,
                                                   label_map=label_map,
                                                   wanted_views=wanted_views)

    sp = StratifiedShuffleSplit(n_splits=1, test_size=test_size)
    train_i, test_i = list(sp.split([0] * len(sample_labels),
                                    sample_labels))[0]

    data_train = DataLoader(dataset,
                            batch_size=batch_size,
                            collate_fn=collate_fn,
                            shuffle=False,
                            sampler=SubsetRandomSampler(train_i.tolist()))

    data_test = DataLoader(dataset,
                           batch_size=batch_size,
                           collate_fn=collate_fn,
                           shuffle=False,
                           sampler=SubsetRandomSampler(test_i.tolist()))

    return data_train, data_test
Esempio n. 2
0
def test_label_encoder_negative_ints():
    le = LabelEncoder()
    le.fit([1, 1, 4, 5, -1, 0])
    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
                       [1, 2, 3, 3, 4, 0, 0])
    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
                       [0, 1, 4, 4, 5, -1, -1])
    with pytest.raises(ValueError):
        le.transform([0, 6])
Esempio n. 3
0
def test_label_encoder(values, classes, unknown):
    # Test LabelEncoder's transform, fit_transform and
    # inverse_transform methods
    le = LabelEncoder()
    le.fit(values)
    assert_array_equal(le.classes_, classes)
    assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
    assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
    le = LabelEncoder()
    ret = le.fit_transform(values)
    assert_array_equal(ret, [1, 0, 2, 0, 2])

    with pytest.raises(ValueError, match="unseen labels"):
        le.transform(unknown)
def test_label_encoder(values, classes, unknown):
    # Test LabelEncoder's transform, fit_transform and
    # inverse_transform methods
    le = LabelEncoder()
    le.fit(values)
    assert_array_equal(le.classes_, classes)
    assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
    assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
    le = LabelEncoder()
    ret = le.fit_transform(values)
    assert_array_equal(ret, [1, 0, 2, 0, 2])

    with pytest.raises(ValueError, match="unseen labels"):
        le.transform(unknown)
def train_test_from_dataset(dataset, batch_size):
    sample_labels = list(dataset.sample_labels)
    label_encoder = LabelEncoder().fit(sample_labels)
    sample_labels = label_encoder.transform(sample_labels)

    def label_remapper(label):
        return int(label_encoder.transform([l])[0])

    label_map = label_remapper

    collate_fn = PersistenceDiagramProviderCollate(dataset,
                                                   label_map=label_map)

    train_ids = np.array([
        label_map(image_id) for image_id in dataset.sample_labels
        if training_data_labels[image_id]
    ])
    test_ids = np.array([
        label_map(image_id) for image_id in dataset.sample_labels
        if not training_data_labels[image_id]
    ])

    data_train = DataLoader(dataset,
                            batch_size=batch_size,
                            collate_fn=collate_fn,
                            shuffle=False,
                            sampler=SubsetRandomSampler(train_ids.tolist()))

    data_test = DataLoader(dataset,
                           batch_size=batch_size,
                           collate_fn=collate_fn,
                           shuffle=False,
                           sampler=SubsetRandomSampler(test_ids.tolist()))

    return data_train, data_test
Esempio n. 6
0
    def load_dataset(self):
        with open(self.file_name) as f:
            dataset = arff.load(f)

            if self.label_attribute is None:
                self.label_attribute = dataset["attributes"][-1][0]

            data = list(numpy.asarray(dataset["data"]).transpose())
            labels = None

            row = 0
            for attribute_name, attribute_type in dataset["attributes"]:
                if attribute_name == self.label_attribute:
                    # Labels found!
                    labels = data.pop(row)
                    continue
                # Nominal attribute
                if isinstance(attribute_type, list):
                    # Convert None in '?' for next check and to make label_binarize work
                    for j in range(len(data[row])):
                        if data[row][j] is None:
                            data[row][j] = "?"
                    if numpy.all(data[row] == "?"):
                        # If no data is present, just remove the row
                        data.pop(row)
                        continue
                    if self.binarize:
                        data[row] = numpy.asarray(label_binarize(
                            data[row], attribute_type),
                                                  dtype=numpy.float64)
                    else:
                        encoder = LabelEncoder()
                        encoder.classes_ = attribute_type
                        if "?" not in encoder.classes_:
                            encoder.classes_.insert(0, "?")
                        data[row] = encoder.transform(data[row]).reshape(
                            (len(data[row]), 1)).astype(numpy.float64)
                else:
                    # Numeric attributes: check for nan values
                    data[row] = data[row].astype(numpy.float64)
                    nans = numpy.isnan(data[row])
                    if numpy.all(nans):
                        # If everything is nan, remove the feature
                        data.pop(row)
                        continue
                    if numpy.any(nans):
                        mean = data[row][numpy.invert(
                            nans)].sum() / numpy.invert(nans).sum()
                        data[row][nans] = mean
                    # Reshape to do hstack later
                    data[row] = data[row].reshape((len(data[row]), 1))
                # Go to next row only if we have NOT removed the current one
                row += 1

            instances = numpy.hstack(tuple(data))
            useless_indices = numpy.where(instances.var(axis=0) == 0)
            instances = numpy.delete(instances, useless_indices, axis=1)

            return instances, labels
Esempio n. 7
0
def test_label_encoder():
    """Test LabelEncoder's transform and inverse_transform methods"""
    le = LabelEncoder()
    le.fit([1, 1, 4, 5, -1, 0])
    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1])
    assert_raises(ValueError, le.transform, [0, 6])
Esempio n. 8
0
def test_label_encoder_empty_array():
    le = LabelEncoder()
    le.fit(np.array(["1", "2", "1", "2", "2"]))
    # test empty transform
    transformed = le.transform([])
    assert_array_equal(np.array([]), transformed)
    # test empty inverse transform
    inverse_transformed = le.inverse_transform([])
    assert_array_equal(np.array([]), inverse_transformed)
def test_label_encoder_negative_ints():
    le = LabelEncoder()
    le.fit([1, 1, 4, 5, -1, 0])
    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
                       [1, 2, 3, 3, 4, 0, 0])
    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
                       [0, 1, 4, 4, 5, -1, -1])
    assert_raises(ValueError, le.transform, [0, 6])
def test_label_encoder_empty_array(values):
    le = LabelEncoder()
    le.fit(values)
    # test empty transform
    transformed = le.transform([])
    assert_array_equal(np.array([]), transformed)
    # test empty inverse transform
    inverse_transformed = le.inverse_transform([])
    assert_array_equal(np.array([]), inverse_transformed)
Esempio n. 11
0
def test_label_encoder():
    """Test LabelEncoder's transform and inverse_transform methods"""
    le = LabelEncoder()
    le.fit([1, 1, 4, 5, -1, 0])
    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
                       [1, 2, 3, 3, 4, 0, 0])
    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
                       [0, 1, 4, 4, 5, -1, -1])
    assert_raises(ValueError, le.transform, [0, 6])
Esempio n. 12
0
def test_label_encoder_string_labels():
    """Test LabelEncoder's transform and inverse_transform methods with
    non-numeric labels"""
    le = LabelEncoder()
    le.fit(["paris", "paris", "tokyo", "amsterdam"])
    assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"])
    assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]), [2, 2, 1])
    assert_array_equal(le.inverse_transform([2, 2, 1]),
                       ["tokyo", "tokyo", "paris"])
    assert_raises(ValueError, le.transform, ["london"])
Esempio n. 13
0
    def load_dataset(self):
        with open(self.file_name) as f:
            dataset = arff.load(f)

            if self.label_attribute is None:
                self.label_attribute = dataset["attributes"][-1][0]

            data = list(numpy.asarray(dataset["data"]).transpose())
            labels = None

            row = 0
            for attribute_name, attribute_type in dataset["attributes"]:
                if attribute_name == self.label_attribute:
                    # Labels found!
                    labels = data.pop(row)
                    continue
                # Nominal attribute
                if isinstance(attribute_type, list):
                    # Convert None in '?' for next check and to make label_binarize work
                    for j in range(len(data[row])):
                        if data[row][j] is None:
                            data[row][j] = "?"
                    if numpy.all(data[row] == "?"):
                        # If no data is present, just remove the row
                        data.pop(row)
                        continue
                    if self.binarize:
                        data[row] = numpy.asarray(label_binarize(data[row], attribute_type), dtype=numpy.float64)
                    else:
                        encoder = LabelEncoder()
                        encoder.classes_ = attribute_type
                        if "?" not in encoder.classes_:
                            encoder.classes_.insert(0, "?")
                        data[row] = encoder.transform(data[row]).reshape((len(data[row]), 1)).astype(numpy.float64)
                else:
                    # Numeric attributes: check for nan values
                    data[row] = data[row].astype(numpy.float64)
                    nans = numpy.isnan(data[row])
                    if numpy.all(nans):
                        # If everything is nan, remove the feature
                        data.pop(row)
                        continue
                    if numpy.any(nans):
                        mean = data[row][numpy.invert(nans)].sum() / numpy.invert(nans).sum()
                        data[row][nans] = mean
                    # Reshape to do hstack later
                    data[row] = data[row].reshape((len(data[row]), 1))
                # Go to next row only if we have NOT removed the current one
                row += 1

            instances = numpy.hstack(tuple(data))
            useless_indices = numpy.where(instances.var(axis=0) == 0)
            instances = numpy.delete(instances, useless_indices, axis=1)

            return instances, labels
Esempio n. 14
0
def test_label_encoder_string_labels():
    """Test LabelEncoder's transform and inverse_transform methods with
    non-numeric labels"""
    le = LabelEncoder()
    le.fit(["paris", "paris", "tokyo", "amsterdam"])
    assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"])
    assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]),
                       [2, 2, 1])
    assert_array_equal(le.inverse_transform([2, 2, 1]),
                       ["tokyo", "tokyo", "paris"])
    assert_raises(ValueError, le.transform, ["london"])
Esempio n. 15
0
def preprocess(data):
    for column in data:
        if data.dtypes[column] == object:
            data[column].fillna("Não mensurado", inplace=True)
            encoder = LabelEncoder()
            encoder.fit(data[column].tolist())
            data[column] = encoder.transform(data[column])
        elif data.dtypes[column] == float:
            data[column].fillna(0, inplace=True)
        elif data.dtypes[column] == int:
            data[column].fillna(0, inplace=True)
    return data
Esempio n. 16
0
def test_label_encoder_errors():
    # Check that invalid arguments yield ValueError
    le = LabelEncoder()
    with pytest.raises(ValueError):
        le.transform([])
    with pytest.raises(ValueError):
        le.inverse_transform([])

    # Fail on unseen labels
    le = LabelEncoder()
    le.fit([1, 2, 3, -1, 1])
    msg = "contains previously unseen labels"
    with pytest.raises(ValueError, match=msg):
        le.inverse_transform([-2])
    with pytest.raises(ValueError, match=msg):
        le.inverse_transform([-2, -3, -4])

    # Fail on inverse_transform("")
    msg = "bad input shape ()"
    with pytest.raises(ValueError, match=msg):
        le.inverse_transform("")
Esempio n. 17
0
def test_label_encoder():
    # Test LabelEncoder's transform and inverse_transform methods
    le = LabelEncoder()
    le.fit([1, 1, 4, 5, -1, 0])
    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1])
    assert_raises(ValueError, le.transform, [0, 6])

    le.fit(["apple", "orange"])
    msg = "bad input shape"
    assert_raise_message(ValueError, msg, le.transform, "apple")
Esempio n. 18
0
def test_label_encoder():
    # Test LabelEncoder's transform and inverse_transform methods
    le = LabelEncoder()
    le.fit([1, 1, 4, 5, -1, 0])
    assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
    assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]),
                       [1, 2, 3, 3, 4, 0, 0])
    assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]),
                       [0, 1, 4, 4, 5, -1, -1])
    assert_raises(ValueError, le.transform, [0, 6])

    le.fit(["apple", "orange"])
    msg = "bad input shape"
    assert_raise_message(ValueError, msg, le.transform, "apple")
Esempio n. 19
0
class LabelEncoderImpl():
    def __init__(self):
        self._hyperparams = {}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Esempio n. 20
0
def _conform_targets(targets):
    """
    Conform targets to  [0, n_targets-1].

    Parameters
    ----------
    targets : array (n_targets, )

    Returns
    -------
    targets_conformed : array (n_targets, )
        targets are between 0 and n_targets-1
    label_encoder : LabelEncoder
        fit on targets, used to invert back using
        label_encoder.inverse_transform
    """
    le = LabelEncoder()
    le.fit(targets)
    return le.transform(targets), le
Esempio n. 21
0
def test_label_encoder_str_bad_shape(dtype):
    le = LabelEncoder()
    le.fit(np.array(["apple", "orange"], dtype=dtype))
    msg = "bad input shape"
    with pytest.raises(ValueError, match=msg):
        le.transform("apple")
def r_precision(S:np.ndarray, y:np.ndarray, metric:str='distance',
                average:str='weighted', return_y_pred:int=0,
                verbose:int=0, n_jobs:int=1) -> float:
    """ Calculate R-Precision (recall at R-th position).

    Parameters
    ----------
    S : ndarray or CSR matrix
        Distance (similarity) matrix

    y : ndarray
        Target (ground truth) labels

    metric : 'distance' or 'similarity', optional, default: 'similarity'
        Define, whether `S` is a distance or similarity matrix.

    average : 'weighted', 'macro' or None, optional, default: 'weighted'
        Ignored. Weighted and macro precisions are returned.

    return_y_pred : int, optional, default: 0
        If > 0, return the labels of the `return_y_pred` nearest neighbors

    verbose : int, optional, default: 0
        Increasing level of output.

    n_jobs : int, optional, default: 1
        Number of parallel processes to use.

    Returns
    -------
    r_precision : dictionary with following keys:
        macro : float
            Macro R-Precision.

        weighted : float
            Weighted R-Precision.

        per_item : ndarray
            R-Precision at the object.

        relevant_items : ndarray
            Relevant items per class.

        y_true : ndarray
            Target labels (req. for weighting).

        y_pred : ndarray
            Labels of some k-nearest neighbors
    """
    io.check_distance_matrix_shape(S)
    io.check_distance_matrix_shape_fits_labels(S, y)
    io.check_valid_metric_parameter(metric)
    log = ConsoleLogging()
    n, _ = S.shape
    S_is_sparse = issparse(S)
    if metric != 'similarity' or not S_is_sparse:
        raise NotImplementedError("Only sparse similarity matrices so far.")

    # Map labels to 0..n(labels)-1
    le = LabelEncoder()
    # Add int.min for misclassifications
    incorr_orig = np.array([np.nan]).astype(int)
    le.fit(np.append(y, incorr_orig))
    y = le.transform(y)
    incorrect = le.transform(incorr_orig)
    # Number of relevant items, i.e. number of each label
    relevant_items = np.bincount(y) - 1 # one less for self class
    # R-Precision for each item
    r_prec = np.zeros(n, dtype=np.float)
    
    # Classify each point in test set
    if verbose:
        log.message("Creating shared memory data.")
    n_random_pred = mp.Value(ctypes.c_int)
    n_random_pred.value = 0
    if verbose and log:
        log.message("Spawning processes for prediction.")
    y_pred = np.zeros((n, return_y_pred), dtype=float)
    kwargs = {'y_pred' : return_y_pred,
              'incorrect' : incorrect}
    with mp.Pool(processes=n_jobs, 
                 initializer=_load_shared_csr, 
                 initargs=(S, y, n_random_pred, relevant_items)) as pool:
        for i, r in enumerate(
            pool.imap(
                func=partial(_r_prec_worker, **kwargs),
                iterable=range(n), 
                chunksize=int(1e2))):
            if verbose and ((i+1)%int(1e7 / 10**verbose) == 0 or i == n-1):
                log.message("Classification: {} of {} on {}.".format(
                            i+1, n, mp.current_process().name), flush=True)
            try:
                r_prec[i] = r[0]
                y_pred[i, :] = r[1]
            except:
                r_prec[i] = r
            if i == n-1:
                pass
    pool.join()

    if verbose and log:
        log.message("Retrieving nearest neighbors.")
    # Work-around for new scikit-learn requirement of 1D arrays for LabelEncoder
    y_pred = np.asarray([le.inverse_transform(col) for col in y_pred.T.astype(int)]).T
    if verbose and log:
        log.message("Finishing.")
    if n_random_pred.value:
        log.warning(("{} queries were classified randomly, because all "
                     "distances were non-finite numbers or there were no other "
                     "objects in the same class.").format(n_random_pred.value))
    return_dict = {'macro' : r_prec.mean(),
                   'weighted' : np.average(r_prec, weights=relevant_items[y]),
                   'per_item' : r_prec,
                   'relevant_items' : relevant_items,
                   'y_true' : y,
                   'y_pred' : y_pred}
    return return_dict
Esempio n. 23
0
def main():
    print('\033[1m' + 'Loading all the datasets...' + '\033[0m')
    arffs_dic = obtain_arffs('./datasetsSelected/')

    # Extract an specific database
    dataset_name = 'sick'  #sick # nursery
    dataset = arffs_dic[dataset_name]

    # ------------------------------------------------------------------------------------ Compute indices for each fold
    # Use folder 0 of that particular dataset to find indices of train and test for each fold
    ref_data = np.concatenate((dataset[0][0], dataset[0][1]), axis=0)
    df_aux = pd.DataFrame(ref_data)
    df_aux = df_aux.fillna('nonna').values
    ref_data_dic = {}
    for i in range(df_aux.shape[0]):
        ref_data_dic[str(df_aux[i, :])] = i

    trn_tst_dic = trn_tst_idxs(ref_data_dic, dataset)

    # --------------------------------------------------------------------------------- Reading parameters from keyboard
    C, kernel, decision_function = read_keyboard()

    # ------------------------------------------------------------------------------------------------------- Preprocess
    df1 = pd.DataFrame(ref_data)
    groundtruth_labels = df1[df1.columns[
        len(df1.columns) - 1]].values  # original labels in a numpy array
    df1 = df1.drop(df1.columns[len(df1.columns) - 1], 1)
    if dataset_name == 'sick':
        df1 = df1.drop(
            'TBG', 1
        )  # This column only contains NaNs so does not add any value to the clustering

    data1 = df1.values  # original data in a numpy array without labels
    load = Preprocess()

    # ---------------------------------------------------------------------------------------- Encode groundtruth labels
    le = LabelEncoder()
    le.fit(np.unique(groundtruth_labels))
    groundtruth_labels = le.transform(groundtruth_labels)

    data_x = load.preprocess_method(data1)
    # -------------------------------------------------------------------------------------------- Supervised classifier
    # Compute accuracy for each fold
    accuracies = []
    fold_number = 0
    start_time = time.time()
    for trn_idxs, tst_idxs in trn_tst_dic.values():
        fold_number = fold_number + 1
        print('Computing accuracy for fold number ' + str(fold_number))
        trn_data = data_x[trn_idxs]
        trn_labels = groundtruth_labels[trn_idxs]
        tst_data = data_x[tst_idxs]
        tst_labels = groundtruth_labels[tst_idxs]

        svecm = SVM_Algorithm(C, kernel, decision_function)
        acc = svecm.algorithm(trn_data, trn_labels, tst_data, tst_labels)
        accuracies.append(acc)

    mean_accuracies = str(round(np.mean(accuracies), 4))
    std_accuracies = str(round(np.std(accuracies), 3))
    print('\n\033[1m' +
          'The mean accuracy of classification in the test set is: ' +
          mean_accuracies + ' ± ' + std_accuracies + '\033[0m')
    print('\033[1mRunning time for the 10 folds: %s seconds\033[0m' %
          round(time.time() - start_time, 4))
Esempio n. 24
0
class ColumnEnsembleClassifier(BaseClassifier):
    """Applies estimators to columns of an array or pandas DataFrame.

        This estimator allows different columns or column subsets of the input
        to be transformed separately and the features generated by each transformer
        will be ensembled to form a single output.

        Parameters
        ----------
        estimators : list of tuples
            List of (name, transformer, column(s)) tuples specifying the
            transformer objects to be applied to subsets of the data.

            name : string
                Like in Pipeline and FeatureUnion, this allows the transformer and
                its parameters to be set using ``set_params`` and searched in grid
                search.
            Estimator : estimator or {'drop'}
                Estimator must support `fit` and `predict_proba`. Special-cased
                strings 'drop' and 'passthrough' are accepted as well, to
                indicate to drop the columns
            column(s) : string or int, array-like of string or int, slice, \
                boolean mask array or callable


        remainder : {'drop', 'passthrough'} or estimator, default 'drop'
            By default, only the specified columns in `transformers` are
            transformed and combined in the output, and the non-specified
            columns are dropped. (default of ``'drop'``).
            By specifying ``remainder='passthrough'``, all remaining columns that
            were not specified in `transformers` will be automatically passed
            through. This subset of columns is concatenated with the output of
            the transformers.
            By setting ``remainder`` to be an estimator, the remaining
            non-specified columns will use the ``remainder`` estimator. The
            estimator must support :term:`fit` and :term:`transform`.

    """
    def __init__(self, estimators, remainder='drop', verbose=False):
        self.estimators = estimators
        self.remainder = remainder
        self.verbose = verbose

    @property
    def _estimators(self):
        return [(name, estim) for name, estim, _ in self.estimators]

    @_estimators.setter
    def _estimators(self, value):
        self.estimators = [(name, estim, col)
                           for ((name, estim),
                                (_, _, col)) in zip(value, self.estimators)]

    # from metaestimators.py
    def _get_params(self, attr, deep=True):
        out = super().get_params(deep=deep)
        if not deep:
            return out
        estimators = getattr(self, attr)
        out.update(estimators)
        for name, estimator in estimators:
            if hasattr(estimator, 'get_params'):
                for key, value in estimator.get_params(deep=True).items():
                    out['%s__%s' % (name, key)] = value
        return out

    def get_params(self, deep=True):
        """Get parameters for this estimator.

        Parameters
        ----------
        deep : boolean, optional
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.

        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.
        """
        return self._get_params('_estimators', deep=deep)

    # from metaestimators.py
    def _set_params(self, attr, **params):
        # Ensure strict ordering of parameter setting:
        # 1. All steps
        if attr in params:
            setattr(self, attr, params.pop(attr))
        # 2. Step replacement
        items = getattr(self, attr)
        names = []
        if items:
            names, _ = zip(*items)
        for name in list(params.keys()):
            if '__' not in name and name in names:
                self._replace_estimator(attr, name, params.pop(name))
        # 3. Step parameters and other initialisation arguments
        super().set_params(**params)
        return self

    # from metaestimators.py
    def _replace_estimator(self, attr, name, new_val):
        # assumes `name` is a valid estimator name
        new_estimators = list(getattr(self, attr))
        for i, (estimator_name, _) in enumerate(new_estimators):
            if estimator_name == name:
                new_estimators[i] = (name, new_val)
                break
        setattr(self, attr, new_estimators)

    def set_params(self, **kwargs):
        """Set the parameters of this estimator.

        Valid parameter keys can be listed with ``get_params()``.

        Returns
        -------
        self
        """
        self._set_params('_estimators', **kwargs)
        return self

    def _validate_estimators(self):
        if not self.estimators:
            return

        names, estimators, _ = zip(*self.estimators)

        self._validate_names(names)

        # validate estimators
        for t in estimators:
            if t == 'drop':
                continue
            if not (hasattr(t, "fit") or hasattr(t, "predict_proba")):
                raise TypeError(
                    "All estimators should implement fit and predict proba"
                    "or can be 'drop' "
                    "specifiers. '%s' (type %s) doesn't." % (t, type(t)))

    def _validate_names(self, names):
        if len(set(names)) != len(names):
            raise ValueError('Names provided are not unique: '
                             '{0!r}'.format(list(names)))
        invalid_names = set(names).intersection(self.get_params(deep=False))
        if invalid_names:
            raise ValueError('Estimator names conflict with constructor '
                             'arguments: {0!r}'.format(sorted(invalid_names)))
        invalid_names = [name for name in names if '__' in name]
        if invalid_names:
            raise ValueError('Estimator names must not contain __: got '
                             '{0!r}'.format(invalid_names))

    # this check whether the column input was a slice object or a tuple.
    def _validate_column_callables(self, X):
        """
        Converts callable column specifications.
        """
        columns = []
        for _, _, column in self.estimators:
            if callable(column):
                column = column(X)
            columns.append(column)
        self._columns = columns

    def _validate_remainder(self, X):
        """
        Validates ``remainder`` and defines ``_remainder`` targeting
        the remaining columns.
        """
        is_estimator = (hasattr(self.remainder, "fit")
                        or hasattr(self.remainder, "predict_proba"))
        if (self.remainder not in ('drop') and not is_estimator):
            raise ValueError(
                "The remainder keyword needs to be 'drop', '%s' was passed instead"
                % self.remainder)

        n_columns = X.shape[1]
        cols = []
        for columns in self._columns:
            cols.extend(_get_column_indices(X, columns))
        remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None

        self._remainder = ('remainder', self.remainder, remaining_idx)

    def _iter(self, replace_strings=False):
        """
        Generate (name, trans, column, weight) tuples.

        If fitted=True, use the fitted transformers, else use the
        user specified transformers updated with converted column names
        and potentially appended with transformer for remainder.

        """

        # interleave the validated column specifiers
        estimators = [(name, estims, column)
                      for (name, estims,
                           _), column in zip(self.estimators, self._columns)]
        # add transformer tuple for remainder
        if self._remainder[2] is not None:
            estimators = chain(estimators, [self._remainder])

        for name, trans, column in estimators:
            if replace_strings:
                # skip in case of 'drop'
                if trans == 'drop':
                    continue
                elif _is_empty_column_selection(column):
                    continue

            yield (name, trans, column)

    def fit(self, X, y, input_checks=True):
        # the data passed in could be an array of dataframes?
        """Fit all estimators, fit the data

        Parameters
        ----------
        X : array-like or DataFrame of shape [n_samples, n_dimensions, n_length]
            Input data, of which specified subsets are used to fit the
            transformers.

        y : array-like, shape (n_samples, ...), optional
            Targets for supervised learning.

        """

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        # X = _check_X(X)
        self._validate_estimators()
        self._validate_column_callables(X)
        self._validate_remainder(X)

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        transformed_y = self.le_.transform(y)

        for name, estim, column in self._iter(replace_strings=True):
            estim.fit(_get_column(X, column), transformed_y)

        return self

    def _collect_probas(self, X):
        return np.asarray([
            estim.predict_proba(_get_column(X, column))
            for (name, estim, column) in self._iter(replace_strings=True)
        ])

    # TODO: check if it is fitted
    def predict_proba(self, X, input_checks=True):
        """Predict class probabilities for X in 'soft' voting """
        avg = np.average(self._collect_probas(X), axis=0)
        return avg

    def _predict(self, X):
        """Collect results from clf.predict calls. """
        return np.asarray([
            estim.predict_proba(_get_column(X, column))
            for (name, estim, column) in self._iter(replace_strings=True)
        ])

    def predict(self, X, input_checks=True):
        maj = np.argmax(self.predict_proba(X), axis=1)
        return self.le_.inverse_transform(maj)
Esempio n. 25
0
def preprocess_classes(classes):
    encoder = LabelEncoder()
    encoder.fit(classes)
    return encoder.transform(classes)
Esempio n. 26
0
def main():
    print('\033[1m' + 'Loading all the datasets...' + '\033[0m')
    arffs_dic = obtain_arffs('./datasets/')

    # Extract an specific database
    dataset_name = 'breast-w'  # possible datasets ('hypothyroid', 'breast-w', 'waveform')
    dat1 = arffs_dic[dataset_name]
    df1 = pd.DataFrame(dat1[0])  # original data in pandas dataframe
    groundtruth_labels = df1[df1.columns[
        len(df1.columns) - 1]].values  # original labels in a numpy array
    df1 = df1.drop(df1.columns[len(df1.columns) - 1], 1)
    if dataset_name == 'hypothyroid':
        df1 = df1.drop(
            'TBG', 1
        )  # This column only contains NaNs so does not add any value to the clustering
    data1 = df1.values  # original data in a numpy array without labels
    load = Preprocess()
    data_x = load.preprocess_method(data1)
    data_x = data_x.astype(np.float64)
    le = LabelEncoder()
    le.fit(np.unique(groundtruth_labels))
    groundtruth_labels = le.transform(groundtruth_labels)

    num_clusters = len(
        np.unique(groundtruth_labels))  # Number of different labels

    # -------------------------------------------------------------------------------Compute covariance and eigenvectors
    original_mean = np.mean(data_x, axis=0)

    cov_m = compute_covariance(data_x, original_mean)
    eig_vals, eig_vect = np.linalg.eig(cov_m)

    idxsort = eig_vals.argsort()[::-1]
    eig_vals = eig_vals[idxsort].real
    eig_vect = eig_vect[:, idxsort].real

    # ---------------------------------------------------------------------Decide the number of features we want to keep
    prop_variance = 0.9
    k = proportion_of_variance(eig_vals, prop_variance)
    print('\nThe value of K selected to obtain a proportion of variance = ' +
          str(prop_variance) + ' is: ' + str(k) + '\n')

    eig_vals_red = eig_vals[:k]
    eig_vect_red = eig_vect[:, :k]  # Eigenvectors are in columns (8xk)

    # ---------------------------------------------------------------------------------Reduce dimensionality of the data
    # A1) Using our implementation of PCA
    transf_data_x = np.dot((eig_vect_red.T), (data_x - original_mean).T).T

    # B1) Using the PCA implementation of sklearn
    pca = PCA(n_components=k)
    transf_data_x_sklearn = pca.fit_transform(data_x)

    # C1) Using the incremental PCA implementation of sklearn
    incrementalpca = IncrementalPCA(n_components=k)
    transf_data_x_sklearn2 = incrementalpca.fit_transform(data_x)

    # --------------------------------------------------------------------------------------------------Reconstruct data
    # A2) Reconstruct data with our method
    reconstruct_data_x = np.dot(eig_vect_red, transf_data_x.T)
    reconstruct_data_x = reconstruct_data_x.T + original_mean

    # B2) Reconstruct data with PCA sklearn
    reconstruct_data_x1 = np.dot(pca.components_.T, transf_data_x_sklearn.T)
    reconstruct_data_x1 = reconstruct_data_x1.T + original_mean

    # C2) Reconstruct data with incremental PCA sklearn
    reconstruct_data_x2 = np.dot(incrementalpca.components_.T,
                                 transf_data_x_sklearn2.T)
    reconstruct_data_x2 = reconstruct_data_x2.T + original_mean

    # ----------------------------------------------------------------Error between original data and reconstructed data
    # A3) Error between original data and reconstruct data
    error = reconstruct_data_x - data_x
    total_error = (np.sum(abs(error)) / np.sum(abs(data_x))) * 100
    print(
        'The relative error after reconstructing the original matrix with K = '
        + str(k) + ' is ' + '\033[1m' + '\033['
        '94m' + str(round(total_error, 2)) + '%' + '\033[0m' +
        ' [using our implementation of PCA]')

    # B3) Error between original data and reconstruct data 1
    error1 = reconstruct_data_x1 - data_x
    total_error1 = (np.sum(abs(error1)) / np.sum(abs(data_x))) * 100
    print(
        'The relative error after reconstructing the original matrix with K = '
        + str(k) + ' is ' + '\033[1m' + '\033['
        '94m' + str(round(total_error1, 2)) + '%' + '\033[0m' +
        ' [using pca.fit_transform of Sklearn]')

    # C3) Error between original data and reconstruct data 2
    error2 = reconstruct_data_x2 - data_x
    total_error2 = (np.sum(abs(error2)) / np.sum(abs(data_x))) * 100
    print(
        'The relative error after reconstructing the original matrix with K = '
        + str(k) + ' is ' + '\033[1m' + '\033['
        '94m' + str(round(total_error2, 2)) + '%' + '\033[0m' +
        ' [using incrementalpca.fit_transform of Sklearn]')

    # ------------------------------------------------------------------------------Kmeans with dimensionality reduction
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )
    print('K-MEANS APPLIED TO THE ORIGINAL DATA')
    tester_kmeans(data_x, groundtruth_labels)
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )
    print(
        'K-MEANS APPLIED TO THE TRANSFORMED DATA USING OUR IMPLEMENTATION OF PCA'
    )
    labels = tester_kmeans(transf_data_x, groundtruth_labels)
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )
    print(
        'K-MEANS APPLIED TO THE TRANSFORMED DATA USING pca.fit_transform OF SKLEARN'
    )
    tester_kmeans(transf_data_x_sklearn, groundtruth_labels)
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )
    print(
        'K-MEANS APPLIED TO THE TRANSFORMED DATA USING incrementalpca.fit_transform OF SKLEARN'
    )
    tester_kmeans(transf_data_x_sklearn2, groundtruth_labels)
    print(
        '\n---------------------------------------------------------------------------------------------------------'
    )

    # -----------------------------------------------------------------------------------------------------Scatter plots
    ploting_boolean = False
    plot_scatters = False  # only change to True for a database with not too many features (like breast-w)

    if ploting_boolean:
        # Plot eigenvector
        plt.plot(eig_vals, 'ro-', linewidth=2, markersize=6)
        plt.title('Magnitude of the eigenvalues')
        plt.show()

        if plot_scatters:
            # Plottings: scatter plots
            # Original data with groundtruth labels
            ploting_v(data_x, num_clusters, groundtruth_labels,
                      'original data with groundtruth labels')
            # Transfomed data with our implementation of PCA and with groundtruth labels
            ploting_v(transf_data_x, num_clusters, groundtruth_labels,
                      'transformed data (our PCA) with groundtruth '
                      'labels')
            # Transfomed data with pca.fit_transform and with groundtruth labels
            ploting_v(
                transf_data_x_sklearn, num_clusters, groundtruth_labels,
                'transformed data (Sklearn PCA v1) '
                'with groundtruth labels')
            # Transfomed data with incrementalpca.fit_transform and with groundtruth labels
            ploting_v(
                transf_data_x_sklearn2, num_clusters, groundtruth_labels,
                'transformed data (Sklearn PCA v2) '
                'with groundtruth labels')

        # ------------------------------------------------------------------------------------------------------3D plots
        # Plottings: 3D plots
        # Original data without labels
        ploting_v3d(data_x, 1, np.zeros(len(groundtruth_labels)),
                    'original data without labels')
        # Original data with groundtruth labels
        ploting_v3d(data_x, num_clusters, groundtruth_labels,
                    'original data with groundtruth labels')
        # Reconstructed data without labels
        ploting_v3d(reconstruct_data_x, 1, np.zeros(len(groundtruth_labels)),
                    'reconstructed data without labels')
        # Transfomed data with our implementation of PCA and without labels
        ploting_v3d(transf_data_x, 1, np.zeros(len(groundtruth_labels)),
                    'transformed data without labels')
        # Transfomed data with our implementation of PCA and with groundtruth_labels
        ploting_v3d(transf_data_x, num_clusters, groundtruth_labels,
                    'transformed data with groundtruth labels')
        # Transfomed data with our implementation of PCA and with the labels obtained with our K-means
        ploting_v3d(transf_data_x, num_clusters, labels,
                    'transformed data with labels from our K-means')
        # Plot of the correlation matrix of the dataset
        plot_corr_matrix(data_x, legend=False)