Esempio n. 1
0
    def SelectLabeled(self, labeled_data_x, labeled_data_y, unlabeled_data_x):
        # just append train data to labeled data
        labeled_x = np.concatenate((self.init_labeled_data_x, labeled_data_x)) \
            if len(labeled_data_x) > 0 else self.init_labeled_data_x
        labeled_y = np.concatenate((self.init_labeled_data_y, labeled_data_y)) \
            if len(labeled_data_x) > 0 else self.init_labeled_data_y
        #

        # create model to predict with confidence and credibility
        model = ClassifierAdapter(
            DecisionTreeClassifier(random_state=config.random_state,
                                   min_samples_leaf=config.min_samples_leaf))
        nc = ClassifierNc(model, MarginErrFunc())
        model_tcp = TcpClassifier(nc, smoothing=True)
        model_tcp.fit(labeled_x, labeled_y)
        s = model_tcp.predict_conf(unlabeled_data_x)
        #

        # selection method
        labeled_ind = [
            i for i, a in enumerate(s)
            if a[1] > config.confidence and a[2] > config.credibility
        ]
        unlabeled_ind = [
            i for i, a in enumerate(s)
            if a[1] < config.confidence or a[2] < config.credibility
        ]

        labeled_unlabeled_x, labeled_unlabeled_y, unlabeled_data_x = \
            np.take(unlabeled_data_x, labeled_ind, axis=0), \
            np.take(s.T, labeled_ind), np.take(unlabeled_data_x, unlabeled_ind, axis=0)

        return labeled_unlabeled_x, labeled_unlabeled_y, unlabeled_data_x
Esempio n. 2
0
    def test_tcp_classification_svm(self):
        # -----------------------------------------------------------------------------
        # Setup training, calibration and test indices
        # -----------------------------------------------------------------------------
        data = load_iris()

        idx = np.random.permutation(data.target.size)
        train = idx[:int(idx.size / 2)]
        test = idx[int(idx.size / 2):]

        # -----------------------------------------------------------------------------
        # Train and calibrate
        # -----------------------------------------------------------------------------
        tcp = TcpClassifier(
            ClassifierNc(ClassifierAdapter(SVC(probability=True)),
                         MarginErrFunc()))
        tcp.fit(data.data[train, :], data.target[train])

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = tcp.predict(data.data[test, :], significance=0.1)
        header = np.array(["c0", "c1", "c2", "Truth"])
        table = np.vstack([prediction.T, data.target[test]]).T
        df = pd.DataFrame(np.vstack([header, table]))
        print(df)
Esempio n. 3
0
    def fit(self, X, Y, lengths=None, init_prob=None, tran_prob=None):
        """Fits the model on observables X and respective hidden sequences Y.

        Parameters
        ----------
        X : numpy array (n_samples, n_features)
            Individual observations.
        Y : numpy array (n_samples,)
            Individual observations of hidden states.
        lengths : list of integer
            Lengths of sequences in X and Y. If None, X and Y are assumed to
            be a single sequence. The sum of lengths should be n_samples.
        init_prob : dict (Default: None)
            The item corresponding to the i-th key is the
            probability of the hidden process to start in the
            i-th state.
            If default (=None), it is estimated from data.
        tran_prob : dict (Default: None)
            The item corresponding to the i-th key of the dictionary
            is a dictionary itself, which, for the j-th key,
            indicates the probability of transitioning from the
            i-th to the j-th state.
            If default (=None), it is estimated from data.
        """
        self.train_x = X
        self.train_y = Y
        if lengths is None:
            lengths = [len(Y)]

        # CP model
        self.cp = TcpClassifier(self.ncm, smoothing=self.smooth)

        # Initial and transition probabilities.
        if not init_prob:
            init_prob = self._estimate_initial_prob(Y, lengths)

        if not tran_prob:
            tran_prob = self._estimate_transition_prob(Y, lengths)

        self.init_prob = init_prob
        self.tran_prob = tran_prob
Esempio n. 4
0
from sklearn.datasets import load_iris

from nonconformist.base import ClassifierAdapter
from nonconformist.cp import TcpClassifier
from nonconformist.nc import ClassifierNc, MarginErrFunc

# -----------------------------------------------------------------------------
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
data = load_iris()

idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 2)]
test = idx[int(idx.size / 2):]

# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
tcp = TcpClassifier(ClassifierNc(ClassifierAdapter(SVC(probability=True)),
                                 MarginErrFunc()))
tcp.fit(data.data[train, :], data.target[train])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
prediction = tcp.predict(data.data[test, :], significance=0.1)
header = np.array(['c0','c1','c2','Truth'])
table = np.vstack([prediction.T, data.target[test]]).T
df = pd.DataFrame(np.vstack([header, table]))
print(df)
Esempio n. 5
0
import sys
sys.path.append('/Users/staffan/git/peptid_studie/experiments/src') # Nonconformist

from nonconformist.cp import TcpClassifier
from nonconformist.nc import NcFactory


iris = load_iris()

idx = np.random.permutation(iris.target.size)

# Divide the data into training set and test set
idx_train, idx_test = idx[:100], idx[100:]

model = SVC(probability=True)	# Create the underlying model
nc = NcFactory.create_nc(model)	# Create a default nonconformity function
tcp = TcpClassifier(nc)			# Create a transductive conformal classifier

# Fit the TCP using the proper training set
tcp.fit(iris.data[idx_train, :], iris.target[idx_train])

# Produce predictions for the test set
predictions = tcp.predict(iris.data[idx_test, :])

# 
targets = np.array(iris.target[idx_test], copy=True)
targets.shape = (len(targets),1)
output = np.hstack((targets, predictions))

np.savetxt('resources/multiclass.csv', output, delimiter=',')
Esempio n. 6
0
from nonconformist.evaluation import class_mean_errors

# -----------------------------------------------------------------------------
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
data = load_iris()

idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 2)]
test = idx[int(idx.size / 2):]

# -----------------------------------------------------------------------------
# Train and calibrate TCP
# -----------------------------------------------------------------------------
tcp = TcpClassifier(
    ClassifierNc(ClassifierAdapter(SVC(probability=True, gamma='scale')),
                 MarginErrFunc()))

tcp.fit(data.data[train, :], data.target[train])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
prediction = tcp.predict(data.data[test, :], significance=0.1)
header = np.array(['c0', 'c1', 'c2', 'Truth'])
table = np.vstack([prediction.T, data.target[test]]).T
df = pd.DataFrame(np.vstack([header, table]))
print('TCP')
print('---')
print(df)
Esempio n. 7
0
class CPHMM(BaseEstimator):
    def __init__(self, ncm, n_states, smooth=True):
        """Initialise a CP-HMM model.

        Parameters
        ----------
        ncm : nonconformist.BaseScorer
            Nonconformity measure to use.
        n_states : int
            Number of hidden states.
        smooth : bool
            If True, smooth CP is used, which achieves exact validity.
            Otherwise, standard CP is used, guaranteeing error smaller or
            equal to the significance level.
        """
        self.ncm = ncm
        self.n_states = n_states
        self.smooth = smooth

    def fit(self, X, Y, lengths=None, init_prob=None, tran_prob=None):
        """Fits the model on observables X and respective hidden sequences Y.

        Parameters
        ----------
        X : numpy array (n_samples, n_features)
            Individual observations.
        Y : numpy array (n_samples,)
            Individual observations of hidden states.
        lengths : list of integer
            Lengths of sequences in X and Y. If None, X and Y are assumed to
            be a single sequence. The sum of lengths should be n_samples.
        init_prob : dict (Default: None)
            The item corresponding to the i-th key is the
            probability of the hidden process to start in the
            i-th state.
            If default (=None), it is estimated from data.
        tran_prob : dict (Default: None)
            The item corresponding to the i-th key of the dictionary
            is a dictionary itself, which, for the j-th key,
            indicates the probability of transitioning from the
            i-th to the j-th state.
            If default (=None), it is estimated from data.
        """
        self.train_x = X
        self.train_y = Y
        if lengths is None:
            lengths = [len(Y)]

        # CP model
        self.cp = TcpClassifier(self.ncm, smoothing=self.smooth)

        # Initial and transition probabilities.
        if not init_prob:
            init_prob = self._estimate_initial_prob(Y, lengths)

        if not tran_prob:
            tran_prob = self._estimate_transition_prob(Y, lengths)

        self.init_prob = init_prob
        self.tran_prob = tran_prob

    def predict(self, x, e):
        """Return a CP-HMM prediction region.

        Uses CP-HMM to output a prediction region (i.e.: a set of candidate
        hidden sequences) for the observed sequence x.
        NOTE: If any of the elements of the sequence has an empty
        prediction set, then no predictions are returned.

        Parameters
        ----------
        x : list
            Observed sequence.
        e : float in [0,1]
            Significance level.
        """
        # Reduce significance level as required.
        e /= float(len(x))
        # Find candidates.
        candidates = self._hidden_candidates(x, e)

        # Generate paths.
        paths = self._generate_paths(candidates, self.tran_prob,
                                     self.init_prob)

        return paths

    def _generate_paths(self, candidates, trans_prob, init_prob):
        """Generate and score paths.

        Accepts a list of list of candidate. Each list of
        candidate contains potential true hidden states to
        compose a path.
        The function produces all possible paths, and scores them
        w.r.t. the transition and initial probabilities.
        It returns the paths in a list, sorted by scores:
        from the most likely to the least likely.

        Parameters
        ----------
        candidates : list of list
            The i-th list it contains represents a set of state
            candidates for the i-th element of the sequence.
        trans_prob : dictionary
            The keys of this dictionary are tuples in the form
            (i, j). The element (i, j) is associated with the
            transition  probability from state i to state j.
        init_prob : dictionary
            The keys are numbers i (as many as the states).
            init_prob[i] contains the probability of a sequence
            starting in state i.
        """
        paths = list(itertools.product(*candidates))
        scores = []
        for p in paths:
            p = list(map(int, p))  # So we can use them as indexes
            s = init_prob[p[0]]
            for i in range(len(p) - 1):
                s *= trans_prob[(p[i], p[i + 1])]
            scores.append(s)

        paths_sorted = [x[1] for x in sorted(zip(scores, paths), reverse=True)]

        return paths_sorted

    def _hidden_candidates(self, x, e):
        """Uses CP-HMM to predict, for each element of the observed sequence, a
        list of candidate states.  Thanks to CP's validity guarantee, the true
        hidden states is within the list of candidates with probability 1-e.

        Parameters
        ----------
        x : list
            Observed sequence.
        e : float in [0,1]
            Significance level.
        """
        # Flatten sequences, "train" CP
        X = self.train_x.flatten().reshape(-1, 1)
        Y = self.train_y.flatten()
        self.cp.fit(X, Y)
        # For each element of the observed sequence x
        # determine a set of candidate states.
        y_candidates = []
        for i in range(len(x)):
            candidates_bool = self.cp.predict(x[i].reshape(-1, 1), e)[0]
            candidates = self.cp.classes[candidates_bool]
            y_candidates.append(candidates)

        return y_candidates

    def _estimate_initial_prob(self, Y, lengths):
        """Returns an frequentist estimate of the initial probabilities over
        the observed hidden states.  Assumes that the hidden states are
        specified by sequential numbers 0, 1, ...  numbers).

        Parameters
        ----------
        Y : numpy array (n_samples,)
            Individual observations of hidden states.
        lengths : list of integer
            Lengths of sequences in X and Y. If None, X and Y are assumed to
            be a single sequence. The sum of lengths should be n_samples.
        """
        if not len(Y):
            return []

        ip = np.array([.0] * self.n_states)
        for i, j in iter_from_X_lengths(Y, lengths):
            ip[Y[i]] += 1

        ip /= sum(ip)
        # To dictionary
        ip = dict(list(zip(list(range(len(ip))), ip)))

        return ip

    def _estimate_transition_prob(self, Y, lengths):
        """Returns an frequentist estimate of the transition probabilities over
        the observed hidden states.  Assumes that the hidden states are
        specified by sequential numbers 0, 1, ... .

        Parameters
        ----------
        Y : numpy array (n_samples,)
            Individual observations of hidden states.
        lengths : list of integer
            Lengths of sequences in X and Y. If None, X and Y are assumed to
            be a single sequence. The sum of lengths should be n_samples.
        """
        if not len(Y):
            return np.empty()

        tp = np.zeros((self.n_states, self.n_states))
        for i, j in iter_from_X_lengths(Y, lengths):
            for k in range(i, j - 1):
                tp[Y[k], Y[k + 1]] += 1

        # Missing values, normalise
        # NOTE: here is made the assumption that states are integers
        # 0, 1, ..., self.n_states-1.
        for y in range(self.n_states):
            if sum(tp[y, :]) == 0:
                tp[y, :] = 1.0
            tp[y, :] /= sum(tp[y, :])

        # To dictionary
        tran_prob = {}
        for i in range(len(tp)):
            for j in range(len(tp[0])):
                tran_prob[(i, j)] = tp[i][j]

        return tran_prob