def get_fit_model(score_list, label_list):
    p_train = np.array(score_list)
    y_train = np.array(label_list)

    ir = IR()
    ir.fit( p_train, y_train )
    return ir
Example #2
0
class HTMLTime(object):
    """
    >>> htmlTime = HTMLTime(pathToIDX)
    >>> t = htmlTime(frameNumber)
    """

    def __init__(self, idx):
        super(HTMLTime, self).__init__()
        self.idx = idx

        # load .idx file using pandas
        df = read_table(
            self.idx, sep='\s+',
            names=['frame_number', 'frame_type', 'bytes', 'seconds']
        )
        x = np.array(df['frame_number'], dtype=np.float)
        y = np.array(df['seconds'], dtype=np.float)

        # train isotonic regression
        self.ir = IsotonicRegression(y_min=np.min(y), y_max=np.max(y))
        self.ir.fit(x, y)

        # frame number support
        self.xmin = np.min(x)
        self.xmax = np.max(x)

    def __call__(self, frameNumber):

        return self.ir.transform([min(self.xmax,
                                      max(self.xmin, frameNumber)
                                      )])[0]
	def main(self):
		x_field = self.fields_by_key('x')[0]
		y_field = self.fields_by_key('y')[0]	
		x = np.array(self.slice_data(x_field,int))
		y = np.array(self.slice_data(y_field,int))
		n = len(x)
		render = StringIO.StringIO()
		
		###############################################################################
		# Fit IsotonicRegression and LinearRegression models

		ir = IsotonicRegression()

		y_ = ir.fit_transform(x, y)

		lr = LinearRegression()
		lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

		###############################################################################
		# plot result

		segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
		lc = LineCollection(segments, zorder=0)
		lc.set_array(np.ones(len(y)))
		lc.set_linewidths(0.5 * np.ones(n))

		fig = plt.figure()
		plt.plot(x, y, 'r.', markersize=12)
		plt.plot(x, y_, 'g.-', markersize=12)
		plt.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
		plt.gca().add_collection(lc)
		plt.legend(('Data', 'Isotonic Fit', 'Linear Fit'), loc='lower right')
		plt.title('Isotonic regression')
		plt.savefig(render,format='png')
		return render
def test_isotonic_regression_ties_secondary_():
    """
    Test isotonic regression fit, transform  and fit_transform
    against the "secondary" ties method and "pituitary" data from R
     "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
     Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
    (PAVA) and Active Set Methods

    Set values based on pituitary example and
     the following R command detailed in the paper above:
    > library("isotone")
    > data("pituitary")
    > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
    > res1$x

    `isotone` version: 1.0-2, 2014-09-07
    R version: R version 3.1.1 (2014-07-10)
    """
    x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
    y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
    y_true = [22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222,
              22.22222, 22.22222, 22.22222, 24.25, 24.25]

    # Check fit, transform and fit_transform
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true, 4)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
Example #5
0
def train_classifier_with_calib(classifier, data, use_all_data=False, normalize=False):
    X_train = data.X_train
    y_train = data.y_train
    X_cv = data.X_cv
    y_cv = data.y_cv
    if normalize:
        X_train, X_cv = normalize_data(X_train, X_cv)
    if not use_all_data:
        ir = IR()
        score, S = train(classifier, X_train, y_train, X_cv, y_cv, data.y_classes)
        predictions_proba = classifier.predict_proba(X_cv)
        proba = predictions_proba[:,1];
        ir.fit_transform(proba,y_cv)
        print proba
        print ir
        return {
            'classifier': classifier,
            'score': score,
            'S_auc': S,
            'IR':ir,
            'prange':[np.amin(proba),np.amax(proba)]
        }
    else:
        train_all_data(classifier, X_train, y_train, X_cv, y_cv)
        return {
            'classifier': classifier
        }
def test_isotonic_duplicate_min_entry():
    x = [0, 0, 1]
    y = [0, 0, 1]

    ir = IsotonicRegression(increasing=True, out_of_bounds="clip")
    ir.fit(x, y)
    all_predictions_finite = np.all(np.isfinite(ir.predict(x)))
    assert_true(all_predictions_finite)
def predict_probs(model, train_class, train_features, test_features, normalize_probs=None):
    """
    Fit a given binary classification model to training sample features
    and return predicted probabilities for the positive class for
    the training and test samples.
    """
    model.fit(train_features, train_class)
    train_prob, test_prob = [model.predict_proba(f)[:, 1] for f in (train_features, test_features)]
    if normalize_probs == "ROCSlope":
        # calibrate probabilities based on the estimated local slope
        # of the ROC curve
        chunk_size = 10  # number of instances for slope estimation
        n_train_pos = 301  # total number of positive (preictal) instances
        n_train_neg = 3766  # total negative (interictal)
        n_chunk_tot = 4000.0 / float(chunk_size)  # estimated total in test data
        # sort training data classes by predicted probability
        sort_order = train_prob.argsort()
        p_sorted = train_prob[sort_order]
        c_sorted = train_class[sort_order]
        ix = np.array(range(len(train_prob)))
        # loop over chunks
        for i_ch in range(1 + (len(train_prob) - 1) / chunk_size):
            p_chunk, c_chunk = [
                x[np.where((ix >= i_ch * chunk_size) & (ix < (i_ch + 1) * chunk_size))[0]] for x in (p_sorted, c_sorted)
            ]
            pmin = np.min(p_chunk)
            pmax = np.max(p_chunk)
            # compute TPR/FPR (relative to the entire training set)
            tpr = np.sum(c_chunk) / float(n_train_pos)
            fpr = np.sum(1 - c_chunk) / float(n_train_neg)
            # compute probability transformation for this chunk
            qc = (2.0 / np.pi) * np.arctan(tpr / (fpr + 1.0e-3 / float(n_train_neg)))
            qmin = np.max((0.0, qc - 0.5 / float(n_chunk_tot)))
            qmax = np.min((1.0, qc + 0.5 / float(n_chunk_tot)))
            # transform probabilities
            tr_p_ch = np.where((train_prob > pmin) & (train_prob <= pmax))[0]
            train_prob[tr_p_ch] = qmin + (train_prob[tr_p_ch] - pmin) * (qmax - qmin) / (pmax - pmin)
            te_p_ch = np.where((test_prob > pmin) & (test_prob <= pmax))[0]
            test_prob[te_p_ch] = qmin + (test_prob[te_p_ch] - pmin) * (qmax - qmin) / (pmax - pmin)
    elif normalize_probs == "LogShift":
        # shift probabilities in log(p/(1-p)) so that a fraction f_pre
        # of the samples has probability > 0.5, where f_pre is the
        # fraction of preictal samples in the training data
        f_pre = len(np.where(train_class)[0]) / float(len(train_class))
        train_th, test_th = [sorted(p)[int((1.0 - f_pre) * len(p))] for p in (train_prob, test_prob)]
        train_prob, test_prob = [
            (1.0 - pth) * p / (pth + p - 2.0 * pth * p)
            for (pth, p) in zip((train_th, test_th), (train_prob, test_prob))
        ]
    elif normalize_probs == "IsoReg":
        # fit an isotonic regression model to training probabilities
        # and use the model to transform all probabilities
        prob_model = IsotonicRegression(out_of_bounds="clip")
        prob_model.fit(train_prob, train_class)
        train_prob, test_prob = [prob_model.transform(p) for p in (train_prob, test_prob)]
    elif normalize_probs is not None:
        sys.exit("Invalid value of normalize_probs:", str(normalize_probs))
    return (train_prob, test_prob)
def test_isotonic_min_max_boundaries():
    # check if min value is used correctly
    ir = IsotonicRegression(y_min=2, y_max=4)
    n = 6
    x = np.arange(n)
    y = np.arange(n)
    y_test = [2, 2, 2, 3, 4, 4]
    y_result = np.round(ir.fit_transform(x, y))
    assert_array_equal(y_result, y_test)
Example #9
0
    def _minCllr(self, targetScoreValues, nonTargetScoreValues, ):
        """
            Computes the 'minimum cost of log likelihood ratio' measure as given in IDIAP's bob calibration.py
            We don't however use pavx here, as used in many other implementations, but sklearn's isotonic regression,
            which is equivalent and frees us from linking to c++ code.
        """
        # First, sort both scores.
        neg = sorted(nonTargetScoreValues)
        pos = sorted(targetScoreValues)
        N = len(neg)
        P = len(pos)
        I = N + P
        # Now, iterate through both score sets and add a 0 for negative and 1 for positive scores.
        n, p = 0, 0
        idealSequence = np.zeros(I)
        neg_indices = [0] * N
        pos_indices = [0] * P
        for i in range(I):
            if n == N or neg[n] > pos[p]:
                pos_indices[p] = i
                p += 1
                idealSequence[i] = 1
            else:
                neg_indices[n] = i
                n += 1

        # Run the pool adjacent violaters method on the ideal LLR scores.
        # pavx implements isotonic regression. Python's sklearn contains code to do just that.
        ir = IsotonicRegression()
        # Calculate the isotonic regression.
        popt = ir.fit_transform(np.arange(len(idealSequence)), idealSequence)

        # disable runtime warnings for a short time since log(0) will raise a warning.
        old_warn_setup = np.seterr(divide='ignore')
        # ... compute logs.

        # Lets assume the prior odds on a target score is the ratio #target scores / #non target scores.
        log_prior_odds = math.log(float(P) / float(N))

        posterior_log_odds = np.log(popt) - np.log(1.0 - popt)

        # ... activate old warnings.
        np.seterr(**old_warn_setup)

        llrs = posterior_log_odds - log_prior_odds

        # Unmix positive and negative scores.
        new_neg = np.zeros(N)
        for n in range(N):
            new_neg[n] = llrs[neg_indices[n]]
        new_pos = np.zeros(P)
        for p in range(P):
            new_pos[p] = llrs[pos_indices[p]]

        # Compute cllr of these new 'optimal' LLR scores.
        minCllr = self._cllr(new_pos, new_neg)
        return minCllr
Example #10
0
 def calibrate_row(row):
     calibrator = IsotonicRegression(y_min=0, y_max=1)
     x = lab[~np.isnan(lab[row])][row].values
     y = lab[~np.isnan(lab[row])]['labels'].values
     calibrator.fit(x, y)
     lab[row] = calibrator.predict(lab[row].values)
     amb[row] = calibrator.predict(amb[row].values)
     unl[row] = calibrator.predict(unl[row].values)
     scr[row] = calibrator.predict(scr[row].values)
def test_isotonic_sample_weight():
    ir = IsotonicRegression()
    x = [1, 2, 3, 4, 5, 6, 7]
    y = [1, 41, 51, 1, 2, 5, 24]
    sample_weight = [1, 2, 3, 4, 5, 6, 7]
    expected_y = [1, 13.95, 13.95, 13.95, 13.95, 13.95, 24]
    received_y = ir.fit_transform(x, y, sample_weight=sample_weight)

    assert_array_equal(expected_y, received_y)
 def sklearn_isotonic_regression_multi(self, y, blocks):
     ir = IsotonicRegression()
     n = len(y)
     x = np.arange(n)
     z = np.zeros(n)
     z[:blocks[0]] = y[:blocks[0]]
     for start, end in zip(blocks, np.append(blocks[1:], [n])):
         z[start:end] = ir.fit_transform(x[start:end], y[start:end])
     return z
 def test_proj_PAV(self):
     n = 10
     x = np.arange(n)
     rs = check_random_state(0)
     for i in range(10):
         y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
         ir = IsotonicRegression()
         truth = ir.fit_transform(x, y)
         self.assertTrue(np.linalg.norm(proj_PAV(y) - truth) < 1e-8)
Example #14
0
def test_fast_predict():
    # test that the faster prediction change doesn't
    # affect out-of-sample predictions:
    # https://github.com/scikit-learn/scikit-learn/pull/6206
    rng = np.random.RandomState(123)
    n_samples = 10 ** 3
    # X values over the -10,10 range
    X_train = 20.0 * rng.rand(n_samples) - 10
    y_train = np.less(rng.rand(n_samples),
                      expit(X_train)).astype('int64').astype('float64')

    weights = rng.rand(n_samples)
    # we also want to test that everything still works when some weights are 0
    weights[rng.rand(n_samples) < 0.1] = 0

    slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
    fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")

    # Build interpolation function with ALL input data, not just the
    # non-redundant subset. The following 2 lines are taken from the
    # .fit() method, without removing unnecessary points
    X_train_fit, y_train_fit = slow_model._build_y(X_train, y_train,
                                                   sample_weight=weights,
                                                   trim_duplicates=False)
    slow_model._build_f(X_train_fit, y_train_fit)

    # fit with just the necessary data
    fast_model.fit(X_train, y_train, sample_weight=weights)

    X_test = 20.0 * rng.rand(n_samples) - 10
    y_pred_slow = slow_model.predict(X_test)
    y_pred_fast = fast_model.predict(X_test)

    assert_array_equal(y_pred_slow, y_pred_fast)
def test_isotonic_regression_pickle():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    x = np.arange(len(y))

    # Create model and fit
    ir = IsotonicRegression(increasing='auto', out_of_bounds="clip")
    ir.fit(x, y)

    ir_ser = pickle.dumps(ir, pickle.HIGHEST_PROTOCOL)
    ir2 = pickle.loads(ir_ser)
    np.testing.assert_array_equal(ir.predict(x), ir2.predict(x))
def test_isotonic_regression_oob_raise():
    # Set y and x
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    x = np.arange(len(y))

    # Create model and fit
    ir = IsotonicRegression(increasing='auto', out_of_bounds="raise")
    ir.fit(x, y)

    # Check that an exception is thrown
    assert_raises(ValueError, ir.predict, [min(x) - 10, max(x) + 10])
Example #17
0
def test_isotonic_regression_oob_bad():
    # Set y and x
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    x = np.arange(len(y))

    # Create model and fit
    ir = IsotonicRegression(increasing='auto', out_of_bounds="xyz")
    ir.fit(x, y)

    # Make sure that we throw an error for bad out_of_bounds value
    assert_raises(ValueError, ir.predict, [min(x)-10, max(x)+10])
def test_isotonic_regression_oob_bad_after():
    # Set y and x
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    x = np.arange(len(y))

    # Create model and fit
    ir = IsotonicRegression(increasing='auto', out_of_bounds="raise")

    # Make sure that we throw an error for bad out_of_bounds value in transform
    ir.fit(x, y)
    ir.out_of_bounds = "xyz"
    assert_raises(ValueError, ir.transform, x)
def test_isotonic_regression_oob_nan():
    # Set y and x
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    x = np.arange(len(y))

    # Create model and fit
    ir = IsotonicRegression(increasing='auto', out_of_bounds="nan")
    ir.fit(x, y)

    # Predict from  training and test x and check that we have two NaNs.
    y1 = ir.predict([min(x) - 10, max(x) + 10])
    assert_equal(sum(np.isnan(y1)), 2)
def test_permutation_invariance():
    # check that fit is permutation invariant.
    # regression test of missing sorting of sample-weights
    ir = IsotonicRegression()
    x = [1, 2, 3, 4, 5, 6, 7]
    y = [1, 41, 51, 1, 2, 5, 24]
    sample_weight = [1, 2, 3, 4, 5, 6, 7]
    x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0)
    y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight)
    y_transformed_s = ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x)

    assert_array_equal(y_transformed, y_transformed_s)
 def test_isotonic_regression(self):
     self.setUp()
     times = []
     rs = check_random_state(0)
     for n in [int(1e1), int(1e2), int(1e3), int(1e4)]:
         x = np.arange(n)
         y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
         ir = IsotonicRegression()
         start_time = time.time()
         y1 = ir.fit_transform(x, y)
         times.append(time.time() - start_time)
     print 'test isotonic_regression'
     print times
def test_isotonic_sample_weight_parameter_default_value():
    # check if default value of sample_weight parameter is one
    ir = IsotonicRegression()
    # random test data
    rng = np.random.RandomState(42)
    n = 100
    x = np.arange(n)
    y = rng.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))
    # check if value is correctly used
    weights = np.ones(n)
    y_set_value = ir.fit_transform(x, y, sample_weight=weights)
    y_default_value = ir.fit_transform(x, y)

    assert_array_equal(y_set_value, y_default_value)
def test_isotonic_regression_oob_clip():
    # Set y and x
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    x = np.arange(len(y))

    # Create model and fit
    ir = IsotonicRegression(increasing='auto', out_of_bounds="clip")
    ir.fit(x, y)

    # Predict from  training and test x and check that min/max match.
    y1 = ir.predict([min(x) - 10, max(x) + 10])
    y2 = ir.predict(x)
    assert_equal(max(y1), max(y2))
    assert_equal(min(y1), min(y2))
Example #24
0
def sklearn_pav(y_true, y_score):
    """
    Binary PAV algorithm, algorithm to solve Isotonic regression
    NOTE: sklearn isotonic regression is used
    y_true: 1D array
    y_score: 1D array
    """
    id_permute = np.argsort(y_score)
    y_sort = y_true[id_permute]
    p_sort = np.sort(y_score)

    ir = IsotonicRegression()
    p_calibrated = ir.fit_transform(p_sort, y_sort)
    return y_sort, p_calibrated
Example #25
0
def ensure_monotone_increasing(arr_, fromright=True, fromleft=True, newmode=True):
    r"""
    Args:
        arr_ (ndarray):

    Returns:
        ndarray: arr

    CommandLine:
        python -m vtool.math --test-ensure_monotone_increasing --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from vtool.math import *  # NOQA
        >>> rng = np.random.RandomState(0)
        >>> size_ = 100
        >>> domain = np.arange(size_)
        >>> offset = ut.get_argval('--offset', type_=float, default=2.3)
        >>> arr_ = np.sin(np.pi * (domain / 100) - offset) + (rng.rand(len(domain)) - .5) * .1
        >>> arr = ensure_monotone_increasing(arr_, fromleft=False, fromright=True)
        >>> result = str(arr)
        >>> print(result)
        >>> ut.quit_if_noshow()
        >>> import plottool as pt
        >>> pt.plot2(domain, arr_, 'r-', fnum=1, pnum=(2, 1, 1), title='before', equal_aspect=False)
        >>> pt.plot2(domain, arr, 'r-', fnum=1, pnum=(2, 1, 2), title='after monotonization (increasing)', equal_aspect=False)
        >>> ut.show_if_requested()
    """
    if newmode:
        from sklearn.isotonic import IsotonicRegression
        ir = IsotonicRegression()
        arr = ir.fit_transform(np.arange(len(arr_)), arr_)
    else:
        arr = arr_.copy()
        size = len(arr)
        # Ensure increasing from right
        if fromright:
            for lx in range(1, size):
                rx = (size - lx - 1)
                if arr[rx] > arr[rx + 1]:
                    arr[rx] = arr[rx + 1]
        if fromleft:
            # ensure increasing from left
            for lx in range(0, size - 1):
                if arr[lx] > arr[lx + 1]:
                    arr[lx + 1] = arr[lx]
    return arr
def test_isotonic_regression_auto_increasing():
    # Set y and x for decreasing
    y = np.array([5, 6.1, 6, 7, 10, 9, 10])
    x = np.arange(len(y))

    # Create model and fit_transform
    ir = IsotonicRegression(increasing='auto')
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        y_ = ir.fit_transform(x, y)
        # work-around for pearson divide warnings in scipy <= 0.17.0
        assert_true(all(["invalid value encountered in "
                         in str(warn.message) for warn in w]))

    # Check that relationship increases
    is_increasing = y_[0] < y_[-1]
    assert_true(is_increasing)
Example #27
0
class LLRIsotonicRegression(LLR):
    """Log-likelihood ratio estimation by isotonic regression"""

    def __init__(self, equal_priors=False):
        super(LLRIsotonicRegression, self).__init__()
        self.equal_priors = equal_priors

    def fit(self, X, Y):

        self.prior = self._get_prior(X, Y)

        scores, ratios = self._get_scores_ratios(X, Y)

        y_min = np.min(ratios)
        y_max = np.max(ratios)
        self.ir = IsotonicRegression(y_min=y_min, y_max=y_max)
        self.ir.fit(scores, ratios)

        return self

    def toLogLikelihoodRatio(self, scores):
        """Get log-likelihood ratio given scores

        Parameters
        ----------
        scores : numpy array
            Test scores

        Returns
        -------
        llr : numpy array
            Log-likelihood ratio array with same shape as input `scores`
        """
        x_min = np.min(self.ir.X_)
        x_max = np.max(self.ir.X_)

        oob_min = np.where(scores < x_min)
        oob_max = np.where(scores > x_max)
        ok = np.where((scores >= x_min) * (scores <= x_max))

        calibrated = np.zeros(scores.shape)
        calibrated[ok] = self.ir.transform(scores[ok])
        calibrated[oob_min] = self.ir.y_min
        calibrated[oob_max] = self.ir.y_max
        return calibrated
def compare_PAVA_implementations():
    trials = 10
    rs = check_random_state(0)
    times = []
    dimensions = [int(1e1), int(1e2), int(1e3), int(1e4), int(1e5), int(1e6)]
    #dimensions = [int(1e6)]

    for n in dimensions:
        print 'dimensionality', n
        x = np.arange(n)
        for trial in range(trials):

            y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))

            # scikit-learn PAVA
            if n <= int(1e5):
            #if n <= int(1e6):
                ir = IsotonicRegression()
                y_copy = np.copy(y)
                start_time = time.time()
                ir.fit_transform(x, y_copy)
                time1 = time.time() - start_time
            else: time1 = -1.

            # in-place PAVA
            y_copy = np.copy(y)
            start_time = time.time()
            isotonic_regression_c_2(y_copy, 0, n)
            time2 = time.time() - start_time

            # in-place PAVA++
            y_copy = np.copy(y)
            start_time = time.time()
            isotonic_regression_c(y_copy, 0, n)
            time3 = time.time() - start_time

            times.append([time1, time2, time3])

    index = []
    for n in ['1e1','1e2','1e3','1e4','1e5','1e6']: index += [n]*trials
    #for n in ['1e6']: index += [n]*trials  
    tuples = zip()
    df = pd.DataFrame(times, index=index, columns=['sklearn', 'PAVA+', 'PAVA++'])
    print df
    df.save('results/PAVA_comparison_5.pkl')
Example #29
0
def test_isotonic_dtype():
    y = [2, 1, 4, 3, 5]
    weights = np.array([.9, .9, .9, .9, .9], dtype=np.float64)
    reg = IsotonicRegression()

    for dtype in (np.int32, np.int64, np.float32, np.float64):
        for sample_weight in (None, weights.astype(np.float32), weights):
            y_np = np.array(y, dtype=dtype)
            expected_dtype = \
                check_array(y_np, dtype=[np.float64, np.float32],
                            ensure_2d=False).dtype

            res = isotonic_regression(y_np, sample_weight=sample_weight)
            assert_equal(res.dtype, expected_dtype)

            X = np.arange(len(y)).astype(dtype)
            reg.fit(X, y_np, sample_weight=sample_weight)
            res = reg.predict(X)
            assert_equal(res.dtype, expected_dtype)
def plot():

    results = []
    for f in glob('umau_lengths*npz'):
        d = np.load(f)
        l = d['lengths']
        l = l[~np.isnan(l)]
        l = l[np.isfinite(l)]
        l = l[l>0]
        results.append([d['mu'], l.mean()])
    for f in glob('miller/lengths*npz'):
        d = np.load(f)
        if d['mu'] not in [r[0] for r in results]:
            l = d['lengths']
            l = l[np.isfinite(l)]
            l = l[~np.isnan(l)]
            l = l[l>0]
            results.append([d['mu'], l.mean()])
        else:
            idx = [r[0] for r in results].index(d['mu'])
            l = d['lengths']
            l = l[np.isfinite(l)]
            l = l[~np.isnan(l)]
            l = l[l>0]
            results[idx][1] = 0.5 * (results[idx][1] + l.mean())
    results = sorted(results)
    results = np.array(results).T
    muvals, mean_length = results
    f = plt.figure()
    f.clf()
    ax = f.gca()
    iso = IsotonicRegression(increasing=False)
    mean_length_iso = iso.fit_transform(np.arange(mean_length.shape[0]), mean_length)    
    ax.plot(muvals, mean_length, 'k', linewidth=2, label='UMAU')
    ax.plot([muvals.min(), muvals.max()], [2*ndist.ppf(0.975)]*2, c='red', label='Sample splitting', linewidth=2)
    ax.plot([muvals.min(), muvals.max()], [np.sqrt(2)*ndist.ppf(0.975)]*2, 'k--')
    ax.set_xlabel(r'$\mu$', fontsize=20)
    ax.set_ylabel(r'E(|CI($\mu$)|)', fontsize=20)
    ax.legend(loc='lower right')
    ax.set_ylim([0,4])
    ax.set_xlim([-2,9])
    f.savefig('figure_b_umau.pdf')
def test_fast_predict():
    # test that the faster prediction change doesn't
    # affect out-of-sample predictions:
    # https://github.com/scikit-learn/scikit-learn/pull/6206
    rng = np.random.RandomState(123)
    n_samples = 10**3
    # X values over the -10,10 range
    X_train = 20.0 * rng.rand(n_samples) - 10
    y_train = np.less(rng.rand(n_samples),
                      expit(X_train)).astype('int64').astype('float64')

    weights = rng.rand(n_samples)
    # we also want to test that everything still works when some weights are 0
    weights[rng.rand(n_samples) < 0.1] = 0

    slow_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")
    fast_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds="clip")

    # Build interpolation function with ALL input data, not just the
    # non-redundant subset. The following 2 lines are taken from the
    # .fit() method, without removing unnecessary points
    X_train_fit, y_train_fit = slow_model._build_y(X_train,
                                                   y_train,
                                                   sample_weight=weights,
                                                   trim_duplicates=False)
    slow_model._build_f(X_train_fit, y_train_fit)

    # fit with just the necessary data
    fast_model.fit(X_train, y_train, sample_weight=weights)

    X_test = 20.0 * rng.rand(n_samples) - 10
    y_pred_slow = slow_model.predict(X_test)
    y_pred_fast = fast_model.predict(X_test)

    assert_array_equal(y_pred_slow, y_pred_fast)
def test_isotonic_regression_reversed():
    y = np.array([10, 9, 10, 7, 6, 6.1, 5])
    y_ = IsotonicRegression(increasing=False).fit_transform(
        np.arange(len(y)), y)
    assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
forward = lambda X, thetas: simulate_posterior_predictive(X, thetas, noise=0.5)

# Construct the calibration dataset
predicted_quantiles, empirical_quantiles = make_cal_dataset(
    y[:, np.newaxis], x, coefs, forward)
# -

plt.scatter(predicted_quantiles, empirical_quantiles)
plt.plot([0, 1], [0, 1], color='tab:grey', linestyle='--')
plt.xlabel('Predicted Cumulative Distribution')
plt.ylabel('Empirical Cumulative Distribution')
plt.title('Calibration Dataset')

# +
# Train isotonic regression in reverse mode
ir = IsotonicRegression(out_of_bounds='clip')
ir.fit(empirical_quantiles, predicted_quantiles)

# Find the values of calibrated quantiles
calibrated_quantiles = ir.predict([0.025, 0.5, 0.975])

# +
# Plot the posterior predictive
low, mid, high = np.percentile(posterior_predictive, [2.5, 50, 97.5], axis=1)
plt.fill_between(x_test, low, high, alpha=0.2, label='95% Predictive Interval')
plt.plot(x_test, mid, color='tab:red', label='Predicted Median')

low, mid, high = np.quantile(posterior_predictive,
                             calibrated_quantiles,
                             axis=1)
plt.fill_between(x_test, low, high, alpha=0.2, label='95% Calibrated Interval')
import numpy as np
import matplotlib.pyplot as plt

from sklearn.isotonic import IsotonicRegression
from sklearn.utils import check_random_state

print("Generating Data.")
n = 100  # number of data points
x = np.arange(n)  # x values
random_seed = check_random_state(0)
y = random_seed.randint(-50, 50,
                        size=(n, )) + 50. * np.log1p(np.arange(n))  # y values

# Fit IsotonicRegression models
print("Fitting model.")
ir = IsotonicRegression()
y_ = ir.fit_transform(x, y)

# Plot result
print("Displaying result.")
fig = plt.figure()
plt.plot(x, y, 'r.', markersize=12)
plt.plot(x, y_, 'b.-', markersize=12)
plt.legend(('Data', 'Isotonic Fit'), loc='upper left')
plt.title('Isotonic regression')
plt.show()
Example #35
0
from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.utils import check_random_state

main = pd.read_csv(
    '/Users/Theo/Google Drive/College/Senior Thesis/Materials Science/data/isotonic/hasam_g.csv',
    sep=',',
    names=['Time', 'G'])
mainx_data = main.Time[1:60]
mainx_target = main.G[1:60]

###############################################################################
# Fit Isotonic Regression model
###############################################################################

ir = IsotonicRegression()
lr = LinearRegression()

y_ = ir.fit_transform(mainx_data, mainx_target)
predictions = ir.predict([10])
print predictions
print ir.score(mainx_data, mainx_target)
#print("RSS: %.2f"
#      % np.mean((ir.predict(mainx_target) - mainy_target) ** 2))

###############################################################################
# Plot result
###############################################################################

fig = plt.figure()
plt.plot(mainx_data, mainx_target, 'r.', markersize=12)
Example #36
0
 def __init__(self, add_one=False):
     self.add_one = add_one
     self._ir = IsotonicRegression()

test_cases = [
    (VotingClassifier([('logistic', LogisticRegression()),
                       ('earth',
                        Pipeline([('earth', Earth()),
                                  ('logistic', LogisticRegression())]))],
                      'hard',
                      weights=[1.01, 1.01]), ['predict'],
     create_weird_classification_problem_1()),
    (GradientBoostingClassifier(max_depth=10,
                                n_estimators=10), ['predict_proba', 'predict'],
     create_weird_classification_problem_1()),
    (LogisticRegression(), ['predict_proba', 'predict'],
     create_weird_classification_problem_1()),
    (IsotonicRegression(out_of_bounds='clip'), ['predict'],
     create_isotonic_regression_problem_1()),
    (Earth(), ['predict', 'transform'], create_regression_problem_1()),
    (Earth(allow_missing=True), ['predict', 'transform'],
     create_regression_problem_with_missingness_1()),
    (ElasticNet(), ['predict'], create_regression_problem_1()),
    (ElasticNetCV(), ['predict'], create_regression_problem_1()),
    (LassoCV(), ['predict'], create_regression_problem_1()),
    (Ridge(), ['predict'], create_regression_problem_1()),
    (RidgeCV(), ['predict'], create_regression_problem_1()),
    (SGDRegressor(), ['predict'], create_regression_problem_1()),
    (Lasso(), ['predict'], create_regression_problem_1()),
    (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]),
     ['predict', 'predict_proba'], create_weird_classification_problem_1()),
    (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))],
                  transformer_weights={
Example #38
0
diabetes_X_test= []
diabetes_y_test= []
f = open('Datatrain.csv')
for row in csv.reader(f):
    diabetes_X_train.append(float(row[3]))
    diabetes_y_train.append(float(row[4]))
f.close()

f = open('Datatest.csv')
for row in csv.reader(f):
    diabetes_X_test.append(float(row[3]))
    diabetes_y_test.append(float(row[4]))
f.close()


ir = IsotonicRegression()
y_ = ir.fit_transform(diabetes_X_train, diabetes_y_train)
#lr = LinearRegression()
#lr.fit(diabetes_X_train, diabetes_y_train)  # x needs to be 2d for LinearRegression

segments = [[[i, diabetes_y_train[i]], [i, y_[i]]] for i in range(len(diabetes_X_train))]
lc = LineCollection(segments, zorder=0)
lc.set_array(np.ones(len(diabetes_y_train)))
lc.set_linewidths(0.5 * np.ones(len(diabetes_X_train)))

fig = plt.figure()
#plt.plot(diabetes_X_train, diabetes_y_train, 'r.', markersize=12,color='green')
plt.plot(diabetes_X_test, diabetes_y_test, 'r.', markersize=12,color='black')
#plt.plot(diabetes_X_train, y_, 'g.-', markersize=12,color='yellow')
plt.plot(diabetes_X_test, ir.predict(diabetes_X_test), 'b-',color='red')
#plt.gca().add_collection(lc)
def interpolation_estimate(Z,
                           Z_constraint,
                           lower=0.5,
                           upper=4,
                           npts=30,
                           ndraw=5000,
                           burnin=1000,
                           estimator='truncated'):
    """
    Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$
    where $C$ is the convex set encoded by `Z_constraint`

    .. math::

       C = \left\{z: Az+b \geq 0 \right\}

    with $(A,b)$ being `(Z_constraints.inequality, 
    Z_constraints.inequality_offset)`.

    The algorithm proceeds by estimating $\|Z\|^2_2$ 
    by Monte Carlo for a range of `npts` values starting from
    `lower*np.linalg.norm(Z)/np.sqrt(n)` to
    `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`.

    These values are then used to compute the GCM 
    (Greated Convex Minorant) which is interpolated and solved 
    for an arguments such that the expected value matches the observed
    value `(Z**2).sum()`.

    Parameters
    ----------

    Z : `np.float`
        Observed data to be used to estimate $\sigma$. Should be in
        the cone specified by `Z_constraints`.

    Z_constraint : `constraints`
        Constraints under which we observe $Z$.

    lower : float
        Multiple of naive estimate to use as lower endpoint.

    upper : float
        Multiple of naive estimate to use as upper endpoint.

    npts : int
        Number of points in interpolation grid.

    ndraw : int
        Number of Gibbs steps to use for estimating
        each expectation.

    burnin : int
        How many Gibbs steps to use for burning in.

    Returns
    -------

    sigma_hat : float
        The root of the interpolant derived from GCM values.

    interpolant : `interp1d`
        The interpolant, to be used for plotting or other 
        diagnostics.

    WARNING
    -------

    * It is assumed that `Z_constraints.equality` is `None`.
    
    * Uses `rpy2` and `fdrtool` library to compute the GCM.

    """

    initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0])

    Svalues = np.linspace(lower * initial, upper * initial, npts)
    Evalues = []

    n = Z.shape[0]
    L, V, U, S = quadratic_bounds(Z, np.identity(n), Z_constraint)

    if estimator == 'truncated':

        def _estimator(S, Z, Z_constraint):
            L, V, U, _ = quadratic_bounds(Z, np.identity(n), Z_constraint)
            num = mpquad(
                lambda x: mpexp(-x**2 / (2 * S**2) - L * x / S**2 +
                                (n - 1) * mplog(
                                    (x + L) / S) + 2 * mplog(x + L)),
                [0, U - L])
            den = mpquad(
                lambda x: mpexp(-x**2 / (2 * S**2) - L * x / S**2 +
                                (n - 1) * mplog((x + L) / S)), [0, U - L])
            print num / den, V**2, S, (L, U)
            return num / den
    elif estimator == 'simulate':

        state = Z.copy()
        rpy.r.assign('state', state)

        def _estimator(S, state, Z_constraint):
            Z_constraint.covariance = S**2 * np.identity(Z.shape[0])
            e, v, _state = expected_norm_squared(state,
                                                 Z_constraint,
                                                 ndraw=ndraw,
                                                 burnin=burnin)
            state[:] = _state
            return e

    state = Z.copy()
    for S in Svalues:
        Evalues.append(_estimator(S, state, Z_constraint))
    ir = IsotonicRegression()
    if DEBUG:
        print Svalues, Evalues
    Eiso = ir.fit_transform(Svalues, Evalues)
    Sinterp, Einterp = Svalues, Eiso
    #     rpy.r.assign('S', Svalues)
    #     rpy.r.assign('E', np.array(Evalues))
    #     rpy.r('''
    #     library(fdrtool);
    #     G = gcmlcm(S, E, 'gcm');
    #     Sgcm = G$x.knots;
    #     Egcm = G$y.knots;
    #     ''')
    #     Sgcm = np.asarray(rpy.r('Sgcm'))
    #     Egcm = np.asarray(rpy.r('Egcm'))
    #     interpolant = interp1d(Sgcm, Egcm - (Z**2).sum())

    interpolant = interp1d(Sinterp, Einterp - (Z**2).sum())
    try:
        sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max())
    except:
        raise ValueError(
            '''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)'''
            % ((Z**2).sum(), Einterp.min(), Einterp.max()))
    return sigma_hat, interpolant
def calculate_probability_distribution(tree , instances , index , cal_method =None):

	if cal_method == None :
		return tree.distribution_for_instance(instances.get_instance(index))

	elif cal_method == 'Platt' :

		p_train = np.zeros(shape=(instances.num_instances,1))
		y_train = np.zeros(shape=(instances.num_instances,1))

		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    p_train[i] = [ (dist[1] - 0.5)*2.0 ]
		    y_train[i] = [instance.get_value(instance.class_index)]

		# print("p_train ====>>>" , p_train)
		# print("y_train ====>>>" , y_train)

		dist = (tree.distribution_for_instance(instances.get_instance(index))[1]-0.5)*2.0
		tmp = np.zeros(shape=(1,1))
		tmp[0] = [dist]

		print(np.sum(y_train))
		if np.sum(y_train) in [len(y_train),0]:
			print("all one class")
			for ins in instances : 
				print("ins ===> " , ins)
			return tree.distribution_for_instance(instances.get_instance(index))

		else :

			warnings.filterwarnings("ignore", category=FutureWarning)
			lr = LR(solver='lbfgs')                                                      
			lr.fit( p_train , np.ravel(y_train,order='C') )

			return lr.predict_proba( tmp.reshape(1, -1))[0]


	elif cal_method == 'Isotonic' :

		p_train = np.zeros(shape=(instances.num_instances,1))
		y_train = np.zeros(shape=(instances.num_instances,1))

		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    p_train[i] = [ dist[1] ]
		    y_train[i] = [instance.get_value(instance.class_index)]


		dist = tree.distribution_for_instance(instances.get_instance(index))[1]
		tmp = np.zeros(shape=(1,1))
		tmp[0] = [dist]

		print(np.sum(y_train))
		if np.sum(y_train) in [len(y_train),0]:
			print("all one class")
			for ins in instances : 
				print("ins ===> " , ins)
			return tree.distribution_for_instance(instances.get_instance(index))

		else :

			ir = IR( out_of_bounds = 'clip' )
			ir.fit(np.ravel(p_train,order='C')  , np.ravel(y_train,order='C'))

			p = ir.transform( np.ravel(tmp,order='C'))[0]
			return [p,1-p]
			
	# elif cal_method == 'ProbabilityCalibrationTree' :
	# 	pass


	elif cal_method == 'ICP' :


		pass
	elif cal_method == 'Venn1' :
		calibrPts = []
		
		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    score = dist[0] if  dist[1] < dist[0] else dist[1]
		    calibrPts.append( ( (score) , instance.get_value(instance.class_index) ) ) 
		    

		dist = (tree.distribution_for_instance(instances.get_instance(index)))
		score = dist[0] if dist[1] < dist[0] else dist[1]
		tmp = [score]

		p0,p1=VennABERS.ScoresToMultiProbs(calibrPts,tmp)
		print("Vennnnnn =========>>>>>>>>>>>>  ", p0, "  , ",p1)
		return [p0,p1]
		pass
Example #41
0
class IsotonicCalibrator(BaseEstimator, RegressorMixin):
    """Probability calibration with isotonic regression.

    Note
    ----
    This class backports and extends `sklearn.isotonic.IsotonicRegression`.
    """

    def __init__(self, y_min=None, y_max=None, increasing=True,
                 interpolation=False):
        """Constructor.

        Parameters
        ----------
        * `y_min` [optional]:
            If not `None`, set the lowest value of the fit to `y_min`.

        * `y_max` [optional]:
            If not `None`, set the highest value of the fit to `y_max`.

        * `increasing` [boolean or string, default=`True`]:
            If boolean, whether or not to fit the isotonic regression with `y`
            increasing or decreasing.
            The string value `"auto"` determines whether `y` should increase or
            decrease based on the Spearman correlation estimate's sign.

        * `interpolation` [boolean, default=`False`]:
            Whether linear interpolation is enabled or not.
        """
        self.y_min = y_min
        self.y_max = y_max
        self.increasing = increasing
        self.interpolation = interpolation

    def fit(self, T, y, sample_weight=None):
        """Fit using `T`, `y` as training data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Training data.

        * `y` [array-like, shape=(n_samples,)]:
            Training target.

        * `sample_weight` [array-like, shape=(n_samples,), optional]:
            Weights. If set to None, all weights will be set to 1.

        Returns
        -------
        * `self` [object]:
            `self`.

        Notes
        -----
        `T` is stored for future use, as `predict` needs T to interpolate
        new input data.
        """
        # Check input
        T = column_or_1d(T)

        # Fit isotonic regression
        self.ir_ = IsotonicRegression(y_min=self.y_min,
                                      y_max=self.y_max,
                                      increasing=self.increasing,
                                      out_of_bounds="clip")
        self.ir_.fit(T, y, sample_weight=sample_weight)

        # Interpolators
        if self.interpolation:
            p = self.ir_.transform(T)

            change_mask1 = (p - np.roll(p, 1)) > 0
            change_mask2 = np.roll(change_mask1, -1)
            change_mask1[0] = True
            change_mask1[-1] = True
            change_mask2[0] = True
            change_mask2[-1] = True

            self.interp1_ = interp1d(T[change_mask1], p[change_mask1],
                                     bounds_error=False,
                                     fill_value=(0., 1.))
            self.interp2_ = interp1d(T[change_mask2], p[change_mask2],
                                     bounds_error=False,
                                     fill_value=(0., 1.))

        return self

    def predict(self, T):
        """Calibrate data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Data to calibrate.

        Returns
        -------
        * `Tt` [array, shape=(n_samples,)]:
            Calibrated data.
        """
        if self.interpolation:
            T = column_or_1d(T)
            return 0.5 * (self.interp1_(T) + self.interp2_(T))

        else:
            return self.ir_.transform(T)
Example #42
0
def _smacof_single(dissimilarities1,
                   dissimilarities2,
                   p,
                   weights1=None,
                   weights2=None,
                   metric=True,
                   n_components=2,
                   init1=None,
                   init2=None,
                   max_iter=300,
                   verbose=0,
                   eps=1e-3,
                   random_state1=None,
                   random_state2=None):
    """
    Computes multidimensional scaling using SMACOF algorithm

    Parameters
    ----------
    dissimilarities : ndarray, shape (n_samples, n_samples)
        Pairwise dissimilarities between the points. Must be symmetric.

    metric : boolean, optional, default: True
        Compute metric or nonmetric SMACOF algorithm.

    n_components : int, optional, default: 2
        Number of dimensions in which to immerse the dissimilarities. If an
        ``init`` array is provided, this option is overridden and the shape of
        ``init`` is used to determine the dimensionality of the embedding
        space.

    init : ndarray, shape (n_samples, n_components), optional, default: None
        Starting configuration of the embedding to initialize the algorithm. By
        default, the algorithm is initialized with a randomly chosen array.

    max_iter : int, optional, default: 300
        Maximum number of iterations of the SMACOF algorithm for a single run.

    verbose : int, optional, default: 0
        Level of verbosity.

    eps : float, optional, default: 1e-3
        Relative tolerance with respect to stress at which to declare
        convergence.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    Returns
    -------
    X : ndarray, shape (n_samples, n_components)
        Coordinates of the points in a ``n_components``-space.

    stress : float
        The final value of the stress (sum of squared distance of the
        disparities and the distances for all constrained points).

    n_iter : int
        The number of iterations corresponding to the best stress.
    """
    dissimilarities1 = check_symmetric(dissimilarities1, raise_exception=True)
    dissimilarities2 = check_symmetric(dissimilarities2, raise_exception=True)

    if dissimilarities1.shape != dissimilarities2.shape:
        print("Error. Distance matrices have different shapes.")
        sys.exit("Error. Distance matrices have different shapes.")

    n_samples = dissimilarities1.shape[0]

    X1, sim_flat1, sim_flat_w1 = initialize(dissimilarities1, random_state1,
                                            init1, n_samples, n_components)
    X2, sim_flat2, sim_flat_w2 = initialize(dissimilarities2, random_state2,
                                            init2, n_samples, n_components)

    #Default: equal weights
    if weights1 is None:
        weights1 = np.ones((n_samples, n_samples))
    if weights2 is None:
        weights2 = np.ones(n_samples)

    # Disparity-specific weights (V in Borg)
    V1 = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        diagonal = 0
        for j in range(n_samples):
            V1[i, j] = -weights1[i, j]
            diagonal += weights1[i, j]
        V1[i, i] = diagonal

    # Locus-specific weights
    V2 = np.zeros((n_samples, n_samples))
    for i, weight in enumerate(weights2):
        V2[i, i] = weight * p * n_samples

    inv_V = moore_penrose(V1 + V2)

    old_stress = None
    ir = IsotonicRegression()
    for it in range(max_iter):
        # Compute distance and monotonic regression
        dis1 = euclidean_distances(X1)
        dis2 = euclidean_distances(X2)

        if metric:
            disparities1 = dissimilarities1
            disparities2 = dissimilarities2
        else:
            disparities1 = nonmetric_disparities1(dis1, sim_flat1, n_samples)
            disparities2 = nonmetric_disparities2(dis2, sim_flat2, n_samples)

        # Compute stress
        stress = ((dis1.ravel() - disparities1.ravel())**2).sum() + (
            (dis2.ravel() - disparities2.ravel())**2
        ).sum() + n_samples * p * ssd(
            X1, X2
        )  #multiply by n_samples to make ssd term comparable in magnitude to embedding error terms

        # Update X1 using the Guttman transform
        X1 = guttman(X1, X2, disparities1, inv_V, V2, dis1)

        # Update X2 using the Guttman transform
        X2 = guttman(X2, X1, disparities2, inv_V, V2, dis2)

        # Test stress
        dis1 = np.sqrt((X1**2).sum(axis=1)).sum()
        dis2 = np.sqrt((X2**2).sum(axis=1)).sum()
        dis = np.mean((dis1, dis2))
        if verbose >= 2:
            print('it: %d, stress %s' % (it, stress))
        if old_stress is not None:
            if np.abs(old_stress - stress / dis) < eps:
                if verbose:
                    print('breaking at iteration %d with stress %s' %
                          (it, stress))
                break
        old_stress = stress / dis

    return X1, X2, stress, it + 1
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.utils import check_random_state

n = 100
x = np.arange(n)
rs = check_random_state(0)
y = rs.randint(-50, 50, size=(n, )) + 50.0 * np.log1p(np.arange(n))

# %%
# Fit IsotonicRegression and LinearRegression models:

ir = IsotonicRegression(out_of_bounds="clip")
y_ = ir.fit_transform(x, y)

lr = LinearRegression()
lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

# %%
# Plot results:

segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
lc = LineCollection(segments, zorder=0)
lc.set_array(np.ones(len(y)))
lc.set_linewidths(np.full(n, 0.5))

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 6))
Example #44
0
class _CalibratedClassifier:
    """Probability calibration with isotonic regression or sigmoid.

    It assumes that base_estimator has already been fit, and trains the
    calibration on the input set of the fit function. Note that this class
    should not be used as an estimator directly. Use CalibratedClassifierCV
    with cv="prefit" instead.

    Parameters
    ----------
    base_estimator : instance BaseEstimator
        The classifier whose output decision function needs to be calibrated
        to offer more accurate predict_proba outputs. No default value since
        it has to be an already fitted estimator.

    method : 'sigmoid' | 'isotonic'
        The method to use for calibration. Can be 'sigmoid' which
        corresponds to Platt's method or 'isotonic' which is a
        non-parametric approach based on isotonic regression.

    classes : array-like, shape (n_classes,), optional
            Contains unique classes used to fit the base estimator.
            if None, then classes is extracted from the given target values
            in fit().

    See also
    --------
    CalibratedClassifierCV

    References
    ----------
    .. [1] Obtaining calibrated probability estimates from decision trees
           and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001

    .. [2] Transforming Classifier Scores into Accurate Multiclass
           Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)

    .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to
           Regularized Likelihood Methods, J. Platt, (1999)

    .. [4] Predicting Good Probabilities with Supervised Learning,
           A. Niculescu-Mizil & R. Caruana, ICML 2005
    """
    def __init__(self, base_estimator, method='isotonic', classes=None):
        self.base_estimator = base_estimator
        self.method = method
        self.classes = classes

    def _preproc(self, X):
        n_classes = len(self.classes_)
        probabilities = self.base_estimator.predict_proba(X)[:, 1]
        idx_pos_class = self.label_encoder_.\
            transform(self.base_estimator.classes_)

        return probabilities, idx_pos_class

    def fit(self, X, y):
        """Calibrate the fitted model

        Parameters
        ----------
        X : array-lie, shape (n_samples,)
            Predictions from the base_estimator

        y : array-like, shape (n_samples,)
            Target values.

        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights. If None, then samples are equally weighted.

        Returns
        -------
        self : object
            Returns an instance of self.
        """
        self.label_encoder_ = LabelEncoder()
        if self.classes is None:
            self.label_encoder_.fit(y)
        else:
            self.label_encoder_.fit(self.classes)

        self.classes_ = self.label_encoder_.classes_
        self.calibrator_ = IsotonicRegression(out_of_bounds='clip')
        self.calibrator_.fit(X, y)

        return self

    def predict_proba(self, X):
        """Posterior probabilities of classification

        This function returns posterior probabilities of classification
        according to each class on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The samples.

        Returns
        -------
        C : array, shape (n_samples, n_classes)
            The predicted probas. Can be exact zeros.
        """
        n_classes = len(self.classes_)
        proba = np.zeros((X.shape[0], n_classes))

        probabilities, idx_pos_class = self._preproc(X)

        proba[:, 1] = self.calibrator_.predict(probabilities)

        # Normalize the probabilities
        if n_classes == 2:
            proba[:, 0] = 1. - proba[:, 1]
        else:
            proba /= np.sum(proba, axis=1)[:, np.newaxis]

        # XXX : for some reason all probas can be 0
        proba[np.isnan(proba)] = 1. / n_classes

        # Deal with cases where the predicted probability minimally exceeds 1.0
        proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0

        return proba
def truncated_estimate(Z, Z_constraint, lower=0.5, upper=2, npts=15):
    """
    Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$
    where $C$ is the convex set encoded by `Z_constraints`

    .. math::

       C = \left\{z: Az+b \geq 0 \right\}

    with $(A,b)$ being `(Z_constraints.inequality, 
    Z_constraints.inequality_offset)`.

    The algorithm proceeds by estimating $\|Z\|^2_2$ 
    by Monte Carlo for a range of `npts` values starting from
    `lower*np.linalg.norm(Z)/np.sqrt(n)` to
    `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`.

    These values are then used to compute the GCM 
    (Greated Convex Minorant) which is interpolated and solved 
    for an arguments such that the expected value matches the observed
    value `(Z**2).sum()`.

    Parameters
    ----------

    Z : `np.float`
        Observed data to be used to estimate $\sigma$. Should be in
        the cone specified by `Z_constraints`.

    Z_constraint : `constraints`
        Constraints under which we observe $Z$.

    lower : float
        Multiple of naive estimate to use as lower endpoint.

    upper : float
        Multiple of naive estimate to use as upper endpoint.

    npts : int
        Number of points in interpolation grid.

    Returns
    -------

    sigma_hat : float
        The root of the interpolant derived from GCM values.

    interpolant : `interp1d`
        The interpolant, to be used for plotting or other 
        diagnostics.

    WARNING
    -------

    * It is assumed that `Z_constraints.equality` is `None`.
    
    * Uses `rpy2` and `fdrtool` library to compute the GCM.

    """

    initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0])

    Svalues = np.linspace(lower * initial, upper * initial, npts)
    Evalues = []

    # use truncated chi to estimate integral
    # with scipy.integrate.quad
    n = Z.shape[0]
    operator = np.identity(n)
    L, V, U, S = quadratic_bounds(Z, operator, Z_constraint)

    for S in Svalues:
        num = quad(lambda x: np.exp(-x**2 / (2 * S**2) + (n + 1) * np.log(x)),
                   L, U)
        den = quad(lambda x: np.exp(-x**2 / (2 * S**2) + (n - 1) * np.log(x)),
                   L, U)
        Evalues.append(num[0] / den[0])
        print num, den

    ir = IsotonicRegression()
    if DEBUG:
        print Svalues, Evalues
    Eiso = ir.fit_transform(Svalues, Evalues)
    Sinterp, Einterp = Svalues, Eiso

    interpolant = interp1d(Sinterp, Einterp - (Z**2).sum())
    try:
        sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max())
    except:
        raise ValueError(
            '''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)'''
            % ((Z**2).sum(), Einterp.min(), Einterp.max()))
    return sigma_hat, interpolant

    print L, V, U, S
Example #46
0
    def known_iso(self, axis=1, unknowns=0):
        # performs isotonic regression ONLY for known data values
        # and ONLY on columns where there are non-increasing points
        # row-wise (axis = 0) or column-wise (axis = 1)
        # unknowns should be 0 or none

        tonic = copy.deepcopy(self.array)  # returns a new isotonic matrix
        known_dict = self.known_for_iso(axis, unknowns)
        if axis == 1:
            increase_dict, non_increase_percent = self.is_col_inc()
        else:
            increase_dict = self.is_row_inc()

        # dat dict tells me where things arent increasing (from is_row_inc() or is_col_inc())
        if axis == 1:
            for i in range(len(tonic[0])):
                try:
                    # if i is a key in increase dict then this column needs regression
                    # else just pass to the next column
                    tester = increase_dict[i]

                    X = known_dict[i]

                    if X != []:
                        initial_vals = [tonic[j][i] for j in X]

                        # Use the initial values to fit the model and then predict what the decreasing ones should be
                        iso = IsotonicRegression(out_of_bounds='clip').fit(
                            X, initial_vals)
                        predictions = iso.predict(range(len(tonic)))

                        # put everything back:
                        for row in range(len(predictions)):
                            tonic[row][i] = predictions[row]
                except:
                    pass

        else:
            # same thing but with rows
            for i in range(len(tonic)):
                try:
                    tester = increase_dict[i]
                    X = known_dict[i]

                    if X != []:

                        initial_vals = [tonic[i][j] for j in X]

                        # Use the initial values to fit the model and then predict what the decreasing ones should be
                        iso = IsotonicRegression(out_of_bounds='clip').fit(
                            X, initial_vals)
                        predictions = iso.predict(range(len(tonic[i])))

                        # put everything back:
                        tonic[i] = predictions

                except:
                    pass

        newframe = pd.DataFrame(tonic)
        newframe.columns = self.dataframe.columns
        newframe.index = self.dataframe.index

        if unknowns == 0:
            # Isotonic outputs NaN values, replace them with zeros
            newframe = newframe.fillna(0)

        return mat_opr(newframe)
Example #47
0
def test_isotonic_copy_before_fit():
    # https://github.com/scikit-learn/scikit-learn/issues/6628
    ir = IsotonicRegression()
    copy.copy(ir)
 def __init__(self):
     self.clf = IsotonicRegression(y_min=0.0,
                                   y_max=1.0,
                                   out_of_bounds='clip')
Example #49
0
def fit_spline(mainDic, x, y, yerr, infilename, outfilename, biasDic,
               resolution, min_dist, max_dist, verbose):
    if verbose:
        print("\nFit a univariate spline to the probability means\n"),
        print(
            "------------------------------------------------------------------------------------\n"
        ),

    # maximum residual allowed for spline is set to min(y)^2
    splineError = min(y)**2

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING A SKLEARN ISOTONIC REGRESSION
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    min_x, max_x = min(x), max(x)
    tempList = sorted([dis for dis in mainDic])
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if min_x <= i <= max_x:
            splineX.append(i)

    splineY = ius(splineX)

    ir = IsotonicRegression(increasing=False)
    rNewSplineY = ir.fit_transform(splineX, splineY)

    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])

    ### Now newSplineY holds the monotonic contact probabilities
    residual = sum([i * i for i in (y - ius(x))])

    ### Now plot the results
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    plt.title(
        'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e'
        % (residual),
        size='small')
    plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y],
             'ro',
             label="Means")
    plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY],
             'g-',
             label="Spline fit")

    plt.ylabel('Probability (1e-5)')
    plt.xlabel('Genomic distance (kb)')
    plt.xlim([min_x / 1000.0, max_x / 1000.0])
    ax.legend(loc="upper right")

    ax = fig.add_subplot(2, 1, 2)
    plt.loglog(splineX, newSplineY, 'g-')
    plt.loglog(x, y, 'r.')  # Data

    plt.ylabel('Probability (log scale)')
    plt.xlabel('Genomic distance (log scale)')
    plt.xlim([min_x, max_x])
    plt.savefig(outfilename + '.res' + str(resolution) + '.png')
    sys.stderr.write("Plotting %s" % outfilename + ".png\n")

    # NOW write the calculated pvalues and corrected pvalues in a file
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0

    if verbose:
        print("lower bound on mid-range distances  " + repr(min_dist) +
              ", upper bound on mid-range distances  " + repr(max_dist) +
              "\n"),

    with gzip.open(infilename, 'r') as infile:
        with gzip.open(
                '{}.res{}.significances.txt.gz'.format(outfilename,
                                                       resolution),
                'w') as outfile:
            outfile.write(
                "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
            )

            for line in infile:
                chr1, mid1, chr2, mid2, contactCount = line.rstrip().split()
                mid1, mid2, contactCount = int(mid1), int(mid2), int(
                    contactCount)
                distance = mid2 - mid1

                bias1 = 1.0
                bias2 = 1.0
                # assumes there is no bias to begin with
                # if the biasDic is not null sets the real bias values
                if len(biasDic) > 0:
                    if chr1 in biasDic and mid1 in biasDic[chr1]:
                        bias1 = biasDic[chr1][mid1]
                    if chr2 in biasDic and mid2 in biasDic[chr2]:
                        bias2 = biasDic[chr2][mid2]

                if min_dist <= distance <= max_dist:
                    # make sure the interaction distance is covered by the probability bins
                    distToLookUp = min(max(distance, min_x), max_x)
                    i = min(bisect.bisect_left(splineX, distToLookUp),
                            len(splineX) - 1)
                    prior_p = newSplineY[i] * (bias1 * bias2
                                               )  # biases added in the picture
                    p_val = scsp.bdtrc(contactCount - 1,
                                       observedIntraInRangeSum, prior_p)

                    if p_val <= 1:
                        outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                            chr1, mid1, chr2, mid2, contactCount, p_val, -1))

    return splineX, newSplineY, residual
Example #50
0
        'severe_wind' : 'wnd_probs_>40_prob_max'
    }

iterator = itertools.product(time_set, target_set,)

for combo in iterator:
    time, target = combo
    print(f'Loading {time} data...')
    fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_training_matched_to_{target}_0km_dataset.pkl')
    data = io.load_dataframe(fname=fname,
                         target_vars=['matched_to_tornado_0km', 'matched_to_severe_hail_0km','matched_to_severe_wind_0km' ],
                         vars_to_drop=target_vars
                         )
    
    examples = data['examples']
    baseline_probs = baseline_var[target]
    forecast_probabilities = examples[baseline_probs]
    target_values = data[f'matched_to_{target}_0km'] 

    iso_reg = IsotonicRegression(out_of_bounds='clip') 

    iso_reg.fit(forecast_probabilities, target_values)

    save_fname = f'calibration_model_wofs_{time}_{target}_{baseline_probs}.joblib'

    joblib.dump(iso_reg, join(config.ML_MODEL_SAVE_PATH, save_fname))

 


Example #51
0
class Forecaster(nn.Module):
    def __init__(self, args):
        super(Forecaster, self).__init__()
        self.args = args

    def eval_all(self, bx, by):
        br = torch.rand(bx.shape[0], 1, device=bx.device)
        mean, stddev = self.forward(bx=bx, br=br)
        cdf = 0.5 * (1.0 + torch.erf((by - mean) / stddev / math.sqrt(2)))

        loss_cdf = torch.abs(cdf - br).mean()

        eps = 1e-5
        loss_cdf_kl = cdf * (torch.log(cdf + eps) - torch.log(br + eps)) + \
                      (1 - cdf) * (torch.log(1 - cdf + eps) - torch.log(1 - br + eps))
        loss_cdf_kl = loss_cdf_kl.mean()

        loss_stddev = stddev.mean()

        # loss_l2 = ((by - mean) ** 2).mean()

        # Log likelihood of by under the predicted Gaussian distribution
        loss_nll = torch.log(stddev) + math.log(2 * math.pi) / 2.0 + ((
            (by - mean) / stddev)**2 / 2.0)
        loss_nll = loss_nll.mean()

        return cdf, loss_cdf * (
            1 - self.args.klcoeff
        ) + loss_cdf_kl * self.args.klcoeff, loss_stddev, loss_nll

    def eval_in_batch(self, bx, by, batch_size):
        pass

    def recalibrate(self, bx, by):
        with torch.no_grad():
            cdf = self.eval_all(bx, by)[0].cpu().numpy()[:, 0].astype(np.float)

        cdf = np.sort(cdf)
        lin = np.linspace(0, 1, int(cdf.shape[0]))

        # Insert an extra 0 and 1 to ensure the range is always [0, 1], and trim CDF for numerical stability
        cdf = np.clip(cdf, a_max=1.0 - 1e-6, a_min=1e-6)
        cdf = np.insert(np.insert(cdf, -1, 1), 0, 0)
        lin = np.insert(np.insert(lin, -1, 1), 0, 0)

        self.iso_transform = IsotonicRegression()
        self.iso_transform.fit_transform(cdf, lin)

    def apply_recalibrate(self, cdf):
        if self.iso_transform is not None:
            # If input tensor output tensor
            # If input numpy array output numpy array
            is_torch = False
            if isinstance(cdf, type(torch.zeros(1))):
                device = cdf.get_device()
                cdf = cdf.cpu().numpy()
                is_torch = True

            original_shape = cdf.shape
            new_cdf = np.reshape(self.iso_transform.transform(cdf.flatten()),
                                 original_shape)
            if is_torch:
                new_cdf = torch.from_numpy(new_cdf).to(device)
            return new_cdf
        else:
            return cdf
Example #52
0
def _smacof_with_anchors_single(config,
                                similarities,
                                metric=True,
                                n_components=2,
                                init=None,
                                max_iter=300,
                                verbose=0,
                                eps=1e-3,
                                random_state=None):
    """
	Computes multidimensional scaling using SMACOF algorithm
	Parameters
	----------
	config : Config object
		configuration object for anchor-tag deployment parameters
	similarities: symmetric ndarray, shape [n * n]
		similarities between the points
	metric: boolean, optional, default: True
		compute metric or nonmetric SMACOF algorithm
	n_components: int, optional, default: 2
		number of dimension in which to immerse the similarities
		overwritten if initial array is provided.
	init: {None or ndarray}, optional
		if None, randomly chooses the initial configuration
		if ndarray, initialize the SMACOF algorithm with this array
	max_iter: int, optional, default: 300
		Maximum number of iterations of the SMACOF algorithm for a single run
	verbose: int, optional, default: 0
		level of verbosity
	eps: float, optional, default: 1e-6
		relative tolerance w.r.t stress to declare converge
	random_state: integer or numpy.RandomState, optional
		The generator used to initialize the centers. If an integer is
		given, it fixes the seed. Defaults to the global numpy random
		number generator.
	Returns
	-------
	X: ndarray (n_samples, n_components), float
			   coordinates of the n_samples points in a n_components-space
	stress_: float
		The final value of the stress (sum of squared distance of the
		disparities and the distances for all constrained points)
	n_iter : int
		Number of iterations run
	last_positions: ndarray [X1,...,Xn]
		An array of computed Xs.
	"""
    NO_OF_TAGS, NO_OF_ANCHORS = config.no_of_tags, config.no_of_anchors
    similarities = check_symmetric(similarities, raise_exception=True)

    n_samples = similarities.shape[0]
    random_state = check_random_state(random_state)

    sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel()
    sim_flat_w = sim_flat[sim_flat != 0]

    if init is None:
        # Randomly choose initial configuration
        X = random_state.rand(n_samples * n_components)
        X = X.reshape((n_samples, n_components))
        # uncomment the following if weight matrix W is not hollow
        #X[:-2] = Xa
    else:
        # overrides the parameter p
        n_components = init.shape[1]
        if n_samples != init.shape[0]:
            raise ValueError("init matrix should be of shape (%d, %d)" %
                             (n_samples, n_components))
        X = init

    old_stress = None
    ir = IsotonicRegression()

    # setup weight matrix
    weights = np.ones((n_samples, n_samples))
    if getattr(config, 'missingdata', None):
        weights[-NO_OF_TAGS:, -NO_OF_TAGS:] = 0

    diag = np.arange(n_samples)
    weights[diag, diag] = 0

    last_n_configs = []
    Xa = config.anchors
    for it in range(max_iter):
        # Compute distance and monotonic regression
        dis = euclidean_distances(X)

        if metric:
            disparities = similarities
        else:
            dis_flat = dis.ravel()
            # similarities with 0 are considered as missing values
            dis_flat_w = dis_flat[sim_flat != 0]

            # Compute the disparities using a monotonic regression
            disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
            disparities = dis_flat.copy()
            disparities[sim_flat != 0] = disparities_flat
            disparities = disparities.reshape((n_samples, n_samples))
            disparities *= np.sqrt(
                (n_samples * (n_samples - 1) / 2) / (disparities**2).sum())

        # Compute stress
        stress = (weights.ravel() *
                  (dis.ravel() - disparities.ravel())**2).sum() / 2
        #stress = ((dis[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel() - disparities[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel()) ** 2).sum()

        # Update X using the Guttman transform
        dis[dis == 0] = 1e5
        ratio = weights * disparities / dis
        B = -ratio
        B[diag, diag] = 0
        B[diag, diag] = -B.sum(axis=1)

        # Apply update to only tag configuration since anchor config is already known

        V = -weights
        V[diag, diag] += weights.sum(axis=1)
        # V_inv = np.linalg.pinv(V)
        V12 = V[-NO_OF_TAGS:, :-NO_OF_TAGS]
        B11 = B[-NO_OF_TAGS:, -NO_OF_TAGS:]
        Zu = X[-NO_OF_TAGS:]
        B12 = B[-NO_OF_TAGS:, :-NO_OF_TAGS]
        V11_inv = np.linalg.inv(V[-NO_OF_TAGS:, -NO_OF_TAGS:])
        Xu = V11_inv.dot(B11.dot(Zu) + (B12 - V12).dot(Xa))

        # merge known anchors config with new tags config
        X = np.concatenate((Xa, Xu))
        last_n_configs.append(X)

        #X = (1/n_samples)*B.dot(X)

        #dis = np.sqrt((X ** 2).sum(axis=1)).sum()
        dis = (weights * dis**2).sum() / 2
        if verbose >= 2:
            print('it: %d, stress %s' % (it, stress))
        if old_stress is not None:
            if (old_stress - stress / dis) < eps:
                if verbose:
                    print('breaking at iteration %d with stress %s' %
                          (it, stress))
                break
        old_stress = stress / dis
    return X, stress, it + 1, np.array(last_n_configs)
linear_regression.fit(visual_vector, conceptual_vector)
predictions = linear_regression.predict(visual_vector)
r2_linear = r2_score(conceptual_vector, predictions)
print("R² linear visual to conceptual:", r2_linear)

# compute least squares regression for R² metric: conceptual to visual
linear_regression = LinearRegression()
linear_regression.fit(conceptual_vector, visual_vector)
predictions = linear_regression.predict(conceptual_vector)
r2_linear = r2_score(visual_vector, predictions)
print("R² linear conceptual to visual:", r2_linear)

# compute isotonic regression for R² metric: visual to conceptual
x = np.reshape(visual_dissimilarities, (-1))
y = np.reshape(conceptual_dissimilarities, (-1))
isotonic_regression = IsotonicRegression()
predictions = isotonic_regression.fit_transform(x, y)
r2_isotonic = r2_score(y, predictions)
print("R² isotonic visual to conceptual:", r2_isotonic)

# compute isotonic regression for R² metric: visual to conceptual
x = np.reshape(conceptual_dissimilarities, (-1))
y = np.reshape(visual_dissimilarities, (-1))
isotonic_regression = IsotonicRegression()
predictions = isotonic_regression.fit_transform(x, y)
r2_isotonic = r2_score(y, predictions)
print("R² isotonic conceptual to visual:", r2_isotonic)

if args.plot:
    # create scatter plot if user want us to
    fig, ax = plt.subplots(figsize=(12, 12))
Example #54
0
from sklearn.isotonic import IsotonicRegression
import matplotlib.pyplot as plt

# the reported preference orderings
x = list(range(1, 7))
# the estimated preference orderings according to the additive model (16.1) and
# the metric solution (Table 16.6) in MVA
y = [0.84, 2.84, 3.16, 3.34, 5.66, 5.16]

gp = IsotonicRegression()
y_gp = gp.fit_transform(x, y)

fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(x, y_gp, c="k")
ax.scatter(x, y, c="r")
for i in range(0, len(y)):
    ax.text(x[i] - 0.05, y_gp[i] + 0.1, "car" + str(i + 1), fontsize=14)
plt.xlabel("revealed rankings", fontsize=14)
plt.ylabel("estimated rankings", fontsize=14)
plt.title("Car rankings", fontsize=16)
plt.show()
Example #55
0
def find_regions(t,
                 x,
                 minimum_datapoints=10,
                 mu_factor=0.7,
                 low_density_factor=0.01):
    """Finds clean regions of gradual growth between jump events.

    Args:
        t (1D numpy.array): clean time points
        x (1D numpy.array): cleaned log-OD series (same shape as `t`)
        minimum_datapoints (int): regions must have at least this many data points
        mu_factor (float): the linear fit is tempered by this factor before isotonic regression
            Low values (0.0..0.5) can result in missing jump events.
            High values (0.8..1.0) can result in oversegmentation, i.e. false jump events

    Returns:
        numpy.array: list of start indexes for the regions
        numpy.array: list of end indexes (inclusive) for the regions
    """
    # find gaps in the data
    avg_dt = (t[-1] - t[0]) / (len(t) - 1)
    gap_start_idexes = np.where(np.diff(t) > avg_dt / low_density_factor)[0]

    # build initial set of regions from these gaps
    s_raw = [0]
    e_raw = []
    for gap_idx in gap_start_idexes:
        e_raw.append(gap_idx)
        s_raw.append(gap_idx + 1)
    e_raw.append(len(t) - 1)
    regions_to_investigate = list(zip(s_raw, e_raw))

    s = []
    e = []
    while len(regions_to_investigate) > 0:

        # pick a new region
        start_idx, end_idx = regions_to_investigate.pop()

        # check that there are at least a minimum number of datapoints
        if end_idx - start_idx + 1 < minimum_datapoints:
            continue

        # find optimal drift
        t_region = t[start_idx:end_idx + 1]
        x_region = x[start_idx:end_idx + 1]
        mu_min = LinearRegression(fit_intercept=True) \
                 .fit(t_region.reshape([-1, 1]),
                     x_region) \
                 .coef_

        # fit monotonic function
        x_drifting = x_region - t_region * mu_min * mu_factor
        iso_reg = IsotonicRegression(increasing=False) \
                  .fit(t_region, x_drifting)
        x_segmented = iso_reg.predict(t_region)

        # find jumps
        jump_indexes = np.where(np.diff(x_segmented) < 0)[0] + start_idx
        if len(jump_indexes) > 0:
            # if found, add the sub-regions to the list of new regions
            start_indexes = [start_idx]
            end_indexes = []
            for jump_idx in jump_indexes:
                end_indexes.append(jump_idx)
                start_indexes.append(jump_idx + 1)
            end_indexes.append(end_idx)
            for start_idx, end_idx in zip(start_indexes, end_indexes):
                regions_to_investigate.append((start_idx, end_idx))
        else:
            # if no subregions are found, add regions to final set
            s.append(start_idx)
            e.append(end_idx)

    s.sort()
    e.sort()
    return np.array(s), np.array(e)
Example #56
0
            # Test
            img_test, truth_mask_test, predicted_mask_test = img, mask, model.predict(
                img)
            test_batch_size = tf.shape(img).numpy()[0]
        else:
            logging.warning("Skipping some data!!!")
            break

    # Flatten the base model predictions and true values
    predicted_mask_train_arr = flatten_tensor(predicted_mask_train)
    truth_mask_train_arr = flatten_tensor(truth_mask_train)

    predicted_mask_test_arr = flatten_tensor(predicted_mask_test)
    truth_mask_test_arr = flatten_tensor(truth_mask_test)

    iso_regression = IsotonicRegression(out_of_bounds='clip')
    iso_regression.fit(predicted_mask_train_arr, truth_mask_train_arr)
    p_calibrated = iso_regression.predict(predicted_mask_test_arr)
    calibration_model_fn = os.path.join(FLAGS.output, 'calibration.weights')
    dump(iso_regression, calibration_model_fn)

    plot_calibration_curve(truth_mask_test_arr,
                           p_calibrated,
                           output=FLAGS.output)

    # Convert 1-d ndarray to tensor of 3 channel
    p_calibrated = np.reshape(
        p_calibrated, [test_batch_size, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNEL])

    metrics = calculate_metrics(truth_mask_test, predicted_mask_test,
                                p_calibrated)
Example #57
0
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic,outliersline,outliersdist,observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write("------------------------------------------------------------------------------------\n"),
   
    splineX = None
    newSplineY = None
    residual = None 
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x,y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1,len(x)):
            if x[i]<=x[i-1]:
                print("ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct.")
                print("Avg. distance of bin(i-1)... %s" % x[i-1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)
        
        # maximum residual allowed for spline is set to min(y)^2
        splineError=min(y)*min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX=max(x)
        tempMinX=min(x)
        tempList=sorted([dis for dis in mainDic])
        splineX=[]
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX<=i<=tempMaxX:
                splineX.append(i)
        splineY=ius(splineX)
        #print(splineY)
        #print(yerr)


        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX,splineY)
        #print(newSplineY)
        residual =sum([i*i for i in (y - ius(x))])

        if visual==True:
            xi = np.linspace(min(x),max(x),5*len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2,1,1)
            plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) 
        
            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb))
            plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2,1,2)

            plt.loglog(splineX,newSplineY,'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename+'.png')
            

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount=0
    intraOutOfRangeCount=0
    intraVeryProximalCount=0
    interCount=0
    discardCount=0
    p_vals=[]
    q_vals=[]
    biasl=[]
    biasr=[]
    for line in infile:
        ch1,mid1,ch2,mid2,contactCount=line.rstrip().split()
        contactCount = float(contactCount)
        interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1); mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres,distUpThres)
        bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 in biasDic and mid1 in biasDic[ch1]:
                bias1=biasDic[ch1][mid1]
            if ch2 in biasDic and mid2 in biasDic[ch2]:
                bias2=biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1<0 or bias2<0) and interactionType !='inter':
            prior_p=1.0
            p_val=1.0
            discardCount+=1
        elif interactionType=='intraInRange' and not interOnly:
            distToLookUp=max(interxn.getDistance(),min(x))
            distToLookUp=min(distToLookUp,max(x))
            i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1)
            prior_p=newSplineY[i]*(bias1*bias2) 
            p_val=scsp.bdtrc(interxn.getCount()-1,observedIntraInRangeSum,prior_p)
            intraInRangeCount +=1
        elif interactionType =='intraShort' and not interOnly:
            prior_p=1.0
            p_val=1.0
            intraVeryProximalCount += 1
        elif interactionType =='intraLong' and not interOnly:
            prior_p=1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val=1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p=interChrProb*(bias1*bias2)
                p_val=scsp.bdtrc(interxn.getCount()-1,observedInterAllSum,prior_p)
                interCount += 1
            else:
                p_val=1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres=1.0/(possibleIntraInRangeCount+possibleInterAllCount)
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0/possibleInterAllCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount)
    else:
        outlierThres = 1.0/possibleIntraInRangeCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile =gzip.open(infilename, 'rt')
    if resolution:
        outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'wt')
    else:
        outfile =gzip.open(outfilename+'.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt"))
    outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n")
    count=0
    for line in infile:
        words=line.rstrip().split()
        chr1=words[0]
        midPoint1=int(words[1])
        chr2=words[2]
        midPoint2=int(words[3])
        interactionCount=float(words[4])
        p_val=p_vals[count]
        q_val=q_vals[count]
        bias1=biasl[count]
        bias2=biasr[count]
        
        if (allReg or interOnly) and chr1!=chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1==chr2:
            interactionDistance = abs(midPoint1-midPoint2)
            if myUtils.in_range_check(interactionDistance,distLowThres, distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        
        if p_val<outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1-midPoint2))
        count+=1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR=0.0
    maxFDR=0.05
    increment=0.001
    FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,outfilename+".qplot")
        
    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy] # from fit_Spline
Example #58
0
def train_models(orig_vector_prediction_matrix,
                 orig_scalar_prediction_matrix,
                 vector_target_matrix,
                 scalar_target_matrix,
                 separate_by_height=True):
    """Trains isotonic-regression models.

    E = number of examples
    H = number of heights
    T_v = number of vector target variables
    T_s = number of scalar target variables

    :param orig_vector_prediction_matrix: numpy array (E x H x T_v) of predicted
        values for vector target variables.
    :param orig_scalar_prediction_matrix: numpy array (E x T_s) of predicted
        values for scalar target variables.
    :param vector_target_matrix: numpy array (E x H x T_v) of actual values
        for vector target variables.
    :param scalar_target_matrix: numpy array (E x T_s) of actual values for
        scalar target variables.
    :param separate_by_height: Boolean flag.  If True, will train one model for
        each target variable (channel).  If False, will train one model for each
        pair of target variable and height.
    :return: scalar_model_objects: List (length T_s) of models (instances of
        `sklearn.isotonic.IsotonicRegression`) for scalar target variables.
    :return: vector_model_object_matrix: numpy array (H x T_v) of models
        (instances of `sklearn.isotonic.IsotonicRegression`) for vector target
        variables.  If `separate_by_height == True`, this array is H x T_v.
        If `separate_by_height == False`, this array has length T_v.
    """

    # Check input args.
    num_examples = None
    num_heights = 0
    num_vector_targets = 0
    num_scalar_targets = 0

    have_vectors = (orig_vector_prediction_matrix is not None
                    or vector_target_matrix is not None)

    if have_vectors:
        error_checking.assert_is_numpy_array(orig_vector_prediction_matrix,
                                             num_dimensions=3)
        error_checking.assert_is_numpy_array_without_nan(
            orig_vector_prediction_matrix)

        error_checking.assert_is_numpy_array(
            vector_target_matrix,
            exact_dimensions=numpy.array(orig_vector_prediction_matrix.shape,
                                         dtype=int))
        error_checking.assert_is_numpy_array_without_nan(vector_target_matrix)

        num_examples = vector_target_matrix.shape[0]
        num_heights = vector_target_matrix.shape[1]
        num_vector_targets = vector_target_matrix.shape[2]

    have_scalars = (orig_scalar_prediction_matrix is not None
                    or scalar_target_matrix is not None)

    if have_scalars:
        error_checking.assert_is_numpy_array(orig_scalar_prediction_matrix,
                                             num_dimensions=2)

        if num_examples is None:
            num_examples = orig_scalar_prediction_matrix.shape[0]

        expected_dim = numpy.array(
            [num_examples, orig_scalar_prediction_matrix.shape[1]], dtype=int)
        error_checking.assert_is_numpy_array(orig_scalar_prediction_matrix,
                                             exact_dimensions=expected_dim)
        error_checking.assert_is_numpy_array_without_nan(
            orig_scalar_prediction_matrix)

        error_checking.assert_is_numpy_array(
            scalar_target_matrix,
            exact_dimensions=numpy.array(orig_scalar_prediction_matrix.shape,
                                         dtype=int))
        error_checking.assert_is_numpy_array_without_nan(scalar_target_matrix)

        num_scalar_targets = scalar_target_matrix.shape[1]

    error_checking.assert_is_boolean(separate_by_height)

    # Do actual stuff.
    scalar_model_objects = [None] * num_scalar_targets
    num_modeling_heights = num_heights if separate_by_height else 1
    vector_model_object_matrix = numpy.full(
        (num_modeling_heights, num_vector_targets), '', dtype=object)

    for k in range(num_scalar_targets):
        print(
            ('Training isotonic-regression model for {0:d}th of {1:d} scalar '
             'target variables...').format(k + 1, num_scalar_targets))

        scalar_model_objects[k] = IsotonicRegression(increasing=True,
                                                     out_of_bounds='clip')
        scalar_model_objects[k].fit(X=orig_scalar_prediction_matrix[:, k],
                                    y=scalar_target_matrix[:, k])

    if num_scalar_targets > 0:
        print('\n')

    for k in range(num_vector_targets):
        for j in range(num_modeling_heights):
            print((
                'Training isotonic-regression model for {0:d}th of {1:d} vector'
                ' target variables at {2:d}th of {3:d} modeling heights...'
            ).format(k + 1, num_vector_targets, j + 1, num_modeling_heights))

            vector_model_object_matrix[j, k] = IsotonicRegression(
                increasing=True, out_of_bounds='clip')

            if separate_by_height:
                vector_model_object_matrix[j, k].fit(
                    X=orig_vector_prediction_matrix[:, j, k],
                    y=vector_target_matrix[:, j, k])
            else:
                vector_model_object_matrix[j, k].fit(
                    X=numpy.ravel(orig_vector_prediction_matrix[..., k]),
                    y=numpy.ravel(vector_target_matrix[..., k]))

        if k != num_vector_targets - 1:
            print('\n')

    return scalar_model_objects, vector_model_object_matrix
def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]),
                       ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])

    # check we don't crash when all x are equal:
    ir = IsotonicRegression()
    assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
Example #60
0
    def iso(self, axis=1, unk='No'):
        # performs isotonic regression row-wise (axis = 0) or column-wise (axis = 1)
        tonic = copy.deepcopy(self.array)  # returns a new isotonic matrix

        # either use a value for unknowns or just do isotonic with all present values
        if unk == 0 or unk is None:
            known_dict = self.known_for_iso(axis, unk)
        else:
            known_dict = None

        # dat dict tells me where things arent increasing (from is_row_inc() or is_col_inc())
        if axis == 1:
            if known_dict is None:
                for i in range(len(tonic[0])):
                    initial_vals = [tonic[j][i] for j in range(len(tonic))]
                    X = list(range(len(initial_vals)))

                    # Use the initial values to fit the model and then predict what the decreasing ones should be
                    iso = IsotonicRegression(out_of_bounds='clip').fit(
                        X, initial_vals)
                    predictions = iso.predict(range(len(tonic)))

                    # put everything back:
                    for row in range(len(predictions)):
                        tonic[row][i] = predictions[row]

            else:
                for i in range(len(tonic[0])):
                    X = known_dict[i]
                    initial_vals = [tonic[j][i] for j in X]

                    # Use the initial values to fit the model and then predict what the decreasing ones should be
                    iso = IsotonicRegression(out_of_bounds='clip').fit(
                        X, initial_vals)
                    predictions = iso.predict(range(len(tonic)))

                    # put everything back:
                    for row in range(len(predictions)):
                        tonic[row][i] = predictions[row]

        else:
            if known_dict is None:
                for i in range(len(tonic)):
                    initial_vals = [tonic[i][j] for j in range(len(tonic[0]))]
                    X = list(range(len(initial_vals)))

                    # Use the initial values to fit the model and then predict what the decreasing ones should be
                    iso = IsotonicRegression(out_of_bounds='clip').fit(
                        X, initial_vals)
                    predictions = iso.predict(range(len(tonic)))

                    # put everything back:
                    tonic[i] = predictions

            else:
                for i in range(len(tonic)):
                    X = known_dict[i]
                    initial_vals = [tonic[i][j] for j in X]

                    # Use the initial values to fit the model and then predict what the decreasing ones should be
                    iso = IsotonicRegression(out_of_bounds='clip').fit(
                        X, initial_vals)
                    predictions = iso.predict(range(len(tonic)))

                    # put everything back:
                    tonic[i] = predictions

        newframe = pd.DataFrame(tonic)
        newframe.columns = self.dataframe.columns
        newframe.index = self.dataframe.index
        return mat_opr(newframe)