Ejemplo n.º 1
0
 def fit(self, A, Y, weights, fit_init=None, refit=False, increasing=True):
   #fit isotonic regression model. 
   model = IsotonicRegression(increasing=increasing,out_of_bounds="clip",y_min=0.0,y_max=1.0)
   model.fit(X=A,y=Y,sample_weight=weights)
   self.model_obj = model
   return(0)
Ejemplo n.º 2
0
def test_isotonic_copy_before_fit():
    # https://github.com/scikit-learn/scikit-learn/issues/6628
    ir = IsotonicRegression()
    copy.copy(ir)
Ejemplo n.º 3
0
import numpy as np
from sklearn.isotonic import IsotonicRegression

ir = IsotonicRegression()


class TimeRetrieval:
    """
    !! ONLY deal with small datasets, because the trainning step is slow
    X: data features such as polar angle
    Y: known time stamps 
    """
    def __init__(self, train_x=None, train_y=None):
        self.train_x = train_x  # {idx: feature(polar angle)}
        self.train_y = train_y  # {idx: stamps (mean,std)}
        self.fit_x = None
        self.fit_y = None

    def train(self,
              train_x=None,
              train_y=None,
              train_alg="fitting_1",
              **kwargs):
        """
        use data points to get labels in X axis
        """
        if train_x is None: train_x = self.train_x.copy()
        if train_y is None: train_y = self.train_y.copy()

        return getattr(self, train_alg)(train_x=train_x,
                                        train_y=train_y,
Ejemplo n.º 4
0
    def known_iso(self, axis=1, unknowns=0):
        # performs isotonic regression ONLY for known data values
        # and ONLY on columns where there are non-increasing points
        # row-wise (axis = 0) or column-wise (axis = 1)
        # unknowns should be 0 or none

        tonic = copy.deepcopy(self.array)  # returns a new isotonic matrix
        known_dict = self.known_for_iso(axis, unknowns)
        if axis == 1:
            increase_dict, non_increase_percent = self.is_col_inc()
        else:
            increase_dict = self.is_row_inc()

        # dat dict tells me where things arent increasing (from is_row_inc() or is_col_inc())
        if axis == 1:
            for i in range(len(tonic[0])):
                try:
                    # if i is a key in increase dict then this column needs regression
                    # else just pass to the next column
                    tester = increase_dict[i]

                    X = known_dict[i]

                    if X != []:
                        initial_vals = [tonic[j][i] for j in X]

                        # Use the initial values to fit the model and then predict what the decreasing ones should be
                        iso = IsotonicRegression(out_of_bounds='clip').fit(
                            X, initial_vals)
                        predictions = iso.predict(range(len(tonic)))

                        # put everything back:
                        for row in range(len(predictions)):
                            tonic[row][i] = predictions[row]
                except:
                    pass

        else:
            # same thing but with rows
            for i in range(len(tonic)):
                try:
                    tester = increase_dict[i]
                    X = known_dict[i]

                    if X != []:

                        initial_vals = [tonic[i][j] for j in X]

                        # Use the initial values to fit the model and then predict what the decreasing ones should be
                        iso = IsotonicRegression(out_of_bounds='clip').fit(
                            X, initial_vals)
                        predictions = iso.predict(range(len(tonic[i])))

                        # put everything back:
                        tonic[i] = predictions

                except:
                    pass

        newframe = pd.DataFrame(tonic)
        newframe.columns = self.dataframe.columns
        newframe.index = self.dataframe.index

        if unknowns == 0:
            # Isotonic outputs NaN values, replace them with zeros
            newframe = newframe.fillna(0)

        return mat_opr(newframe)
forward = lambda X, thetas: simulate_posterior_predictive(X, thetas, noise=0.5)

# Construct the calibration dataset
predicted_quantiles, empirical_quantiles = make_cal_dataset(
    y[:, np.newaxis], x, coefs, forward)
# -

plt.scatter(predicted_quantiles, empirical_quantiles)
plt.plot([0, 1], [0, 1], color='tab:grey', linestyle='--')
plt.xlabel('Predicted Cumulative Distribution')
plt.ylabel('Empirical Cumulative Distribution')
plt.title('Calibration Dataset')

# +
# Train isotonic regression in reverse mode
ir = IsotonicRegression(out_of_bounds='clip')
ir.fit(empirical_quantiles, predicted_quantiles)

# Find the values of calibrated quantiles
calibrated_quantiles = ir.predict([0.025, 0.5, 0.975])

# +
# Plot the posterior predictive
low, mid, high = np.percentile(posterior_predictive, [2.5, 50, 97.5], axis=1)
plt.fill_between(x_test, low, high, alpha=0.2, label='95% Predictive Interval')
plt.plot(x_test, mid, color='tab:red', label='Predicted Median')

low, mid, high = np.quantile(posterior_predictive,
                             calibrated_quantiles,
                             axis=1)
plt.fill_between(x_test, low, high, alpha=0.2, label='95% Calibrated Interval')
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection

from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.utils import check_random_state

n = 100
x = np.arange(n)
rs = check_random_state(0)
y = rs.randint(-50, 50, size=(n, )) + 50.0 * np.log1p(np.arange(n))

# %%
# Fit IsotonicRegression and LinearRegression models:

ir = IsotonicRegression(out_of_bounds="clip")
y_ = ir.fit_transform(x, y)

lr = LinearRegression()
lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

# %%
# Plot results:

segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]
lc = LineCollection(segments, zorder=0)
lc.set_array(np.ones(len(y)))
lc.set_linewidths(np.full(n, 0.5))

fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 6))
Ejemplo n.º 7
0
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic,outliersline,outliersdist,observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write("------------------------------------------------------------------------------------\n"),
   
    splineX = None
    newSplineY = None
    residual = None 
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x,y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1,len(x)):
            if x[i]<=x[i-1]:
                print("ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct.")
                print("Avg. distance of bin(i-1)... %s" % x[i-1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)
        
        # maximum residual allowed for spline is set to min(y)^2
        splineError=min(y)*min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX=max(x)
        tempMinX=min(x)
        tempList=sorted([dis for dis in mainDic])
        splineX=[]
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX<=i<=tempMaxX:
                splineX.append(i)
        splineY=ius(splineX)
        #print(splineY)
        #print(yerr)


        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX,splineY)
        #print(newSplineY)
        residual =sum([i*i for i in (y - ius(x))])

        if visual==True:
            xi = np.linspace(min(x),max(x),5*len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2,1,1)
            plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) 
        
            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb))
            plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2,1,2)

            plt.loglog(splineX,newSplineY,'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename+'.png')
            

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount=0
    intraOutOfRangeCount=0
    intraVeryProximalCount=0
    interCount=0
    discardCount=0
    p_vals=[]
    q_vals=[]
    biasl=[]
    biasr=[]
    for line in infile:
        ch1,mid1,ch2,mid2,contactCount=line.rstrip().split()
        contactCount = float(contactCount)
        interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1); mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres,distUpThres)
        bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 in biasDic and mid1 in biasDic[ch1]:
                bias1=biasDic[ch1][mid1]
            if ch2 in biasDic and mid2 in biasDic[ch2]:
                bias2=biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1<0 or bias2<0) and interactionType !='inter':
            prior_p=1.0
            p_val=1.0
            discardCount+=1
        elif interactionType=='intraInRange' and not interOnly:
            distToLookUp=max(interxn.getDistance(),min(x))
            distToLookUp=min(distToLookUp,max(x))
            i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1)
            prior_p=newSplineY[i]*(bias1*bias2) 
            p_val=scsp.bdtrc(interxn.getCount()-1,observedIntraInRangeSum,prior_p)
            intraInRangeCount +=1
        elif interactionType =='intraShort' and not interOnly:
            prior_p=1.0
            p_val=1.0
            intraVeryProximalCount += 1
        elif interactionType =='intraLong' and not interOnly:
            prior_p=1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val=1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p=interChrProb*(bias1*bias2)
                p_val=scsp.bdtrc(interxn.getCount()-1,observedInterAllSum,prior_p)
                interCount += 1
            else:
                p_val=1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres=1.0/(possibleIntraInRangeCount+possibleInterAllCount)
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0/possibleInterAllCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount)
    else:
        outlierThres = 1.0/possibleIntraInRangeCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile =gzip.open(infilename, 'rt')
    if resolution:
        outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'wt')
    else:
        outfile =gzip.open(outfilename+'.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt"))
    outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n")
    count=0
    for line in infile:
        words=line.rstrip().split()
        chr1=words[0]
        midPoint1=int(words[1])
        chr2=words[2]
        midPoint2=int(words[3])
        interactionCount=float(words[4])
        p_val=p_vals[count]
        q_val=q_vals[count]
        bias1=biasl[count]
        bias2=biasr[count]
        
        if (allReg or interOnly) and chr1!=chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1==chr2:
            interactionDistance = abs(midPoint1-midPoint2)
            if myUtils.in_range_check(interactionDistance,distLowThres, distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        
        if p_val<outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1-midPoint2))
        count+=1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR=0.0
    maxFDR=0.05
    increment=0.001
    FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,outfilename+".qplot")
        
    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy] # from fit_Spline
Ejemplo n.º 8
0
if "Auto" in datasets:
	build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto")
	build_auto(TransformedTargetRegressor(LinearRegression(), func = numpy.log, inverse_func = numpy.exp), "TransformedLinearRegressionAuto")

def build_auto_isotonic(regressor, auto_isotonic_X, name):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
	pipeline.fit(auto_isotonic_X, auto_y)
	pipeline.verify(auto_isotonic_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_isotonic_X), columns = ["mpg"])
	store_csv(mpg, name)

if "Auto" in datasets:
	build_auto_isotonic(IsotonicRegression(increasing = True, out_of_bounds = "nan"), auto_X["acceleration"], "IsotonicRegressionIncrAuto")
	build_auto_isotonic(IsotonicRegression(increasing = False, y_min = 12, y_max = 36, out_of_bounds = "clip"), auto_X["weight"], "IsotonicRegressionDecrAuto")

auto_train_mask = numpy.random.choice([False, True], size = (392,), p = [0.5, 0.5])
auto_test_mask = ~auto_train_mask

def build_auto_opt(regressor, name, fit_params = {}, **pmml_options):
	pipeline = PMMLPipeline([
		("regressor", regressor)
	])
	pipeline.fit(auto_X[auto_train_mask], auto_y[auto_train_mask], **fit_params)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	store_pkl(pipeline, name)
Ejemplo n.º 9
0
def run_train_all_sklearn(file, fp_name, cv=5, verbose=0, seed=1):

    np.random.seed(seed)
    c = defaultdict(list)

    for k in ProgIter([
            'synergy_zip', 'synergy_bliss', 'synergy_loewe', 'synergy_hsa',
            'css_ri', 'name'
    ],
                      verbose=verbose,
                      total=5):
        v = file[k]

        if k != 'name':
            temp = dict(
            )  # for results storage. Assuming that "name" comes last

            if 'drug_row_col' in v.columns:
                v.drop(columns=['drug_row_col'], inplace=True)

            cat_cols = ['cell_line_name']
            categories = [
                v[column].unique() for column in v[cat_cols]
            ]  # manually find all available categories for one-hot

            # pipelines
            encode = Pipeline(steps=[('one-hot-encode',
                                      OneHotEncoder(categories=categories))])
            processor = ColumnTransformer(transformers=[
                ('cat_encoding', encode, cat_cols), ('dropping', 'drop', [k])
            ],
                                          remainder='passthrough')

            catbst = ColumnTransformer(transformers=[('dropping', 'drop', [k])
                                                     ],
                                       remainder='passthrough')

            # regressions
            lr = make_pipeline(processor, linear_model.LinearRegression())
            ridge = make_pipeline(processor, linear_model.Ridge())
            lasso = make_pipeline(processor, linear_model.Lasso())
            elastic = make_pipeline(processor, linear_model.ElasticNet())
            lassolars = make_pipeline(processor, linear_model.LassoLars())
            b_ridge = make_pipeline(processor, linear_model.BayesianRidge())
            kernel = DotProduct() + WhiteKernel()
            gpr = make_pipeline(processor,
                                GaussianProcessRegressor(kernel=kernel))
            linSVR = make_pipeline(processor, LinearSVR())
            hist_gbr = make_pipeline(
                processor,
                HistGradientBoostingRegressor(warm_start=True, max_depth=6))
            rfr = make_pipeline(
                processor,
                RandomForestRegressor(warm_start=True, max_depth=6, n_jobs=3))
            iso = make_pipeline(processor,
                                IsotonicRegression(increasing='auto'))
            xgb = make_pipeline(
                processor, XGBRegressor(tree_method='gpu_hist', max_depth=6))
            cbt = make_pipeline(
                catbst,
                CatBoostRegressor(task_type='GPU',
                                  depth=6,
                                  cat_features=np.array([0]),
                                  verbose=False))

            mls = [
                cbt, rfr, gpr, hist_gbr, lr, ridge, lasso, elastic, lassolars,
                b_ridge, gpr, linSVR, iso
            ]
            mls_names = [
                "cbt", "rfr", "gpr", "hist_gbr", "lr", "ridge", "lasso",
                "elastic", "lassolars", "b_ridge", "gpr", "linSVR", "iso"
            ]

            # results
            start = time.time()
            for MODEL, name in zip(mls, mls_names):
                print(f'\n{name}')
                if 'cbt' == name:
                    n_jobs = 1
                else:
                    n_jobs = cv
                cv_dict = cross_validate(
                    MODEL,
                    v,
                    v[k],
                    cv=cv,
                    scoring={
                        "pearsonr": pearson,
                        "rmse": rmse
                    },
                    return_train_score=False,
                    verbose=verbose,
                    n_jobs=n_jobs,
                )
                temp[name] = {
                    'test_pearsonr': np.nanmean(cv_dict['test_pearsonr']),
                    'test_rmse': abs(np.nanmean(cv_dict['test_rmse']))
                }
                print(temp[name])
            print(f'{k} took {int(time.time()-start)/60} mins')

            c[k] = temp
        else:
            nm = f'/tf/notebooks/code_for_pub/_logs_as_python_files/{fp_name}_13models_5foldCV_{time.ctime()}.pickle'
            with open(nm, 'wb') as file:
                pickle.dump(c, file)
            print(f'saving complete to {nm}')
    return c
Ejemplo n.º 10
0
 def __init__(self):
     self.ir = IsotonicRegression(out_of_bounds="clip")
Ejemplo n.º 11
0
# ===============================================

SGDClf = linear_model.SGDClassifier(loss='modified_huber',penalty='l1')

LogicReg = linear_model.LogisticRegression(penalty='l1', C=1.0, n_jobs=4)

RidgeReg = linear_model.Ridge(alpha=1.0)

KernelRidge = KernelRidge(alpha=1.0, kernel="linear", gamma=None)

RANSACReg = linear_model.RANSACRegressor(linear_model.LinearRegression())

BayesReg = linear_model.BayesianRidge(n_iter=300,alpha_1=1.e-6,alpha_2=1.e-6,
                                      lambda_1=1.e-6, lambda_2=1.e-6)

IsotonicReg = IsotonicRegression(y_min=None, y_max=None, increasing=True,
                                 out_of_bounds='nan')

linear_SVC = svm.SVC(C=1.0, kernel='linear', decision_function_shape='ovr')

def createCLF(model):
    model_str = str(model)
    paras = model.get_params()
    if 'SVC' in model_str:
        return svm.SVC(C=paras['C'], kernel='linear', decision_function_shape='ovr')
    if 'Logistic' in model_str:
        C = paras['C']
        penalty = paras['penalty']
        return linear_model.LogisticRegression(C=C, penalty=penalty)


def lseErr(X, y, leafType):
Ejemplo n.º 12
0
def calibration_comparison(base_estimator,
                           n_samples,
                           weights=None,
                           n_bins=10,
                           detail=False):

    X, y = make_classification(n_samples=3 * n_samples,
                               n_features=6,
                               random_state=42,
                               weights=weights)
    base_estimator_dict = {
        "MultinomialNB": MultinomialNB(),
        "GaussianNB": GaussianNB(),
        "SVC": LinearSVC()
    }

    if (base_estimator == "MultinomialNB"):
        X -= X.min()
    # Train data: train binary model.
    X_train, y_train = X[:n_samples], y[:n_samples]
    print("Positive Rate: {x}".format(x=y_train.mean()))
    # calibrate data.
    X_calib, y_calib = X[n_samples:2 * n_samples], y[n_samples:2 * n_samples]
    # test data.
    X_test, y_test = X[2 * n_samples:], y[2 * n_samples:]
    # train the base estimator
    clf = base_estimator_dict[base_estimator].fit(X_train, y_train)

    if (base_estimator == "SVC"):
        # y_calib_score: training in the calibration model.
        y_calib_score = clf.decision_function(X_calib)
        y_calib_score = (y_calib_score - y_calib_score.min()) /\
                        (y_calib_score.max() - y_calib_score.min())
        # y_test_score: evaluation in the calibration model.
        y_test_score = clf.decision_function(X_test)
        y_test_score = (y_test_score - y_test_score.min()) /\
                       (y_test_score.max() - y_test_score.min())
    else:
        # y_calib_score: training in the calibration model.
        y_calib_score = clf.predict_proba(X_calib)
        y_calib_score = np.array([score[1] for score in y_calib_score])

        # y_test_score: evaluation in the calibration model.
        y_test_score = clf.predict_proba(X_test)
        y_test_score = np.array([score[1] for score in y_test_score])

    calibrate_model_dict = {
        "mimic": _MimicCalibration(threshold_pos=5, record_history=False),
        "isotonic": IsotonicRegression(y_min=0.0,
                                       y_max=1.0,
                                       out_of_bounds='clip'),
        # "platt": LogisticRegression()
    }

    result = {}
    result[base_estimator] = {}
    for cal_name, cal_object in calibrate_model_dict.items():
        # import pdb; pdb.set_trace()
        print(cal_name)
        cal_object.fit(copy(y_calib_score), copy(y_calib))
        if cal_name in ["mimic", "isotonic"]:
            y_output_score = cal_object.predict(copy(y_test_score))
        else:
            raise "Please specify probability prediction function."

        frac_pos, predicted_value = calibration_curve(y_test,
                                                      y_output_score,
                                                      n_bins=n_bins)
        b_score = brier_score_loss(y_test, y_output_score, pos_label=1)
        # precsion = precision_score(y_test, y_output_score)
        # recall = recall_score(y_test, y_output_score)
        # f1 = f1_score(y_test, y_output_score)

        result[base_estimator][cal_name] = {
            "calibration_curve": [frac_pos, predicted_value],
            # "eval_score" : [b_score, precsion, recall, f1]
            "eval_score": [b_score]
        }

        if (detail):
            result[base_estimator][cal_name]["detail"] = {
                "y_test": y_test,
                "y_test_calibrate_score": y_output_score
            }

    return result
Ejemplo n.º 13
0
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train,
                                                y_train,
                                                test_size=0.3,
                                                random_state=42)
bins = 15

### fit naive bayes classifier
clf = GaussianNB()
clf.fit(X_train, y_train)
clf_proba = clf.predict_proba(X_cv)[:, 1]

### fit isotonic regression
isotonic_regression = IsotonicRegression(y_min=0, y_max=1, increasing=True)
isotonic_regression.fit(clf_proba, y_cv)

### predict probabilities
uncalibrated_prob = clf.predict_proba(X_test)[:, 1]
calibrated_prob = isotonic_regression.predict(uncalibrated_prob)

### compute score
logloss = log_loss(y_test, uncalibrated_prob)
roc = roc_auc_score(y_test, uncalibrated_prob)
calibrated_loss = log_loss(y_test, calibrated_prob)
calibrated_roc = roc_auc_score(y_test, calibrated_prob)

### plot calibration curve
plt.plot([0, 1], [0, 1], 'k:', label='Perfectly calibrated')
Ejemplo n.º 14
0
def _smacof_single_p(similarities,
                     n_uq,
                     metric=True,
                     n_components=2,
                     init=None,
                     max_iter=300,
                     verbose=0,
                     eps=1e-3,
                     random_state=None):
    """
    Computes multidimensional scaling using SMACOF algorithm

    Parameters
    ----------
    similarities: symmetric ndarray, shape [n * n]
        similarities between the points

    metric: boolean, optional, default: True
        compute metric or nonmetric SMACOF algorithm

    n_components: int, optional, default: 2
        number of dimension in which to immerse the similarities
        overwritten if initial array is provided.

    init: {None or ndarray}, optional
        if None, randomly chooses the initial configuration
        if ndarray, initialize the SMACOF algorithm with this array

    max_iter: int, optional, default: 300
        Maximum number of iterations of the SMACOF algorithm for a single run

    verbose: int, optional, default: 0
        level of verbosity

    eps: float, optional, default: 1e-6
        relative tolerance w.r.t stress to declare converge

    random_state: integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    Returns
    -------
    X: ndarray (n_samples, n_components), float
               coordinates of the n_samples points in a n_components-space

    stress_: float
        The final value of the stress (sum of squared distance of the
        disparities and the distances for all constrained points)

    n_iter : int
        Number of iterations run.

    """
    similarities = check_symmetric(similarities, raise_exception=True)

    n_samples = similarities.shape[0]
    random_state = check_random_state(random_state)

    # ipdb.set_trace()

    W = np.ones((n_samples, n_samples))
    W[:n_uq, :n_uq] = 0.0
    W[n_uq:, n_uq:] = 0.0
    # W[np.arange(len(W)), np.arange(len(W))] = 0.0

    V = -W
    V[np.arange(len(V)), np.arange(len(V))] = W.sum(axis=1)
    e = np.ones((n_samples, 1))

    Vp = np.linalg.inv(V +
                       np.dot(e, e.T) / n_samples) - np.dot(e, e.T) / n_samples
    # Vp = np.linalg.pinv(V)

    # sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel()
    sim_flat = similarities.ravel()
    sim_flat_w = sim_flat[sim_flat != 0]
    if init is None:
        # Randomly choose initial configuration
        X = random_state.rand(n_samples * n_components)
        X = X.reshape((n_samples, n_components))
    else:
        # overrides the parameter p
        n_components = init.shape[1]
        if n_samples != init.shape[0]:
            raise ValueError("init matrix should be of shape (%d, %d)" %
                             (n_samples, n_components))
        X = init

    old_stress = None
    ir = IsotonicRegression()
    for it in range(max_iter):
        # Compute distance and monotonic regression
        dis = euclidean_distances(X)

        if metric:
            disparities = similarities
        else:
            # dis_flat = dis.ravel()
            # # similarities with 0 are considered as missing values
            # dis_flat_w = dis_flat[sim_flat != 0]

            # # Compute the disparities using a monotonic regression
            # disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
            # disparities = dis_flat.copy()
            # disparities[sim_flat != 0] = disparities_flat
            # disparities = disparities.reshape((n_samples, n_samples))
            # disparities *= np.sqrt((n_samples * (n_samples - 1) / 2) /
            #                        (disparities ** 2).sum())

            dis_flat = dis.ravel()
            # similarities with 0 are considered as missing values
            dis_flat_w = dis_flat[sim_flat != 0]

            # Compute the disparities using a monotonic regression
            disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
            disparities = dis_flat.copy()
            disparities[sim_flat != 0] = disparities_flat
            disparities = disparities.reshape((n_samples, n_samples))
            disparities *= np.sqrt(
                (n_samples * (n_samples - 1) / 2) / (disparities**2).sum())
            disparities[similarities == 0] = 0

        # Compute stress
        # stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
        _stress = (W.ravel() *
                   ((dis.ravel() - disparities.ravel())**2)).sum() / 2

        # Update X using the Guttman transform
        # dis[dis == 0] = 1e-5
        # ratio = disparities / dis
        # B = - ratio
        # B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
        # X = 1. / n_samples * np.dot(B, X)
        # print (1. / n_samples * np.dot(B, X))[:5].T

        dis[dis == 0] = 1e-5
        ratio = disparities / dis
        _B = -W * ratio
        _B[np.arange(len(_B)), np.arange(len(_B))] += (W * ratio).sum(axis=1)

        X = np.dot(Vp, np.dot(_B, X))
        # print X[:5].T

        dis = np.sqrt((X**2).sum(axis=1)).sum()

        if verbose >= 2:
            print('it: %d, stress %s' % (it, stress))
        if old_stress is not None:
            if (old_stress - _stress / dis) < eps:
                if verbose:
                    print('breaking at iteration %d with stress %s' %
                          (it, stress))
                break
        old_stress = _stress / dis

    return X, _stress, it + 1
Ejemplo n.º 15
0
def _smacof_with_anchors_single(config,
                                similarities,
                                metric=True,
                                n_components=2,
                                init=None,
                                max_iter=300,
                                verbose=0,
                                eps=1e-3,
                                random_state=None):
    """
	Computes multidimensional scaling using SMACOF algorithm
	Parameters
	----------
	config : Config object
		configuration object for anchor-tag deployment parameters
	similarities: symmetric ndarray, shape [n * n]
		similarities between the points
	metric: boolean, optional, default: True
		compute metric or nonmetric SMACOF algorithm
	n_components: int, optional, default: 2
		number of dimension in which to immerse the similarities
		overwritten if initial array is provided.
	init: {None or ndarray}, optional
		if None, randomly chooses the initial configuration
		if ndarray, initialize the SMACOF algorithm with this array
	max_iter: int, optional, default: 300
		Maximum number of iterations of the SMACOF algorithm for a single run
	verbose: int, optional, default: 0
		level of verbosity
	eps: float, optional, default: 1e-6
		relative tolerance w.r.t stress to declare converge
	random_state: integer or numpy.RandomState, optional
		The generator used to initialize the centers. If an integer is
		given, it fixes the seed. Defaults to the global numpy random
		number generator.
	Returns
	-------
	X: ndarray (n_samples, n_components), float
			   coordinates of the n_samples points in a n_components-space
	stress_: float
		The final value of the stress (sum of squared distance of the
		disparities and the distances for all constrained points)
	n_iter : int
		Number of iterations run
	last_positions: ndarray [X1,...,Xn]
		An array of computed Xs.
	"""
    NO_OF_TAGS, NO_OF_ANCHORS = config.no_of_tags, config.no_of_anchors
    similarities = check_symmetric(similarities, raise_exception=True)

    n_samples = similarities.shape[0]
    random_state = check_random_state(random_state)

    sim_flat = ((1 - np.tri(n_samples)) * similarities).ravel()
    sim_flat_w = sim_flat[sim_flat != 0]

    if init is None:
        # Randomly choose initial configuration
        X = random_state.rand(n_samples * n_components)
        X = X.reshape((n_samples, n_components))
        # uncomment the following if weight matrix W is not hollow
        #X[:-2] = Xa
    else:
        # overrides the parameter p
        n_components = init.shape[1]
        if n_samples != init.shape[0]:
            raise ValueError("init matrix should be of shape (%d, %d)" %
                             (n_samples, n_components))
        X = init

    old_stress = None
    ir = IsotonicRegression()

    # setup weight matrix
    weights = np.ones((n_samples, n_samples))
    if getattr(config, 'missingdata', None):
        weights[-NO_OF_TAGS:, -NO_OF_TAGS:] = 0

    diag = np.arange(n_samples)
    weights[diag, diag] = 0

    last_n_configs = []
    Xa = config.anchors
    for it in range(max_iter):
        # Compute distance and monotonic regression
        dis = euclidean_distances(X)

        if metric:
            disparities = similarities
        else:
            dis_flat = dis.ravel()
            # similarities with 0 are considered as missing values
            dis_flat_w = dis_flat[sim_flat != 0]

            # Compute the disparities using a monotonic regression
            disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
            disparities = dis_flat.copy()
            disparities[sim_flat != 0] = disparities_flat
            disparities = disparities.reshape((n_samples, n_samples))
            disparities *= np.sqrt(
                (n_samples * (n_samples - 1) / 2) / (disparities**2).sum())

        # Compute stress
        stress = (weights.ravel() *
                  (dis.ravel() - disparities.ravel())**2).sum() / 2
        #stress = ((dis[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel() - disparities[:-NO_OF_TAGS, -NO_OF_TAGS:].ravel()) ** 2).sum()

        # Update X using the Guttman transform
        dis[dis == 0] = 1e5
        ratio = weights * disparities / dis
        B = -ratio
        B[diag, diag] = 0
        B[diag, diag] = -B.sum(axis=1)

        # Apply update to only tag configuration since anchor config is already known

        V = -weights
        V[diag, diag] += weights.sum(axis=1)
        # V_inv = np.linalg.pinv(V)
        V12 = V[-NO_OF_TAGS:, :-NO_OF_TAGS]
        B11 = B[-NO_OF_TAGS:, -NO_OF_TAGS:]
        Zu = X[-NO_OF_TAGS:]
        B12 = B[-NO_OF_TAGS:, :-NO_OF_TAGS]
        V11_inv = np.linalg.inv(V[-NO_OF_TAGS:, -NO_OF_TAGS:])
        Xu = V11_inv.dot(B11.dot(Zu) + (B12 - V12).dot(Xa))

        # merge known anchors config with new tags config
        X = np.concatenate((Xa, Xu))
        last_n_configs.append(X)

        #X = (1/n_samples)*B.dot(X)

        #dis = np.sqrt((X ** 2).sum(axis=1)).sum()
        dis = (weights * dis**2).sum() / 2
        if verbose >= 2:
            print('it: %d, stress %s' % (it, stress))
        if old_stress is not None:
            if (old_stress - stress / dis) < eps:
                if verbose:
                    print('breaking at iteration %d with stress %s' %
                          (it, stress))
                break
        old_stress = stress / dis
    return X, stress, it + 1, np.array(last_n_configs)
linear_regression.fit(visual_vector, conceptual_vector)
predictions = linear_regression.predict(visual_vector)
r2_linear = r2_score(conceptual_vector, predictions)
print("R² linear visual to conceptual:", r2_linear)

# compute least squares regression for R² metric: conceptual to visual
linear_regression = LinearRegression()
linear_regression.fit(conceptual_vector, visual_vector)
predictions = linear_regression.predict(conceptual_vector)
r2_linear = r2_score(visual_vector, predictions)
print("R² linear conceptual to visual:", r2_linear)

# compute isotonic regression for R² metric: visual to conceptual
x = np.reshape(visual_dissimilarities, (-1))
y = np.reshape(conceptual_dissimilarities, (-1))
isotonic_regression = IsotonicRegression()
predictions = isotonic_regression.fit_transform(x, y)
r2_isotonic = r2_score(y, predictions)
print("R² isotonic visual to conceptual:", r2_isotonic)

# compute isotonic regression for R² metric: visual to conceptual
x = np.reshape(conceptual_dissimilarities, (-1))
y = np.reshape(visual_dissimilarities, (-1))
isotonic_regression = IsotonicRegression()
predictions = isotonic_regression.fit_transform(x, y)
r2_isotonic = r2_score(y, predictions)
print("R² isotonic conceptual to visual:", r2_isotonic)

if args.plot:
    # create scatter plot if user want us to
    fig, ax = plt.subplots(figsize=(12, 12))
Ejemplo n.º 17
0

test_cases = [
    (VotingClassifier([('logistic', LogisticRegression()),
                       ('earth',
                        Pipeline([('earth', Earth()),
                                  ('logistic', LogisticRegression())]))],
                      'hard',
                      weights=[1.01, 1.01]), ['predict'],
     create_weird_classification_problem_1()),
    (GradientBoostingClassifier(max_depth=10,
                                n_estimators=10), ['predict_proba', 'predict'],
     create_weird_classification_problem_1()),
    (LogisticRegression(), ['predict_proba', 'predict'],
     create_weird_classification_problem_1()),
    (IsotonicRegression(out_of_bounds='clip'), ['predict'],
     create_isotonic_regression_problem_1()),
    (Earth(), ['predict', 'transform'], create_regression_problem_1()),
    (Earth(allow_missing=True), ['predict', 'transform'],
     create_regression_problem_with_missingness_1()),
    (ElasticNet(), ['predict'], create_regression_problem_1()),
    (ElasticNetCV(), ['predict'], create_regression_problem_1()),
    (LassoCV(), ['predict'], create_regression_problem_1()),
    (Ridge(), ['predict'], create_regression_problem_1()),
    (RidgeCV(), ['predict'], create_regression_problem_1()),
    (SGDRegressor(), ['predict'], create_regression_problem_1()),
    (Lasso(), ['predict'], create_regression_problem_1()),
    (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]),
     ['predict', 'predict_proba'], create_weird_classification_problem_1()),
    (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))],
                  transformer_weights={
 def __init__(self):
     self.clf = IsotonicRegression(y_min=0.0,
                                   y_max=1.0,
                                   out_of_bounds='clip')
Ejemplo n.º 19
0
def _smacof_single(dissimilarities1,
                   dissimilarities2,
                   p,
                   weights1=None,
                   weights2=None,
                   metric=True,
                   n_components=2,
                   init1=None,
                   init2=None,
                   max_iter=300,
                   verbose=0,
                   eps=1e-3,
                   random_state1=None,
                   random_state2=None):
    """
    Computes multidimensional scaling using SMACOF algorithm

    Parameters
    ----------
    dissimilarities : ndarray, shape (n_samples, n_samples)
        Pairwise dissimilarities between the points. Must be symmetric.

    metric : boolean, optional, default: True
        Compute metric or nonmetric SMACOF algorithm.

    n_components : int, optional, default: 2
        Number of dimensions in which to immerse the dissimilarities. If an
        ``init`` array is provided, this option is overridden and the shape of
        ``init`` is used to determine the dimensionality of the embedding
        space.

    init : ndarray, shape (n_samples, n_components), optional, default: None
        Starting configuration of the embedding to initialize the algorithm. By
        default, the algorithm is initialized with a randomly chosen array.

    max_iter : int, optional, default: 300
        Maximum number of iterations of the SMACOF algorithm for a single run.

    verbose : int, optional, default: 0
        Level of verbosity.

    eps : float, optional, default: 1e-3
        Relative tolerance with respect to stress at which to declare
        convergence.

    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.

    Returns
    -------
    X : ndarray, shape (n_samples, n_components)
        Coordinates of the points in a ``n_components``-space.

    stress : float
        The final value of the stress (sum of squared distance of the
        disparities and the distances for all constrained points).

    n_iter : int
        The number of iterations corresponding to the best stress.
    """
    dissimilarities1 = check_symmetric(dissimilarities1, raise_exception=True)
    dissimilarities2 = check_symmetric(dissimilarities2, raise_exception=True)

    if dissimilarities1.shape != dissimilarities2.shape:
        print("Error. Distance matrices have different shapes.")
        sys.exit("Error. Distance matrices have different shapes.")

    n_samples = dissimilarities1.shape[0]

    X1, sim_flat1, sim_flat_w1 = initialize(dissimilarities1, random_state1,
                                            init1, n_samples, n_components)
    X2, sim_flat2, sim_flat_w2 = initialize(dissimilarities2, random_state2,
                                            init2, n_samples, n_components)

    #Default: equal weights
    if weights1 is None:
        weights1 = np.ones((n_samples, n_samples))
    if weights2 is None:
        weights2 = np.ones(n_samples)

    # Disparity-specific weights (V in Borg)
    V1 = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        diagonal = 0
        for j in range(n_samples):
            V1[i, j] = -weights1[i, j]
            diagonal += weights1[i, j]
        V1[i, i] = diagonal

    # Locus-specific weights
    V2 = np.zeros((n_samples, n_samples))
    for i, weight in enumerate(weights2):
        V2[i, i] = weight * p * n_samples

    inv_V = moore_penrose(V1 + V2)

    old_stress = None
    ir = IsotonicRegression()
    for it in range(max_iter):
        # Compute distance and monotonic regression
        dis1 = euclidean_distances(X1)
        dis2 = euclidean_distances(X2)

        if metric:
            disparities1 = dissimilarities1
            disparities2 = dissimilarities2
        else:
            disparities1 = nonmetric_disparities1(dis1, sim_flat1, n_samples)
            disparities2 = nonmetric_disparities2(dis2, sim_flat2, n_samples)

        # Compute stress
        stress = ((dis1.ravel() - disparities1.ravel())**2).sum() + (
            (dis2.ravel() - disparities2.ravel())**2
        ).sum() + n_samples * p * ssd(
            X1, X2
        )  #multiply by n_samples to make ssd term comparable in magnitude to embedding error terms

        # Update X1 using the Guttman transform
        X1 = guttman(X1, X2, disparities1, inv_V, V2, dis1)

        # Update X2 using the Guttman transform
        X2 = guttman(X2, X1, disparities2, inv_V, V2, dis2)

        # Test stress
        dis1 = np.sqrt((X1**2).sum(axis=1)).sum()
        dis2 = np.sqrt((X2**2).sum(axis=1)).sum()
        dis = np.mean((dis1, dis2))
        if verbose >= 2:
            print('it: %d, stress %s' % (it, stress))
        if old_stress is not None:
            if np.abs(old_stress - stress / dis) < eps:
                if verbose:
                    print('breaking at iteration %d with stress %s' %
                          (it, stress))
                break
        old_stress = stress / dis

    return X1, X2, stress, it + 1
def interpolation_estimate(Z,
                           Z_constraint,
                           lower=0.5,
                           upper=4,
                           npts=30,
                           ndraw=5000,
                           burnin=1000,
                           estimator='truncated'):
    """
    Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$
    where $C$ is the convex set encoded by `Z_constraint`

    .. math::

       C = \left\{z: Az+b \geq 0 \right\}

    with $(A,b)$ being `(Z_constraints.inequality, 
    Z_constraints.inequality_offset)`.

    The algorithm proceeds by estimating $\|Z\|^2_2$ 
    by Monte Carlo for a range of `npts` values starting from
    `lower*np.linalg.norm(Z)/np.sqrt(n)` to
    `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`.

    These values are then used to compute the GCM 
    (Greated Convex Minorant) which is interpolated and solved 
    for an arguments such that the expected value matches the observed
    value `(Z**2).sum()`.

    Parameters
    ----------

    Z : `np.float`
        Observed data to be used to estimate $\sigma$. Should be in
        the cone specified by `Z_constraints`.

    Z_constraint : `constraints`
        Constraints under which we observe $Z$.

    lower : float
        Multiple of naive estimate to use as lower endpoint.

    upper : float
        Multiple of naive estimate to use as upper endpoint.

    npts : int
        Number of points in interpolation grid.

    ndraw : int
        Number of Gibbs steps to use for estimating
        each expectation.

    burnin : int
        How many Gibbs steps to use for burning in.

    Returns
    -------

    sigma_hat : float
        The root of the interpolant derived from GCM values.

    interpolant : `interp1d`
        The interpolant, to be used for plotting or other 
        diagnostics.

    WARNING
    -------

    * It is assumed that `Z_constraints.equality` is `None`.
    
    * Uses `rpy2` and `fdrtool` library to compute the GCM.

    """

    initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0])

    Svalues = np.linspace(lower * initial, upper * initial, npts)
    Evalues = []

    n = Z.shape[0]
    L, V, U, S = quadratic_bounds(Z, np.identity(n), Z_constraint)

    if estimator == 'truncated':

        def _estimator(S, Z, Z_constraint):
            L, V, U, _ = quadratic_bounds(Z, np.identity(n), Z_constraint)
            num = mpquad(
                lambda x: mpexp(-x**2 / (2 * S**2) - L * x / S**2 +
                                (n - 1) * mplog(
                                    (x + L) / S) + 2 * mplog(x + L)),
                [0, U - L])
            den = mpquad(
                lambda x: mpexp(-x**2 / (2 * S**2) - L * x / S**2 +
                                (n - 1) * mplog((x + L) / S)), [0, U - L])
            print num / den, V**2, S, (L, U)
            return num / den
    elif estimator == 'simulate':

        state = Z.copy()
        rpy.r.assign('state', state)

        def _estimator(S, state, Z_constraint):
            Z_constraint.covariance = S**2 * np.identity(Z.shape[0])
            e, v, _state = expected_norm_squared(state,
                                                 Z_constraint,
                                                 ndraw=ndraw,
                                                 burnin=burnin)
            state[:] = _state
            return e

    state = Z.copy()
    for S in Svalues:
        Evalues.append(_estimator(S, state, Z_constraint))
    ir = IsotonicRegression()
    if DEBUG:
        print Svalues, Evalues
    Eiso = ir.fit_transform(Svalues, Evalues)
    Sinterp, Einterp = Svalues, Eiso
    #     rpy.r.assign('S', Svalues)
    #     rpy.r.assign('E', np.array(Evalues))
    #     rpy.r('''
    #     library(fdrtool);
    #     G = gcmlcm(S, E, 'gcm');
    #     Sgcm = G$x.knots;
    #     Egcm = G$y.knots;
    #     ''')
    #     Sgcm = np.asarray(rpy.r('Sgcm'))
    #     Egcm = np.asarray(rpy.r('Egcm'))
    #     interpolant = interp1d(Sgcm, Egcm - (Z**2).sum())

    interpolant = interp1d(Sinterp, Einterp - (Z**2).sum())
    try:
        sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max())
    except:
        raise ValueError(
            '''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)'''
            % ((Z**2).sum(), Einterp.min(), Einterp.max()))
    return sigma_hat, interpolant
Ejemplo n.º 21
0
    def iso(self, axis=1, unk='No'):
        # performs isotonic regression row-wise (axis = 0) or column-wise (axis = 1)
        tonic = copy.deepcopy(self.array)  # returns a new isotonic matrix

        # either use a value for unknowns or just do isotonic with all present values
        if unk == 0 or unk is None:
            known_dict = self.known_for_iso(axis, unk)
        else:
            known_dict = None

        # dat dict tells me where things arent increasing (from is_row_inc() or is_col_inc())
        if axis == 1:
            if known_dict is None:
                for i in range(len(tonic[0])):
                    initial_vals = [tonic[j][i] for j in range(len(tonic))]
                    X = list(range(len(initial_vals)))

                    # Use the initial values to fit the model and then predict what the decreasing ones should be
                    iso = IsotonicRegression(out_of_bounds='clip').fit(
                        X, initial_vals)
                    predictions = iso.predict(range(len(tonic)))

                    # put everything back:
                    for row in range(len(predictions)):
                        tonic[row][i] = predictions[row]

            else:
                for i in range(len(tonic[0])):
                    X = known_dict[i]
                    initial_vals = [tonic[j][i] for j in X]

                    # Use the initial values to fit the model and then predict what the decreasing ones should be
                    iso = IsotonicRegression(out_of_bounds='clip').fit(
                        X, initial_vals)
                    predictions = iso.predict(range(len(tonic)))

                    # put everything back:
                    for row in range(len(predictions)):
                        tonic[row][i] = predictions[row]

        else:
            if known_dict is None:
                for i in range(len(tonic)):
                    initial_vals = [tonic[i][j] for j in range(len(tonic[0]))]
                    X = list(range(len(initial_vals)))

                    # Use the initial values to fit the model and then predict what the decreasing ones should be
                    iso = IsotonicRegression(out_of_bounds='clip').fit(
                        X, initial_vals)
                    predictions = iso.predict(range(len(tonic)))

                    # put everything back:
                    tonic[i] = predictions

            else:
                for i in range(len(tonic)):
                    X = known_dict[i]
                    initial_vals = [tonic[i][j] for j in X]

                    # Use the initial values to fit the model and then predict what the decreasing ones should be
                    iso = IsotonicRegression(out_of_bounds='clip').fit(
                        X, initial_vals)
                    predictions = iso.predict(range(len(tonic)))

                    # put everything back:
                    tonic[i] = predictions

        newframe = pd.DataFrame(tonic)
        newframe.columns = self.dataframe.columns
        newframe.index = self.dataframe.index
        return mat_opr(newframe)
def truncated_estimate(Z, Z_constraint, lower=0.5, upper=2, npts=15):
    """
    Estimate the parameter $\sigma$ in $Z \sim N(0, \sigma^2 I) | Z \in C$
    where $C$ is the convex set encoded by `Z_constraints`

    .. math::

       C = \left\{z: Az+b \geq 0 \right\}

    with $(A,b)$ being `(Z_constraints.inequality, 
    Z_constraints.inequality_offset)`.

    The algorithm proceeds by estimating $\|Z\|^2_2$ 
    by Monte Carlo for a range of `npts` values starting from
    `lower*np.linalg.norm(Z)/np.sqrt(n)` to
    `upper*np.linalg.norm(Z)/np.sqrt(n)` with `n=Z.shape[0]`.

    These values are then used to compute the GCM 
    (Greated Convex Minorant) which is interpolated and solved 
    for an arguments such that the expected value matches the observed
    value `(Z**2).sum()`.

    Parameters
    ----------

    Z : `np.float`
        Observed data to be used to estimate $\sigma$. Should be in
        the cone specified by `Z_constraints`.

    Z_constraint : `constraints`
        Constraints under which we observe $Z$.

    lower : float
        Multiple of naive estimate to use as lower endpoint.

    upper : float
        Multiple of naive estimate to use as upper endpoint.

    npts : int
        Number of points in interpolation grid.

    Returns
    -------

    sigma_hat : float
        The root of the interpolant derived from GCM values.

    interpolant : `interp1d`
        The interpolant, to be used for plotting or other 
        diagnostics.

    WARNING
    -------

    * It is assumed that `Z_constraints.equality` is `None`.
    
    * Uses `rpy2` and `fdrtool` library to compute the GCM.

    """

    initial = np.linalg.norm(Z) / np.sqrt(Z.shape[0])

    Svalues = np.linspace(lower * initial, upper * initial, npts)
    Evalues = []

    # use truncated chi to estimate integral
    # with scipy.integrate.quad
    n = Z.shape[0]
    operator = np.identity(n)
    L, V, U, S = quadratic_bounds(Z, operator, Z_constraint)

    for S in Svalues:
        num = quad(lambda x: np.exp(-x**2 / (2 * S**2) + (n + 1) * np.log(x)),
                   L, U)
        den = quad(lambda x: np.exp(-x**2 / (2 * S**2) + (n - 1) * np.log(x)),
                   L, U)
        Evalues.append(num[0] / den[0])
        print num, den

    ir = IsotonicRegression()
    if DEBUG:
        print Svalues, Evalues
    Eiso = ir.fit_transform(Svalues, Evalues)
    Sinterp, Einterp = Svalues, Eiso

    interpolant = interp1d(Sinterp, Einterp - (Z**2).sum())
    try:
        sigma_hat = bisect(interpolant, Sinterp.min(), Sinterp.max())
    except:
        raise ValueError(
            '''Bisection failed -- check (lower, upper). Observed = %0.1e, Range = (%0.1e,%0.1e)'''
            % ((Z**2).sum(), Einterp.min(), Einterp.max()))
    return sigma_hat, interpolant

    print L, V, U, S
Ejemplo n.º 23
0
def find_regions(t,
                 x,
                 minimum_datapoints=10,
                 mu_factor=0.7,
                 low_density_factor=0.01):
    """Finds clean regions of gradual growth between jump events.

    Args:
        t (1D numpy.array): clean time points
        x (1D numpy.array): cleaned log-OD series (same shape as `t`)
        minimum_datapoints (int): regions must have at least this many data points
        mu_factor (float): the linear fit is tempered by this factor before isotonic regression
            Low values (0.0..0.5) can result in missing jump events.
            High values (0.8..1.0) can result in oversegmentation, i.e. false jump events

    Returns:
        numpy.array: list of start indexes for the regions
        numpy.array: list of end indexes (inclusive) for the regions
    """
    # find gaps in the data
    avg_dt = (t[-1] - t[0]) / (len(t) - 1)
    gap_start_idexes = np.where(np.diff(t) > avg_dt / low_density_factor)[0]

    # build initial set of regions from these gaps
    s_raw = [0]
    e_raw = []
    for gap_idx in gap_start_idexes:
        e_raw.append(gap_idx)
        s_raw.append(gap_idx + 1)
    e_raw.append(len(t) - 1)
    regions_to_investigate = list(zip(s_raw, e_raw))

    s = []
    e = []
    while len(regions_to_investigate) > 0:

        # pick a new region
        start_idx, end_idx = regions_to_investigate.pop()

        # check that there are at least a minimum number of datapoints
        if end_idx - start_idx + 1 < minimum_datapoints:
            continue

        # find optimal drift
        t_region = t[start_idx:end_idx + 1]
        x_region = x[start_idx:end_idx + 1]
        mu_min = LinearRegression(fit_intercept=True) \
                 .fit(t_region.reshape([-1, 1]),
                     x_region) \
                 .coef_

        # fit monotonic function
        x_drifting = x_region - t_region * mu_min * mu_factor
        iso_reg = IsotonicRegression(increasing=False) \
                  .fit(t_region, x_drifting)
        x_segmented = iso_reg.predict(t_region)

        # find jumps
        jump_indexes = np.where(np.diff(x_segmented) < 0)[0] + start_idx
        if len(jump_indexes) > 0:
            # if found, add the sub-regions to the list of new regions
            start_indexes = [start_idx]
            end_indexes = []
            for jump_idx in jump_indexes:
                end_indexes.append(jump_idx)
                start_indexes.append(jump_idx + 1)
            end_indexes.append(end_idx)
            for start_idx, end_idx in zip(start_indexes, end_indexes):
                regions_to_investigate.append((start_idx, end_idx))
        else:
            # if no subregions are found, add regions to final set
            s.append(start_idx)
            e.append(end_idx)

    s.sort()
    e.sort()
    return np.array(s), np.array(e)
Ejemplo n.º 24
0
def train_models(orig_vector_prediction_matrix,
                 orig_scalar_prediction_matrix,
                 vector_target_matrix,
                 scalar_target_matrix,
                 separate_by_height=True):
    """Trains isotonic-regression models.

    E = number of examples
    H = number of heights
    T_v = number of vector target variables
    T_s = number of scalar target variables

    :param orig_vector_prediction_matrix: numpy array (E x H x T_v) of predicted
        values for vector target variables.
    :param orig_scalar_prediction_matrix: numpy array (E x T_s) of predicted
        values for scalar target variables.
    :param vector_target_matrix: numpy array (E x H x T_v) of actual values
        for vector target variables.
    :param scalar_target_matrix: numpy array (E x T_s) of actual values for
        scalar target variables.
    :param separate_by_height: Boolean flag.  If True, will train one model for
        each target variable (channel).  If False, will train one model for each
        pair of target variable and height.
    :return: scalar_model_objects: List (length T_s) of models (instances of
        `sklearn.isotonic.IsotonicRegression`) for scalar target variables.
    :return: vector_model_object_matrix: numpy array (H x T_v) of models
        (instances of `sklearn.isotonic.IsotonicRegression`) for vector target
        variables.  If `separate_by_height == True`, this array is H x T_v.
        If `separate_by_height == False`, this array has length T_v.
    """

    # Check input args.
    num_examples = None
    num_heights = 0
    num_vector_targets = 0
    num_scalar_targets = 0

    have_vectors = (orig_vector_prediction_matrix is not None
                    or vector_target_matrix is not None)

    if have_vectors:
        error_checking.assert_is_numpy_array(orig_vector_prediction_matrix,
                                             num_dimensions=3)
        error_checking.assert_is_numpy_array_without_nan(
            orig_vector_prediction_matrix)

        error_checking.assert_is_numpy_array(
            vector_target_matrix,
            exact_dimensions=numpy.array(orig_vector_prediction_matrix.shape,
                                         dtype=int))
        error_checking.assert_is_numpy_array_without_nan(vector_target_matrix)

        num_examples = vector_target_matrix.shape[0]
        num_heights = vector_target_matrix.shape[1]
        num_vector_targets = vector_target_matrix.shape[2]

    have_scalars = (orig_scalar_prediction_matrix is not None
                    or scalar_target_matrix is not None)

    if have_scalars:
        error_checking.assert_is_numpy_array(orig_scalar_prediction_matrix,
                                             num_dimensions=2)

        if num_examples is None:
            num_examples = orig_scalar_prediction_matrix.shape[0]

        expected_dim = numpy.array(
            [num_examples, orig_scalar_prediction_matrix.shape[1]], dtype=int)
        error_checking.assert_is_numpy_array(orig_scalar_prediction_matrix,
                                             exact_dimensions=expected_dim)
        error_checking.assert_is_numpy_array_without_nan(
            orig_scalar_prediction_matrix)

        error_checking.assert_is_numpy_array(
            scalar_target_matrix,
            exact_dimensions=numpy.array(orig_scalar_prediction_matrix.shape,
                                         dtype=int))
        error_checking.assert_is_numpy_array_without_nan(scalar_target_matrix)

        num_scalar_targets = scalar_target_matrix.shape[1]

    error_checking.assert_is_boolean(separate_by_height)

    # Do actual stuff.
    scalar_model_objects = [None] * num_scalar_targets
    num_modeling_heights = num_heights if separate_by_height else 1
    vector_model_object_matrix = numpy.full(
        (num_modeling_heights, num_vector_targets), '', dtype=object)

    for k in range(num_scalar_targets):
        print(
            ('Training isotonic-regression model for {0:d}th of {1:d} scalar '
             'target variables...').format(k + 1, num_scalar_targets))

        scalar_model_objects[k] = IsotonicRegression(increasing=True,
                                                     out_of_bounds='clip')
        scalar_model_objects[k].fit(X=orig_scalar_prediction_matrix[:, k],
                                    y=scalar_target_matrix[:, k])

    if num_scalar_targets > 0:
        print('\n')

    for k in range(num_vector_targets):
        for j in range(num_modeling_heights):
            print((
                'Training isotonic-regression model for {0:d}th of {1:d} vector'
                ' target variables at {2:d}th of {3:d} modeling heights...'
            ).format(k + 1, num_vector_targets, j + 1, num_modeling_heights))

            vector_model_object_matrix[j, k] = IsotonicRegression(
                increasing=True, out_of_bounds='clip')

            if separate_by_height:
                vector_model_object_matrix[j, k].fit(
                    X=orig_vector_prediction_matrix[:, j, k],
                    y=vector_target_matrix[:, j, k])
            else:
                vector_model_object_matrix[j, k].fit(
                    X=numpy.ravel(orig_vector_prediction_matrix[..., k]),
                    y=numpy.ravel(vector_target_matrix[..., k]))

        if k != num_vector_targets - 1:
            print('\n')

    return scalar_model_objects, vector_model_object_matrix
Ejemplo n.º 25
0
def test_isotonic_regression_reversed():
    y = np.array([10, 9, 10, 7, 6, 6.1, 5])
    y_ = IsotonicRegression(increasing=False).fit_transform(
        np.arange(len(y)), y)
    assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
Ejemplo n.º 26
0
 def __init__(self, add_one=False):
     self.add_one = add_one
     self._ir = IsotonicRegression()
Ejemplo n.º 27
0
def distance_law(matrix,
                 detectable_bins=None,
                 max_dist=None,
                 smooth=True,
                 fun=np.nanmean):
    """
    Computes genomic distance law by averaging over each diagonal in the upper
    triangle matrix. If a list of detectable bins is provided, pixels in
    missing bins will be excluded from the averages. A maximum distance can be
    specified to define how many diagonals should be computed.

    parameters
    ----------
    matrix: scipy.sparse.csr_matrix
        the input matrix to compute distance law from.
    detectable_bins : numpy.ndarray of ints
        An array of detectable bins indices to consider when computing
        distance law.
    max_dist : int
        Maximum distance from diagonal, in number of bins in which to compute
        distance law
    smooth : bool
        Whether to use isotonic regression to smooth the distance law.
    fun : callable
        A function to apply on each diagonal. Defaults to mean.

    Returns
    -------
    dist: np.ndarray
        the output genomic distance law.

    example
    -------
        >>> m = np.ones((3,3))
        >>> m += np.array([1,2,3])
        >>> m
        array([[2., 3., 4.],
               [2., 3., 4.],
               [2., 3., 4.]])
        >>> distance_law(csr_matrix(m))
        array([3. , 3.5, 4. ])

    """
    mat_n = matrix.shape[0]
    if max_dist is None:
        max_dist = mat_n
    n_diags = min(mat_n, max_dist + 1)
    dist = np.zeros(mat_n)
    if detectable_bins is None:
        detectable_bins = np.array(range(mat_n))

    for diag in range(n_diags):
        # Find detectable which fall in diagonal
        detect_mask = np.zeros(mat_n, dtype=bool)
        detect_mask[detectable_bins] = 1
        # Find bins which are detectable in the diagonal (intersect of
        # hori and verti)
        detect_mask_h = detect_mask[:(mat_n - diag)]
        detect_mask_v = detect_mask[mat_n - (mat_n - diag):]
        detect_mask_diag = detect_mask_h & detect_mask_v
        detect_diag = matrix.diagonal(diag)[detect_mask_diag]
        dist[diag] = fun(detect_diag[detect_diag > 0])
    # Smooth the curve using isotonic regression: Find closest approximation
    # with the condition that point n+1 cannot be higher than point n.
    # (i.e. contacts can only decrease when increasing distance)
    if smooth and mat_n > 2:
        ir = IsotonicRegression(increasing=False)
        dist[~np.isfinite(dist)] = 0
        dist = ir.fit_transform(range(len(dist)), dist)

    return dist
Ejemplo n.º 28
0
def fit_spline(mainDic, x, y, yerr, infilename, outfilename, biasDic,
               resolution, min_dist, max_dist, verbose):
    if verbose:
        print("\nFit a univariate spline to the probability means\n"),
        print(
            "------------------------------------------------------------------------------------\n"
        ),

    # maximum residual allowed for spline is set to min(y)^2
    splineError = min(y)**2

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING A SKLEARN ISOTONIC REGRESSION
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    min_x, max_x = min(x), max(x)
    tempList = sorted([dis for dis in mainDic])
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if min_x <= i <= max_x:
            splineX.append(i)

    splineY = ius(splineX)

    ir = IsotonicRegression(increasing=False)
    rNewSplineY = ir.fit_transform(splineX, splineY)

    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])

    ### Now newSplineY holds the monotonic contact probabilities
    residual = sum([i * i for i in (y - ius(x))])

    ### Now plot the results
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    plt.title(
        'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e'
        % (residual),
        size='small')
    plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y],
             'ro',
             label="Means")
    plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY],
             'g-',
             label="Spline fit")

    plt.ylabel('Probability (1e-5)')
    plt.xlabel('Genomic distance (kb)')
    plt.xlim([min_x / 1000.0, max_x / 1000.0])
    ax.legend(loc="upper right")

    ax = fig.add_subplot(2, 1, 2)
    plt.loglog(splineX, newSplineY, 'g-')
    plt.loglog(x, y, 'r.')  # Data

    plt.ylabel('Probability (log scale)')
    plt.xlabel('Genomic distance (log scale)')
    plt.xlim([min_x, max_x])
    plt.savefig(outfilename + '.res' + str(resolution) + '.png')
    sys.stderr.write("Plotting %s" % outfilename + ".png\n")

    # NOW write the calculated pvalues and corrected pvalues in a file
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0

    if verbose:
        print("lower bound on mid-range distances  " + repr(min_dist) +
              ", upper bound on mid-range distances  " + repr(max_dist) +
              "\n"),

    with gzip.open(infilename, 'r') as infile:
        with gzip.open(
                '{}.res{}.significances.txt.gz'.format(outfilename,
                                                       resolution),
                'w') as outfile:
            outfile.write(
                "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
            )

            for line in infile:
                chr1, mid1, chr2, mid2, contactCount = line.rstrip().split()
                mid1, mid2, contactCount = int(mid1), int(mid2), int(
                    contactCount)
                distance = mid2 - mid1

                bias1 = 1.0
                bias2 = 1.0
                # assumes there is no bias to begin with
                # if the biasDic is not null sets the real bias values
                if len(biasDic) > 0:
                    if chr1 in biasDic and mid1 in biasDic[chr1]:
                        bias1 = biasDic[chr1][mid1]
                    if chr2 in biasDic and mid2 in biasDic[chr2]:
                        bias2 = biasDic[chr2][mid2]

                if min_dist <= distance <= max_dist:
                    # make sure the interaction distance is covered by the probability bins
                    distToLookUp = min(max(distance, min_x), max_x)
                    i = min(bisect.bisect_left(splineX, distToLookUp),
                            len(splineX) - 1)
                    prior_p = newSplineY[i] * (bias1 * bias2
                                               )  # biases added in the picture
                    p_val = scsp.bdtrc(contactCount - 1,
                                       observedIntraInRangeSum, prior_p)

                    if p_val <= 1:
                        outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                            chr1, mid1, chr2, mid2, contactCount, p_val, -1))

    return splineX, newSplineY, residual
Ejemplo n.º 29
0
def fit_curve(x, y):
    curve = IsotonicRegression(increasing=False)
    curve.fit(x, y)
    return curve
Ejemplo n.º 30
0
    def compute(self) -> float:
        """
        This method allows to compute multiple stress functions:
            * Kruskal stress https://www.researchgate.net/publication/24061688_Nonmetric_multidimensional_scaling_A_numerical_method
            * S stress http://gifi.stat.ucla.edu/janspubs/1977/articles/takane_young_deleeuw_A_77.pdf
            * Sammon stress http://ieeexplore.ieee.org/document/1671271/?reload=true
            * Quadratic Loss
        Source: https://github.com/flowersteam/Unsupervised_Goal_Space_Learning/blob/master/src/embqual.py.
        :return: Stress measure.
        """

        # We retrieve dimensions of the data
        n, m = self._low_dimensional_data.shape

        #  We compute distance matrices in both spaces
        if self._use_geodesic_distances:
            k: int = 2
            is_connex: bool = False

            while is_connex is False:
                knn = sklearn.neighbors.NearestNeighbors(n_neighbors=k)
                knn.fit(self._low_dimensional_data)
                M = knn.kneighbors_graph(self._low_dimensional_data,
                                         mode='distance')
                graph = networkx.from_scipy_sparse_matrix(M)
                is_connex = networkx.is_connected(graph)
                k += 1
            s_uni_distances = networkx.all_pairs_dijkstra_path_length(
                graph, cutoff=None, weight='weight')
            s_all_distances = np.array([
                np.array(a.items())[:, 1]
                for a in np.array(s_uni_distances.items())[:, 1]
            ])
            s_all_distances = (s_all_distances + s_all_distances.T) / 2
            s_uni_distances = scipy.spatial.distance.squareform(
                s_all_distances)
            s_all_distances = s_all_distances.ravel()

        else:
            s_uni_distances = scipy.spatial.distance.pdist(
                self._low_dimensional_data)
            s_all_distances = scipy.spatial.distance.squareform(
                s_uni_distances).ravel()
        l_uni_distances = scipy.spatial.distance.pdist(self._target_data)
        l_all_distances = scipy.spatial.distance.squareform(
            l_uni_distances).ravel()

        # We set up the measure dict
        measures = dict()

        # 1. Quadratic Loss
        # measures['quadratic_loss'] = numpy.square(s_uni_distances - l_uni_distances).sum()

        # 2. Sammon stress
        # measures['sammon_stress'] = (1 / s_uni_distances.sum()) * (
        #     numpy.square(s_uni_distances - l_uni_distances) / s_uni_distances
        # ).sum()

        # 3. S stress
        # measures['s_stress'] = numpy.sqrt((1 / n) * (
        #     numpy.square(
        #         (numpy.square(s_uni_distances) - numpy.square(l_uni_distances)).sum()
        #     ) / numpy.power(s_uni_distances, 4)
        # )).sum()

        # 4. Kruskal stress
        # We reorder the distances under the order of distances in latent space
        s_all_distances: np.ndarray = s_all_distances[
            l_all_distances.argsort()]
        l_all_distances: np.ndarray = l_all_distances[
            l_all_distances.argsort()]
        # We perform the isotonic regression
        iso: IsotonicRegression = IsotonicRegression()
        s_iso_distances: np.ndarray = iso.fit_transform(
            s_all_distances, l_all_distances)
        # We compute the kruskal stress.
        measures['kruskal_stress'] = np.sqrt(
            np.square(s_iso_distances - l_all_distances).sum() /
            np.square(l_all_distances).sum())

        # Pick Kruskal's stress here. Best choices amongst Sammon, S, Kruskal, quadratic loss?
        return measures['kruskal_stress']