def test_predict_proba(self):
     clf = SklearnClassifier(estimator=GaussianProcessClassifier(),
                             missing_label='nan')
     self.assertRaises(NotFittedError, clf.predict_proba, X=self.X)
     clf.fit(X=self.X, y=self.y1)
     P = clf.predict_proba(X=self.X)
     est = GaussianProcessClassifier().fit(X=np.zeros((3, 1)),
                                           y=['tokyo', 'paris', 'tokyo'])
     P_exp = est.predict_proba(X=self.X)
     np.testing.assert_array_equal(P_exp, P)
     np.testing.assert_array_equal(clf.classes_, est.classes_)
     clf.fit(X=self.X, y=self.y2)
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always")
         P = clf.predict_proba(X=self.X)
         self.assertEqual(len(w), 1)
     P_exp = np.ones((len(self.X), 1))
     np.testing.assert_array_equal(P_exp, P)
     clf = SklearnClassifier(estimator=GaussianProcessClassifier(),
                             classes=['ny', 'paris', 'tokyo'],
                             missing_label='nan')
     clf.fit(X=self.X, y=self.y_nan)
     P = clf.predict_proba(X=self.X)
     P_exp = np.ones((len(self.X), 3)) / 3
     np.testing.assert_array_equal(P_exp, P)
     clf.fit(X=self.X, y=self.y1)
     P = clf.predict_proba(X=self.X)
     P_exp = np.zeros((len(self.X), 3))
     P_exp[:, 1:] = est.predict_proba(X=self.X)
     np.testing.assert_array_equal(P_exp, P)
Beispiel #2
0
def gpc_sklearn(ax, x, y, kernel, optimizer="fmin_l_bfgs_b"):
    """
    Implemented with GaussianProcessClassifier in sklearn.gaussisan_process.
    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of GPML. 
    The Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian.
    The implementation is restricted to using the logistic link function.
    
    INPUT:
        ax: an Axes object
        x: (N, ) np.array
        y: (N, ) np.array
        kernel: sklearn.gaussian_process.kernels object. Used to initialize GaussianProcessClassifier
        optimizer : 
            string or callable.
            Can either be one of the internally supported optimizers for optimizing the kernel's parameters,
            specified by a string, or an externally defined optimizer passed as a callable.
            If a callable is passed, it must have the signature.
            If None is passed, the kernel's parameters are kept [
        ax: an Axes object
    """
    # Fit GaussianProcessClassification and LinearRegression models
    gpc = GaussianProcessClassifier(kernel=kernel, optimizer=optimizer)
    gpc.fit(x[:, np.newaxis], y)
    print("\nLearned kernel: %s" % gpc.kernel_)
    y_ = gpc.predict_proba(x[:, np.newaxis])[:, 1]

    xs = np.linspace(np.min(x), np.max(x), 1000)
    ys = gpc.predict_proba(xs[:, np.newaxis])[:, 1]

    # lr = LinearRegression()
    # lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

    # Plot
    # ax.plot(x, y, 'r.', markersize=12, alpha = 0.2)
    ax.plot(xs, ys, markersize=12, alpha=0.2)

    # ax.plot(x, lr.predict(x[:, np.newaxis]), 'b-')
    # ax.set_xlim(-0.1, 1.1)
    # ax.set_ylim(-0.1, 1.1)

    # compute ece and acc after calibration
    ece = EceEval(np.array([1 - y_, y_]).T, y, num_bins=100)
    y_predict = y_ > 0.5
    acc = (y_predict == y).mean()

    ax.text(0.05,
            0.8,
            'ECE=%.4f\nACC=%.4f' % (ece, acc),
            size=14,
            ha='left',
            va='center',
            bbox={
                'facecolor': 'green',
                'alpha': 0.5,
                'pad': 4
            })

    return ax
def GPAL(X,
         Y,
         train_ind,
         candidate_ind,
         test_ind,
         sample='En',
         kernel='rbf',
         Niter=500,
         eta=10):
    ourRes = []
    train_index = train_ind.copy()
    test_index = test_ind.copy()
    candidate_index = candidate_ind.copy()
    varRes = []
    enRes = []
    for i in range(Niter):
        print(i)
        if (kernel == 'linear'):
            dotkernel = DotProduct(sigma_0=1)
            model = GPC(kernel=dotkernel)
        else:
            model = GPC()
        model.fit(X[train_index], Y[train_index])
        ourRes.append(model.score(X[test_index, :], Y[test_index]))
        print(ourRes[-1])
        if (sample == 'rand'):
            sampleIndex = np.random.randint(len(candidate_index))
        elif (sample == 'En'):
            proba = model.predict_proba(X[candidate_index, :])
            en = sp.stats.entropy(proba.T)
            sampleScore = en
            sampleIndex = np.argmax(sampleScore)
        elif (sample == 'var'):
            model.predict_proba(X[candidate_index, :])
            meanVar = np.zeros(len(candidate_index))
            for tem in model.base_estimator_.estimators_:
                meanVar = meanVar + tem.var
            sampleIndex = np.argmax(meanVar)
        elif (sample == 'varEN'):
            proba = model.predict_proba(X[candidate_index, :])
            en = sp.stats.entropy(proba.T)
            meanVar = np.zeros(len(candidate_index))
            enRes.append(np.mean(en))

            for tem in model.base_estimator_.estimators_:
                meanVar = meanVar + tem.var
            sampleIndex = np.argmax(meanVar / len(np.unique(Y)) * eta + en)
            varRes.append(np.mean(meanVar))
            print('max var %f----selected var %f-----selected en %f ' %
                  (np.max(meanVar), meanVar[sampleIndex], en[sampleIndex]))
        sampleIndex = candidate_index[sampleIndex]
        train_index = train_index + [sampleIndex]
        candidate_index = [
            x for x in candidate_index if x not in [sampleIndex]
        ]
    return [ourRes, varRes, enRes]
Beispiel #4
0
def test_multi_class_n_jobs(kernel):
    # Test that multi-class GPC produces identical results with n_jobs>1.
    gpc = GaussianProcessClassifier(kernel=kernel)
    gpc.fit(X, y_mc)

    gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2)
    gpc_2.fit(X, y_mc)

    y_prob = gpc.predict_proba(X2)
    y_prob_2 = gpc_2.predict_proba(X2)
    assert_almost_equal(y_prob, y_prob_2)
Beispiel #5
0
def test_multi_class_n_jobs(kernel):
    # Test that multi-class GPC produces identical results with n_jobs>1.
    gpc = GaussianProcessClassifier(kernel=kernel)
    gpc.fit(X, y_mc)

    gpc_2 = GaussianProcessClassifier(kernel=kernel, n_jobs=2)
    gpc_2.fit(X, y_mc)

    y_prob = gpc.predict_proba(X2)
    y_prob_2 = gpc_2.predict_proba(X2)
    assert_almost_equal(y_prob, y_prob_2)
def calculate_t(dataset_no):
    print("Starting to find orignal_labels ans for dataset no", dataset_no)
    X, y = load_datasets(dataset_no)
    rows, col = X.shape
    kernel = 1.0 * RBF(1.0)
    ROW = int(Training_percent * rows)
    ROW = 800
    print("Starting $")
    gpc = GaussianProcessClassifier(kernel=kernel,
                                    random_state=0).fit(X[:ROW, :], y[:ROW])
    print("Successfully trained ", dataset_no)
    print("Starting predicting data for full length")
    orignal_probability = gpc.predict_proba(X[:ROW, :])
    print("Orignal_probability array calculated for dataset_no", dataset_no)

    mrl = [None for _ in range(5)]

    current_pos = int(STARTING_FRACTION * col)

    while 1:
        gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(
            X[:ROW, :current_pos], y[:ROW])
        new_probability = gpc.predict_proba(X[:ROW, :current_pos])

        print("Probabilities calculated for current value of f = ",
              current_pos)

        for i in range(5):
            value_mrl = mrl[i]
            if not (value_mrl):
                if i == 4:
                    temporary = 16
                else:
                    temporary = i + 1
                if (check_probabilities_for_f(orignal_probability,
                                              new_probability, alpha, y,
                                              temporary)):
                    mrl[i] = current_pos
                    print("F for label", temporary, " is ", current_pos)
                    # print("Saving model")
                    # s = 'label_id' + str(i+1) + "component" + str(dataset_no)
                    # filename = 'models/' + s + '.sav'
                    # pickle.dump(gpc, open(filename, 'wb'))

        all_completed = 1
        for value_mrl in mrl:
            if not (value_mrl):
                all_completed = 0
        if all_completed:
            break

        current_pos = current_pos + 5
    return mrl
Beispiel #7
0
class myGPBinary(myModel):
    def make(self , make_params  ):
        self.model = GaussianProcessClassifier(**make_params )
        return self

    def fit(self , xtrain , ytrain , xtest =None, ytest =None , fit_params = {} ):
        self.model.fit(xtrain , ytrain  , **fit_params)

    def predict(self , xs , threshold = 0.5):
        return self.model.predict
                    
    def predict_proba(self, xs):
        if len(xs.shape) == 1:
            return self.model.predict_proba(xs.reshape(1,-1))
        else:
            return self.model.predict_proba(xs)[:,1]
Beispiel #8
0
def GPC(train, target, test):
  kernel = 1.0 * RBF(1.0)
  gpc = GaussianProcessClassifier(kernel=kernel, random_state=0)
  gpc.fit(train, target)
  #print("Score:",gpc.score(train, target))
  prediction = gpc.predict_proba(test)[:, 1]
  return prediction
Beispiel #9
0
def get_new_labels_entropy(evaluated_set_X,
                           evaluated_set_y,
                           unevaluated_X,
                           number_of_new_labels,
                           _KRIGING=0):
    """ Get a set of parameter combinations according to their predicted label entropy
    
    
    
    """
    if _KRIGING:
        clf = GaussianProcessClassifier()
        clf.fit(evaluated_set_X,
                calibration_condition(evaluated_set_y, calibration_threshold))
    else:
        clf = fit_entropy_classifier(evaluated_set_X, evaluated_set_y,
                                     surrogate_model,
                                     surrogate_parameter_space)

    y_hat_probability = clf.predict_proba(unevaluated_X)
    y_hat_entropy = np.array(map(entropy, y_hat_probability))
    y_hat_entropy /= y_hat_entropy.sum()
    unevaluated_X_size = unevaluated_X.shape[0]

    selections = np.random.choice(a=unevaluated_X_size,
                                  size=number_of_new_labels,
                                  replace=False,
                                  p=y_hat_entropy)
    return selections
Beispiel #10
0
def test_predict_consistent():
    """ Check binary predict decision has also predicted probability above 0.5.
    """
    for kernel in kernels:
        gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
        assert_array_equal(gpc.predict(X),
                           gpc.predict_proba(X)[:, 1] >= 0.5)
Beispiel #11
0
def test_predict_consistent_structured():
    # Check binary predict decision has also predicted probability above 0.5.
    X = ["A", "AB", "B"]
    y = np.array([True, False, True])
    kernel = MiniSeqKernel(baseline_similarity_bounds="fixed")
    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
    assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
Beispiel #12
0
def activity_3_3():
    iris = datasets.load_iris()
    X = iris.data[:, :2]
    y = np.array(iris.target, dtype=int)

    h = .02

    # crea una malla para realizar la grafica
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    kernel = 1.0 * kernels.RBF([1.0])
    gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)

    Z = gpc_rbf_isotropic.predict_proba(np.c_[xx.ravel(), yy.ravel()])

    # coloca el resultado en colores
    Z = Z.reshape((xx.shape[0], xx.shape[1], 3))
    plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower")

    # Grafica
    plt.scatter(X[:, 0],
                X[:, 1],
                c=np.array(["r", "g", "b"])[y],
                edgecolors=(0, 0, 0))
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.show()
Beispiel #13
0
def train_on_pool(choice_function, X, y, pool_idcs, train_idcs, test_idcs,
                  name):
    Xtest, ytest = X[test_idcs], y[test_idcs]
    accuracies, balances, n_points, train_idcs, pool_idcs = list(), list(
    ), list(), copy(train_idcs), copy(pool_idcs)

    gp = GaussianProcessClassifier(n_restarts_optimizer=25,
                                   kernel=Matern(),
                                   n_jobs=-1,
                                   random_state=42)

    #Add initial points

    while pool_idcs:
        Xtrain, ytrain = X[train_idcs], y[train_idcs]
        gp.fit(Xtrain, ytrain)

        preds = gp.predict(Xtest)

        accuracies.append(accuracy_score(ytest, preds))
        n_points.append(len(train_idcs))

        train_classes = np.unique(y[train_idcs], return_counts=True)[1]
        balances.append(max(train_classes) / sum(train_classes))
        print(
            f"{len(train_idcs)}: {name}: {accuracies[-1]:.3}, class balance: {balances[-1]:.3}"
        )

        y_pool_p = gp.predict_proba(X[pool_idcs])
        chosen_idx = choice_function(y_pool_p)

        train_idcs.append(pool_idcs.pop(chosen_idx))

    return n_points, accuracies, balances
Beispiel #14
0
class GaussianProcess_(ProbabilisticModel):

    """GaussianProcess Classifier
    """

    def __init__(self, *args, **kwargs):
        self.model = GaussianProcessClassifier(*args, **kwargs)
        self.name = "gpc"        

    def train(self, dataset, *args, **kwargs):
        return self.model.fit(*(dataset.format_sklearn() + args), **kwargs)

    def predict(self, feature, *args, **kwargs):
        return self.model.predict(feature, *args, **kwargs)

    def score(self, testing_dataset, *args, **kwargs):
        return self.model.score(*(testing_dataset.format_sklearn() + args),
                                **kwargs)
    def predict_proba(self, feature, *args, **kwargs):
        return self.model.predict_proba(feature, *args, **kwargs)
    
    def feature_importances_(self):
        LOGGER.warn("GPC model does not support feature_importance")
        return None
    
    def get_params(self):
        return self.model.get_params()
Beispiel #15
0
def build_classifier_gp(data, labels, **kwargs):
    linear_kernel = Sum(k1=Product(k1=DotProduct(sigma_0=0, sigma_0_bounds='fixed'), k2=ConstantKernel()),
                        k2=ConstantKernel())
    gp_clf = GaussianProcessClassifier(kernel=linear_kernel)
    gp_clf.fit(data, labels)
    id_pos_class = gp_clf.classes_ == labels.max()
    return gp_clf, gp_clf.predict_proba(data)[:, id_pos_class]
Beispiel #16
0
def test_multi_class(kernel):
    # Test GPC for multi-class classification problems.
    gpc = GaussianProcessClassifier(kernel=kernel)
    gpc.fit(X, y_mc)

    y_prob = gpc.predict_proba(X2)
    assert_almost_equal(y_prob.sum(1), 1)

    y_pred = gpc.predict(X2)
    assert_array_equal(np.argmax(y_prob, 1), y_pred)
Beispiel #17
0
def test_multi_class(kernel):
    # Test GPC for multi-class classification problems.
    gpc = GaussianProcessClassifier(kernel=kernel)
    gpc.fit(X, y_mc)

    y_prob = gpc.predict_proba(X2)
    assert_almost_equal(y_prob.sum(1), 1)

    y_pred = gpc.predict(X2)
    assert_array_equal(np.argmax(y_prob, 1), y_pred)
Beispiel #18
0
def evaluate_gp(y,x,y_test,x_test):
    from sklearn.gaussian_process.kernels import RBF, ConstantKernel,WhiteKernel 
    from sklearn.gaussian_process import GaussianProcessClassifier
    np.random.seed(200)
    kernel =ConstantKernel()*RBF() + WhiteKernel()
    start = time.time()
    gp = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=10).fit(x, y)
    logp = np.log(gp.predict_proba(x_test))
    end = time.time()
    print('Gp took {}s'.format(end - start))
    
    test_loglik = np.mean(y_test.reshape(-1)*logp[:,1] + (1-y_test.reshape(-1))*logp[:,0])
    return(test_loglik)
Beispiel #19
0
def GPRTraining(XEstimate, XValidate, Parameters, class_labels):
    kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-05, 100000.0))
    #clf = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=1)
    clf = GaussianProcessClassifier(kernel=RBF(length_scale=1.0),
                                    optimizer=None,
                                    multi_class='one_vs_one',
                                    n_jobs=1)

    print(clf.fit(XEstimate, class_labels))
    Yvalidate = clf.predict(XValidate)
    EstParameters = clf.get_params()
    print(clf.predict_proba(XValidate))
    return {"Yvalidate": Yvalidate, "EstParameters": EstParameters, "clf": clf}
Beispiel #20
0
def calculate_mrl(X, Y, alpha, initial_f):
    '''
    Calculates the mrl for a component
    '''
    kernel = 1.0 * RBF(1.0)
    gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X, Y)
    original_prob = gpc.predict_proba(X)
    print("Probabilities for the complete MTS calculated")

    mrl = [None for _ in range(LABELS)]

    current_f = initial_f
    while True:
        gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(
            X[:, :current_f], Y)
        new_prob = gpc.predict_proba(X[:, :current_f])
        print("Probabilities calculated when value of F is %s" %
              (str(current_f)))

        for i, mrl_value in enumerate(mrl):
            if not (mrl_value):
                if check_probabilities(original_prob, new_prob, alpha, Y,
                                       i + 1):
                    mrl[i] = current_f
                    print("F for label %s is %s" %
                          (str(i + 1), str(current_f)))

        all_values_are_found = True
        for mrl_value in mrl:
            if not (mrl_value):
                all_values_are_found = False

        if (all_values_are_found):
            break

        current_f += 1

    return mrl
Beispiel #21
0
def run_gaussian_clf(df, config):
  df = df[0:100]
  start = time.time()
  X = df.drop(columns={"flux_list", "wavelength", "objid", "ra", "dec", "class", "spectral_lines"})
  y = df["class"]

  X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=42)

  kernel = 1.0 * RBF(1.0)#config['kernel_val'])
  model = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X_train, y_train)

  y_pred_test = model.predict(X_test)
  accuracy_test = acc(y_test, y_pred_test)
  end = time.time()
  tt = end - start
  print("Accuracy of trained model on test set: %.2f%%" % (accuracy_test * 100.0))
  # print(y_pred_test)
  print("time :", tt)

  model.predict_proba(X_test)
  df_result_GC = pd.DataFrame(model.predict_proba(X_test))
  df_result_GC_rename = df_result_GC.rename(columns={0: "GALAXY", 1: "QSO", 2: "STAR"})
  df_result_GC_rename["predict"] = y_pred_test
  df_result_GC_rename["actual"] = y_test
def getBestParametersRBF(X, Y):
    nMag = 10
    nScale = 10
    nValidation = 2
    magnitudes = np.logspace(-2, 4, nMag)  #10^-2 to 10^4
    scales = np.logspace(-1, 2, nScale)  #10^-1 to 10^2
    nlpd = np.zeros((nMag, nScale))

    for i in range(nMag):
        for j in range(nScale):
            sumProb = 0.0
            numProb = 0

            for n_k in range(nValidation):
                gp = GaussianProcessClassifier(kernel=magnitudes[i] *
                                               RBF(scales[j]))
                xTrain, yTrain, xTest, yTest = splitTrainingTestingData(
                    X, Y, nValidation, n_k)
                gp.fit(xTrain, yTrain)
                #get prediction probabilities
                probs = gp.predict_proba(xTest)
                sumProb += sum(-np.log(probs[i][(1 + yTest[i]) // 2])
                               for i in range(probs.shape[0]))
                numProb += probs.shape[0]

            #calculate negtive log predictive density
            nlpd[i, j] = sumProb / numProb

    id_x, id_y = np.unravel_index(nlpd.argmin(),
                                  nlpd.shape)  #index of minimum value of NLPD
    #3D plot of NLPD
    X, Y = np.meshgrid(magnitudes, scales)
    nlpd = np.transpose(nlpd)
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    surf = ax.plot_surface(np.log10(X),
                           np.log10(Y),
                           nlpd,
                           cmap=cm.coolwarm,
                           linewidth=0,
                           antialiased=False)
    fig.colorbar(surf, shrink=0.5, aspect=10)
    ax.set_xlabel("Magnitude")
    ax.set_ylabel("Length-scale")
    ax.set_zlabel("Negative Log Predictive Density")
    plt.show()

    return scales[id_y], magnitudes[id_x]
    def common_test_gpc(self, dtype=np.float32, n_classes=2):

        gp = GaussianProcessClassifier()
        gp, X = self.fit_classification_model(gp, n_classes=n_classes)

        # return_cov=False, return_std=False
        if dtype == np.float32:
            cls = FloatTensorType
        else:
            cls = DoubleTensorType
        model_onnx = to_onnx(gp,
                             initial_types=[('X', cls([None, None]))],
                             target_opset=TARGET_OPSET,
                             options={
                                 GaussianProcessClassifier: {
                                     'zipmap': False,
                                     'optim': 'cdist'
                                 }
                             })
        self.assertTrue(model_onnx is not None)

        try:
            sess = InferenceSession(model_onnx.SerializeToString())
        except OrtFail:
            if not hasattr(self, 'path'):
                return
            suffix = 'Double' if dtype == np.float64 else 'Float'
            # Operator Solve is missing
            model_onnx = change_onnx_domain(
                model_onnx, {'Solve': ('Solve%s' % suffix, 'ai.onnx.contrib')})
            so = SessionOptions()
            so.register_custom_ops_library(self.path)
            sess = InferenceSession(model_onnx.SerializeToString(), so)

            res = sess.run(None, {'X': X.astype(dtype)})
            assert_almost_equal(res[0].ravel(), gp.predict(X).ravel())
            assert_almost_equal(res[1], gp.predict_proba(X), decimal=3)
            return

        dt = 32 if dtype == np.float32 else 64
        dump_data_and_model(X.astype(dtype),
                            gp,
                            model_onnx,
                            verbose=False,
                            basename="SklearnGaussianProcessRBFT%d%d" %
                            (n_classes, dt))
class GaussianProcessClassifierImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)
Beispiel #25
0
 def model_GPC(self):
     kf, X, y = self.data_KF()
     avg = 0
     for train_index, test_index in kf.split(X):
         X_train, X_test = X[train_index], X[test_index]
         y_train, y_test = y[train_index], y[test_index]
         kernel = 1.0 * RBF(1.0)
         gpc = GaussianProcessClassifier(kernel=kernel,
                                         random_state=0).fit(X, y)
         result = gpc.predict_proba(X_test)
         count0 = 0
         for index in range(len(result)):
             if result[index] == y_test[index]:
                 count0 += 1
         avg += count0 / len(result)
         # print(result)
     return avg / self.nsp
Beispiel #26
0
    def OnlineGPC(self,X,y,shift=1):
        """

        :param X: global input
        :param y: ground truth
        :return: alert=1 OK , alert=0 senting a alert

        """
        T = X.shape[0]
        alert = np.zeros(T-shift)
        clf = GaussianProcessClassifier()

        for t in np.arange(shift,T):
            x_t = X[:t,:]
            y_t = y[:t]
            clf.fit(x_t,y_t)
            score = clf.predict_proba(X[t,:])
            alert[t-shift] = (score[0][int(y[t]-1)] >= 0.5)

        return alert
def apply_gaussian_classifier(feature, col_required, array_to_predict):
    Main_X, Main_Y = load_datasets(feature)
    print("Starting Gausian")
    gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(
        Main_X[:ROWS, :col_required], Main_Y[:ROWS])
    print("Successfully Trained :)")
    orignal_probability = gpc.predict_proba(array_to_predict)
    maxa = max(orignal_probability[0])
    for j in range(5):
        if (orignal_probability[0][j] == maxa):
            index = j
    if (index == 0):
        return 1
    elif (index == 1):
        return 2
    elif (index == 2):
        return 3
    elif (index == 3):
        return 4
    else:
        return 16
Beispiel #28
0
class GaussianProcess(BaseEstimator):

    """Implement a Gaussian Process Classifier. GP is by definition a
    Bayesian model, so uncertainty on the prediction is easy acquired.

    Attributes:
        model (TYPE): Description

    """

    def __init__(self, name='GP'):

        super().__init__(name)
        self.model = None

    def fit(self, x, y, **kwargs):
        """Train a Gaussian Process Classifier.

        Args:
            x (np.array): design matrix (inputa data)
            y (np.array): labels
            **kwargs: additional parameters
        """
        # Specify Gaussian Processes with fixed and optimized hyperparameters
        self.model = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
                                               multi_class='one_vs_rest')

        self.model.fit(x, y)

    def predict(self, x):
        """Perform prediction using GP.

        Args:
            x (np.array): input data

        Returns:
            np.array: multiple predictions for each sample x_i
        """
        return self.model.predict_proba(x)
Beispiel #29
0
class GaussianProcess(AbstractModel):
    def __init__(self, optimised=True):
        self.create_model(optimised)

    def create_model(self, optimised):
        self.model = GaussianProcessClassifier()

    def fit_model(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def predict(self, x_test):
        y_pred = self.model.predict(x_test)
        return y_pred

    def get_model(self):
        return self.model

    def predict_proba(self, x_test):
        y_pred = self.model.predict_proba(x_test)
        return y_pred

    def print(self):
        pass
def trainPredict(subjectid, makeplot=False):
    print("testing participant " + subjectid)
    # Load training data from the file matlab generates
    traindata = np.genfromtxt('csvdata/' + subjectid +
                              '_sim.csv', delimiter=',',
                              missing_values=['NaN', 'nan'],
                              filling_values=None)
    # Clean + downsample this data
    trainx, trainy = cleandata(traindata, downsamplefactor=20)

    # Train a Gaussian Process
    anisokern = kernels.RBF()  # default kernel
    gp = GaussianProcessClassifier(kernel=anisokern)  # Initialize the GPC
    gp.fit(trainx, trainy)  # train this class on the data
    trainx = trainy = None  # Discard all training data to preserve memory

    # load test data
    testdata = np.genfromtxt('csvdata/' + subjectid +
                             '_rival.csv', delimiter=',',
                             missing_values=['NaN', 'nan'],
                             filling_values=None)
    testx, testy = cleandata(testdata, downsamplefactor=4)  # clean data

    testdata = None  # clear from memory
    # work out percentage in percept for each data point:
    percentages, nextpercept = assign_percentage(testy)

    # get a prediction for all points in the test data:
    predicty = gp.predict(testx)
    proby = gp.predict_proba(testx)

    if makeplot:
        summaryplot(participant, testx, testy, predicty, proby, gp)

    # Summarise prediction by reported percept
    meanprediction = {'mean' + percept:
                      proby[testy == value, 1].mean()
                      for percept, value in perceptindices.iteritems()}
    predictiondev = {'stdev' + percept:
                     proby[testy == value, 1].std()
                     for percept, value in perceptindices.iteritems()}
    predictionaccuracy = {'acc' + percept:
                          (predicty[testy == value] ==
                           testy[testy == value]).mean()
                          for percept, value in perceptindices.iteritems()}
    # Summarise prediction by percentage in percept
    predictioncourse = {'timecourse' + percept + str(cutoff):
                        proby[(testy == value) &
                              (percentages < cutoff) &
                              (percentages > cutoff - 0.1), 1].mean()
                        for percept, value in perceptindices.iteritems()
                        for cutoff in np.linspace(0.1, 1, 10)}

    # Summarise mixed percept time courses by the next percept
    nextcourse = {'nextcourse' + percept + str(cutoff):
                  proby[(testy == 0) &
                        (percentages < cutoff) &
                        (percentages > cutoff - 0.1) &
                        (nextpercept == perceptindices[percept]), 1].mean()
                  for percept in ['highfreq', 'lowfreq']
                  for cutoff in np.linspace(0.1, 1, 10)}

    afterdominant = {'after' + percept + "_" + after + "_" + str(cutoff):
                     proby[(testy == perceptindices[percept]) &
                           (percentages < cutoff) &
                           (percentages > cutoff - 0.1) &
                           (nextpercept == perceptindices[after]), 1].mean()
                     for percept, after in [('highfreq', 'mixed'),
                                            ('highfreq', 'lowfreq'),
                                            ('lowfreq', 'mixed'),
                                            ('lowfreq', 'highfreq')]
                     for cutoff in np.linspace(0.1, 1, 10)}

    # Only return the summarised data
    return meanprediction, predictiondev, predictionaccuracy, \
        predictioncourse, nextcourse, afterdominant
Beispiel #31
0
def test_predict_consistent(kernel):
    # Check binary predict decision has also predicted probability above 0.5.
    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
    assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
Beispiel #32
0

xx, yy = np.meshgrid(np.linspace(-3, 3, 50),
                     np.linspace(-3, 3, 50))
rng = np.random.RandomState(0)
X = rng.randn(200, 2)
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

# fit the model
plt.figure(figsize=(10, 5))
kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2]
for i, kernel in enumerate(kernels):
    clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)

    # plot the decision function for each datapoint on the grid
    Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
    Z = Z.reshape(xx.shape)

    plt.subplot(1, 2, i + 1)
    image = plt.imshow(Z, interpolation='nearest',
                       extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                       aspect='auto', origin='lower', cmap=plt.cm.PuOr_r)
    contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
                           linetypes='--')
    plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
    plt.xticks(())
    plt.yticks(())
    plt.axis([-3, 3, -3, 3])
    plt.colorbar(image)
    plt.title("%s\n Log-Marginal-Likelihood:%.3f"
              % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)),
Beispiel #33
0
logpmf1 = predict_copula_classification(copula_classification_obj, x_plot_grid)
pmf1 = jnp.exp(logpmf1)
jnp.save('plot_files/ccopula_moon_pmf', pmf1)

#Predictive Resample
B = 1000
T = 5000
logpmf_ytest_samp, logpmf_yn_samp, y_samp, x_samp, pdiff = predictive_resample_classification(
    copula_classification_obj, y, x, x_plot_grid, B, T)

jnp.save('plot_files/ccopula_moon_logpmf_ytest_pr', logpmf_ytest_samp)
jnp.save('plot_files/ccopula_moon_logpmf_yn_pr', logpmf_yn_samp)

#Convergence
T = 10000  #T = 10000, seed = 50 for i = 30
seed = 200
_, _, _, _, pdiff = predictive_resample_classification(
    copula_classification_obj, y, x, x_test[0:1], 1, T, seed=seed)
jnp.save('plot_files/ccopula_moon_pdiff', pdiff)

#Gaussian Process
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
from sklearn.gaussian_process import GaussianProcessClassifier

kernel = ConstantKernel() * RBF() + WhiteKernel()
gp = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=10).fit(
    x, y.reshape(-1, ))
p_pred = gp.predict_proba(
    np.array([x_meshgrid[0].ravel(), x_meshgrid[1].ravel()]).transpose())
jnp.save('plot_files/gp_moon_pred', p_pred)
train_set, test_set = train_test_split(parts_labeled, random_state=42)
# get X and Y values
X_train, X_test = [s[['corr_scaled','mass_scaled']].values for s in (train_set, test_set)]
y_train, y_test = [s['manual_label'].values for s in (train_set, test_set)]

#clf_scaler_path = '../output/pipeline/GPClassification/GPCclfRBF.p'
#with open(clf_scaler_path, 'rb') as f:
#    clf = pickle.load(f)
#    scaler = pickle.load(f)

# train a gaussian process classifier with RBF kernel (Default)
clf = GaussianProcessClassifier(1.0 * RBF(1.0),  random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)
plot2dDecisionFunc(clf, X_train, y_train, save=save_dir+'prob_surfaceGPC.pdf')
clf.score(X_test, y_test)
labels_pred = clf.predict_proba(X_test)[:,1]

# compute f1 score: harmonic mean between precision and recall
# see https://en.wikipedia.org/wiki/F1_score
prob_f1 = pd.DataFrame()
prob_f1['prob_thresh'] = np.linspace(0.1, 1, 90, endpoint=False)
f1score = np.array([metrics.precision_recall_fscore_support(y_test, labels_pred>thresh)[2]
        for thresh in prob_thresh])
prob_f1['f1score_False']= f1score[:,0]
prob_f1['f1score_True']= f1score[:,1]
prob_f1.to_csv(save_dir+'prob_f1score.csv', index=False)

fig, ax = plt.subplots()
ax.plot(prob_f1.prob_thresh, prob_f1.f1score_False, color='r')
ax.plot(prob_f1.prob_thresh, prob_f1.f1score_True, color='b')
ax.set(ylabel='F1 score', xlabel='Prob. threshold')
Beispiel #35
0
    # training loop over the number of epochs
    batch_size = 5
    batches = int(len(X_) / batch_size)

    for epoch in range(training_epochs):
        losses = 0
        dkl_losses = 0
        accs = 0
        for j in range(batches):
            idx = np.random.randint(X_.shape[0], size=batch_size)
            X_b = X_[idx]
            Y_b = Y_[idx]

            # get the GPC predictions... and slice only the positive class probabilities
            Y_g = gpc.predict_proba(X_b)[:, 1].reshape((-1, 1))

            # train the network, note the dictionary of inputs and labels
            sess.run(train_step, feed_dict={x: X_b, y: Y_b, y_g: Y_g})
            # feedforwad the same data and labels, but grab the accuracy and loss as outputs
            acc, l, soft_max_a, l_2 = sess.run([accuracy, loss, a, loss_2],
                                               feed_dict={
                                                   x: X_b,
                                                   y: Y_b,
                                                   y_g: Y_g
                                               })

            losses = losses + np.sum(l)
            accs = accs + np.sum(acc)
            dkl_losses = dkl_losses + np.sum(l_2)
        print("Epoch %.8d " % epoch, "avg train loss over", batches,
Beispiel #36
0
class Population(object):
    """Popultation object containing every function and
    data structures  to run for the pipeline"""

    types = {
        'eas': 'east asian',
        'nfe': 'non finish european',
        'sas': 'south asian',
        'afr': 'african',
        'amr': 'mixed american',
        'nan': 'unknown',
        'fin': 'finish'
    }

    dataset = {
        'labels_miniproj.txt':
        'https://www.dropbox.com/s/dmgchsjklm1jvgk/\
    acb_ii_mini_project_labels.txt?dl=1',
        'data_miniproj.vcf.bgz':
        'https://www.dropbox.com/s/iq8c81awi31067c/\
    acb_ii_mini_project.vcf.bgz?dl=1'
    }

    maxallelefreq = None
    callrate = None
    outfile = None
    labeled = None
    nbcomp = None
    valofinterest = None
    clf = None
    train_pha = None
    test_pha = None
    train_gt = None
    test_gt = None
    train_red = None
    pred_red = None
    rec = None
    classifier = None
    tofind = None
    found = None

    def __init__(self):
        super(Population, self).__init__()
        os.system('mkdir data')

    def __getitem__(self, key):
        """
        you can get an item as with a dictionary
        """
        print self.types[str(
            list(self.all.loc[self.all['sample_id'] == key]['ancestry'])[0])]

    def __len__(self):
        return (self.labeled.shape[0],
                self.tofind.shape[0]) if self.labeled is not None else 0

    def __iter__(self, labeled=True):
        """
        you can get iterate as with a dictionary
        """
        return iter(list(self.labeled['sample_id']))

    def keys(self):
        """
        you can get the keys as with a dictionary
        """
        return list(self.all['sample_id']).__iter__()

    def values(self):
        """
        you can get the values as with a dictionary
        """
        return list(self.all['ancestry']).__iter__()

# Utilities

    def _get_training_data(self, inp, percentage=0.3):
        X_train, X_test, y_train, y_test = train_test_split(
            inp,
            self.labeled['ancestry'],
            test_size=percentage,
            random_state=0)
        return X_train, X_test, y_train, y_test


# Functions

    def load_dataset(self, filename=None, url=None):
        """
        load the data from dropbox in case the user don't have it already
        you can load your own dataset from anywhere

        Params:
        ------
        filename: str, the file name
        url: str, the url
        """
        if filename is None:
            for key, val in self.dataset.iteritems():
                if not os.path.exists('data/' + key):
                    print "downloading " + key + " with urllib"
                    f = urlopen(val)
                    data = f.read()
                    with open('data/' + key, "wb") as code:
                        code.write(data)
                else:
                    print "file is already there"
        else:
            if not os.path.exists(filename):
                print "downloading " + filename + " with urllib"
                f = urlopen(url)
                data = f.read()
                with open(filename, "wb") as code:
                    code.write(data)
            else:
                print "file is already there"

    def filter_variants(self,
                        name="data/data_miniproj.vcf.bgz",
                        out="out",
                        onlypruning=False,
                        minmissing=30000,
                        maxr2=0.01,
                        callrate=0.8,
                        maxallelefreq=0.01):
        """
        Successful, clean PCA on human genetic data will require
        filtering data to high-quality variants that are linkage disequilibrium (LD)-pruned.
        In general, we like to run PCA on high-callrate, bi-allelic,
        common (allele frequency > 0.01) variants that are pruned to r^2<0.1;

        but you are welcome to run PCA on whichever set of variants you find work best for you.
        min missing r2

        """
        if not onlypruning:
            print "assuming you have mawk, vcftools, cat, cut installed"
            self.maxallelefreq = maxallelefreq
            self.callrate = callrate
            self.outfile = out
            filt = "vcftools --gzvcf '" + name + "' --recode --out data/lowDPbefore"
            filt += " --maf " + str(maxallelefreq)
            filt += ' --min-alleles 2 --max-alleles 2'
            filt += ' --max-missing ' + str(callrate)
            print "applying first filter"
            os.system(filt)
            print "applying second filter"
            os.system(
                'vcftools --vcf "data/lowDPbefore.recode.vcf" --missing-indv --out data/out'
            )
            print "finding if too much missing individuals and recoding the file"
            os.system(
                "mawk '$4 > 30000' data/out.imiss | cut -f1 > data/lowDP.indv")
            os.system(
                "vcftools --vcf 'data/lowDPbefore.recode.vcf' --recode --remove data/lowDP.indv\
             --out data/filtered2")
            print "removing garbage.."
            os.system('rm data/lowDP*')

        vcf_reader = vcf.Reader(open('data/filtered2.recode.vcf', 'r'))
        os.system('mkdir data/chunks')
        print "dividing the input file.."
        for i in vcf_reader.contigs.keys():
            i = str(i)
            if len(i) < 3:
                os.system(
                    "vcftools  --vcf  data/filtered2.recode.vcf  --chr " + i +
                    " --recode --recode-INFO-all --out  data/chunks/VCF_ch" +
                    i)
        print "running the ld prunning in parallel (might still take time (avg is 60mn)"
        for i in vcf_reader.contigs.keys():
            i = str(i)
            if len(i) < 3:
                os.system(
                    "vcftools --vcf data/chunks/VCF_ch" + i +
                    ".recode.vcf --min-r2 0.1 --geno-r2 --out data/chunks/filtVCF_ch"
                    + i + " &")
        start = time.time()
        while (True):
            nbjob = 0
            for p in psutil.process_iter():
                try:
                    if str(p.name()) == 'vcftools':
                        nbjob += 1
                except (psutil.AccessDenied, psutil.ZombieProcess):
                    pass
                except psutil.NoSuchProcess:
                    continue
            if nbjob == 0:
                break
            else:
                print "there is still " + str(nbjob) + " jobs \r",
        end = time.time()
        print "it took " + str(end - start) + " seconds"
        print "concatenating every file"
        os.system('rm data/*.log')
        os.system('cat data/chunks/filtVCF_ch* > data/all_VCF.geno.lg')
        print "now prunning..."
        os.system(
            'vcftools --vcf data/filtered2.recode.vcf --exclude-positions \
            data/all_VCF.geno.lg --recode --out data/' + out)

    def extract_unlabeled(self, filename=None):
        filename = filename if filename is not None else "data/labels_miniproj.txt"
        labels = pd.read_csv(filename, sep='\t')
        indices = labels['ancestry'].isna()
        self.tofind = labels[indices]
        self.labeled = labels[indices == False]
        self.all = pd.concat([self.labeled, self.tofind])

    def load_from_vcf(self,
                      filename=None,
                      printinfo=True,
                      maxval=1000,
                      keep_prev=False):
        """
        parloadval read from the filtered vcf file, the names given in df

        Params:
        ------
        df : dataframe - a dataframe with a sample_id index containing the names
        of the different samples to extract from the file

        filename : str - the name of the file

        printinfo : flag - show the information about the vcf file being read

        Returns:
        -------
        gt: np.array [nbofindividuals,nbofrecords] - 0-1-2 values stating if the genotype has
        the ALTval in 0-1-2 of its chromosomes.

        pha: np.array [nbofindividuals,nbofrecords] - bool stating if this variant is phased or not

        rec: dict[chromvalue:list[POS,REFval,ALTval]] - a dicionnary of meta information about the
        records being read

        """
        filename = filename if filename is not None else 'data/' + self.outfile + '.recode.vcf'
        vcf_reader = vcf.Reader(open(filename, 'r'))
        if printinfo:
            print "having " + str(len(vcf_reader.contigs)) + " chromosomes"
            size = 0
            for key, val in vcf_reader.contigs.iteritems():
                size += val.length
            print "meta :"
            print vcf_reader.metadata
            print "genomesize : "
            print size
        label_names = list(self.labeled['sample_id'])
        test_names = list(self.tofind['sample_id'])
        if not keep_prev:
            self.train_gt = np.empty((0, len(label_names)), int)
            self.train_pha = np.empty((0, len(label_names)), bool)
            self.test_gt = np.empty((0, len(test_names)), int)
            self.test_pha = np.empty((0, len(test_names)), bool)
            self.rec = {}
        else:
            self.test_pha = self.test_pha.T
            self.test_gt = self.test_gt.T
            self.train_pha = self.train_pha.T
            self.train_gt = self.train_gt.T
        chrom = -1
        j = 0
        numa = 0
        count = 0
        for record in vcf_reader:
            if keep_prev:
                for key, val in self.rec.iteritems():
                    for key, val2 in val.iteritems():
                        vcf_reader.next()
                        numa += 1
                keep_prev = False
            count = numa + j
            print str(count) + " doing chrom : " + str(
                record.CHROM) + ', at pos : ' + str(record.POS) + "\r",
            if record.CHROM != chrom:
                chrom = record.CHROM
                if record.CHROM not in self.rec:
                    self.rec.update({chrom: {}})
            self.rec[chrom].update(
                {record.ID: [count, record.POS, record.REF, record.ALT]})
            train_gt = np.zeros(len(label_names))
            train_pha = np.zeros(len(label_names))
            for i, name in enumerate(label_names):
                train_gt[i] = record.genotype(name).gt_type if record.genotype(
                    name).gt_type is not None else 0
                train_pha[i] = record.genotype(name).phased if record.genotype(
                    name).phased is not None else 0

            test_gt = np.zeros(len(test_names))
            test_pha = np.zeros(len(test_names))
            for i, name in enumerate(test_names):
                test_gt[i] = record.genotype(name).gt_type if record.genotype(
                    name).gt_type is not None else 0
                test_pha[i] = record.genotype(name).phased if record.genotype(
                    name).phased is not None else 0
            self.train_gt = np.vstack((self.train_gt, train_gt))
            self.train_pha = np.vstack((self.train_pha, train_pha))
            self.test_gt = np.vstack((self.test_gt, test_gt))
            self.test_pha = np.vstack((self.test_pha, test_pha))
            j += 1
            if j > maxval - 1:
                break
        # """
        # we are using numpy, more efficient
        # we order by individuals x records
        self.test_pha = self.test_pha.T
        self.test_gt = self.test_gt.T
        self.train_pha = self.train_pha.T
        self.train_gt = self.train_gt.T
        print ' '  # to jump a line
        print "PHASE nonzero " + str(np.count_nonzero(self.train_pha))
        print "SNPs nonzero " + str(np.count_nonzero(self.train_gt))
        for key, val in self.types.iteritems():
            print "you have " + str(self.labeled.loc[self.labeled['ancestry'] == key].shape[0])\
                + " " + str(val) + " in your labeled set"

    def par_load_from_vcf(self, filename, printinfo=True):
        """
        the parallel version of loadfromvcf,should be way faster

        same inputs but reduced choice for now
        """
        filename = filename if filename is not None else 'data/' + self.outfile + '.recode.vcf'
        vcf_reader = vcf.Reader(open(filename, 'r'))
        print "dividing the input file.."

        files = []
        for i in vcf_reader.contigs.keys():
            i = str(i)
            if len(i) < 3:
                files.append(i)
                os.system(
                    "vcftools  --vcf  " + filename + " --chr " + i +
                    " --recode --recode-INFO-all --out  data/chunks/inpar_ch" +
                    i)
        label_names = list(self.labeled['sample_id'])
        test_names = list(self.tofind['sample_id'])
        self.rec = {}
        self.train_gt = np.empty((0, len(label_names)), int)
        self.train_pha = np.empty((0, len(label_names)), bool)
        self.test_gt = np.empty((0, len(test_names)), int)
        self.test_pha = np.empty((0, len(test_names)), bool)
        if printinfo:
            print "having " + str(len(vcf_reader.contigs)) + " chromosomes"
            size = 0
            for key, val in vcf_reader.contigs.iteritems():
                size += val.length
            print vcf_reader.metadata
            print size
        vals = Parallel(n_jobs=-1)(
            delayed(_inpar)(file, label_names, test_names) for file in files)
        for i, val in enumerate(vals):
            if len(val[1]) != 0:
                # wether or not it is equal to zero we consider it is the same for all others
                self.train_gt = np.vstack((self.train_gt, convertlist(val[1])))
                self.train_pha = np.vstack(
                    (self.train_pha, convertlist(val[2], type=np.bool)))
                self.test_gt = np.vstack((self.test_gt, convertlist(val[3])))
                self.test_pha = np.vstack(
                    (self.test_pha, convertlist(val[4], type=np.bool)))
            self.rec.update({files[i]: val[0]})
        self.test_pha = self.test_pha.T
        self.test_gt = self.test_gt.T
        self.train_pha = self.train_pha.T
        self.train_gt = self.train_gt.T
        print "PHASE nonzero " + str(np.count_nonzero(self.train_pha))
        print "SNPs nonzero " + str(np.count_nonzero(self.train_gt))
        for key, val in self.types.iteritems():
            print "you have " + str(self.labeled.loc[self.labeled['ancestry'] == key].shape[0])\
                + " " + str(val) + " in your labeled set"
        os.system("rm *.log")
        os.system("rm data/*.log")
        os.system("rm data/chunks/*.log")

    def reduce_features(self,
                        inp=None,
                        topred=None,
                        reducer='pca',
                        n_components=500,
                        val='gt',
                        retrain=True):
        """
        will use a dimensionality reduction algorithm to reduce the number of features of the dataset

        you can pass it you own inputs or use the ones that are stored in the file

        Params:
        ------
        inp: np.array[values,features],the input array you have and want to reduce and will train on
        topred: np.array[values,features], the input array you have and want to reduce and predict
        reducer: str, the reducer algorithm to use (pca,)
        n_components : int, the final number of features in your reduced dataset
        val : str (gt|pha), to see if there is any predictibility using phasing..
        retrain: flag, set to false if you already have trained the PCA and don't want to restart
        (espacially important if you consider to compare two different datasets)

        Outs:
        ----
        nbcomp: saves the number of components
        valofinterest: the value of interest (val)
        train_red, pred_red: and the reduced train and pred arrays
        """
        self.nbcomp = n_components
        self.valofinterest = val
        if topred is None and inp is None:
            topred = self.test_gt if val is 'gt' else self.test_pha
        if inp is None:
            inp = self.train_gt if val is 'gt' else self.train_pha
        toreduce = np.vstack((inp, topred)) if topred is not None else inp
        if reducer is 'pca':
            redu = PCA(n_components=n_components)
        if reducer is 'kpca':
            redu = KernelPCA(n_components=n_components, kernel='linear')
        if reducer is 'spca':
            redu = SparsePCA(n_components=n_components,
                             alpha=1,
                             ridge_alpha=0.01,
                             max_iter=1000,
                             method='lars',
                             n_jobs=-1)
        if reducer is 'lda':
            redu = TruncatedSVD(n_components=n_components,
                                algorithm='randomized',
                                n_iter=5)
        red = redu.fit_transform(toreduce) if retrain else redu.fit(toreduce)
        self.train_red = red[:inp.shape[0]]
        self.pred_red = red[inp.shape[0]:]

    def train_classifier(self,
                         inp=None,
                         labels=None,
                         classifier='knn',
                         train=True,
                         test='CV',
                         scoring='accuracy',
                         percentage=0.3,
                         proba=True,
                         iter=100):
        """
        will use a classification algorithm and train it on the training
        set using the labels and predict its accuracy

        you can pass it your own inputs and labels (be carefull to reduce their features before hand
        or use the ones that are stored in the file

        Params:
        ------
        inp: np.array[values,features], the input array you will train on
        labels: list of values, the input array you have and want to reduce and predict
        classifier: str, the classification algorithm to use (adaboost **, knn ***, svm ***, gaussian ***** )
        test: str, the test algorithm to use (reg,CV)
        scoring: string, the scoring to use (not all of them work for this type of classification)
        percentage: float, the percentage of your data that should be used for testing
        for the regular testing algorithm
        proba: flag, to say if you want the algorithm to compute the probability of each class
        (uniquely for the svm)
        iter: int, number of iterations for the gradient descent of the gaussian mixture classifier

        Returns:
        ------
        score, float, the final score the classifier had

        Outs
        ----

        clf: will save the classifier
        classifier: and its name
        """
        if inp is None:
            inp = self.train_red
        if labels is None:
            labels = self.labeled['ancestry']
        self.classifier = classifier
        if classifier is 'adaboost':
            self.clf = AdaBoostClassifier(n_estimators=int(self.nbcomp * 0.7))
        elif classifier is 'knn':
            self.clf = NearestCentroid()
        elif classifier is 'svm':
            self.clf = SVC(C=1.0,
                           kernel='rbf',
                           degree=3,
                           gamma='auto',
                           coef0=0.0,
                           shrinking=True,
                           probability=proba,
                           tol=0.001,
                           cache_size=400,
                           class_weight=None,
                           verbose=False,
                           max_iter=-1)
        elif classifier is 'gaussian':
            self.clf = GCP(max_iter_predict=iter)
        else:
            print "unkown classifier"
        if test is 'CV':
            scores = cross_val_score(self.clf,
                                     inp,
                                     labels,
                                     scoring=scoring,
                                     cv=3,
                                     n_jobs=1)
            print "cv scores : " + str(scores)
            score = np.mean(scores)
        elif test is 'reg':
            X_train, X_test, y_train, y_test = self._get_training_data(
                inp, percentage=percentage)
            self.clf.fit(X_train, y_train)
            y_pred = self.clf.predict(X_test)
            score = accuracy_score(y_test, y_pred)
        self.clf.fit(inp, labels) if train else None
        print "the total score is of " + str(score)
        return score

    def predict_labels(self, inp=None, minval=0.75):
        """
        give it an input (that you have been passed already to the PCA algorithm precising
        that it has already been trained) and gives you the labels

        Params:
        ------
        inp: np.array[values,features], the input array you will train on ( optional)

        Returns:
        -------
        found : list, of found values (saved in the class)
        """
        if self.clf is not None:
            if self.classifier is 'svm':
                founde = []
                print "not checking that you are using svm"
                self.found = self.clf.predict_proba(
                    inp) if inp is not None else self.clf.predict_proba(
                        self.pred_red)
                for x, i in enumerate(self.tofind['sample_id']):
                    print '----------------------'
                    print str(i) + 'has:'
                    y = 0
                    a = ''
                    keya = 'U'
                    for key in self.types.keys():
                        if key is not 'nan':
                            if self.found[x][y] > minval:
                                print "####",
                                a = self.found[x][y]
                                keya = key

                            print str(self.found[x][y]
                                      ) + "% chance to be " + self.types[key]
                            y += 1
                    founde.append([keya, a])
                self.found = founde
            else:
                self.found = self.clf.predict(
                    inp) if inp is not None else self.clf.predict(
                        self.pred_red)
            return self.found

    def compute_features_nb(self,
                            classifier='knn',
                            reducer='pca',
                            vmin=50,
                            vmax=1000,
                            step=10,
                            k=5):
        """
        computes the number of features that is the best with a simple gready search
        does not count as training

        Params:
        ------
        classifier: string : name of the classifier for which you want to the best number of
        features
        vmin : int minimal value
        vmax :
        step :

        Returns:
        -------
        a plt plot
        scores : np.array the ordered list of best scores
        vals: list the corresponding ordered values

        """
        vals = range(vmin, vmax, step)
        scores = np.zeros(len(vals))
        for i, val in enumerate(vals):
            self.reduce_features(n_components=val, reducer=reducer)
            self.selectfeatures(auto=True, k=k)
            score = self.train_classifier(classifier=classifier, train=False)
            scores[i] = score
        plt.plot(vals, scores)
        ind = np.argsort(scores)
        scores[:] = scores[ind]
        vals = [vals[i] for i in ind]
        return scores, vals

    def savedata(self, name):
        """
        saves the PC values in a gzip file and the labels in a json file

        Params:
        ------
        name: str, name of the files n which to save
        """
        filename1 = "data/save/" + name + ".json"
        filename2 = "data/save/" + name + ".gz"
        print "writing in " + name
        d = {}
        for i, val in enumerate(list(self.tofind['sample_id'])):
            d.update({val: self.found[i]})
        data = json.dumps(d, indent=4, separators=(',', ': '))
        dirname = os.path.dirname(filename1)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(filename1, 'w') as f:
            f.write(data)
        np.savetxt(filename2, np.vstack((self.train_red, self.pred_red)))
        print "it worked !"

    # not working with the SNP IDs of Han et al.
    def selectSNPs(self,
                   names=[[11, 'rs232045'], [11, 'rs12786973'],
                          [11, 'rs7946015'], [11, 'rs4756778'],
                          [11, 'rs7931276'], [11, 'rs4823557'],
                          [11, 'rs10832001'], [5, 'rs35397'],
                          [11, 'rs11604470'], [11, 'rs10831841'],
                          [1, 'rs2296224'], [11, 'rs12286898'],
                          [11, 'rs1869084'], [11, 'rs4491181'],
                          [11, 'rs1604797'], [11, 'rs7931276'],
                          [11, 'rs11826168'], [11, 'rs477036'],
                          [11, 'rs7940199'], [11, 'rs4429025'],
                          [11, 'rs6483747'], [15, 'rs199138']]):
        """
        Will select a subset of snps from the list given (before dim reducing)

        Params:
        ------
        names: list[int chromnb,str ID], list containing the chromosome number and the id
        of the snps
        """

        newtraingt = np.zeros((self.train_gt.shape[0], len(names)))
        newtestgt = np.zeros((self.test_gt.shape[0], len(names)))
        newtestpha = np.zeros((self.test_pha.shape[0], len(names)))
        newtrainpha = np.zeros((self.train_pha.shape[0], len(names)))
        for i, name in enumerate(names):
            val = self.rec[str(name[0])][name[1]][0]
            # selecting the positional value of the SNP in the matrix from
            # the chrom and ID of the snps
            newtraingt.T[:][i] = self.train_gt.T[:][val]
            newtestgt.T[:][i] = self.test_gt.T[:][val]
            newtestpha.T[:][i] = self.test_pha.T[:][val]
            newtrainpha.T[:][i] = self.train_pha.T[:][val]
        self.train_gt = newtraingt
        self.test_gt = newtestgt
        self.test_pha = newtestpha
        self.train_pha = newtrainpha

    def selectfeatures(self,
                       inp=None,
                       out=None,
                       auto=False,
                       features=None,
                       k=7):
        """
        will select a subset of features from the list or automatically according to
        an ANOVA F-value
        """
        if not auto and type(featuresnumber) is not list:
            raise NameError("need feature numbers as a list")
        inp = inp if inp is not None else self.train_red
        out = out if out is not None else self.labeled['ancestry']
        if auto:
            sel = selector(k=k)
            self.train_red = sel.fit_transform(inp, out)
            self.pred_red = self.pred_red.T[:][sel.get_support(True)].T
        else:
            self.train_red = self.train_red.T[:][featuresnumber].T
            self.pred_red = self.pred_red.T[:][featuresnumber].T

    def plotPC(self, interactive=False, pc=[0, 1], foundplot=True, tsne=False):
        """
        will plot the features that have been extracted by the reducer algorithm
        it has nice features such as a color for each label and an interactive
        plot to zoom and analyze each different individual

        Params:
        -----
        Interactive: flag, if using bokeh or not to plot
        foundplot: flag, if add the predicted labels or not
        pc: list of size 2, the two PCs to analyse
        tsne: flag, use tsne or plot only two values

        Returns:
        -------
        p: object, the plot object
        """
        colormap = {
            'eas': "#3498db",
            'nfe': "#2ecc71",
            'sas': "#9b59b6",
            'afr': '#34495e',
            'amr': '#f1c40f',
            'nan': '#000000',
            'fin': "#f39c12",
            'U': '#7f8c8d'
        }
        if self.found is not None:
            found = self.found if self.classifier is not 'svm' else [
                i[0] for i in self.found
            ]
        else:
            found = list(self.tofind['ancestry'])  # just nans
        colorslab = [colormap[x] for x in list(self.labeled['ancestry'])]
        colorsnot = [colormap[str(x)] for x in found] if foundplot else None
        labels = list(self.labeled['ancestry'])
        labels.extend(found) if foundplot else None
        if tsne:
            reduced = TSNE(
                n_components=2,
                perplexity=30.0,
                verbose=1,
                learning_rate=200.0,
                n_iter=1000).fit_transform(
                    np.vstack((
                        self.train_red,
                        self.pred_red)) if foundplot else self.train_red)
        else:
            tot = np.vstack((self.train_red,
                             self.pred_red)) if foundplot else self.train_red
            reduced = np.empty((tot.shape[0], 2))
            reduced.T[:][0] = tot.T[:][pc[0]]
            reduced.T[:][1] = tot.T[:][pc[1]]
        colorslab.extend(colorsnot) if foundplot else None
        if interactive:
            names = list(self.labeled['sample_id'])
            names.extend(list(self.tofind['sample_id'])) if foundplot else None
            print " if you are on a notebook you should write 'from bokeh.io import output_notebook'"
            source = ColumnDataSource(
                data=dict(x=reduced[:, 0],
                          y=reduced[:, 1],
                          label=[
                              names[i] + "origin :" + self.types[x]
                              for i, x in enumerate(labels)
                          ],
                          color=colorslab))
            output_notebook()
            hover = HoverTool(tooltips=[
                ("label", "@label"),
            ])
            p = figure(title="T-sne plot of the PC values",
                       tools=[
                           hover,
                           BoxZoomTool(),
                           WheelZoomTool(),
                           SaveTool(),
                           ResetTool()
                       ])
            p.circle(x='x', y='y', source=source, color='color')

            show(p)
            output_file(self.classifier + "plot.html")
            save(p)
            return p
        else:
            fig = plt.figure()
            ax = fig.add_subplot(111)
            ax.scatter(reduced[:, 0], reduced[:, 1], c=colorslab)
            plt.show()
y = np.array(g(X) > 0, dtype=int)

# Instanciate and fit Gaussian Process Model
kernel = C(0.1, (1e-5, np.inf)) * DotProduct(sigma_0=0.1) ** 2
gp = GaussianProcessClassifier(kernel=kernel)
gp.fit(X, y)
print("Learned kernel: %s " % gp.kernel_)

# Evaluate real function and the predicted probability
res = 50
x1, x2 = np.meshgrid(np.linspace(- lim, lim, res),
                     np.linspace(- lim, lim, res))
xx = np.vstack([x1.reshape(x1.size), x2.reshape(x2.size)]).T

y_true = g(xx)
y_prob = gp.predict_proba(xx)[:, 1]
y_true = y_true.reshape((res, res))
y_prob = y_prob.reshape((res, res))

# Plot the probabilistic classification iso-values
fig = plt.figure(1)
ax = fig.gca()
ax.axes.set_aspect('equal')
plt.xticks([])
plt.yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

cax = plt.imshow(y_prob, cmap=cm.gray_r, alpha=0.8,
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f" % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f" % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print(
    "Accuracy: %.3f (initial) %.3f (optimized)"
    % (
        accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
        accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])),
    )
)
print(
    "Log-loss: %.3f (initial) %.3f (optimized)"
    % (
        log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
        log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]),
    )
)


# Plot posteriors
plt.figure(0)
plt.scatter(X[:train_size, 0], y[:train_size], c="k", label="Train data")
plt.scatter(X[train_size:, 0], y[train_size:], c="g", label="Test data")
X_ = np.linspace(0, 5, 100)
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], "r", label="Initial kernel: %s" % gp_fix.kernel_)
plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], "b", label="Optimized kernel: %s" % gp_opt.kernel_)
plt.xlabel("Feature")
plt.ylabel("Class 1 probability")
plt.xlim(0, 5)
Beispiel #39
0
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f"
      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f"
      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)"
      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)"
      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))


# Plot posteriors
plt.figure(0)
plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data",
            edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data",
            edgecolors=(0, 0, 0))
X_ = np.linspace(0, 5, 100)
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
         label="Initial kernel: %s" % gp_fix.kernel_)
plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
         label="Optimized kernel: %s" % gp_opt.kernel_)
plt.xlabel("Feature")
def plot(df, options):

    UNIQ_GROUPS = df.group.unique()
    UNIQ_GROUPS.sort()

    sns.set_style("white")
    grppal = sns.color_palette("Set2", len(UNIQ_GROUPS))

    print '# UNIQ GROUPS', UNIQ_GROUPS

    cent_stats = df.groupby(
        ['position', 'group', 'side']).apply(stats_per_group)
    cent_stats.reset_index(inplace=True)

    import time
    from sklearn import preprocessing
    from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
    from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ExpSineSquared, ConstantKernel, RBF


    ctlDF = cent_stats[ cent_stats['group'] == 0 ]

    TNRightDF = cent_stats[ cent_stats['group'] != 0]
    TNRightDF = TNRightDF[TNRightDF['side'] == 'right']

    dataDf = pd.concat([ctlDF, TNRightDF], ignore_index=True)
    print dataDf

    yDf = dataDf['group'] == 0
    yDf = yDf.astype(int)
    y = yDf.values
    print y
    print y.shape

    XDf = dataDf[['position', 'values']]
    X = XDf.values
    X = preprocessing.scale(X)
    print X
    print X.shape
    

    # kernel = ConstantKernel() + Matern(length_scale=mean, nu=3 / 2) + \
    # WhiteKernel(noise_level=1e-10)
    
    kernel = 1**2 * Matern(length_scale=1, nu=1.5) + \
        WhiteKernel(noise_level=0.1)

    figure = plt.figure(figsize=(10, 6))


    stime = time.time()
    gp = GaussianProcessClassifier(kernel)
    gp.fit(X, y)

    print gp.kernel_
    print gp.log_marginal_likelihood()

    print("Time for GPR fitting: %.3f" % (time.time() - stime))


    # create a mesh to plot in
    h = 0.1
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                        np.arange(y_min, y_max, h))

    plt.figure(figsize=(10, 5))
    
    # Plot the predicted probabilities. For that, we will assign a color to
    # each point in the mesh [x_min, m_max]x[y_min, y_max].

    Z = gp.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z = Z[:,1]
    print Z
    print Z.shape
    # Put the result into a color plot
    Z = Z.reshape((xx.shape[0], xx.shape[1]))
    print Z.shape
    plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower")

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g"])[y])
    plt.xlabel('position')
    plt.ylabel('normalized val')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.title("%s, LML: %.3f" %
            ("TN vs. Control", gp.log_marginal_likelihood(gp.kernel_.theta)))

    plt.tight_layout()


    if options.title:
        plt.suptitle(options.title)

    if options.output:
        plt.savefig(options.output, dpi=150)

    if options.is_show:
        plt.show()