Ejemplo n.º 1
0
def run():

    datadict = load_data()
    model = iONMF(rank=5, max_iter=100, alpha=0.0)

    # Fit all training data
    model.fit(datadict)

    # Make predictions about class on all training data
    # using only expression data ...
    testdict = dict()
    testdict["Pos_diff_expr"] = datadict["Pos_diff_expr"]
    testdict["Neg_diff_expr"] = datadict["Neg_diff_expr"]
    rdict = model.predict(testdict)

    # ... and calculate training error
    true_y = np.zeros((len(datadict["Class_0"]), 1))
    true_y[np.where(datadict["Class_0"])] = 0
    true_y[np.where(datadict["Class_1"])] = 1
    true_y[np.where(datadict["Class_2"])] = 2

    predictions = np.array([
        np.argmax(
            [rdict["Class_0"][i], rdict["Class_1"][i], rdict["Class_2"][i]])
        for i in xrange(len(true_y))
    ])

    acc = np.sum(predictions == true_y.ravel()) / float(len(true_y))
    print "Training accuracy: ", acc

    # Plot matrices
    plt.figure(figsize=(12, 12))
    for ki, ky in enumerate(testdict.keys()):
        plt.subplot(len(testdict), 2, 2 * ki + 1)
        plt.title(ky)
        plt.imshow(datadict[ky])
        plt.subplot(len(testdict), 2, 2 * ki + 2)
        plt.title(ky + " (approx.)")
        plt.imshow(model.coef_.dot(model.basis_[ky]))
    plt.show()
Ejemplo n.º 2
0
def run():

    datadict = load_data()
    model = iONMF(rank=5, max_iter=100, alpha=0.0)

    # Fit all training data
    model.fit(datadict)

    # Make predictions about class on all training data
    # using only expression data ...
    testdict = dict()
    testdict["Pos_diff_expr"] = datadict["Pos_diff_expr"]
    testdict["Neg_diff_expr"]  = datadict["Neg_diff_expr"]
    rdict = model.predict(testdict)


    # ... and calculate training error
    true_y = np.zeros((len(datadict["Class_0"]), 1))
    true_y[np.where(datadict["Class_0"])] = 0
    true_y[np.where(datadict["Class_1"])] = 1
    true_y[np.where(datadict["Class_2"])] = 2

    predictions = np.array([np.argmax([rdict["Class_0"][i], rdict["Class_1"][i],
                                       rdict["Class_2"][i]])
                            for i in xrange(len(true_y))])

    acc = np.sum(predictions == true_y.ravel()) / float(len(true_y))
    print "Training accuracy: ", acc

    # Plot matrices
    plt.figure(figsize=(12, 12))
    for ki, ky in enumerate(testdict.keys()):
        plt.subplot(len(testdict), 2, 2*ki+1)
        plt.title(ky)
        plt.imshow(datadict[ky])
        plt.subplot(len(testdict), 2, 2*ki+2)
        plt.title(ky + " (approx.)")
        plt.imshow(model.coef_.dot(model.basis_[ky]))
    plt.show()
Ejemplo n.º 3
0
def run():

    # Select example protein folder from the dataset
    protein = sys.argv[1]

    # Load training data and column labels
    training_data = load_data("../datasets/clip/%s/5000/training_sample_0"
                              % protein,
                              go=False, kmer=False)
    training_labels = load_labels("../datasets/clip/%s/5000/training_sample_0"
                                  % protein, go=False, kmer=False)
    model = iONMF(rank=5, max_iter=100, alpha=10.0)

    # Fit all training data
    model.fit(training_data)

    # Make predictions about class on all training data
    # delete class from dictionary
    test_data = load_data("../datasets/clip/%s/5000/test_sample_0" % protein,
                          go=False, kmer=False)
    true_y = test_data["Y"].copy()
    del test_data["Y"]
    results = model.predict(test_data)

    # Evaluate prediction on holdout test set
    predictions = results["Y"]
    auc = roc_auc_score(true_y, predictions)
    print "Test AUC: ", auc

    # Draw low-dimensional components for Region types (H_RG)
    # and RNA structure (H_RNA)
    # with mean values in coefficient matrix W for positive (+) and negative (-)
    # positions
    f, axes = plt.subplots(model.rank, 3, sharex='col',
                           figsize=(15, 8))
    H_RNA   = model.basis_["X_RNA"]
    H_RG    = model.basis_["X_RG"]
    labelset = sorted(set(training_labels["X_RG"]))

    positives = training_data["Y"].nonzero()[0]
    negatives = (training_data["Y"] == 0).nonzero()[0]
    for k in xrange(model.rank):

        # Values in the coefficient (W) matrix
        w_positives = model.coef_[positives, :][:, k].mean()
        w_negatives = model.coef_[negatives, :][:, k].mean()
        e_positives = model.coef_[positives, :][:, k].std() / np.sqrt(len(positives))
        e_negatives = model.coef_[negatives, :][:, k].std() / np.sqrt(len(negatives))
        axes[k, 2].bar([0], [w_negatives], yerr =[(0,), (e_positives, )],
                       color="blue", align="center")
        axes[k, 2].bar([1], [w_positives], yerr  =[(0,), (e_negatives, )],
                       color="green", align="center")

        # Plot RNA structure
        axes[k, 1].plot(H_RNA[k, :].ravel(),)

        # Plot region types
        for label in labelset:
            indices = np.where(map(lambda e: e == label, training_labels["X_RG"]))[0]
            axes[k, 0].plot(H_RG[k, indices].ravel(), label=label)
        axes[k, 0].set_ylabel("Module %d" % k)


    j = model.rank - 1
    axes[0, 0].legend(bbox_to_anchor=(0., 1.04, 1., .102), loc=3,
             ncol=3, mode="expand", borderaxespad=0.)
    axes[0, 1].set_title("Double-stranded RNA")
    axes[0, 2].set_title("Mean values in the coefficient matrix (W)")
    axes[j, 0].set_xticks(np.linspace(0, H_RNA.shape[1], 5))
    axes[j, 0].set_xticklabels([-50, -25, 0, 25, 50])
    axes[j, 0].set_xlabel("Position relative to cross-link site")
    axes[j, 1].set_xticks(np.linspace(0, H_RNA.shape[1], 5))
    axes[j, 1].set_xticklabels([-50, -25, 0, 25, 50])
    axes[j, 1].set_xlabel("Position relative to cross-link site")
    axes[j, 2].set_xticks([0, 1])
    axes[j, 2].set_xticklabels(["-", "+"])


    plt.show()