Ejemplo n.º 1
0
def test_end_to_end_logistic_regression():
    pos, neg = logistic.generate_well_separable(100, 0.50)

    #graph_pos_neg(pos, neg)

    X = logistic.vstack([pos, neg])
    y = logistic.hstack([np.array([1] * len(pos)), 
                   np.array([0] * len(neg)),])
    data = logistic.generate_random_points(100, 
                                           center=np.array([2,2]), 
                                           scale=np.array([5,5]))

    #theta = logistic.logistic_gradient_descent(X, y)

    thetaC = logistic.fast_logistic_gradient_descent(X, y)
    theta = thetaC
    #assert np.allclose(theta, thetaC)

    labels = logistic.label_data(data, theta, binarize=True)
    assert len([l for l in labels if l == 0]) > 10
    assert len([l for l in labels if l == 1]) > 10
    labels = logistic.label_data(data, thetaC, binarize=True)
    assert len([l for l in labels if l == 0]) > 10
    assert len([l for l in labels if l == 1]) > 10

    small_data = np.array([[-1, -1], [11, 11]])
    labels2 = logistic.label_data(small_data, theta, binarize=True)
    assert np.allclose([0, 1], labels2)
    assert not np.allclose([1, 1], labels2)
    labels2 = logistic.label_data(small_data, thetaC, binarize=True)
    assert np.allclose([0, 1], labels2)
    assert not np.allclose([1, 1], labels2)
Ejemplo n.º 2
0
def normalize_pu_nonnegative_data(pos_sample, unlabeled, v_p, v_u):
    """Same as above but works for non-negative data
    """
    d = logistic.vstack([pos_sample, unlabeled])

    # decorrelater = sklearn.decomposition.PCA(whiten=False)
    decorrelater = sklearn.decomposition.NMF()
    #decorrelater.fit(d)

    transformer = sklearn.preprocessing.Scaler()
    transformer.fit(d)

    #fixer = lambda d: transformer.transform(decorrelater.transform(d))
    fixer = lambda d: transformer.transform(d)

    return ((fixer(pos_sample), fixer(unlabeled), fixer(v_p), fixer(v_u)),
            (decorrelater, transformer, fixer))
Ejemplo n.º 3
0
def normalize_pu_nonnegative_data(pos_sample, unlabeled, v_p, v_u):
    """Same as above but works for non-negative data
    """
    d = logistic.vstack([pos_sample, unlabeled])

    # decorrelater = sklearn.decomposition.PCA(whiten=False)
    decorrelater = sklearn.decomposition.NMF()
    #decorrelater.fit(d)

    transformer = sklearn.preprocessing.Scaler()
    transformer.fit(d)

    #fixer = lambda d: transformer.transform(decorrelater.transform(d))
    fixer = lambda d: transformer.transform(d)

    return ((fixer(pos_sample), fixer(unlabeled), fixer(v_p), fixer(v_u)), 
             (decorrelater, transformer, fixer))
Ejemplo n.º 4
0
    numpy.save(os.path.join(folder, 'data.neg.swissprot.npy'), neg.todense())
    numpy.save(os.path.join(folder, 'data.test_pos.swissprot.npy'),
               test_pos.todense())

    print 'read data...'

    table = []
    for cp in [1.0, 0.5, 0.1, 0.7, 0.6, 0.4, 0.3, 0.2, 0.9, 0.8]:
        # split out the validation set separately
        split = lambda a: logistic.sample_split(a, int(0.8 * a.shape[0]))
        half_pos, v_pos = split(pos)
        half_neg, v_neg = split(neg)
        half_test_pos, v_test_pos = split(test_pos)

        # figure out the subset to sample (c)
        u = logistic.vstack([half_neg, half_test_pos])
        pos_sample, unlabeled = logistic.sample_positive(cp, half_pos, u)

        # create validation set the same way
        u = logistic.vstack([v_neg, v_test_pos])
        v_p, v_u = logistic.sample_positive(cp, v_pos, u)

        print 'set up data...'

        data = (pos_sample, unlabeled, v_p, v_u)
        #data, fixers = normalize_pu_nonnegative_data(*data)
        print 'not-normalized...'
        #print 'normalized...'
        _, estimators = logistic.calculate_estimators(*data, max_iter=100)

        t = (
def add_x2_y2(a):
    """Accepts an (N,2) array, adds 2 more columns
        which are first col squared, second col squared.
    """
    return logistic.vstack([a.T, a[:,0]**2, a[:,1]**2]).T
Ejemplo n.º 6
0
    pos, neg, test_pos = (np.load(os.path.join(folder, "data.%s.swissprot.npy" % d)) for d in npy_filenames)

    print "read data..."

    # set up data

    table = []
    for cp in [1.0, 0.5, 0.1, 0.7, 0.6, 0.4, 0.3, 0.2, 0.9, 0.8]:
        # split out the validation set separately
        split_half = lambda a: logistic.sample_split(a, len(a) / 2)
        half_pos, v_pos = split_half(pos)
        half_neg, v_neg = split_half(neg)
        half_test_pos, v_test_pos = split_half(test_pos)

        # figure out the subset to sample (c)
        u = logistic.vstack([half_neg, half_test_pos])
        pos_sample, unlabeled = logistic.sample_positive(cp, half_pos, u)

        # create validation set the same way
        u = logistic.vstack([v_neg, v_test_pos])
        v_p, v_u = logistic.sample_positive(cp, v_pos, u)

        print "set up data..."

        _, estimators = logistic.calculate_estimators(pos_sample, unlabeled, v_p, v_u)

        t = (
            cp,
            len(half_pos),
            len(half_neg),
            len(half_test_pos),
Ejemplo n.º 7
0
def add_x2_y2(a):
    """Accepts an (N,2) array, adds 2 more columns
        which are first col squared, second col squared.
    """
    return logistic.vstack([a.T, a[:,0]**2, a[:,1]**2]).T