Example #1
0
def test_sample_weight():
    group = numpy.random.binomial(1, .5, size=1000) == 1
    sample_weight = 1 / (group * 100 + 1.0)
    x = numpy.random.uniform(-10, 10, size=1000)
    y = numpy.abs(x)
    y[group] = numpy.abs(x[group] - 5)
    y += numpy.random.normal(0, 1, size=1000)
    model = Earth().fit(x[:, numpy.newaxis], y, sample_weight=sample_weight)
    
    # Check that the model fits better for the more heavily weighted group
    assert_true(model.score(x[group], y[group]) < model.score(
        x[numpy.logical_not(group)], y[numpy.logical_not(group)]))
    
    # Make sure that the score function gives the same answer as the trace
    pruning_trace = model.pruning_trace()
    rsq_trace = pruning_trace.rsq(model.pruning_trace().get_selected())
    assert_almost_equal(model.score(x, y, sample_weight=sample_weight),
                        rsq_trace)
Example #2
0
def test_sample_weight():
    group = numpy.random.binomial(1, .5, size=1000) == 1
    sample_weight = 1 / (group * 100 + 1.0)
    x = numpy.random.uniform(-10, 10, size=1000)
    y = numpy.abs(x)
    y[group] = numpy.abs(x[group] - 5)
    y += numpy.random.normal(0, 1, size=1000)
    model = Earth().fit(x[:, numpy.newaxis], y, sample_weight=sample_weight)

    # Check that the model fits better for the more heavily weighted group
    assert_true(model.score(x[group], y[group]) < model.score(
        x[numpy.logical_not(group)], y[numpy.logical_not(group)]))

    # Make sure that the score function gives the same answer as the trace
    pruning_trace = model.pruning_trace()
    rsq_trace = pruning_trace.rsq(model.pruning_trace().get_selected())
    assert_almost_equal(model.score(x, y, sample_weight=sample_weight),
                        rsq_trace)
Example #3
0
def test_missing_data():
    earth = Earth(allow_missing=True, **default_params)
    missing_ = numpy.random.binomial(1, .05, X.shape).astype(bool)
    X_ = X.copy()
    X_[missing_] = None
    earth.fit(X_, y)
    res = str(earth.score(X_, y))
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress_missing_data.txt')
#     with open(filename, 'w') as fl:
#         fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_true(abs(float(res) - float(prev)) < .03)
Example #4
0
def test_missing_data():
    earth = Earth(allow_missing=True, **default_params)
    missing_ = numpy.random.binomial(1, .05, X.shape).astype(bool)
    X_ = X.copy()
    X_[missing_] = None
    earth.fit(X_, y)
    res = str(earth.score(X_, y))
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress_missing_data.txt')
#     with open(filename, 'w') as fl:
#         fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()
    assert_true(abs(float(res) - float(prev)) < .03)
Example #5
0
def test_missing_data():
    numpy.random.seed(0)
    earth = Earth(allow_missing=True, **default_params)
    missing_ = numpy.random.binomial(1, .05, X.shape).astype(bool)
    X_ = X.copy()
    X_[missing_] = None
    earth.fit(X_, y)
    res = str(earth.score(X_, y))
    filename = os.path.join(os.path.dirname(__file__),
                            'earth_regress_missing_data.txt')
    if regenerate_target_files:
        with open(filename, 'w') as fl:
            fl.write(res)
    with open(filename, 'r') as fl:
        prev = fl.read()
    try:
        assert_true(abs(float(res) - float(prev)) < .03)
    except AssertionError:
        print('Got %f, %f' % (float(res), float(prev)))
        raise
Example #6
0
def csc(df, hamming_string_dict, outdir, filename):
    """CRISPR Specificity Correction

    :param df: pandas dataframe with first column as gRNA and second column as logFC/metric
    :param hamming_string_dict: CSC onboard dictionary object with key as gRNA and value as Hamming metrics
    :param outdir: absolute filepath to output directory
    :param filename: name of input file to be used as part of output filename
    :return: CSC adjustment

    """
    # MARS compatible file
    df_mars_lst = []
    df_v = np.asarray(df)
    for i in range(len(df_v)):
        row_lst = []
        grna, metric = df_v[i][0], df_v[i][1]
        try:
            metric = float(metric)
        except ValueError:
            sys.stdout.write(
                'WARNING: encountered %s which is not float compatible, skipping\n'
                % metric)
            continue
        row_lst.append(grna)
        try:
            for jj in hamming_string_dict[grna]:
                row_lst.append(jj)
            row_lst.append(metric)
            df_mars_lst.append(row_lst)
        except KeyError:
            sys.stdout.write('\n%s not found in selected library: passing\n' %
                             grna)
            continue

    df = pd.DataFrame(df_mars_lst,
                      columns=[
                          'gRNA', 'specificity', 'h0', 'h1', 'h2', 'h3',
                          'original_value'
                      ])

    # exclude infinte specificity non-target gRNAs
    df = df[df['h0'] != 0]

    # isolate pertinent confounder variables
    df_confounders = df[['specificity', 'h0', 'h1', 'h2', 'h3']]

    # knots
    knots = df['original_value'].quantile([0.25, 0.5, 0.75, 1])

    # training and testing data
    train_x, test_x, train_y, test_y = train_test_split(df_confounders,
                                                        df['original_value'],
                                                        test_size=0.10,
                                                        random_state=1)

    # Fit an Earth model
    model = Earth(feature_importance_type='gcv')
    try:
        model.fit(train_x, train_y)
    except ValueError:
        sys.stdout.write(
            '\nValue Error encountered. Model unable to be trained. Exiting CSC Novo\n'
        )
        model_processed = 'F'
        sys.stdout.write(
            'training input x data\n %s\ntraining input y data\n %s\n' %
            (train_x, train_y))
        return model_processed

    # Print the model
    print(model.trace())
    print(model.summary())
    print(model.summary_feature_importances())

    # Plot the model
    y_hat = model.predict(test_x)

    # calculating RMSE values
    rms1 = sqrt(mean_squared_error(test_y, y_hat))
    print('\n\nRMSE on Predictions\n\n')
    print(rms1)

    # calculating R^2 for training
    print('\n\nR^2 on Training Data\n\n')
    print(model.score(train_x, train_y))

    # calculating R^2 for testing
    print('\n\nR^2 on Testing Data\n\n')
    print(model.score(test_x, test_y))

    # write out model metrics
    with open('%s/csc_model_metrics_%s.txt' % (outdir, filename),
              'w') as outfile:
        outfile.write('%s\n%s\n%s\nRMSE on Predictions\n%s' %
                      (model.trace(), model.summary(),
                       model.summary_feature_importances(), rms1))

    if rms1 <= 1.0:

        #model processed
        model_processed = 'T'

        # full data prediction
        df['earth_adjustment'] = model.predict(df_confounders)

        # CSC correction
        df['earth_corrected'] = df['original_value'] - df['earth_adjustment']

        # main write out
        df.to_csv('%s/csc_output_%s_earth_patched.csv' % (outdir, filename))

        # pickle write out
        model_file = open(
            '%s/csc_output_%s_earth_model.pl' % (outdir, filename), 'wb')
        pl.dump(model, model_file)
        model_file.close()

        sys.stdout.write('\nCSC adjustment complete\n')
        sys.stdout.write('\nCSC output files written to %s\n' % outdir)
        return model_processed

    else:
        sys.stdout.write(
            '\nCSC adjustment not computed as model residual mean squared error exceeds 1.0\n'
        )
        model_processed = 'F'
        return model_processed
ax = sns.distplot(y_train)
plt.show()

#mars solution
model = Earth()

model = Earth(
    max_degree=2,
    penalty=1.0,
    minspan_alpha=0.01,
    endspan_alpha=0.01,
    endspan=5
)  #2nd degree formula is necessary to see interactions, penalty and alpha values for making model simple

model.fit(X_train, y_train)
model.score(X_train, y_train)
#y_pred = model.predict(train["SalePrice"])

y_pred = model.predict(X_test)
y_pred = np.exp(y_pred)  # inverse log transform the results

print(model)
print(model.summary())
print(y_pred)

#Final_labels_new = np.expm1(model.predict(y_pred))

pd.DataFrame({
    'Id': test_ID,
    "SalePrice": y_pred
}).to_csv('Predictions1.csv', index=False)