def test_sample_weight(): group = numpy.random.binomial(1, .5, size=1000) == 1 sample_weight = 1 / (group * 100 + 1.0) x = numpy.random.uniform(-10, 10, size=1000) y = numpy.abs(x) y[group] = numpy.abs(x[group] - 5) y += numpy.random.normal(0, 1, size=1000) model = Earth().fit(x[:, numpy.newaxis], y, sample_weight=sample_weight) # Check that the model fits better for the more heavily weighted group assert_true(model.score(x[group], y[group]) < model.score( x[numpy.logical_not(group)], y[numpy.logical_not(group)])) # Make sure that the score function gives the same answer as the trace pruning_trace = model.pruning_trace() rsq_trace = pruning_trace.rsq(model.pruning_trace().get_selected()) assert_almost_equal(model.score(x, y, sample_weight=sample_weight), rsq_trace)
def test_missing_data(): earth = Earth(allow_missing=True, **default_params) missing_ = numpy.random.binomial(1, .05, X.shape).astype(bool) X_ = X.copy() X_[missing_] = None earth.fit(X_, y) res = str(earth.score(X_, y)) filename = os.path.join(os.path.dirname(__file__), 'earth_regress_missing_data.txt') # with open(filename, 'w') as fl: # fl.write(res) with open(filename, 'r') as fl: prev = fl.read() assert_true(abs(float(res) - float(prev)) < .03)
def test_missing_data(): numpy.random.seed(0) earth = Earth(allow_missing=True, **default_params) missing_ = numpy.random.binomial(1, .05, X.shape).astype(bool) X_ = X.copy() X_[missing_] = None earth.fit(X_, y) res = str(earth.score(X_, y)) filename = os.path.join(os.path.dirname(__file__), 'earth_regress_missing_data.txt') if regenerate_target_files: with open(filename, 'w') as fl: fl.write(res) with open(filename, 'r') as fl: prev = fl.read() try: assert_true(abs(float(res) - float(prev)) < .03) except AssertionError: print('Got %f, %f' % (float(res), float(prev))) raise
def csc(df, hamming_string_dict, outdir, filename): """CRISPR Specificity Correction :param df: pandas dataframe with first column as gRNA and second column as logFC/metric :param hamming_string_dict: CSC onboard dictionary object with key as gRNA and value as Hamming metrics :param outdir: absolute filepath to output directory :param filename: name of input file to be used as part of output filename :return: CSC adjustment """ # MARS compatible file df_mars_lst = [] df_v = np.asarray(df) for i in range(len(df_v)): row_lst = [] grna, metric = df_v[i][0], df_v[i][1] try: metric = float(metric) except ValueError: sys.stdout.write( 'WARNING: encountered %s which is not float compatible, skipping\n' % metric) continue row_lst.append(grna) try: for jj in hamming_string_dict[grna]: row_lst.append(jj) row_lst.append(metric) df_mars_lst.append(row_lst) except KeyError: sys.stdout.write('\n%s not found in selected library: passing\n' % grna) continue df = pd.DataFrame(df_mars_lst, columns=[ 'gRNA', 'specificity', 'h0', 'h1', 'h2', 'h3', 'original_value' ]) # exclude infinte specificity non-target gRNAs df = df[df['h0'] != 0] # isolate pertinent confounder variables df_confounders = df[['specificity', 'h0', 'h1', 'h2', 'h3']] # knots knots = df['original_value'].quantile([0.25, 0.5, 0.75, 1]) # training and testing data train_x, test_x, train_y, test_y = train_test_split(df_confounders, df['original_value'], test_size=0.10, random_state=1) # Fit an Earth model model = Earth(feature_importance_type='gcv') try: model.fit(train_x, train_y) except ValueError: sys.stdout.write( '\nValue Error encountered. Model unable to be trained. Exiting CSC Novo\n' ) model_processed = 'F' sys.stdout.write( 'training input x data\n %s\ntraining input y data\n %s\n' % (train_x, train_y)) return model_processed # Print the model print(model.trace()) print(model.summary()) print(model.summary_feature_importances()) # Plot the model y_hat = model.predict(test_x) # calculating RMSE values rms1 = sqrt(mean_squared_error(test_y, y_hat)) print('\n\nRMSE on Predictions\n\n') print(rms1) # calculating R^2 for training print('\n\nR^2 on Training Data\n\n') print(model.score(train_x, train_y)) # calculating R^2 for testing print('\n\nR^2 on Testing Data\n\n') print(model.score(test_x, test_y)) # write out model metrics with open('%s/csc_model_metrics_%s.txt' % (outdir, filename), 'w') as outfile: outfile.write('%s\n%s\n%s\nRMSE on Predictions\n%s' % (model.trace(), model.summary(), model.summary_feature_importances(), rms1)) if rms1 <= 1.0: #model processed model_processed = 'T' # full data prediction df['earth_adjustment'] = model.predict(df_confounders) # CSC correction df['earth_corrected'] = df['original_value'] - df['earth_adjustment'] # main write out df.to_csv('%s/csc_output_%s_earth_patched.csv' % (outdir, filename)) # pickle write out model_file = open( '%s/csc_output_%s_earth_model.pl' % (outdir, filename), 'wb') pl.dump(model, model_file) model_file.close() sys.stdout.write('\nCSC adjustment complete\n') sys.stdout.write('\nCSC output files written to %s\n' % outdir) return model_processed else: sys.stdout.write( '\nCSC adjustment not computed as model residual mean squared error exceeds 1.0\n' ) model_processed = 'F' return model_processed
ax = sns.distplot(y_train) plt.show() #mars solution model = Earth() model = Earth( max_degree=2, penalty=1.0, minspan_alpha=0.01, endspan_alpha=0.01, endspan=5 ) #2nd degree formula is necessary to see interactions, penalty and alpha values for making model simple model.fit(X_train, y_train) model.score(X_train, y_train) #y_pred = model.predict(train["SalePrice"]) y_pred = model.predict(X_test) y_pred = np.exp(y_pred) # inverse log transform the results print(model) print(model.summary()) print(y_pred) #Final_labels_new = np.expm1(model.predict(y_pred)) pd.DataFrame({ 'Id': test_ID, "SalePrice": y_pred }).to_csv('Predictions1.csv', index=False)