def permutation_Test_LR(Y, f): Y = Y.reshape((len(Y), )) loo = cross_validation.LeaveOneOut(len(Y)) errors = [] for train_idx, test_idx in loo: f_train, f_test = f[train_idx], f[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] r = linearRegression(f_train, Y_train) y = r.predict(f_test) errors.append(np.abs(Y_test - y)) mae = np.mean(errors) mre = mae / Y.mean() return mae, mre
def permutation_Test_LR(Y, f): Y = Y.reshape((len(Y),)) loo = cross_validation.LeaveOneOut(len(Y)) errors = [] for train_idx, test_idx in loo: f_train, f_test = f[train_idx], f[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] r = linearRegression(f_train, Y_train) y = r.predict(f_test) errors.append(np.abs(Y_test - y)) mae = np.mean(errors) mre = mae / Y.mean() return mae, mre
def LR_training_python(lrf, Y, verboseoutput): Y = Y.reshape((len(Y), )) loo = cross_validation.LeaveOneOut(len(Y)) mae2 = 0 errors2 = [] for train_idx, test_idx in loo: f_train, f_test = lrf[train_idx], lrf[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] if not np.any(np.isnan(f_train)) and np.all(np.isfinite(f_train)): r2 = linearRegression(f_train, Y_train) y2 = r2.predict(f_test) errors2.append(np.abs(Y_test - y2)) if verboseoutput: print Y_test[0], y2[0] else: print 'nan or infinite' pass mae2 = np.mean(errors2) var2 = np.sqrt(np.var(errors2)) mre2 = mae2 / Y.mean() return mae2, var2, mre2
def LR_training_python(lrf, Y, verboseoutput): Y = Y.reshape((len(Y),)) loo = cross_validation.LeaveOneOut(len(Y)) mae2 = 0 errors2 = [] for train_idx, test_idx in loo: f_train, f_test = lrf[train_idx], lrf[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] if not np.any(np.isnan(f_train)) and np.all(np.isfinite(f_train)): r2 = linearRegression(f_train, Y_train) y2 = r2.predict(f_test) errors2.append( np.abs( Y_test - y2 ) ) if verboseoutput: print Y_test[0], y2[0] else: print 'nan or infinite' pass mae2 = np.mean(errors2) var2 = np.sqrt( np.var(errors2) ) mre2 = mae2 / Y.mean() return mae2, var2, mre2
def tenFoldCV_onChicagoCrimeData(features=['corina'], CVmethod='10Fold', P=10, NUM_ITER=20, SHUFFLE=True): """ Use different years data to train the NB model """ YEARS = range(2003, 2014) Y = [] C = [] FL = [] GL = [] T = [] for year in YEARS: W = generate_transition_SocialLag(year, lehd_type=0) Yhat = retrieve_crime_count(year - 1, ['total']) y = retrieve_crime_count(year, ['total']) c = generate_corina_features() popul = c[1][:, 0].reshape((77, 1)) # crime count is normalized by the total population as crime rate # here we use the crime count per 10 thousand residents y = np.divide(y, popul) * 10000 Yhat = np.divide(Yhat, popul) * 10000 W2 = generate_geographical_SpatialLag_ca() f1 = np.dot(W, Yhat) f2 = np.dot(W2, Yhat) FL.append(f1) GL.append(f2) Y.append(y) T.append(Yhat) C.append(c[1]) Y = np.concatenate(Y, axis=0) columnName = ['intercept'] f = np.ones(Y.shape) if 'corina' in features: C = np.concatenate(C, axis=0) f = np.concatenate((f, C), axis=1) columnName += c[0] if 'sociallag' in features: FL = np.concatenate(FL, axis=0) f = np.concatenate((f, FL), axis=1) columnName += ['sociallag'] if 'spatiallag' in features: GL = np.concatenate(GL, axis=0) f = np.concatenate((f, GL), axis=1) columnName += ['spatiallag'] if 'temporallag' in features: T = np.concatenate(T, axis=0) f = np.concatenate((f, T), axis=1) columnName += ['temporallag'] if SHUFFLE: f, Y = shuffle(f, Y) if CVmethod == '10Fold': splt = cross_validation.KFold(n=f.shape[0], n_folds=10, shuffle=True) elif CVmethod == 'leaveOneOut': splt = cross_validation.LeaveOneOut(n=f.shape[0]) elif CVmethod == 'leavePOut': splt = cross_validation.LeavePOut(n=f.shape[0], p=P) mae1 = [] mae2 = [] mre1 = [] mre2 = [] sd_mae1 = [] sd_mae2 = [] sd_mre1 = [] sd_mre2 = [] med_mae1 = [] med_mae2 = [] med_mre1 = [] med_mre2 = [] cnt = 0 if CVmethod == 'leaveOneOut': y_gnd = [] y_lr = [] for train_idx, test_idx in splt: cnt += 1 if cnt > NUM_ITER: break f_train, f_test = f[train_idx, :], f[test_idx, :] Y_train, Y_test = Y[train_idx, :], Y[test_idx, :] # write file for invoking NB regression in R np.savetxt("Y_train.csv", Y_train, delimiter=",") np.savetxt("Y_test.csv", Y_test, delimiter=",") pd.DataFrame(f_train, columns=columnName).to_csv("f_train.csv", sep=",", index=False) pd.DataFrame(f_test, columns=columnName).to_csv("f_test.csv", sep=",", index=False) # NB regression nbres = subprocess.check_output(['Rscript', 'nbr_eval_kfold.R']).split(" ") y1 = np.array([float(e) for e in nbres]) y1 = y1.reshape((y1.shape[0], 1)) a = np.abs(Y_test - y1) mae1.append(np.mean(a)) sd_mae1.append(np.std(a)) med_mae1 += a.tolist() r = a / Y_test mre1.append(np.mean(r)) sd_mre1.append(np.std(r)) med_mre1 += r.tolist() # Linear regression r2 = linearRegression(f_train, Y_train) y2 = r2.predict(f_test) y2 = y2.reshape((y2.shape[0], 1)) ae = np.abs(Y_test - y2) mae2.append(np.mean(ae)) sd_mae2.append(np.std(ae)) med_mae2 += ae.tolist() re = ae / Y_test mre2.append(np.mean(re)) sd_mre2.append(np.std(re)) med_mre2 += re.tolist() if CVmethod == 'leaveOneOut': y_gnd.append(Y_test) y_lr.append(y2) if CVmethod == 'leaveOneOut': print np.mean(mae1), np.median(mae1), np.mean(mre1), np.median(mre1), print np.mean(mae2), np.median(mae2), np.mean(mre2), np.median(mre2) return y_gnd, y_lr else: print np.mean(mae1), np.mean(sd_mae1), np.median(med_mae1), np.mean( mre1), np.mean(sd_mre1), np.median(med_mre1), print np.mean(mae2), np.mean(sd_mae2), np.median(med_mae2), np.mean( mre2), np.mean(sd_mre2), np.median(med_mre2) return mae1, mae2
def tenFoldCV_onChicagoCrimeData(features=['corina'], CVmethod='10Fold', P = 10, NUM_ITER=20, SHUFFLE=True): """ Use different years data to train the NB model """ YEARS = range(2003, 2014) Y = [] C = [] FL = [] GL = [] T = [] for year in YEARS: W = generate_transition_SocialLag(year, lehd_type=0) Yhat = retrieve_crime_count(year-1, ['total']) y = retrieve_crime_count(year, ['total']) c = generate_corina_features() popul = c[1][:,0].reshape((77,1)) # crime count is normalized by the total population as crime rate # here we use the crime count per 10 thousand residents y = np.divide(y, popul) * 10000 Yhat = np.divide(Yhat, popul) * 10000 W2 = generate_geographical_SpatialLag_ca() f1 = np.dot(W, Yhat) f2 = np.dot(W2, Yhat) FL.append(f1) GL.append(f2) Y.append(y) T.append(Yhat) C.append(c[1]) Y = np.concatenate(Y, axis=0) columnName = ['intercept'] f = np.ones(Y.shape) if 'corina' in features: C = np.concatenate(C, axis=0) f = np.concatenate( (f, C), axis=1 ) columnName += c[0] if 'sociallag' in features: FL = np.concatenate(FL, axis=0) f = np.concatenate( (f, FL), axis = 1) columnName += ['sociallag'] if 'spatiallag' in features: GL = np.concatenate(GL, axis=0) f = np.concatenate((f, GL), axis=1) columnName += ['spatiallag'] if 'temporallag' in features: T = np.concatenate(T, axis=0) f = np.concatenate((f, T), axis=1) columnName += ['temporallag'] if SHUFFLE: f, Y = shuffle(f, Y) if CVmethod == '10Fold': splt = cross_validation.KFold(n=f.shape[0], n_folds=10, shuffle=True) elif CVmethod == 'leaveOneOut': splt = cross_validation.LeaveOneOut(n=f.shape[0]) elif CVmethod == 'leavePOut': splt = cross_validation.LeavePOut(n=f.shape[0], p = P) mae1 = [] mae2 = [] mre1 = [] mre2 = [] sd_mae1 = [] sd_mae2 = [] sd_mre1 = [] sd_mre2 = [] med_mae1 = [] med_mae2 = [] med_mre1 = [] med_mre2 = [] cnt = 0 if CVmethod == 'leaveOneOut': y_gnd = [] y_lr = [] for train_idx, test_idx in splt: cnt += 1 if cnt > NUM_ITER: break f_train, f_test = f[train_idx, :], f[test_idx, :] Y_train, Y_test = Y[train_idx, :], Y[test_idx, :] # write file for invoking NB regression in R np.savetxt("Y_train.csv", Y_train, delimiter=",") np.savetxt("Y_test.csv", Y_test, delimiter=",") pd.DataFrame(f_train, columns = columnName).to_csv("f_train.csv", sep=",", index=False) pd.DataFrame(f_test, columns = columnName).to_csv("f_test.csv", sep=",", index=False) # NB regression nbres = subprocess.check_output( ['Rscript', 'nbr_eval_kfold.R'] ).split(" ") y1 = np.array([float(e) for e in nbres]) y1 = y1.reshape((y1.shape[0], 1)) a = np.abs( Y_test - y1 ) mae1.append(np.mean(a)) sd_mae1.append(np.std(a)) med_mae1 += a.tolist() r = a / Y_test mre1.append(np.mean(r)) sd_mre1.append(np.std(r)) med_mre1 += r.tolist() # Linear regression r2 = linearRegression(f_train, Y_train) y2 = r2.predict(f_test) y2 = y2.reshape((y2.shape[0], 1)) ae = np.abs(Y_test - y2) mae2.append( np.mean(ae) ) sd_mae2.append( np.std(ae) ) med_mae2 += ae.tolist() re = ae / Y_test mre2.append( np.mean(re)) sd_mre2.append( np.std(re) ) med_mre2 += re.tolist() if CVmethod == 'leaveOneOut': y_gnd.append(Y_test) y_lr.append(y2) if CVmethod == 'leaveOneOut': print np.mean(mae1), np.median(mae1), np.mean(mre1), np.median(mre1), print np.mean(mae2), np.median(mae2), np.mean(mre2), np.median(mre2) return y_gnd, y_lr else: print np.mean(mae1), np.mean(sd_mae1), np.median(med_mae1), np.mean(mre1), np.mean(sd_mre1), np.median(med_mre1), print np.mean(mae2), np.mean(sd_mae2), np.median(med_mae2), np.mean(mre2), np.mean(sd_mre2), np.median(med_mre2) return mae1, mae2