def execution_time(train_df): X_train = train_df[['x', 'y', 'x_range', 'y_range']].values y_train = train_df['count'].values sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) #Training Models logger.info("Model Training Initiation\n=====================") kmeans = KMeans(random_state=0) mars_ = Earth(feature_importance_type='gcv', ) lsnr = PR(mars_, vigil_x=0.01) start = time.time() lsnr.fit(X_train, y_train) return (time.time() - start, lsnr.get_number_of_l1(), lsnr.get_number_of_l2())
def prediction_serving_time(train_df): data = { 'dimensionality': [], 'prediction_serving_time': [], 'l1': [], 'l2': [] } X_train = train_df[['x', 'y', 'x_range', 'y_range']].values y_train = train_df['count'].values for i in range(5): X_train = np.column_stack((X_train, X_train)) sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) kmeans = KMeans(random_state=0) mars_ = Earth(feature_importance_type='gcv', ) lsnr = PR(mars_, vigil_x=1.5, vigil_theta=1.5) lsnr.fit(X_train, y_train) for j in range(5): q = X_train[j, :].reshape(1, -1) start = time.time() m = lsnr.get_model(q).predict(q) end = time.time() - start data['dimensionality'].append(q.shape[1]) data['prediction_serving_time'].append(end) data['l1'].append(lsnr.get_number_of_l1()) data['l2'].append(lsnr.get_number_of_l2()) return data
def explanation_serving_t(train_df): data = {'vigil_t': [], 'explanation_serving_time': [], 'l1': [], 'l2': []} X_train = train_df[['x', 'y', 'x_range', 'y_range']].values y_train = train_df['count'].values sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) #Training Models logger.info("Model Training Initiation\n=====================") kmeans = KMeans(random_state=0) mars_ = Earth(feature_importance_type='gcv', ) vigilance_t = np.linspace(0.01, 3, Config.vigilance_t_frequency) for sens_t in vigilance_t: logger.info("Sensitivity Level {}".format(sens_t)) lsnr = PR(mars_, vigil_theta=sens_t) lsnr.fit(X_train, y_train) for i in range(5): q = train_df.iloc[i].values[:4].reshape(1, -1) q = sc.transform(q) start = time.time() m = lsnr.get_model(q) end = time.time() - start data['vigil_t'].append(sens_t) data['explanation_serving_time'].append(end) data['l1'].append(lsnr.get_number_of_l1()) data['l2'].append(lsnr.get_number_of_l2()) return data
aggregates = ['count', 'sum_', 'avg'] agg_map = {'count': 4, 'sum_': 5, 'avg': 6} for agg in aggregates: logger.info("Evaluating Aggregates : {0}".format(agg)) X_train = train_df[['x', 'y', 'x_range', 'y_range']].values y_train = train_df[agg].values sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) #Training Models logger.info("Model Training Initiation\n=====================") kmeans = KMeans() mars_ = Earth(feature_importance_type='gcv', ) vigilance_t = np.linspace(0.01, 3, Config.vigilance_t_frequency) for sens_t in vigilance_t: lsnr = PR(mars_, vigil_theta=sens_t) lsnr.fit(X_train, y_train) logger.info( "Accuracy Evaluation on Test set with vigil_t={0}\n=====================" .format(sens_t)) for i in range(1000): #Obtain query from test-set dataset = p printProgressBar(i, 1000, prefix='Progress:', suffix='Complete', length=50) q = test_df.iloc[i].values[:4].reshape(1, -1)
def accuracy_on_crimes(): logger.info("Finding datasets...") directory = os.fsencode('input/Crimes_Workload') directory_sub = os.fsencode('input/Subqueries/') patterns = {'gauss-gauss': '*x-gauss*-length-gauss*', 'gauss-uni': '*x-gauss*-length-uniform*', 'uni-gauss': '*x-uniform*-length-gauss*', 'uni-uni': '*x-uniform*-length-uniform*',} train_datasets = {} test_datasets = {} sub_datasets = {} for p in patterns: res = [os.fsdecode(n) for n in os.listdir(directory) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])] train_datasets[p] = res[0] if res[0].startswith('train') else res[1] test_datasets[p] = res[0] if res[0].startswith('test') else res[1] sub_datasets[p] = [os.fsdecode(n) for n in os.listdir(directory_sub) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])][0] res_eval = {'model': [], 'dataset': [], 'aggregate_name': [], 'kl': [], 'r2':[], 'md':[], 'nrmse':[]} #Main for p in patterns: logger.info('Beginning Evaluation for {0}'.format(p)) logger.info('Loading Datasets...') test_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(test_datasets[p]), index_col=0) train_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(train_datasets[p]), index_col=0) sub = np.load('/home/fotis/dev_projects/explanation_framework/input/Subqueries/{0}'.format(sub_datasets[p])) logger.info('Finished loading\nCommencing Evaluation') aggregates = ['count','sum_','avg'] agg_map = {'count' :4, 'sum_':5, 'avg':6} for agg in aggregates: logger.info("Evaluating Aggregates : {0}".format(agg)) X_train = train_df[['x','y','x_range','y_range']].values y_train = train_df[agg].values sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) #Training Models logger.info("Model Training Initiation\n=====================") kmeans = KMeans() lr = Ridge() lsnr = PR(lr) lsnr.fit(X_train,y_train) lr_global = LinearRegression() lr_global.fit(X_train, y_train) logger.info("Accuracy Evaluation on Test set\n=====================") for i in range(1000): #Obtain query from test-set dataset = p printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50) q = test_df.iloc[i].values[:4].reshape(1,-1) q = sc.transform(q) #Obtain subquery pertubations for query q from test set q1 = sub[i] X = q1[:,:4] y = q1[:,agg_map[agg]] X = sc.transform(X) # Train local model (Should be the best out of the 3) lr = LinearRegression() lr.fit(X,y) y_hat = lr.predict(X) metrics_for_model('local',dataset,agg,y_hat,X, y, lr,res_eval) #Obtain metrics for our y_hat_s = lsnr.get_model(q).predict(X) metrics_for_model('ours',dataset,agg,y_hat_s,X,y,lsnr.get_model(q) ,res_eval) #Obtain metrics for global y_hat_g = lr_global.predict(X) metrics_for_model('global',dataset,agg,y_hat_g,X,y,lr_global,res_eval) logger.info("Finished Queries") eval_df = pd.DataFrame(res_eval) eval_df.to_csv('output/Accuracy/evaluation_results_linear.csv')
def accuracy_on_higgs(): logger.info("Starting Accuracy Tests on Higgs") logger.info("================================") df = pd.read_csv('input/sample_higgs_0.01.csv', index_col=0) X = df[['m_bb','m_wwbb']].dropna().values y = df['label'] min_ = np.min(X, axis=0) max_ = np.max(X, axis=0) X = (X-min_) / (max_-min_) data = np.column_stack((X,y)) x = np.linspace(0.1,0.9,7) xx,yy = np.meshgrid(x,x) DIMS = X.shape[1] cov = np.identity(DIMS)*0.001 cluster_centers = np.column_stack((xx.ravel(),yy.ravel())) query_centers = [] #Generate queries over cluster centers for c in cluster_centers: queries = np.random.multivariate_normal(np.array(c), cov, size=40) query_centers.append(queries) query_centers = np.array(query_centers).reshape(-1,DIMS) ranges = np.random.uniform(low=0.005**(1/3), high=0.25**(1/3), size=(query_centers.shape[0], DIMS)) queries = [] empty = 0 for q,r in zip(query_centers,ranges): b = generate_boolean_vector(data,q,r,2) res = data[b] if res.shape[0]==0: empty+=1 ans = float(np.mean(res[:,-1])) if res.shape[0]!=0 else 0 qt = q.tolist() qt += r.tolist() qt.append(ans) queries.append(qt) qs = np.array(queries).reshape(-1, 2*DIMS+1) X_train, X_test, y_train, y_test = train_test_split( qs[:,:qs.shape[1]-1], qs[:,-1], test_size=0.4, random_state=0) earth = Earth() lsnr = PR(earth) lsnr.fit(X_train, y_train) y_hat = np.array([float(lsnr.get_model(x.reshape(1,-1)).predict(x.reshape(1,-1))) for x in X_test]) r2 = metrics.r2_score(y_test,y_hat) kl = kl_divergence_error(y_test, y_hat) nrmse = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test) logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2, nrmse, kl)) #Linear Regression comparsion lr = LinearRegression() lr.fit(X_train, y_train) y_hat_lr = lr.predict(X_test) r2_lr = metrics.r2_score(y_test, y_hat_lr) kl_lr = kl_divergence_error(y_test, y_hat_lr) nrmse_lr = np.sqrt(metrics.mean_squared_error(y_test, y_hat_lr))/np.mean(y_test) logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2_lr, kl_lr, nrmse_lr)) dic = {} dic['LPM' ]= [('r2',r2), ('kl',kl), ('nrmse',nrmse)] dic['LR'] = [('r2',r2_lr), ('kl',kl_lr), ('nrmse',nrmse_lr)] #Polynomial regression comparsion for count, degree in enumerate(np.arange(3,10,2)): model = make_pipeline(PolynomialFeatures(degree), Ridge()) model.fit(X_train, y_train) y_hat = model.predict(X_test) r2_p = metrics.r2_score(y_test,y_hat) kl_p = kl_divergence_error(y_test, y_hat) nrmse_p = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test) dic["LR ({})".format(degree)] = [('r2',r2_p), ('kl',kl_p), ('nrmse',nrmse_p)] print("R2 for degree {} : {}".format(degree, metrics.r2_score(y_test, y_hat))) logger.info("==============================================") with open('output/Accuracy/multiple_methods_higgs.pkl', 'wb') as handle: pickle.dump(dic, handle)
logger.info('Finished loading\nCommencing Evaluation') aggregates = ['count','sum_','avg'] agg_map = {'count' :4, 'sum_':5, 'avg':6} for agg in aggregates: logger.info("Evaluating Aggregates : {0}".format(agg)) X_train = train_df[['x','y','x_range','y_range']].values y_train = train_df[agg].values sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) #Training Models logger.info("Model Training Initiation\n=====================") mars_ = Earth(feature_importance_type='gcv',) vigilance_x = np.linspace(0.01, 3, Config.vigilance_x_frequency) for sens_x in vigilance_x: lsnr = PR(mars_,vigil_x=sens_x) lsnr.fit(X_train,y_train) logger.info("Accuracy Evaluation on Test set with vigil_x={0}\n=====================".format(sens_x)) for i in range(1000): #Obtain query from test-set dataset = p printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50) q = test_df.iloc[i].values[:4].reshape(1,-1) q = sc.transform(q) #Obtain subquery pertubations for query q from test set q1 = sub[i] X = q1[:,:4]