def prediction_serving_time(train_df):
    data = {
        'dimensionality': [],
        'prediction_serving_time': [],
        'l1': [],
        'l2': []
    }
    X_train = train_df[['x', 'y', 'x_range', 'y_range']].values
    y_train = train_df['count'].values
    for i in range(5):
        X_train = np.column_stack((X_train, X_train))
        sc = StandardScaler()
        sc.fit(X_train)
        X_train = sc.transform(X_train)
        kmeans = KMeans(random_state=0)
        mars_ = Earth(feature_importance_type='gcv', )
        lsnr = PR(mars_, vigil_x=1.5, vigil_theta=1.5)
        lsnr.fit(X_train, y_train)
        for j in range(5):
            q = X_train[j, :].reshape(1, -1)
            start = time.time()
            m = lsnr.get_model(q).predict(q)
            end = time.time() - start
            data['dimensionality'].append(q.shape[1])
            data['prediction_serving_time'].append(end)
            data['l1'].append(lsnr.get_number_of_l1())
            data['l2'].append(lsnr.get_number_of_l2())

    return data
def explanation_serving_t(train_df):
    data = {'vigil_t': [], 'explanation_serving_time': [], 'l1': [], 'l2': []}
    X_train = train_df[['x', 'y', 'x_range', 'y_range']].values
    y_train = train_df['count'].values
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)

    #Training Models
    logger.info("Model Training Initiation\n=====================")
    kmeans = KMeans(random_state=0)
    mars_ = Earth(feature_importance_type='gcv', )
    vigilance_t = np.linspace(0.01, 3, Config.vigilance_t_frequency)
    for sens_t in vigilance_t:
        logger.info("Sensitivity Level {}".format(sens_t))
        lsnr = PR(mars_, vigil_theta=sens_t)
        lsnr.fit(X_train, y_train)
        for i in range(5):
            q = train_df.iloc[i].values[:4].reshape(1, -1)
            q = sc.transform(q)
            start = time.time()
            m = lsnr.get_model(q)
            end = time.time() - start
            data['vigil_t'].append(sens_t)
            data['explanation_serving_time'].append(end)
            data['l1'].append(lsnr.get_number_of_l1())
            data['l2'].append(lsnr.get_number_of_l2())
    return data
def execution_time(train_df):
    X_train = train_df[['x', 'y', 'x_range', 'y_range']].values
    y_train = train_df['count'].values
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    #Training Models
    logger.info("Model Training Initiation\n=====================")
    kmeans = KMeans(random_state=0)
    mars_ = Earth(feature_importance_type='gcv', )

    lsnr = PR(mars_, vigil_x=0.01)
    start = time.time()
    lsnr.fit(X_train, y_train)
    return (time.time() - start, lsnr.get_number_of_l1(),
            lsnr.get_number_of_l2())
        aggregates = ['count', 'sum_', 'avg']
        agg_map = {'count': 4, 'sum_': 5, 'avg': 6}
        for agg in aggregates:
            logger.info("Evaluating Aggregates : {0}".format(agg))
            X_train = train_df[['x', 'y', 'x_range', 'y_range']].values
            y_train = train_df[agg].values
            sc = StandardScaler()
            sc.fit(X_train)
            X_train = sc.transform(X_train)
            #Training Models
            logger.info("Model Training Initiation\n=====================")
            kmeans = KMeans()
            mars_ = Earth(feature_importance_type='gcv', )
            vigilance_t = np.linspace(0.01, 3, Config.vigilance_t_frequency)
            for sens_t in vigilance_t:
                lsnr = PR(mars_, vigil_theta=sens_t)
                lsnr.fit(X_train, y_train)

                logger.info(
                    "Accuracy Evaluation on Test set with vigil_t={0}\n====================="
                    .format(sens_t))
                for i in range(1000):
                    #Obtain query from test-set
                    dataset = p
                    printProgressBar(i,
                                     1000,
                                     prefix='Progress:',
                                     suffix='Complete',
                                     length=50)

                    q = test_df.iloc[i].values[:4].reshape(1, -1)
Exemple #5
0
def accuracy_on_crimes():
    logger.info("Finding datasets...")
    directory = os.fsencode('input/Crimes_Workload')
    directory_sub = os.fsencode('input/Subqueries/')
    patterns = {'gauss-gauss': '*x-gauss*-length-gauss*',
               'gauss-uni': '*x-gauss*-length-uniform*',
               'uni-gauss': '*x-uniform*-length-gauss*',
               'uni-uni': '*x-uniform*-length-uniform*',}
    train_datasets = {}
    test_datasets = {}
    sub_datasets = {}

    for p in patterns:
        res = [os.fsdecode(n) for n in os.listdir(directory) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])]
        train_datasets[p] = res[0] if res[0].startswith('train') else res[1]
        test_datasets[p] = res[0] if res[0].startswith('test') else res[1]
        sub_datasets[p] = [os.fsdecode(n) for n in os.listdir(directory_sub) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])][0]

    res_eval = {'model': [],
               'dataset': [],
               'aggregate_name': [],
               'kl': [],
               'r2':[],
               'md':[],
               'nrmse':[]}
    #Main
    for p in patterns:
        logger.info('Beginning Evaluation for {0}'.format(p))
        logger.info('Loading Datasets...')

        test_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(test_datasets[p]), index_col=0)
        train_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(train_datasets[p]), index_col=0)
        sub = np.load('/home/fotis/dev_projects/explanation_framework/input/Subqueries/{0}'.format(sub_datasets[p]))

        logger.info('Finished loading\nCommencing Evaluation')
        aggregates = ['count','sum_','avg']
        agg_map = {'count' :4, 'sum_':5, 'avg':6}
        for agg in aggregates:
            logger.info("Evaluating Aggregates : {0}".format(agg))
            X_train = train_df[['x','y','x_range','y_range']].values
            y_train = train_df[agg].values
            sc = StandardScaler()
            sc.fit(X_train)
            X_train = sc.transform(X_train)
            #Training Models
            logger.info("Model Training Initiation\n=====================")
            kmeans = KMeans()
            lr = Ridge()

            lsnr = PR(lr)
            lsnr.fit(X_train,y_train)

            lr_global = LinearRegression()
            lr_global.fit(X_train, y_train)

            logger.info("Accuracy Evaluation on Test set\n=====================")
            for i in range(1000):
                #Obtain query from test-set
                dataset = p
                printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50)

                q = test_df.iloc[i].values[:4].reshape(1,-1)
                q = sc.transform(q)
                #Obtain subquery pertubations for query q from test set
                q1 = sub[i]
                X = q1[:,:4]
                y = q1[:,agg_map[agg]]
                X = sc.transform(X)
                # Train local model (Should be the best out of the 3)
                lr = LinearRegression()
                lr.fit(X,y)
                y_hat = lr.predict(X)
                metrics_for_model('local',dataset,agg,y_hat,X, y, lr,res_eval)

                #Obtain metrics for our
                y_hat_s = lsnr.get_model(q).predict(X)
                metrics_for_model('ours',dataset,agg,y_hat_s,X,y,lsnr.get_model(q) ,res_eval)


                #Obtain metrics for global
                y_hat_g = lr_global.predict(X)
                metrics_for_model('global',dataset,agg,y_hat_g,X,y,lr_global,res_eval)
            logger.info("Finished Queries")
    eval_df = pd.DataFrame(res_eval)
    eval_df.to_csv('output/Accuracy/evaluation_results_linear.csv')
Exemple #6
0
def accuracy_on_higgs():
    logger.info("Starting Accuracy Tests on Higgs")
    logger.info("================================")
    df = pd.read_csv('input/sample_higgs_0.01.csv', index_col=0)
    X = df[['m_bb','m_wwbb']].dropna().values
    y = df['label']
    min_ = np.min(X, axis=0)
    max_ = np.max(X, axis=0)
    X = (X-min_) / (max_-min_)
    data = np.column_stack((X,y))
    x = np.linspace(0.1,0.9,7)
    xx,yy = np.meshgrid(x,x)
    DIMS = X.shape[1]
    cov = np.identity(DIMS)*0.001
    cluster_centers = np.column_stack((xx.ravel(),yy.ravel()))
    query_centers = []
    #Generate queries over cluster centers
    for c in cluster_centers:
        queries = np.random.multivariate_normal(np.array(c), cov, size=40)
        query_centers.append(queries)
    query_centers = np.array(query_centers).reshape(-1,DIMS)

    ranges = np.random.uniform(low=0.005**(1/3), high=0.25**(1/3), size=(query_centers.shape[0], DIMS))
    queries = []
    empty = 0
    for q,r in zip(query_centers,ranges):
            b = generate_boolean_vector(data,q,r,2)
            res = data[b]
            if res.shape[0]==0:
                empty+=1

            ans = float(np.mean(res[:,-1])) if res.shape[0]!=0 else 0
            qt = q.tolist()
            qt += r.tolist()
            qt.append(ans)
            queries.append(qt)
    qs = np.array(queries).reshape(-1, 2*DIMS+1)
    X_train, X_test, y_train, y_test = train_test_split(
         qs[:,:qs.shape[1]-1], qs[:,-1], test_size=0.4, random_state=0)
    earth  = Earth()
    lsnr = PR(earth)
    lsnr.fit(X_train, y_train)
    y_hat = np.array([float(lsnr.get_model(x.reshape(1,-1)).predict(x.reshape(1,-1))) for x in X_test])
    r2 = metrics.r2_score(y_test,y_hat)
    kl = kl_divergence_error(y_test, y_hat)
    nrmse = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test)
    logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2, nrmse, kl))
    #Linear Regression comparsion
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_hat_lr = lr.predict(X_test)
    r2_lr = metrics.r2_score(y_test, y_hat_lr)
    kl_lr = kl_divergence_error(y_test, y_hat_lr)
    nrmse_lr = np.sqrt(metrics.mean_squared_error(y_test, y_hat_lr))/np.mean(y_test)
    logger.info("R2 Score : {}\nNRMSE : {}\nKL-Divergence : {}".format(r2_lr, kl_lr, nrmse_lr))
    dic = {}
    dic['LPM' ]= [('r2',r2), ('kl',kl), ('nrmse',nrmse)]
    dic['LR'] = [('r2',r2_lr), ('kl',kl_lr), ('nrmse',nrmse_lr)]
    #Polynomial regression comparsion
    for count, degree in enumerate(np.arange(3,10,2)):
         model = make_pipeline(PolynomialFeatures(degree), Ridge())
         model.fit(X_train, y_train)
         y_hat = model.predict(X_test)
         r2_p = metrics.r2_score(y_test,y_hat)
         kl_p = kl_divergence_error(y_test, y_hat)
         nrmse_p = np.sqrt(metrics.mean_squared_error(y_test, y_hat))/np.mean(y_test)
         dic["LR ({})".format(degree)] = [('r2',r2_p), ('kl',kl_p), ('nrmse',nrmse_p)]
         print("R2 for degree {} : {}".format(degree, metrics.r2_score(y_test, y_hat)))
    logger.info("==============================================")
    with open('output/Accuracy/multiple_methods_higgs.pkl', 'wb') as handle:
        pickle.dump(dic, handle)
        logger.info('Finished loading\nCommencing Evaluation')
        aggregates = ['count','sum_','avg']
        agg_map = {'count' :4, 'sum_':5, 'avg':6}
        for agg in aggregates:
            logger.info("Evaluating Aggregates : {0}".format(agg))
            X_train = train_df[['x','y','x_range','y_range']].values
            y_train = train_df[agg].values
            sc = StandardScaler()
            sc.fit(X_train)
            X_train = sc.transform(X_train)
            #Training Models
            logger.info("Model Training Initiation\n=====================")
            mars_ = Earth(feature_importance_type='gcv',)
            vigilance_x = np.linspace(0.01, 3, Config.vigilance_x_frequency)
            for sens_x in vigilance_x:
                lsnr = PR(mars_,vigil_x=sens_x)
                lsnr.fit(X_train,y_train)



                logger.info("Accuracy Evaluation on Test set with vigil_x={0}\n=====================".format(sens_x))
                for i in range(1000):
                    #Obtain query from test-set
                    dataset = p
                    printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50)

                    q = test_df.iloc[i].values[:4].reshape(1,-1)
                    q = sc.transform(q)
                    #Obtain subquery pertubations for query q from test set
                    q1 = sub[i]
                    X = q1[:,:4]