Esempio n. 1
0
 def __init__(self,
              dataID,
              b_allow_repeated_value=True,
              logger_file=logger_file,
              num_of_points=None,
              base_models=[tools.app_linear]):
     self.logger = logs.QueryLogs(log=logger_file)
     # self.logger.set_no_output()
     self.data = dl.load2d(dataID)
     if not b_allow_repeated_value:
         self.data.remove_repeated_x_1d()
     self.cRegression = CRegression(logger_object=self.logger,
                                    base_models=base_models)
     self.cRegression.fit(self.data)
     # self.logger.set_logging(file_name=logger_file)
     if num_of_points is None:
         self.qe = QueryEngine(self.cRegression, logger_object=self.logger)
     else:
         self.qe = QueryEngine(self.cRegression,
                               logger_object=self.logger,
                               num_training_points=num_of_points)
     self.qe.density_estimation()
     self.q_min = min(self.data.features)
     self.q_max = max(self.data.features)
     self.dataID = dataID
     del self.data
Esempio n. 2
0
    def runNd(self, dataID, base_models=None, ensemble_models=None, classifier_type=tools.classifier_xgboost_name, b_show_plot=False, b_disorder=False,b_select_classifier=False):
        data =dl.loadNd(dataID)

        client = CRegression(logger_object=self.logger, base_models=base_models, ensemble_models=ensemble_models,
                             classifier_type=classifier_type, b_show_plot=b_show_plot,
                             b_select_classifier=b_select_classifier,b_disorder=b_disorder)

        client.run(data)
        return client
Esempio n. 3
0
class Query_Engine_2d:
    def __init__(self,dataID,b_allow_repeated_value=True,logger_file=logger_file,num_of_points=None):
        self.logger = logs.QueryLogs(log=logger_file)
        # self.logger.set_no_output()
        self.data = dl.load2d(dataID)
        if not b_allow_repeated_value:
            self.data.remove_repeated_x_1d()
        self.cRegression = CRegression(logger_object=self.logger)
        self.cRegression.fit(self.data)
        # self.logger.set_logging(file_name=logger_file)
        if num_of_points is None:
            self.qe = QueryEngine(self.cRegression, logger_object=self.logger)
        else:
            self.qe = QueryEngine(self.cRegression, logger_object=self.logger,num_training_points=num_of_points)
        self.qe.density_estimation()
        self.q_min = min(self.data.features)
        self.q_max = max(self.data.features)
        self.dataID = dataID

        #warnings.filterwarnings(action='ignore', category=DeprecationWarning)

    def query_2d_avg(self,l=0,h=100):
        """query to 2d data sets.

        Args:
            l (int, optional): query lower boundary
            h (int, optional): query higher boundary
        """
        avgs,time = self.qe.approximate_avg_from_to(l ,h, 0)  #0.05E8,0.1E8,
        return avgs, time

    def query_2d_sum(self,l=0,h=100):
        """query to 2d data sets.

        Args:
            l (int, optional): query lower boundary
            h (int, optional): query higher boundary
        """
        sums,time = self.qe.approximate_sum_from_to(l ,h, 0)
        return sums, time
    def query_2d_count(self,l=0,h=100):
        count,time = self.qe.approximate_count_from_to(l ,h, 0)
        return count, time
    def query_2d_variance_x(self,l=0,h=100):
        variance_x,time = self.qe.approximate_variance_x_from_to(l ,h, 0)
        return variance_x, time
    def query_2d_variance_y(self,l=0,h=100):
        variance_y,time = self.qe.approximate_variance_y_from_to(l ,h, 0)
        return variance_y, time
    def query_2d_covariance(self,l=0,h=100):
        covariance,time = self.qe.approximate_covar_from_to(l ,h, 0)
        return covariance, time
    def query_2d_correlation(self,l=0,h=100):
        correlation,time = self.qe.approximate_corr_from_to(l ,h, 0)
        return correlation, time
    def query_2d_percentile(self, p):
        percentile,time = self.qe.approximate_percentile_from_to(p, self.q_min, self.q_max)
        return percentile, time

    def mass_query_sum(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number):
        q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0
        random.seed(1.0)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number))
            q_centre = random.uniform(self.q_min,self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length
            

            sqlStr = "SELECT SUM("+y+") FROM " + str(table) +" WHERE  "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_sum(l=q_left,h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)

            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning("MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " + str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " + str(exact_times))
        self.logger.logger.warning("Approximate query results: " + str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " + str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_avg(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number):
        q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0
        random.seed(1.0)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number))
            q_centre = random.uniform(self.q_min,self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length
            

            sqlStr = "SELECT AVG("+y+") FROM " + str(table) +" WHERE  "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0])
            self.logger.logger.info(sqlStr)


            approx_result, approx_time = self.query_2d_avg(l=q_left,h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning("MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " + str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " + str(exact_times))
        self.logger.logger.warning("Approximate query results: " + str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " + str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_count(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number):
        q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0
        random.seed(1.0)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number))
            q_centre = random.uniform(self.q_min,self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length
            

            sqlStr = "SELECT COUNT("+y+") FROM " + str(table) +" WHERE  "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_count(l=q_left,h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning("MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " + str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " + str(exact_times))
        self.logger.logger.warning("Approximate query results: " + str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " + str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_variance_x(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number):
        q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0
        # random.seed(1.0)
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length
        query_centres = generate_random.make_user_distribution(self.qe.kde, 30, 100,n=number)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number))
            q_centre = query_centres[i] #random.uniform(self.q_min,self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length
            

            sqlStr = "SELECT VARIANCE("+x+") FROM " + str(table) +" WHERE  "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_variance_x(l=q_left,h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning("MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " + str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " + str(exact_times))
        self.logger.logger.warning("Approximate query results: " + str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " + str(approx_times))
        return exact_results, approx_results, exact_times, approx_times


    def mass_query_variance_y(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number):
        q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0
        # random.seed(1.0)
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length
        query_centres = generate_random.make_user_distribution(self.qe.kde, 30, 100,n=number)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number))
            q_centre = query_centres[i] #random.uniform(self.q_min,self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length
            

            sqlStr = "SELECT VARIANCE("+y+") FROM " + str(table) +" WHERE  "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_variance_y(l=q_left,h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning("MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " + str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " + str(exact_times))
        self.logger.logger.warning("Approximate query results: " + str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " + str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_covariance(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number):
        q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0
        # random.seed(1.0)
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length
        query_centres = generate_random.make_user_distribution(self.qe.kde, 30, 100,n=number)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number))
            q_centre = query_centres[i] #random.uniform(self.q_min,self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length
            

            sqlStr = "SELECT COVARIANCE("+x+", "+y+") FROM " + str(table) +" WHERE  "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_covariance(l=q_left,h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning("MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " + str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " + str(exact_times))
        self.logger.logger.warning("Approximate query results: " + str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " + str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_correlation(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number):
        q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0
        # random.seed(1.0)
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length
        query_centres = generate_random.make_user_distribution(self.qe.kde, 30, 100,n=number)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number))
            q_centre = query_centres[i] #random.uniform(self.q_min,self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length
            

            sqlStr = "SELECT CORR("+x+", "+y+") FROM " + str(table) +" WHERE  "+x+" BETWEEN " + str(q_left[0]) +" AND " +str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_correlation(l=q_left,h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning("MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " + str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " + str(exact_times))
        self.logger.logger.warning("Approximate query results: " + str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " + str(approx_times))
        self.logger.logger.info("")
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_percentile(self,table,x="ss_list_price",y="ss_wholesale_cost",percent=5,number=default_mass_query_number):
        q_range_half_length = (self.q_max-self.q_min)*percent/100.0/2.0
        random.seed(1.0)
        
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No."+str(i+1) +" out of "+str(number))
            q_centre = random.uniform(0,1)
            sqlStr = "SELECT percentile_cont("+x+", "+str(q_centre)+") FROM " + str(table)
            self.logger.logger.info(sqlStr)

            
            
            approx_result, approx_time = self.query_2d_percentile(q_centre)
            self.logger.logger.info(approx_result)

            
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning("MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " + str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " + str(exact_times))
        self.logger.logger.warning("Approximate query results: " + str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " + str(approx_times))
        self.logger.logger.info("")
        return exact_results, approx_results, exact_times, approx_times

    def relative_error(self,exact_results, approx_results):
        # abs_errors = [abs(i - j) for i, j in zip(exact_results,approx_results) ]
        rel_errors = [abs(i - j)*1.0/i for i, j in zip(exact_results,approx_results) ]
        # abs_time_reduction = [(j - i) for i, j in zip(exact_times, approx_times) ]
        result = sum(rel_errors)/len(rel_errors)
        self.logger.logger.warning("Relative error is : " + str(result))
        return result
    def time_ratio(self,exact_times, approx_times):
        result = sum(approx_times)/sum(exact_times)
        self.logger.logger.warning("Time ratio is : " + str(result))
        return result




    def query2hive(self,sql="SHOW TABLES",use_server=True):
        if use_server:
            host = "137.205.118.65"
        else:
            host = "localhost"


        with pyhs2.connect(host=host,
                       port=10000,
                       authMechanism="PLAIN",
                       user='******',
                       password='******',
                       database='default') as conn:
            with conn.cursor() as cur:
                #Show databases
                # print cur.getDatabases()

                #Execute query
                # cur.execute("select * from src")
                start = datetime.now()
                cur.execute(sql)
                end = datetime.now()
                time_cost =  (end - start).total_seconds()
                #Return column info from query
                # print cur.getSchema()

                #Fetch table results
                self.logger.logger.info("Time spent for HIVE query: %.4fs." % time_cost)
                for i in cur.fetch():
                    self.logger.logger.info(i)
        return i[0], time_cost

    def query2mysql(self,sql="SHOW TABLES",use_server=True):
        # Open database connection
        # db = MySQLdb.connect("127.0.0.1","hiveuser","bayern","hivedb" )
        if use_server:
            db = pymysql.connect("137.205.118.65","u1796377","bayern","hivedb",port=3306 )
        else:
            db = pymysql.connect("127.0.0.1","hiveuser","bayern","hivedb",port=3306 )

        # prepare a cursor object using cursor() method
        cursor = db.cursor()

        # Drop table if it already exist using execute() method.
        # cursor.execute("DROP TABLE IF EXISTS EMPLOYEE")

        # Create table as per requirement
        # sql = sql
        start = datetime.now()
        cursor.execute(sql)
        results = cursor.fetchall()

        end = datetime.now()
        time_cost =  (end - start).total_seconds()
        self.logger.logger.info("Time spent for MYSQL query: %.4fs." % time_cost)
        for row in results:
            self.logger.logger.info(row)

        # disconnect from server
        db.close()
        return row[0], time_cost
Esempio n. 4
0
    def init_whole_range(self,file, table, columnItems, num_of_points=None):
        """Build DBEst table for a table, with different combinations of column pairs.
        
        Args:
            table (str): table names, for example: 'store_sales'
            file (str): path to the file
            columnItem (list): columnItem of each table, for example, 
                        [["ss_quantity", "ss_ext_discount_amt"],...]
            num_of_points (dict, optional): Description
        
        Deleted Parameters:
            tableColumnSets (None, optional): Description
        """
        start_time = datetime.now()

        tableColumnSets = [[table]+columnItems[i] for i in range(len(columnItems))]
        # self.logger.logger.info(tableColumnSets)

        if not hasattr(self,'df'):
            self.df={}

        if self.dataset == "tpcds":
            # initialise the column set, tables in TPC-DS dataset.
            if num_of_points is None:
                num_of_points = {}
                num_of_points["store_sales"] = 2685596178
                num_of_points["web_page"] = 3000
                num_of_points["time_dim"] = 86400
                num_of_points["web_sales"] = 720000376
            # merge dict num_of_points to self.num_of_points
            # self.logger.logger.info(num_of_points)
            self.num_of_points = {**self.num_of_points, **num_of_points}
            # self.logger.logger.info(self.num_of_points)

            if tableColumnSets is None:
                tableColumnSets = [
                    ["store_sales", "ss_quantity", "ss_ext_discount_amt"]  # ,
                    # ["store_sales", "ss_quantity", "ss_ext_sales_price"],
                    # ["store_sales", "ss_quantity", "ss_ext_list_price"],
                    # ["store_sales", "ss_quantity", "ss_ext_tax"],
                    # ["store_sales", "ss_quantity", "ss_net_paid"],
                    # ["store_sales", "ss_quantity", "ss_net_paid_inc_tax"],
                    # ["store_sales", "ss_quantity", "ss_net_profit"],
                    # ["store_sales", "ss_quantity", "ss_list_price"],
                    # ["store_sales", "ss_list_price", "ss_list_price"],
                    # ["store_sales", "ss_coupon_amt", "ss_list_price"],
                    # ["store_sales", "ss_wholesale_cost", "ss_list_price"],
                    # ["store_sales", "ss_sales_price", "ss_quantity"],
                    # ["store_sales", "ss_net_profit", "ss_quantity"]  # ,
                    # ["web_page", "wp_char_count", "wp_link_count"],   # *
                    # ["time_dim", "t_minute", "t_hour"],               # *
                    # ["web_sales", "ws_sales_price", "ws_quantity"]
                ]
            # self.logger.logger.info(tableColumnSets)
            self.tableColumnSets = self.tableColumnSets+tableColumnSets
            # self.logger.logger.info(self.tableColumnSets)
        if self.dataset == "pp":
            if num_of_points is None:
                num_of_points = {}
                num_of_points["powerplant"] = 26000000000
            # merge dict num_of_points to self.num_of_points
            self.num_of_points = {**self.num_of_points, **num_of_points}

            if tableColumnSets is None:
                tableColumnSets = [
                    # ["store_sales", "ss_quantity", "*"],
                    ["powerplant", "T", "EP"],
                    ["powerplant", "AP", "EP"],
                    ["powerplant", "RH", "EP"]
                ]
            self.tableColumnSets = self.tableColumnSets+tableColumnSets

        tables = [sublist[0] for sublist in self.tableColumnSets]
        self.uniqueTables = list(set(tables))
        self.logger.logger.info(
            "Dataset contains " + str(len(self.uniqueTables)) +
            " tables, which are:")
        self.logger.logger.info(self.uniqueTables)
        self.uniqueTCS = []
        for element in self.tableColumnSets:
            if element not in self.uniqueTCS:
                self.uniqueTCS.append(element)
        self.numOfCsOfTables = [tables.count(uniqueTableName) for
                                uniqueTableName in self.uniqueTables]
        self.logger.logger.info(
            "Talbes in the dataset need " + str(self.numOfCsOfTables) +
            " Column Sets.")

        # get column set in each table
        self.CSinTable = {}
        for uniqueTable in self.uniqueTables:
            columnSet = [[item[1], item[2]]
                         for item in self.uniqueTCS if item[0] is uniqueTable]
            self.logger.logger.debug(columnSet)
            self.CSinTable[uniqueTable] = columnSet
        self.logger.logger.debug(self.CSinTable)

        # load data
        for uniqueTable in self.uniqueTables:
            self.df[uniqueTable] = pd.read_csv(file)
            # self.logger.logger.info(df.to_string())
            self.df[uniqueTable] = self.df[uniqueTable].dropna()

            if uniqueTable in self.DBEstClients:
                DBEstiClient = self.DBEstClients[uniqueTable]
            else:
                DBEstiClient = {}  # store all QeuryEngines within each table

            for columnItem in self.CSinTable[uniqueTable]:
                if ((uniqueTable in self.DBEstClients) and
                        (str(columnItem) not in self.DBEstClients[uniqueTable])
                        or uniqueTable not in self.DBEstClients):
                    # the if sentence above judges whether previous model has
                    #  been trained, if so, skip re-train it
                    self.logger.logger.info(
                        "--------------------------------------------------")
                    self.logger.logger.info("Start training Qeury Engine for" +
                                            " Table " + uniqueTable +
                                            ", Column Set: " + str(columnItem))
                    headerX = columnItem[0]
                    headerY = columnItem[1]
                    x = self.df[uniqueTable][[headerX]].values
                    y = self.df[uniqueTable][[headerY]].values.reshape(-1)

                    data = DataSource()
                    data.features = x
                    data.labels = y
                    data.headers = columnItem
                    data.file = file
                    # self.data.removeNAN()

                    cRegression = CRegression(logger_object=self.logger,base_models=self.base_models)
                    cRegression.fit(data)

                    qe = QueryEngine(
                        cRegression, logger_object=self.logger,
                        num_training_points=self.num_of_points[uniqueTable])
                    qe.density_estimation()        
                    cRegression.clear_training_data()
                    # qe.get_size()
                    DBEstiClient[str(columnItem)] = qe

                    self.logger.logger.info("Finish training Qeury Engine " +
                                            "for Table " + uniqueTable +
                                            ", Column Set: " + str(columnItem))
                    self.logger.logger.info(
                        "--------------------------------------------------")
                    self.logger.logger.debug(DBEstiClient)
                    self.DBEstClients[uniqueTable] = DBEstiClient
                else:
                    self.logger.logger.info("This model exsits, not need to train, so just skip training! ")
        self.logger.logger.info(self.DBEstClients)
        # self.logger.logger.info(json.dumps(DBEstClients, indent=4))
        end_time = datetime.now()
        time_cost = (end_time - start_time).total_seconds()
        self.logger.logger.info(
            "DBEsti has been initialised, ready to serve... (%.1fs)"
            % time_cost)
Esempio n. 5
0
    def init_groupby(self, file="../data/tpcDs10k/store_sales.csv",
                     table="store_sales", group="ss_store_sk",
                     columnItem=["ss_wholesale_cost", "ss_list_price"],
                     num_of_points_per_group=None):
        """ support simple group by,

        Args:
            table (str, optional): table name
            group (str, optional): column name of the group
            columnItem (list, optional): [x, y], in list format
            num_of_points_per_group (Dictionary, optional): store the total number of points of each group 
        """
        start_time =  datetime.now()
        self.logger.logger.info("")
        self.logger.logger.info("Start building GROUP BY for Table " + table)
        self.df[table] = pd.read_csv(file)
        self.df[table] = self.df[table].dropna()
        # self.logger.logger.info(self.df[table]["ss_sold_date_sk"])
        # self.df[group] =pd.Series([], dtype=int)
        grouped = self.df[table].groupby(group)
        group_name = str([table, group])
        self.group_names[group_name] = []
        if table not in self.DBEstClients:
            self.DBEstClients[table] = {}

        # initiate the number of points per group
        if num_of_points_per_group is None:
            self.logger.logger.error("Please provide the information\
                (num_of_points_per_group) for init_groupby() ")
            sys.exit()
        else:
            self.num_of_points_per_group_tbls[str(
                [table, group])] = num_of_points_per_group


        for grp_name, group in grouped:

            self.group_names[group_name].append(grp_name)
            columnItemGroup = str(columnItem) + "-" + str(grp_name)
            self.logger.logger.debug(
                "--------------------------------------------------")
            self.logger.logger.debug(
                "Start building groupy by for " + columnItemGroup)
            headerX = columnItem[0]
            headerY = columnItem[1]
            x = group[[headerX]].values
            y = group[[headerY]].values.reshape(-1)

            data = DataSource()
            data.features = x
            data.labels = y
            data.headers = columnItem
            data.file = columnItemGroup
            # self.data.removeNAN()

            cRegression = CRegression(
                logger_object=self.logger, b_cross_validation=True,base_models=self.base_models)
            cRegression.fit(data)
            #
            #
            #
            #
            #
            #
            #
            #
            #
            # number of points
            self.logger.logger.info(grp_name)
            # self.logger.logger.info(group)
            qe = QueryEngine(
                cRegression, logger_object=self.logger,
                num_training_points=int(
                    num_of_points_per_group[str((int(grp_name)))]))
                    # num_of_points_per_group[str(int(grp_name))]))
            qe.density_estimation()
            cRegression.clear_training_data()
            # qe.get_size()
            self.DBEstClients[table][columnItemGroup] = qe
            self.logger.logger.info(
                "Start building groupy by for " + columnItemGroup)
            self.logger.logger.info(
                "--------------------------------------------------")

            # result, time = self.query_2d_percentile(float(line))
        end_time = datetime.now()
        time_cost = (end_time - start_time).total_seconds()
        self.logger.logger.info(
            "GROUP BY has been initialised, ready to serve... (%.1fs)"
            % time_cost)
Esempio n. 6
0
        end = datetime.now()
        time_cost = (end - start).total_seconds()
        if self.b_print_time_cost:
            self.logger.info("Time spent for approximate CORR: %.4fs." %
                             time_cost)
        return result, time_cost


if __name__ == "__main__":
    import generate_random
    warnings.filterwarnings(action='ignore', category=DeprecationWarning)

    logger = logs.QueryLogs()
    logger.set_no_output()
    data = dl.load2d(5)
    cRegression = CRegression(logger_object=logger)
    cRegression.fit(data)

    # cRegression.plot_training_data_2d()
    logger.set_logging()
    qe = QueryEngine(cRegression, logger_object=logger)
    qe.density_estimation()
    # qe.desngity_estimation_plt2d()

    r = generate_random.percentile(0.9,
                                   qe.kde,
                                   30,
                                   100,
                                   steps=200,
                                   n_bisect=100)
    print(r)
Esempio n. 7
0
class Query_Engine_2d:
    def __init__(self,
                 dataID,
                 b_allow_repeated_value=True,
                 logger_file=logger_file,
                 num_of_points=None,
                 base_models=[tools.app_linear]):
        self.logger = logs.QueryLogs(log=logger_file)
        # self.logger.set_no_output()
        self.data = dl.load2d(dataID)
        if not b_allow_repeated_value:
            self.data.remove_repeated_x_1d()
        self.cRegression = CRegression(logger_object=self.logger,
                                       base_models=base_models)
        self.cRegression.fit(self.data)
        # self.logger.set_logging(file_name=logger_file)
        if num_of_points is None:
            self.qe = QueryEngine(self.cRegression, logger_object=self.logger)
        else:
            self.qe = QueryEngine(self.cRegression,
                                  logger_object=self.logger,
                                  num_training_points=num_of_points)
        self.qe.density_estimation()
        self.q_min = min(self.data.features)
        self.q_max = max(self.data.features)
        self.dataID = dataID
        del self.data

        #warnings.filterwarnings(action='ignore', category=DeprecationWarning)

    def query_2d_avg(self, l=0, h=100):
        """query to 2d data sets.

        Args:
            l (int, optional): query lower boundary
            h (int, optional): query higher boundary
        """
        avgs, time = self.qe.approximate_avg_from_to(l, h, 0)  # 0.05E8,0.1E8,
        return avgs, time

    def query_2d_sum(self, l=0, h=100):
        """query to 2d data sets.

        Args:
            l (int, optional): query lower boundary
            h (int, optional): query higher boundary
        """
        sums, time = self.qe.approximate_sum_from_to(l, h, 0)
        return sums, time

    def query_2d_count(self, l=0, h=100):
        count, time = self.qe.approximate_count_from_to(l, h, 0)
        return count, time

    def query_2d_variance_x(self, l=0, h=100):
        variance_x, time = self.qe.approximate_variance_x_from_to(l, h, 0)
        return variance_x, time

    def query_2d_variance_y(self, l=0, h=100):
        variance_y, time = self.qe.approximate_variance_y_from_to(l, h, 0)
        return variance_y, time

    def query_2d_covariance(self, l=0, h=100):
        covariance, time = self.qe.approximate_covar_from_to(l, h, 0)
        return covariance, time

    def query_2d_correlation(self, l=0, h=100):
        correlation, time = self.qe.approximate_corr_from_to(l, h, 0)
        return correlation, time

    def query_2d_percentile(self, p):
        percentile, time = self.qe.approximate_percentile_from_to(
            p, self.q_min, self.q_max)
        return percentile, time

    def query_2d_min(self, l=0, h=100, ci=True, confidence=0.95):
        _min, time = self.qe.approximate_min_from_to(l,
                                                     h,
                                                     ci=ci,
                                                     confidence=confidence)
        return _min, time

    def query_2d_max(self, l=0, h=100, ci=True, confidence=0.95):
        _max, time = self.qe.approximate_max_from_to(l,
                                                     h,
                                                     ci=ci,
                                                     confidence=confidence)
        return _max, time

    def mass_query_sum(self,
                       table,
                       x="ss_list_price",
                       y="ss_wholesale_cost",
                       percent=5,
                       number=default_mass_query_number,
                       b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        self.logger.logger.info("Start generating queries")
        if b_random_queries:
            random.seed(1.0)
            query_centres = []
            for i in range(number):
                query_centres.append(random.uniform(self.q_min, self.q_max)[0])
        else:
            query_centres = generate_random.make_user_distribution(self.qe.kde,
                                                                   self.q_min,
                                                                   self.q_max,
                                                                   n=number)
        self.logger.logger.info("Finish generating " + str(number) +
                                " queries, the center points are:")
        self.logger.logger.info(str(query_centres))
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            # random.uniform(self.q_min, self.q_max)
            q_centre = query_centres[i]
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT SUM(" + y + ") FROM " + str(table) + " WHERE  " + \
                x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_sum(l=q_left, h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)

            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_avg(self,
                       table,
                       x="ss_list_price",
                       y="ss_wholesale_cost",
                       percent=5,
                       number=default_mass_query_number,
                       b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        self.logger.logger.info("Start generating queries")
        if b_random_queries:
            random.seed(1.0)
            query_centres = []
            for i in range(number):
                query_centres.append(random.uniform(self.q_min, self.q_max)[0])
        else:
            query_centres = generate_random.make_user_distribution(self.qe.kde,
                                                                   self.q_min,
                                                                   self.q_max,
                                                                   n=number)
        self.logger.logger.info("Finish generating " + str(number) +
                                " queries, the center points are:")
        self.logger.logger.info(str(query_centres))
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            # random.uniform(self.q_min, self.q_max)
            q_centre = query_centres[i]
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT AVG(" + y + ") FROM " + str(table) + " WHERE  " + \
                x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_avg(l=q_left, h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_count(self,
                         table,
                         x="ss_list_price",
                         y="ss_wholesale_cost",
                         percent=5,
                         number=default_mass_query_number,
                         b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        self.logger.logger.info("Start generating queries")
        if b_random_queries:
            random.seed(1.0)
            query_centres = []
            for i in range(number):
                query_centres.append(random.uniform(self.q_min, self.q_max)[0])
        else:
            query_centres = generate_random.make_user_distribution(self.qe.kde,
                                                                   self.q_min,
                                                                   self.q_max,
                                                                   n=number)
        self.logger.logger.info("Finish generating " + str(number) +
                                " queries, the center points are:")
        self.logger.logger.info(str(query_centres))
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length

        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            # random.uniform(self.q_min, self.q_max)
            q_centre = query_centres[i]
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT COUNT(" + y + ") FROM " + str(table) + " WHERE  " + \
                x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_count(l=q_left,
                                                             h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_variance_x(self,
                              table,
                              x="ss_list_price",
                              y="ss_wholesale_cost",
                              percent=5,
                              number=default_mass_query_number,
                              b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        self.logger.logger.info("Start generating queries")
        if b_random_queries:
            random.seed(1.0)
            query_centres = []
            for i in range(number):
                query_centres.append(random.uniform(self.q_min, self.q_max)[0])
        else:
            query_centres = generate_random.make_user_distribution(self.qe.kde,
                                                                   self.q_min,
                                                                   self.q_max,
                                                                   n=number)
        self.logger.logger.info("Finish generating " + str(number) +
                                " queries, the center points are:")
        self.logger.logger.info(str(query_centres))
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length

        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            # random.uniform(self.q_min,self.q_max)
            q_centre = query_centres[i]
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT VARIANCE(" + x + ") FROM " + str(table) + " WHERE  " + \
                x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_variance_x(l=q_left,
                                                                  h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_variance_y(self,
                              table,
                              x="ss_list_price",
                              y="ss_wholesale_cost",
                              percent=5,
                              number=default_mass_query_number,
                              b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        # random.seed(1.0)
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length
        self.logger.logger.info("Start generating queries")
        if b_random_queries:
            random.seed(1.0)
            query_centres = []
            for i in range(number):
                query_centres.append(random.uniform(self.q_min, self.q_max)[0])
        else:
            query_centres = generate_random.make_user_distribution(self.qe.kde,
                                                                   self.q_min,
                                                                   self.q_max,
                                                                   n=number)
        self.logger.logger.info("Finish generating " + str(number) +
                                " queries, the center points are:")
        self.logger.logger.info(str(query_centres))
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            # random.uniform(self.q_min,self.q_max)
            q_centre = query_centres[i]
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT VARIANCE(" + y + ") FROM " + str(table) + " WHERE  " + \
                x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_variance_y(l=q_left,
                                                                  h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_covariance(self,
                              table,
                              x="ss_list_price",
                              y="ss_wholesale_cost",
                              percent=5,
                              number=default_mass_query_number,
                              b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        # random.seed(1.0)
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length
        self.logger.logger.info("Start generating queries")
        if b_random_queries:
            random.seed(1.0)
            query_centres = []
            for i in range(number):
                query_centres.append(random.uniform(self.q_min, self.q_max)[0])
        else:
            query_centres = generate_random.make_user_distribution(self.qe.kde,
                                                                   self.q_min,
                                                                   self.q_max,
                                                                   n=number)
        self.logger.logger.info("Finish generating " + str(number) +
                                " queries, the center points are:")
        self.logger.logger.info(str(query_centres))
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            # random.uniform(self.q_min,self.q_max)
            q_centre = query_centres[i]
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT COVARIANCE(" + x + ", " + y + ") FROM " + str(
                table) + " WHERE  " + x + " BETWEEN " + str(
                    q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_covariance(l=q_left,
                                                                  h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_correlation(self,
                               table,
                               x="ss_list_price",
                               y="ss_wholesale_cost",
                               percent=5,
                               number=default_mass_query_number,
                               b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        # random.seed(1.0)
        random_left_boundary = self.q_min + q_range_half_length
        random_right_boundary = self.q_max - q_range_half_length
        self.logger.logger.info("Start generating queries")
        if b_random_queries:
            random.seed(1.0)
            query_centres = []
            for i in range(number):
                query_centres.append(random.uniform(self.q_min, self.q_max)[0])
        else:
            query_centres = generate_random.make_user_distribution(self.qe.kde,
                                                                   self.q_min,
                                                                   self.q_max,
                                                                   n=number)
        self.logger.logger.info("Finish generating " + str(number) +
                                " queries, the center points are:")
        self.logger.logger.info(str(query_centres))
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            # random.uniform(self.q_min,self.q_max)
            q_centre = query_centres[i]
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT CORR(" + x + ", " + y + ") FROM " + str(
                table) + " WHERE  " + x + " BETWEEN " + str(
                    q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_correlation(l=q_left,
                                                                   h=q_right)
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        self.logger.logger.info("")
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_percentile(self,
                              table,
                              x="ss_list_price",
                              y="ss_wholesale_cost",
                              percent=5,
                              number=default_mass_query_number,
                              b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        random.seed(1.0)

        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            q_centre = random.uniform(0, 1)
            sqlStr = "SELECT percentile_cont(" + x + \
                ", " + str(q_centre) + ") FROM " + str(table)
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_percentile(q_centre)
            self.logger.logger.info(approx_result)

            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)
            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        self.logger.logger.info("")
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_min(self,
                       table,
                       x="ss_list_price",
                       y="ss_wholesale_cost",
                       percent=5,
                       number=default_mass_query_number,
                       b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        random.seed(1.0)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            q_centre = random.uniform(self.q_min, self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT MIN(" + y + ") FROM " + str(table) + " WHERE  " + \
                x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_min(l=q_left[0],
                                                           h=q_right[0])
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)

            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def mass_query_max(self,
                       table,
                       x="ss_list_price",
                       y="ss_wholesale_cost",
                       percent=5,
                       number=default_mass_query_number,
                       b_random_queries=True):
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        random.seed(1.0)
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []
        for i in range(number):
            self.logger.logger.info("start query No." + str(i + 1) +
                                    " out of " + str(number))
            q_centre = random.uniform(self.q_min, self.q_max)
            q_left = q_centre - q_range_half_length
            q_right = q_centre + q_range_half_length

            sqlStr = "SELECT MAX(" + y + ") FROM " + str(table) + " WHERE  " + \
                x + " BETWEEN " + str(q_left[0]) + " AND " + str(q_right[0])
            self.logger.logger.info(sqlStr)

            approx_result, approx_time = self.query_2d_max(l=q_left[0],
                                                           h=q_right[0])
            self.logger.logger.info(approx_result)
            exact_result, exact_time = self.query2mysql(sql=sqlStr)

            self.logger.logger.info(exact_result)

            if (exact_result is not None) and (exact_result is not 0):
                exact_results.append(exact_result)
                exact_times.append(exact_time)
                approx_results.append(approx_result)
                approx_times.append(approx_time)
            else:
                self.logger.logger.warning(
                    "MYSQL returns None, so this record is ignored.")
        self.logger.logger.warning("MYSQL query results: " +
                                   str(exact_results))
        self.logger.logger.warning("MYSQL query time cost: " +
                                   str(exact_times))
        self.logger.logger.warning("Approximate query results: " +
                                   str(approx_results))
        self.logger.logger.warning("Approximate query time cost: " +
                                   str(approx_times))
        return exact_results, approx_results, exact_times, approx_times

    def relative_error(self, exact_results, approx_results):
        # abs_errors = [abs(i - j) for i, j in zip(exact_results,approx_results) ]
        rel_errors = [
            abs(i - j) * 1.0 / i for i, j in zip(exact_results, approx_results)
        ]
        # abs_time_reduction = [(j - i) for i, j in zip(exact_times, approx_times) ]
        result = sum(rel_errors) / len(rel_errors)
        self.logger.logger.warning("Relative error is : " + str(result))
        return result

    def time_ratio(self, exact_times, approx_times):
        result = sum(approx_times) / sum(exact_times)
        self.logger.logger.warning("Time ratio is : " + str(result))
        return result

    # def query2hive(self, sql="SHOW TABLES", use_server=True):
    #     if use_server:
    #         host = "137.205.118.65"
    #     else:
    #         host = "localhost"

    #     with pyhs2.connect(host=host,
    #                        port=10000,
    #                        authMechanism="NOSASL",
    #                        user='******',
    #                        password='******',
    #                        database='default') as conn:
    #         with conn.cursor() as cur:
    #             # Show databases
    #             # print cur.getDatabases()

    #             # Execute query
    #             # cur.execute("select * from src")
    #             start = datetime.now()
    #             cur.execute(sql)
    #             end = datetime.now()
    #             time_cost = (end - start).total_seconds()
    #             # Return column info from query
    #             # print cur.getSchema()

    #             # Fetch table results
    #             self.logger.logger.info(
    #                 "Time spent for HIVE query: %.4fs." % time_cost)
    #             for i in cur.fetch():
    #                 self.logger.logger.info(i)
    #     return i[0], time_cost

    def query2hive2(self, sql="SHOW TABLES", use_server=True):
        if use_server:
            host = "137.205.118.65"
        else:
            host = "localhost"

        conn = hive.connect(host=host,
                            port=10000,
                            username='******',
                            auth='NOSASL')

        cursor = conn.cursor()
        start = datetime.now()
        cursor.execute(sql)
        end = datetime.now()
        time_cost = (end - start).total_seconds()
        for result in cursor.fetchall():
            self.logger.logger.info(result)
        return result[0], time_cost

    def query2mysql(self, sql="SHOW TABLES", use_server=True):
        # Open database connection
        # db = MySQLdb.connect("127.0.0.1","hiveuser","bayern","hivedb" )
        if use_server:
            db = pymysql.connect("137.205.118.65",
                                 "u1796377",
                                 "bayern",
                                 "hivedb",
                                 port=3306)
        else:
            db = pymysql.connect("127.0.0.1",
                                 "hiveuser",
                                 "bayern",
                                 "hivedb",
                                 port=3306)

        # prepare a cursor object using cursor() method
        cursor = db.cursor()

        # Drop table if it already exist using execute() method.
        # cursor.execute("DROP TABLE IF EXISTS EMPLOYEE")

        # Create table as per requirement
        # sql = sql
        start = datetime.now()
        cursor.execute(sql)
        results = cursor.fetchall()

        end = datetime.now()
        time_cost = (end - start).total_seconds()
        self.logger.logger.info("Time spent for MYSQL query: %.4fs." %
                                time_cost)
        for row in results:
            self.logger.logger.info(row)

        # disconnect from server
        db.close()
        return row[0], time_cost

    def mass_query(self, file, agg_func='avg', ci=True, confidence=0.95):
        AQP_results = []
        time_costs = []
        index = 0
        with open(file) as fin:
            for line in fin:
                self.logger.logger.info("Starting Query " + str(index) + ":")
                index = index + 1
                if agg_func is 'percentile':
                    # print(line)
                    result, time = self.query_2d_percentile(float(line))
                    AQP_results.append(result)
                    time_costs.append(time)
                    # print(result)
                if agg_func is 'avg':
                    lh = line.split(",")
                    l = float(lh[0])
                    h = float(lh[1])
                    result, time = self.query_2d_avg(l, h)
                    AQP_results.append(result)
                    time_costs.append(time)
                if agg_func is 'sum':
                    lh = line.split(",")
                    l = float(lh[0])
                    h = float(lh[1])
                    result, time = self.query_2d_sum(l, h)
                    AQP_results.append(result)
                    time_costs.append(time)
                if agg_func is 'count':
                    lh = line.split(",")
                    l = float(lh[0])
                    h = float(lh[1])
                    result, time = self.query_2d_count(l, h)
                    AQP_results.append(result)
                    time_costs.append(time)
                if agg_func is 'variance_x':
                    lh = line.split(",")
                    l = float(lh[0])
                    h = float(lh[1])
                    result, time = self.query_2d_variance_x(l, h)
                    AQP_results.append(result)
                    time_costs.append(time)
                if agg_func is 'min':
                    lh = line.split(",")
                    l = float(lh[0])
                    h = float(lh[1])
                    result, time = self.query_2d_min(l,
                                                     h,
                                                     ci=ci,
                                                     confidence=confidence)
                    AQP_results.append(result)
                    time_costs.append(time)
                if agg_func is 'max':
                    lh = line.split(",")
                    l = float(lh[0])
                    h = float(lh[1])
                    result, time = self.query_2d_max(l,
                                                     h,
                                                     ci=ci,
                                                     confidence=confidence)
                    AQP_results.append(result)
                    time_costs.append(time)
                if agg_func is 'covar':
                    lh = line.split(",")
                    l = float(lh[0])
                    h = float(lh[1])
                    result, time = self.query_2d_covariance(l, h)
                    AQP_results.append(result)
                    time_costs.append(time)
                if agg_func is 'corr':
                    lh = line.split(",")
                    l = float(lh[0])
                    h = float(lh[1])
                    result, time = self.query_2d_correlation(l, h)
                    AQP_results.append(result)
                    time_costs.append(time)

        self.logger.logger.info(AQP_results)
        self.logger.logger.info(time_costs)
        return AQP_results

    def generate_queries(self,
                         table,
                         x="ss_list_price",
                         y="ss_wholesale_cost",
                         percent=5,
                         number=default_mass_query_number,
                         b_random_queries=False,
                         mode=2,
                         file_pre="etrade_"):
        """Summary

        Args:
            table (TYPE): TABLE name
            x (str, optional): x
            y (str, optional): y
            percent (int, optional): query width, or query domain
            number (TYPE, optional): number of queries generated
            b_random_queries (bool, optional): Description
            mode (str, optional): 1 for percentile queries, QRegQL;
                                  2 for other queries, QRegQL;
                                  1 for percentile queries, SQL or HIVEQL;
                                  2 for other queries, SQL or HIVEQL;
        """
        if mode is 1:
            file_name_qreg = file_pre + "percentile" + str(percent) + ".qreg"
            file_name_hiveql = file_pre + \
                "percentile" + str(percent) + ".hiveql"
        elif mode is 2:
            file_name_qreg = file_pre + "queries" + str(percent) + ".qreg"
            file_name_hiveql = file_pre + "queries" + str(percent) + ".hiveql"
        else:
            self.logger.logger.error(
                "Failed to generate queries, no mode selected!")
            sys.exit(errno.EPERM)
        q_range_half_length = (self.q_max - self.q_min) * percent / 100.0 / 2.0
        self.logger.logger.info("Start generating queries")
        if b_random_queries:
            random.seed(1.0)
            query_centres = []
            for i in range(number):
                if mode is 1:
                    # _start = (self.q_max - self.q_min) * 0.4
                    # query_centres.append(random.uniform(
                    #     self.q_min + _start, self.q_max)[0])
                    query_centres.append(random.uniform(0.4, 1))
                else:
                    query_centres.append(
                        random.uniform(self.q_min, self.q_max)[0])
        else:
            query_centres = generate_random.make_user_distribution(self.qe.kde,
                                                                   self.q_min,
                                                                   self.q_max,
                                                                   n=number)
        self.logger.logger.info("Finish generating " + str(number) +
                                " queries, the center points are:")
        self.logger.logger.info(str(query_centres))
        exact_results = []
        exact_times = []
        approx_results = []
        approx_times = []

        if mode is 1:
            with open(file_name_hiveql, 'w+') as f_hiveql:
                with open(file_name_qreg, 'w+') as f_qreg:
                    for i in range(number):
                        q_centre = query_centres[i]
                        qregStr = str(q_centre)
                        sqlStr = "SELECT percentile_cont(" + x + \
                            ", " + str(q_centre) + ") FROM " + str(table)
                        f_hiveql.write(sqlStr + "\n")
                        f_qreg.write(qregStr + "\n")
        else:
            with open(file_name_hiveql, 'w+') as f_hiveql:
                with open(file_name_qreg, 'w+') as f_qreg:
                    aggregates = ["COUNT", "SUM", "AVG", "MIN", "MAX"]
                    for aggregate in aggregates:
                        for i in range(number):
                            q_centre = query_centres[i]
                            q_left = q_centre - q_range_half_length
                            q_right = q_centre + q_range_half_length
                            qregStr = str(q_left[0]) + ", " + str(q_right[0])
                            sqlStr = "SELECT " + aggregate + "(" + y + ") FROM " + str(table) + " WHERE  " + \
                                x + " BETWEEN " + \
                                    str(q_left[0]) + " AND " + str(q_right[0])

                            f_hiveql.write(sqlStr + "\n")
                            if aggregate is "COUNT":
                                f_qreg.write(qregStr + "\n")
                    for i in range(number):
                        q_centre = query_centres[i]
                        q_left = q_centre - q_range_half_length
                        q_right = q_centre + q_range_half_length
                        qregStr = str(q_left[0]) + ", " + str(q_right[0])
                        sqlStr = "SELECT " + "VARIANCE" + "(" + x + ") FROM " + str(table) + " WHERE  " + \
                            x + " BETWEEN " + \
                                str(q_left[0]) + " AND " + str(q_right[0])

                        f_hiveql.write(sqlStr + "\n")

            self.logger.logger.info("Queries are written to file " +
                                    file_name_qreg + " and " +
                                    file_name_hiveql)

    def mass_query2mysql(self, file):
        AQP_results = []
        time_costs = []
        index = 1
        with open(file) as fin:
            for line in fin:
                self.logger.logger.info("Starting Query " + str(index) + ":")
                index = index + 1
                self.logger.logger.info(line)
                result, time = self.query2mysql(sql=line)
                AQP_results.append(result)
                time_costs.append(time)
        self.logger.logger.info(AQP_results)
        self.logger.logger.info(time_costs)
        return AQP_results

    def mass_query2hive(self, file):
        AQP_results = []
        time_costs = []
        index = 1
        with open(file) as fin:
            for line in fin:
                try:
                    self.logger.logger.info("Starting Query " + str(index) +
                                            ":")
                    index = index + 1
                    self.logger.logger.info(line)
                    result, time = self.query2hive2(sql=line)
                    AQP_results.append(result)
                    time_costs.append(time)
                except Exception as e:
                    import smtplib

                    server = smtplib.SMTP('smtp.gmail.com', 587)
                    server.starttls()
                    server.login("*****@*****.**", "warwickpass")

                    msg = "failed!"
                    server.sendmail("*****@*****.**", "*****@*****.**",
                                    msg)
                    server.quit()

                    import sys
                    sys.exit("Error message")

        self.logger.logger.info(AQP_results)
        self.logger.logger.info(time_costs)
        return AQP_results