Example #1
0
    def __init__(self, query_manager, logCB=None, progressCB=None):

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)

        self.query_manager = query_manager

        #Profile of information currently being dealt with
        self.class_result_dict = None
        self.class_att_value_weight = None
        self.numeric_result_dict = None
        self.get_possible_values(query_manager)

        #Used by SVM_model to piece together results
        self.label_id_lookup_table = None

        #Current data being stored
        self.labels = []
        self.samples = []
        self.is_null_list = []

        #Used by KNN
        self.log_trans_atts = set([])
        self.attribute_id_list = []
        self.attribute_id_dict = {}
        self.id_attribute_dict = {}
    def __init__(self, xml_elem, logCB = None, progressCB = None) :

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING) 

        #KNN tuning parameters
        self.k = 10 #Make this 1 more than the number of columns
        self.num_display = 10
        self.num_mod = 1

        #Attributes that are used to make the prediction
        attributes_string = xml_elem.attributes['attributes'].value
        self.attributes = util_get_attribute_list(attributes_string)

        #NOT ACTUALLY USED, JUST MAKES IT SO KNN LIBRARY CAN BE USED
        self.test_attribute = None
        
        #Sets of attributes that must be considered as a whole
        self.attribute_combinations = []
        
        #Set all weights to 1
        self.initialized_weights = {}
        for attribute in self.attributes :
            self.initialized_weights[attribute] = 1

        #Attributes that will get there values log transformed to produce better results
        if xml_elem.hasAttribute('log_trans_attributes') :
            log_trans_string = xml_elem.attributes['log_trans_attributes'].value
            temp_atts_list = util_get_attribute_list(log_trans_string)
            self.log_trans_atts = set(temp_atts_list)

        self.null_value_list = [] #NOT USED

        #Random information
        self.test_type = "LDOF"
Example #3
0
    def __init__(self, query_manager, logCB = None, progressCB = None) :
        
        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)        


        self.query_manager = query_manager

        #Profile of information currently being dealt with
        self.class_result_dict = None
        self.class_att_value_weight = None
        self.numeric_result_dict = None
        self.get_possible_values(query_manager)
        
        #Used by SVM_model to piece together results
        self.label_id_lookup_table = None
        
        #Current data being stored
        self.labels = []
        self.samples = []
        self.is_null_list = []
        
        #Used by KNN
        self.log_trans_atts = set([])
        self.attribute_id_list = []
        self.attribute_id_dict = {}
        self.id_attribute_dict = {}
Example #4
0
    def __init__(self, xml_elem, logCB=None, progressCB=None):

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)

        #KNN tuning parameters
        self.k = 10  #Make this 1 more than the number of columns
        self.num_display = 10
        self.num_mod = 1

        #Attributes that are used to make the prediction
        attributes_string = xml_elem.attributes['attributes'].value
        self.attributes = util_get_attribute_list(attributes_string)

        #NOT ACTUALLY USED, JUST MAKES IT SO KNN LIBRARY CAN BE USED
        self.test_attribute = None

        #Sets of attributes that must be considered as a whole
        self.attribute_combinations = []

        #Set all weights to 1
        self.initialized_weights = {}
        for attribute in self.attributes:
            self.initialized_weights[attribute] = 1

        #Attributes that will get there values log transformed to produce better results
        if xml_elem.hasAttribute('log_trans_attributes'):
            log_trans_string = xml_elem.attributes[
                'log_trans_attributes'].value
            temp_atts_list = util_get_attribute_list(log_trans_string)
            self.log_trans_atts = set(temp_atts_list)

        self.null_value_list = []  #NOT USED

        #Random information
        self.test_type = "LDOF"
Example #5
0
    def __init__(self, xml_elem, MAKE_ALL_PREDS, logCB = None, progressCB = None) :

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING) 

        #Test specific information
        self.test_attribute = xml_elem.attributes["test_attribute"].value
        
        self.test_classifier = "weka.classifiers.lazy.IBk"
        if xml_elem.hasAttribute("test_classifier") :
            self.test_classifier = xml_elem.attributes["classifier"].value

        self.test_options = "-I -K 20 -X -A weka.core.neighboursearch.KDTree"
        if xml_elem.hasAttribute("options") :
            self.test_options = xml_elem.attributes["options"].value

        #Feature selection information
        self.use_feature_selection = False
        self.using_pca = False
        self.search_class = ""
        self.evaluation_class = ""
        if xml_elem.hasAttribute('fs_evaluation_class'):
            self.use_feature_selection = True

            self.search_class = xml_elem.attributes["fs_search_class"].value
            self.evaluation_class = xml_elem.attributes["fs_evaluation_class"].value
            
            #Checking for pca
            if self.evaluation_class.find("PrincipalComponents") > -1 :
                self.using_pca = True
                
            #Attributes that the search class starts with (Not used with PCA)
            self.start_attributes = []
            if xml_elem.hasAttribute('fs_start_attributes') :
                self.start_attributes = util_get_attribute_list(xml_elem.attributes['fs_start_attributes'].value)  
            
        #Attributes that are used to make the prediction        
        attributes_string = xml_elem.attributes["train_attributes"].value
        self.attributes = util_get_attribute_list(attributes_string)

        #Values that are considered null for the target attribute
        self.null_value_list = []
        elements = xml_elem.getElementsByTagName('null_values')
        if len(elements) > 0 :
            null_val_element = elements[0]
            for element in null_val_element.getElementsByTagName('v') :
    
                attribute = element.attributes['attribute'].value
                type = element.attributes['type'].value
                value = element.attributes['value'].value
                vt = element.attributes['vt'].value
    
                null_dict = {"attribute" : attribute, "type" : type}
    
                if vt == "int" :
                    null_dict["value"] = int(value)
                elif vt == "string" :
                    null_dict["value"] = str(value)
            
                self.null_value_list.append(null_dict)

        #Simply defined null values
        if xml_elem.hasAttribute("null_value") :
            null_value = xml_elem.attributes["null_value"].value
            null_dict = {"attribute" : self.test_attribute, "type" : "E", "value" : int(null_value)}
            self.null_value_list.append(null_dict)

        #Random information
        self.test_type = "Num"
        self.MAKE_ALL_PREDS = MAKE_ALL_PREDS           
Example #6
0
class Num_model :

    def __init__(self, xml_elem, MAKE_ALL_PREDS, logCB = None, progressCB = None) :

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING) 

        #Test specific information
        self.test_attribute = xml_elem.attributes["test_attribute"].value
        
        self.test_classifier = "weka.classifiers.lazy.IBk"
        if xml_elem.hasAttribute("test_classifier") :
            self.test_classifier = xml_elem.attributes["classifier"].value

        self.test_options = "-I -K 20 -X -A weka.core.neighboursearch.KDTree"
        if xml_elem.hasAttribute("options") :
            self.test_options = xml_elem.attributes["options"].value

        #Feature selection information
        self.use_feature_selection = False
        self.using_pca = False
        self.search_class = ""
        self.evaluation_class = ""
        if xml_elem.hasAttribute('fs_evaluation_class'):
            self.use_feature_selection = True

            self.search_class = xml_elem.attributes["fs_search_class"].value
            self.evaluation_class = xml_elem.attributes["fs_evaluation_class"].value
            
            #Checking for pca
            if self.evaluation_class.find("PrincipalComponents") > -1 :
                self.using_pca = True
                
            #Attributes that the search class starts with (Not used with PCA)
            self.start_attributes = []
            if xml_elem.hasAttribute('fs_start_attributes') :
                self.start_attributes = util_get_attribute_list(xml_elem.attributes['fs_start_attributes'].value)  
            
        #Attributes that are used to make the prediction        
        attributes_string = xml_elem.attributes["train_attributes"].value
        self.attributes = util_get_attribute_list(attributes_string)

        #Values that are considered null for the target attribute
        self.null_value_list = []
        elements = xml_elem.getElementsByTagName('null_values')
        if len(elements) > 0 :
            null_val_element = elements[0]
            for element in null_val_element.getElementsByTagName('v') :
    
                attribute = element.attributes['attribute'].value
                type = element.attributes['type'].value
                value = element.attributes['value'].value
                vt = element.attributes['vt'].value
    
                null_dict = {"attribute" : attribute, "type" : type}
    
                if vt == "int" :
                    null_dict["value"] = int(value)
                elif vt == "string" :
                    null_dict["value"] = str(value)
            
                self.null_value_list.append(null_dict)

        #Simply defined null values
        if xml_elem.hasAttribute("null_value") :
            null_value = xml_elem.attributes["null_value"].value
            null_dict = {"attribute" : self.test_attribute, "type" : "E", "value" : int(null_value)}
            self.null_value_list.append(null_dict)

        #Random information
        self.test_type = "Num"
        self.MAKE_ALL_PREDS = MAKE_ALL_PREDS           
        
    def get_predictions(self, query_manager) :

        #Filenames
        test_filename = "test" + str(int(time.time())) + ".arff"
        train_filename = "train" + str(int(time.time())) + ".arff"
        train_log = "train_log" + str(int(time.time())) + ".arff"
        result_filename = "results" + str(int(time.time())) + ".txt"


        #Creates (or clears) files that are used by the binary 
        IS_NUM_TEST = True
        file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca, test_filename, train_filename, query_manager, self)
        target_values = file_creation_info["target_values"]
        target_value_null = file_creation_info["target_value_null"]
        attribute_indexes = file_creation_info["attribute_indexes"]

        #If there are no null values in the test set
        #And the run is only replacing null values then terminate if no null values
        if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0 :
            os.remove(test_filename)
            os.remove(train_filename)            
            return None


        #Running feature selection process if needed
        acc_est = {}
        if self.use_feature_selection :
            (test_filename, train_filename, selected_attributes) = feature_selection(test_filename, train_filename, query_manager, file_creation_info, self, IS_NUM_TEST)
            acc_est["selected attributes"] = selected_attributes
                      
        #Running tests
        model_name = "saved_model" + str(int(time.time()))
        path_spef_weka = os.path.join( path, "models", "weka.jar")
        train_string = "java -Xmx1024m -cp " + path_spef_weka + " " + self.test_classifier + " -d " + model_name  + " " + self.test_options + " -t " + train_filename + " >> " + train_log
        test_string = "java -Xmx1024m -cp " +  path_spef_weka + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename

        self.printOut.pLog( "PRED- Training model")
        os.system(train_string)
        self.printOut.pLog( "PRED- Making predictions")
        os.system(test_string)
                    
        #Gathering results for each test instance
        self.printOut.pLog( "PRED- Getting results")
        f = open(result_filename)
        
        prediction_list = []
        confidence_list = []
        
        #For stat keeping
        absolute_diff_list = [] 
        relative_diff_list = []
        
        index = 0
        collect_results = False
        for line in f.readlines() :
            line_list = line.split()
            
            #Getting results
            if collect_results and len(line_list) > 1:
                prediction = float(line_list[2])
                prediction_list.append(prediction)
                confidence_list.append(0.0)
                
                #Getting difference between predicted and actuall results 
                #For non null values
                if not target_value_null[index] :
                    actual = float(target_values[index])
                    diff = math.fabs(actual - prediction)
                    
                    absolute_diff_list.append(diff)
                    if actual > 0 :
                        relative_diff_list.append(diff / actual)
                    else :
                        relative_diff_list.append(-1)
                index += 1
            
            #Seeing if you are at the results portion of the file
            if line.find("inst#") > -1 :
                collect_results = True            
        
        f.close()

        #Gathering accuracy estimations
        f = open(train_log)

        cross_val_info = False
        get_k_value = False
        for line in f.readlines() :
        
            #Getting all performance related metrics
            if cross_val_info :
                line = line.rstrip('\n')
                line = line.rstrip('\t')
                line = line.rstrip('\b')
                line = line.rstrip(' %')
        
                list = line.split('  ')
                if len(list) > 1:
                    attribute = list[0]
                    value = list[len(list) - 1]
                    value = float(value)
                    acc_est[attribute] = value
        
            #Getting parameter search results
            if get_k_value and line.find('using') > -1:
                list = line.split(' ')
                k = int(list[1])
                acc_est["1 Parameter: k value"] = k
                get_k_value = False
        
            #Finding cross validation info                                                                   
            if line.find('Cross-validation') > -1 :
                cross_val_info = True

            #Finding k value info
            if line.find('IB1 instance-based classifier') > -1 :
                get_k_value = True

        f.close()

        #Adding actual performance statistics
        absolute_diff_array = numpy.array(absolute_diff_list)
        relative_diff_array = numpy.array(relative_diff_list)

        absolute_mean = numpy.mean(absolute_diff_array)
        absolute_std = numpy.std(absolute_diff_array)

        relative_mean = numpy.mean(relative_diff_array)
        relative_std = numpy.std(relative_diff_array)
        
        acc_est["2 On test data: mean absolute diff"] = absolute_mean
        acc_est["2 On test data: std absolute diff"] = absolute_std
        acc_est["2 On test data: mean relative diff"] = relative_mean
        acc_est["2 On test data: std relative diff"] = relative_std
        

        #Add number of test instances to the accuracy estimation
        current_test_num = query_manager.current_test_block.parcel_count
        acc_est["test instance count"] = current_test_num / query_manager.group_max
        acc_est["block number"] = (len(query_manager.used_blocks) - 1)*query_manager.group_max + query_manager.group_count
         
        #Removing files
        os.remove(test_filename)
        os.remove(train_filename)
        os.remove(train_log)
        os.remove(result_filename)
        os.remove(model_name)                  
                    
        return Test_result("Num", self.test_attribute, prediction_list, confidence_list, acc_est)
Example #7
0
    def __init__(self, io_info_element, logCB = None, progressCB = None) :
        
        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)        
        
        #Storing all the information passed as parameters to the query manager
        self.db_url = io_info_element.attributes["input_db_url"].value
        self.table_name = io_info_element.attributes["input_table_name"].value
        self.x_attribute = io_info_element.attributes["x_column"].value
        self.y_attribute = io_info_element.attributes["y_column"].value
        self.id_attribute = io_info_element.attributes["id_column"].value

        #Forcing certain attributes to be categorical
        self.fclass_atts = []
        if io_info_element.hasAttribute('force_to_class') :
            self.fclass_atts = util_get_attribute_list(io_info_element.attributes["force_to_class"].value)

        #Forcing certain attributes to be numerical
        self.fnum_atts = []
        elements = io_info_element.getElementsByTagName('force_to_numeric')
        if io_info_element.hasAttribute('force_to_numeric') :
            self.fnum_atts = util_get_attribute_list(io_info_element.attributes["force_to_numeric"].value)

        #Size of blocks that will be created
        self.train_size = 40000
        if io_info_element.hasAttribute("train_block_size") :
            self.train_size = int(io_info_element.attributes["train_block_size"].value)
        
        self.test_size = 40000
        if io_info_element.hasAttribute("test_block_size") :
            self.test_size = int(io_info_element.attributes["test_block_size"].value)

        #Getting access to the table
        self.table = util_get_table(self.db_url, self.table_name)

        #Getting all attributes from the table
        #Getting what types of attributes they are
        (self.class_list, self.numeric_list, self.attributes) = util_get_attribute_info(self.table, self.fclass_atts, self.fnum_atts)

        #Used for the parcel query
        self.query_string = True
        elements = io_info_element.getElementsByTagName('test_criteria')
        if len(elements) > 0 :
            tc_elem = elements[0]
            self.query_string = self.util_create_query_string(tc_elem)

        #Used for extreme rows that are included in every test done
        self.ois_query_string = None
        elements = io_info_element.getElementsByTagName('outlier_inc_set')
        if len(elements) > 0 :
            ois_elem = elements[0] 
            if len(ois_elem.getElementsByTagName('or')) > 0:
                self.ois_query_string = self.util_create_query_string(ois_elem)

        #Getting x/y boundaries of the parcels and number of rows 
        #(may want to find a faster way to do this)
        (self.x_max, self.y_max, self.x_min, self.y_min, self.total_count) = self.util_spatial_boundaries() 
        
        self.rows_left = self.total_count

        #Information that is being stored about the number of parcel blocks remaining and used
        self.printOut.pLog("RET- Creating all parcel blocks...")
            
            
        self.block_list = self.util_create_parcel_block(self.x_max, self.y_max, self.x_min, self.y_min) 
        self.set_colors()
        self.used_blocks = []
        
        #In order to make sure max, min vals didn't leave any out
        #Can happen if x and y attributes are varchars in metadata
        self.adjust_borders()
        
        #Used for profiling the speed at which the program is running
        self.first_query_time = None
        self.number_rows_tested = 0
        
        self.table_current_test_rows = []

        #Parcel block information
        self.current_test_block = None
        self.current_training_block = None
        self.group_max = 2
        self.group_count = 2
        if io_info_element.hasAttribute('num_cv_folds') :
            self.group_max = int(io_info_element.attributes['num_cv_folds'].value)
            self.group_count = self.group_max

        self.overall_is_test_list = []
        self.use_as_training = []

        #Current rows retrieved
        self.current_rows = []
        self.is_test_list = []
        self.is_null_list = []
        self.test_number = []
Example #8
0
class Query_manager :
    def __init__(self, io_info_element, logCB = None, progressCB = None) :
        
        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)        
        
        #Storing all the information passed as parameters to the query manager
        self.db_url = io_info_element.attributes["input_db_url"].value
        self.table_name = io_info_element.attributes["input_table_name"].value
        self.x_attribute = io_info_element.attributes["x_column"].value
        self.y_attribute = io_info_element.attributes["y_column"].value
        self.id_attribute = io_info_element.attributes["id_column"].value

        #Forcing certain attributes to be categorical
        self.fclass_atts = []
        if io_info_element.hasAttribute('force_to_class') :
            self.fclass_atts = util_get_attribute_list(io_info_element.attributes["force_to_class"].value)

        #Forcing certain attributes to be numerical
        self.fnum_atts = []
        elements = io_info_element.getElementsByTagName('force_to_numeric')
        if io_info_element.hasAttribute('force_to_numeric') :
            self.fnum_atts = util_get_attribute_list(io_info_element.attributes["force_to_numeric"].value)

        #Size of blocks that will be created
        self.train_size = 40000
        if io_info_element.hasAttribute("train_block_size") :
            self.train_size = int(io_info_element.attributes["train_block_size"].value)
        
        self.test_size = 40000
        if io_info_element.hasAttribute("test_block_size") :
            self.test_size = int(io_info_element.attributes["test_block_size"].value)

        #Getting access to the table
        self.table = util_get_table(self.db_url, self.table_name)

        #Getting all attributes from the table
        #Getting what types of attributes they are
        (self.class_list, self.numeric_list, self.attributes) = util_get_attribute_info(self.table, self.fclass_atts, self.fnum_atts)

        #Used for the parcel query
        self.query_string = True
        elements = io_info_element.getElementsByTagName('test_criteria')
        if len(elements) > 0 :
            tc_elem = elements[0]
            self.query_string = self.util_create_query_string(tc_elem)

        #Used for extreme rows that are included in every test done
        self.ois_query_string = None
        elements = io_info_element.getElementsByTagName('outlier_inc_set')
        if len(elements) > 0 :
            ois_elem = elements[0] 
            if len(ois_elem.getElementsByTagName('or')) > 0:
                self.ois_query_string = self.util_create_query_string(ois_elem)

        #Getting x/y boundaries of the parcels and number of rows 
        #(may want to find a faster way to do this)
        (self.x_max, self.y_max, self.x_min, self.y_min, self.total_count) = self.util_spatial_boundaries() 
        
        self.rows_left = self.total_count

        #Information that is being stored about the number of parcel blocks remaining and used
        self.printOut.pLog("RET- Creating all parcel blocks...")
            
            
        self.block_list = self.util_create_parcel_block(self.x_max, self.y_max, self.x_min, self.y_min) 
        self.set_colors()
        self.used_blocks = []
        
        #In order to make sure max, min vals didn't leave any out
        #Can happen if x and y attributes are varchars in metadata
        self.adjust_borders()
        
        #Used for profiling the speed at which the program is running
        self.first_query_time = None
        self.number_rows_tested = 0
        
        self.table_current_test_rows = []

        #Parcel block information
        self.current_test_block = None
        self.current_training_block = None
        self.group_max = 2
        self.group_count = 2
        if io_info_element.hasAttribute('num_cv_folds') :
            self.group_max = int(io_info_element.attributes['num_cv_folds'].value)
            self.group_count = self.group_max

        self.overall_is_test_list = []
        self.use_as_training = []

        #Current rows retrieved
        self.current_rows = []
        self.is_test_list = []
        self.is_null_list = []
        self.test_number = []

    #Gets rows that represent a block of parcels
    #Returns None if there aren't any rows left
    def query_rows(self) :

        #FOR GUI
        self.printOut.progress(int((self.rows_left / float(self.total_count))*100))

        #Getting all new data loaded in data structures
        if self.group_count == self.group_max :

            #Reset the group count
            self.group_count = 1

            #Profiling (won't work if distributed) ############
            #Getting information about approximate time left
            if self.first_query_time == None :
                self.first_query_time = time.time()
            else :
                average_time = (time.time() - self.first_query_time) / (self.number_rows_tested)
                
                self.printOut.pLog( "PROFILE- Number of blocks remaining: " + str(len(self.block_list)))
                self.printOut.pLog( "PROFILE- Average time per unit: " +  str(average_time))
                self.printOut.pLog( "PROFILE- Number of rows remaining: " + str(self.rows_left))
                self.printOut.pLog( "PROFILE- Predicted remaining time (in minutes): " + str(int((average_time*(self.rows_left))/60)))
    
            ####################################################
            
            self.printOut.pLog( "RET- Retrieving training and test parcels from remaining: " + str(self.rows_left))      
            
            #Getting a block with a non zero parcel count
            block = None
            while self.block_list != [] and block == None :
                block = self.block_list.pop(0)
                if block.parcel_count == 0 :
                    block = None
    
            if block != None :
                
                self.current_test_block = block                
                training_rows_query = self.util_training_rows(block)
                
                start_time = int(time.time())
                
                #Getting the attribute values from the raw rows
                #Get rows into the proper format
                proper_rows = []            
                id_set = set([])
                for row in training_rows_query :
                    temp_row = {}
                    for attribute in self.attributes :
                        temp_row[attribute] = row[attribute]                                        
                    proper_rows.append(temp_row)
                    id_set.add(row[self.id_attribute])
        
                #Getting test and training rows (test is a subset of training)
                is_test_list = []
                test_number = []
                test_count = 0
                for index in range(len(proper_rows)) :
                    row = proper_rows[index]
    
                    #REQUIRES X and Y attributes to be included in the rows (may be a problem)
                    if block.row_within_block(self.x_attribute, self.y_attribute, row) :
                        is_test_list.append(True)
                        test_number.append(test_count)
                        test_count += 1
                    else :
                        is_test_list.append(False)
                        test_number.append(None)
                
                #Adjust block count (cause borders are modified in some cases)
                block.parcel_count = test_count
                
                self.used_blocks.append(block)
                self.rows_left -= block.parcel_count
                self.number_rows_tested += block.parcel_count
    
                #Adding the extreme values that need to be added to every data set
                #This helps outlier detection and null value predictions
                if self.ois_query_string != None :
                    s = self.table.select(and_(self.ois_query_string, self.query_string )).execute()
                    self.printOut.pLog( "RET- Num extra (rare) rows added: " + str(s.rowcount))
                    for row in s :
                        if row[self.id_attribute] not in id_set :
                            temp_row = {}
                            for attribute in self.attributes :
                                temp_row[attribute] = row[attribute]                                        
                            
                            proper_rows.append(temp_row)
                            is_test_list.append(False)
                            test_number.append(None)
    
                self.current_rows = proper_rows
                self.is_test_list = is_test_list
                self.overall_is_test_list = copy.deepcopy(is_test_list)
                self.test_number = test_number
    
                end_time = int(time.time())
                self.printOut.pLog( "RET- Time loading data structures: " + str(end_time - start_time))
    
            else :
                self.current_rows = []
                self.is_test_list = []
                self.test_number = []

                self.overall_is_test_list = []
                self.use_as_training = []

        #Increment group count
        else :
            self.group_count += 1

        #Use data that exists / loading temporary data structures
        self.is_test_list = []
        self.use_as_training = []

        test_num = 0
        test_count = 0
        train_count = 0

        #Going over every current row
        for index in range(len(self.current_rows)) :
            #if ONE group then all in test
            if self.group_max == 1 :
                if self.overall_is_test_list[index] :
                    self.is_test_list.append(True)
                    test_count += 1
                else :
                    self.is_test_list.append(False)
                    
                self.use_as_training.append(True)
                train_count += 1
            
            #If more than one group then split up test and training sets
            else :
                is_test = self.overall_is_test_list[index]
                if is_test :
                    test_num += 1
    
                #Deciding whether instance will be in the test set
                #Splits test set up
                #MISSING 4 VALUES IN SANITY CHECK
                used_as_test = False
                if is_test and test_num % self.group_max == (self.group_count - 1) : 
                    self.is_test_list.append(True)
                    used_as_test = True
                    test_count += 1
    
                else :
                    self.is_test_list.append(False)
                
                #Deciding whether instance should be a training data set
                #FIND INTELIGENT WAY TO STOP THE TRAINING SET FROM BEING TO LARGE
                if not used_as_test :
                    train_count += 1
                    
                if not used_as_test :
                    self.use_as_training.append(True)
                else :
                    self.use_as_training.append(False)

        self.printOut.pLog( "RET- Group: " + str(self.group_count))
        self.printOut.pLog( "RET- Test count: " + str(test_count))
        self.printOut.pLog( "RET- Train count: " + str(train_count))

    #Returns the number of rows that are left
    #to be retrieved
    def number_remaining_blocks(self) :
        if len(self.block_list) == 0 and self.group_count == self.group_max :
            return 0
        else :
            return 1


    #Used to setup basic query string
    def util_create_query_string(self, element):
        
        #Getting dictionary of column objects
        #Creating and clauses for all columns in test criteria combined
        qs = True
        and_list = []
        for or_tag in element.getElementsByTagName('or') :
            
            #Creating or clauses for a given "or list"
            or_list = []
            for elem in or_tag.getElementsByTagName('tc') :
                attribute = elem.attributes['attribute'].value
                type = elem.attributes['type'].value
                value = elem.attributes['value'].value

                #Getting the right form of the value
                vt = elem.attributes['vt'].value
                if vt == "string" :
                    value = str(value)
                elif vt == "int" :
                    value = int(value)
                
                #Creating clause for criteria
                if type == "E" :       
                    or_list.append(self.table.c[attribute] == value)
                elif type == "NE" :
                    or_list.append(self.table.c[attribute] != value)
                elif type == "GT" :
                    or_list.append(self.table.c[attribute] > value)
                elif type == "LT" :
                    or_list.append(self.table.c[attribute] < value)

            if len(or_list) > 0 :
                and_list.append(or_(*or_list))

        #Only make the query string equal to the list if there is something in the list
        if len(and_list) > 0 :   
            qs = and_(*and_list)

        return qs

    #util                                                                                               
    #keeps track of all parcel blocks                                                                         
    class Parcel_block :
        def __init__(self, x_max, y_max, x_min, y_min, parcel_count) :
            self.x_max = float(x_max)
            self.y_max = float(y_max)
            self.x_min = float(x_min)
            self.y_min = float(y_min)
            self.parcel_count = parcel_count
            
            self.right_border = False
            self.bottom_border = False
            
            #For visual represantation
            self.color = None
                        
        #Sets values for whether sides are borders
        def set_border_bools(self, query_manager):
            if self.y_min == query_manager.y_min :
                self.bottom_border = True
            if self.x_max == query_manager.x_max :
                self.right_border = True
            
        def row_within_block(self, x_at, y_at, row) :
            #There are strict equalties for the lower and right sides of the block
            #UNLESS that side borders the edge of space
            xa = float(row[x_at])
            ya = float(row[y_at])
            rb = self.right_border
            bb = self.bottom_border
            
            if xa >= self.x_min and ((rb and xa <= self.x_max) or (not rb and xa < self.x_max)) :
                if ya <= self.y_max and ((bb and ya >= self.y_min) or (not bb and ya > self.y_min)):
                    return True

            return False

    #util
    #Gets all the training rows (super set of test rows)
    def util_training_rows(self, block) :
        (cx_max, cy_max, cx_min, cy_min) = [block.x_max, block.y_max, block.x_min, block.y_min] 
        current_count = block.parcel_count

        if current_count == 0 :
            return [[], []]
        else :
            self.printOut.pLog( "RET- Current count inside training block: " + str(current_count))

            #setting easy variables
            x = self.x_attribute
            y = self.y_attribute
            t = self.table

            #ROOM FOR IMPROVEMENT
            #Make it so that this doesn't terribly overshoot the training size
            count_repeated = 0
            last_count = 0
            select_stat = t.select(and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string ))
            while(current_count < self.train_size) :
                change = math.sqrt((self.train_size - current_count) / float(max(self.train_size/10, current_count)))
                cx_min -= (cx_max - cx_min)*change*.1 
                cx_max += (cx_max - cx_min)*change*.1
                cy_min -= (cy_max - cy_min)*change*.1
                cy_max += (cy_max - cy_min)*change*.1
                             
                select_stat = t.select(and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string ))
                
                #Getting the number of instances inside the block
                s = select([func.count("*")], and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string ), from_obj=[t]).execute()
                block_count = parcel_count = sql_get_agg(s, "int")             
                
                last_count = current_count
                current_count = block_count
                
                self.printOut.pLog( "RET- Current count inside training block: " + str(current_count))
                          
                #Protects against cases in which current_count will never be bigger than train_size
                if last_count == current_count :
                    count_repeated += 1
                else :
                    count_repeated = 0
                if count_repeated == 5 :
                    break
            
            #Executing the training query
            s = select_stat.execute()
            
            #Used for parcel visual
            self.current_training_block = self.Parcel_block(cx_max, cy_max, cx_min, cy_min, "(training block)")
            self.current_training_block.color = self.current_test_block.color
            
            return s
         
    #util
    #Seperates parcels in a grid type fashion and creates
    #spatial objects for each grid
    def util_create_parcel_block(self, tx_max, ty_max, tx_min, ty_min) :
        t = self.table
        x = self.x_attribute
        y = self.y_attribute

        #NEED TO BE IMPROVED
        #The inequalities should be made strict in a certain way in order to insure nothings redundant
        s = select([func.count("*")], and_(t.c[x] >= tx_min, t.c[x] <= tx_max, t.c[y] >= ty_min, t.c[y] <= ty_max, self.query_string ), from_obj=[t]).execute()
        parcel_count = sql_get_agg(s, "int")

        #ROOM FOR IMPROVEMENT
        #Make it so that very small test blocks aren't created
        if parcel_count > self.test_size :
            x_mid = (tx_max - tx_min) / 2 + tx_min
            y_mid = (ty_max - ty_min) /2 + ty_min
            
            temp_list = []
            
            #Always splits in such a way that the the resulting rectangles are squarish
            x_diff = tx_max - tx_min
            y_diff = ty_max - ty_min
            if x_diff < y_diff :
                #Split horiz
                temp_list.extend(self.util_create_parcel_block(tx_max, ty_max, tx_min, y_mid))
                temp_list.extend(self.util_create_parcel_block(tx_max, y_mid, tx_min, ty_min))
            else :
                #Split vert
                temp_list.extend(self.util_create_parcel_block(tx_max, ty_max, x_mid, ty_min))
                temp_list.extend(self.util_create_parcel_block(x_mid, ty_max, tx_min, ty_min))
                
            return temp_list
        else :
            p = self.Parcel_block(tx_max, ty_max, tx_min, ty_min, parcel_count)
            self.printOut.pLog( "RET- Block size: " + str(parcel_count))

            p.set_border_bools(self)
            return [p]

    #util
    #Returns the max and min x and y coordinate values
    def util_spatial_boundaries(self) :
        self.printOut.pLog( "RET- Finding spatial boundaries of the database...")
    
        t = self.table
    
        #Setting overall values
        (x_max, y_max, x_min, y_min) = [None, None, None, None]
        s = select([func.count("*")], self.query_string, from_obj=[t]).execute()
        total_count = sql_get_agg(s, "int")

        s = select([func.max(t.c[self.x_attribute])]).execute()
        x_max = sql_get_agg(s, "float")
        
        s = select([func.min(t.c[self.x_attribute])]).execute()
        x_min = sql_get_agg(s, "float")
        
        s = select([func.max(t.c[self.y_attribute])]).execute()
        y_max = sql_get_agg(s, "float")
        
        s = select([func.min(t.c[self.y_attribute])]).execute()
        y_min = sql_get_agg(s, "float")

        return [x_max, y_max, x_min, y_min, total_count]
    
    #Creates a list that says whether each value is null or not
    def proc_is_null_list(self, test_object) :
        self.printOut.pLog( "RET- Test Attribute: " + str(test_object.test_attribute))

        is_null_list = [] 
        for i in range(len(self.current_rows)) :
            is_null = False
            for null_dict in test_object.null_value_list :

                value = null_dict["value"]
                type = null_dict["type"]
                row_value = self.current_rows[i][null_dict["attribute"]]
                
                if type == "GT" and row_value > value :
                    is_null = True
                    break
                elif type == "LT" and row_value < value :
                    is_null = True
                    break
                elif type == "E" and row_value == value :
                    is_null = True
                    break
                elif type == "NE" and row_value != value :
                    is_null = True
                    break

            is_null_list.append(is_null)
            
        self.is_null_list = is_null_list
        self.printOut.pLog( "RET- Found " + str(is_null_list.count(True)) + " null labels in whole training blocks")
            

    #makes it so class and num attribute lists only represent attributes being used in tests
    def update_att_lists(self, model_list):
        new_class_list = []
        new_num_list = []
        
        self.printOut.pLog( "RET- Checking that all needed attributes are in the table.")
        
        #finding all attributes being used
        for model in model_list :
            for attribute in model.attributes :
                if attribute in self.class_list :
                    new_class_list.append(attribute)
                elif attribute in self.numeric_list :
                    new_num_list.append(attribute)
                else :
                    self.printOut.pLog( "ERROR: Attribute Not in table- " + attribute)
            
            #Make sure the target attribute is included
            if model.test_attribute in self.class_list :
                new_class_list.append(model.test_attribute)
            elif model.test_attribute in self.numeric_list :
                new_num_list.append(model.test_attribute)
            elif model.test_attribute != None :
                self.printOut.pLog( "ERROR: Target Attribute Not in Table- ", model.test_attribute)

        self.printOut.pLog( "")
        
        self.class_list = new_class_list
        self.numeric_list = new_num_list

    #Finds color for each block
    #(Uses modified color scheme)
    def set_colors(self):
        color_list = ["red", "blue", "green", "yellow"]
        #Recording all touching blocks
        blocks_touching ={}
        for block in self.block_list :
            blocks_touching[block] = set([])
            #Checking which blocks are touching
            for ob in self.block_list :
                #left or right
                if block.x_min == ob.x_max or block.x_max == ob.x_min :
                    if not block.y_max <= ob.y_min and not block.y_min >= ob.y_max :
                        blocks_touching[block].add(ob)
                #top or bottom      
                elif block.y_min == ob.y_max or block.y_max == ob.y_min :
                    if not block.x_max <= ob.x_min and not block.x_min >= ob.x_max :
                        blocks_touching[block].add(ob)
        
        #Randomly coloring blocks
        #but making sure as many conflicts can be avoided as possible
        conflict_count = 0
        for block in self.block_list :
            available_colors = copy.deepcopy(color_list)
            for nb in blocks_touching[block] :
                if nb.color in available_colors :
                    available_colors.remove(nb.color)
    
            if len(available_colors) > 0 :
                #Picking a color that a neighbor doesn't have
                index = random.randint(0, len(available_colors) - 1)
                block.color = available_colors[index]
            else :
                #Picking a random color
                index = random.randint(0, len(color_list) - 1)
                block.color = color_list[index]
                conflict_count += 1
        self.printOut.pLog( "RET- Color conflicts: " + str(conflict_count)) 
        
    #For cases in which location variables are strings (in the database)    
    def adjust_borders(self):
        
        new_x_max = round_up(self.x_max)        
        new_x_min = round_down(self.x_min)
        new_y_max = round_up(self.y_max)
        new_y_min = round_down(self.y_min)
        
        for block in self.block_list :
            if block.x_max == self.x_max :
                block.x_max = new_x_max
            if block.y_max == self.y_max :
                block.y_max = new_y_max
            if block.x_min == self.x_min :
                block.x_min = new_x_min
            if block.y_min == self.y_min :
                block.y_min = new_y_min
        
        self.x_max = new_x_max
        self.y_max = new_y_max
        self.x_min = new_x_min
        self.y_min = new_y_min        
class LDOF_model :

    def __init__(self, xml_elem, logCB = None, progressCB = None) :

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING) 

        #KNN tuning parameters
        self.k = 10 #Make this 1 more than the number of columns
        self.num_display = 10
        self.num_mod = 1

        #Attributes that are used to make the prediction
        attributes_string = xml_elem.attributes['attributes'].value
        self.attributes = util_get_attribute_list(attributes_string)

        #NOT ACTUALLY USED, JUST MAKES IT SO KNN LIBRARY CAN BE USED
        self.test_attribute = None
        
        #Sets of attributes that must be considered as a whole
        self.attribute_combinations = []
        
        #Set all weights to 1
        self.initialized_weights = {}
        for attribute in self.attributes :
            self.initialized_weights[attribute] = 1

        #Attributes that will get there values log transformed to produce better results
        if xml_elem.hasAttribute('log_trans_attributes') :
            log_trans_string = xml_elem.attributes['log_trans_attributes'].value
            temp_atts_list = util_get_attribute_list(log_trans_string)
            self.log_trans_atts = set(temp_atts_list)

        self.null_value_list = [] #NOT USED

        #Random information
        self.test_type = "LDOF"

                     
    def get_predictions(self, data_profiler) :

        #Creates (or clears) files that are used by the binary 
        self.initialize_files()

        #Put data into files
        self.create_input_files(data_profiler)

        #Create the configuration file
        self.util_create_config(data_profiler)
        
        #Getting list of predictions and confidences
        #(This runs the binary with the config as a input)
        os.system(BINARY + ' ' + self.config_file  + ' >> ' + self.results_file)
        (lof_lists, lof_id_lists) = self.util_get_results(self.results_file)        
                    
        #Removes files used by the binary
        self.remove_files()
                    
        return Test_result(lof_lists, lof_id_lists)

    
    #util function
    #Gets the results from the result file
    #Also removes the files that were being used
    def util_get_results(self, filename) :
        start_time = int(time.time())
        self.printOut.pLog( "LDOF- Getting results" )
        
        f = open(filename)
        predictions = []
        confidences = []
    
        lof_lists = []
        lof_id_lists = []
    
        results_portion = False
        index = 0
        for line in f.readlines() :
            
            #Find better way to identify comments
            if line.find('#') < 0 and results_portion:
                line_list = line.split(', ')
                
                predictions.append(0)
                confidences.append(0)
                
                #Getting lof values for the line
                lof_list = []
                lof_id_list = []
                num_check = (self.k*2) / self.num_mod
                for i in range(num_check + 1) :
                    if i == 0 :
                        id = line_list[i]
                    elif i % 2 == 1 :
                        lof_list.append(1000*float(line_list[i])) #Don't adjust value like this
                    else :
                        lof_id_list.append(int(line_list[i]))
                        
                lof_lists.append(lof_list)
                lof_id_lists.append(lof_id_list)
                
                index += 1

            else :
                stripped = line.rstrip('\n')
                self.printOut.pLog("LDOF- " + stripped)
    
            #Finding the portion of the output file that has results
            if line.find("Printing final results:") > -1 :
                results_portion = True
        
        end_time = int(time.time())
        self.printOut.pLog( "LDOF- Time reading results: " + str( end_time - start_time ))
        
        return [lof_lists, lof_id_lists]
    
    #util function
    #Creates a configuration file that is used by the 
    #binary in order to run the test
    def util_create_config(self, data_profiler) :
           
        #Modifying the list of attribute ids
        #making sure the ones that are connected are represented as such
        #This means they have the same id in the configuration file
        #Also, the weights that are specified are given as well
        weights_list = []
        for index in range(len(data_profiler.attribute_id_list)) :
            id = data_profiler.attribute_id_list[index]
            attribute = data_profiler.attribute_id_dict[id]

            #Setting right IDs
            for combo in self.attribute_combinations :
                if attribute in combo :
                    data_profiler.attribute_id_list[index] = data_profiler.id_attribute_dict[combo[0]]
            
            #Setting right weights
            if attribute in self.initialized_weights :
                weights_list.append(self.initialized_weights[attribute])
            else :
                weights_list.append(0)
           
        #Printing information into the configuration file
        f = open(self.config_file, 'w')
    
        f.write("test_file: " + self.test_filename + "\n")
        f.write("train_file: " + self.train_filename + "\n")
    
        f.write("number_neighbors: " + str(self.k) + "\n")
        f.write("number_display: " + str(self.num_display) + "\n")
        f.write("number_mod: " + str(self.num_mod) + "\n")
    
        f.write("columns_attribute:")
        for i in data_profiler.attribute_id_list :
            f.write(" " + str(i))
        f.write("\n")
        
        f.write("initial_weights:")
        for weight in weights_list :
            f.write(" " + str(weight))
        f.write("\n")
        
        f.close()
    
    #util function
    #Gets information from the data_profiler object to files
    def create_input_files(self, data_profiler):
        start_time = int(time.time())
              
        #Just loading data structures
        test_labels = []
        test_samples = []
        test_ids = []
        
        train_labels = []
        train_samples = []
        train_ids = []
      
        for i in range(len(data_profiler.labels)) :
            #Adding instances that are going to be tested to the list
            if data_profiler.query_manager.is_test_list[i] :
                test_labels.append(data_profiler.labels[i])
                test_samples.append(data_profiler.samples[i])
                test_ids.append(i)
            
            #Adding non null instances to the training set
            if not data_profiler.query_manager.is_null_list[i]:
                train_labels.append(data_profiler.labels[i])
                train_samples.append(data_profiler.samples[i])
                train_ids.append(i)
        
        #Create test and train files
        self.util_create_valid_file_from_samples(test_labels, test_samples, test_ids, self.test_filename)
        self.util_create_valid_file_from_samples(train_labels, train_samples, train_ids, self.train_filename)

        #Isn't used but something needs to be passed to KNN        
        self.util_create_valid_file_from_samples([], [], [], self.val_filename)
        
        #Re-setting transformation information in data profiler
        data_profiler.log_trans_atts = set([])
    
        end_time = int(time.time())
        self.printOut.pLog( "LDOF- Time creating input files: " + str( end_time - start_time ))
    
    #util function
    #Creates files that will be used by the KNN binary                                                                     
    def util_create_valid_file_from_samples(self, labels, samples, id_list, filename) :
        f = open(filename, 'w')
        index = 0
        for sample in samples :
            line = str( labels[index] )
            line += " " + str(id_list[index])
            keys = sample.keys()
            keys.sort()
            for key in keys :
                line += " " + str(key) + ":" + str(sample[key])
            line += "\n"
            f.write(line)
            index += 1
    
        f.close()



    def initialize_files(self):

        #Creating unique file names
        self.test_filename = 'LDOF_TEST_INPUT_FILE' + str(int(time.time())) + '.TXT'
        self.train_filename = 'LDOF_TRAIN_INPUT_FILE' + str(int(time.time())) + '.TXT'
        self.val_filename = 'LDOF_VAL_FILE' + str(int(time.time())) + '.TXT'        
        self.results_file = 'LDOF_RESULTS_FILE' + str(int(time.time())) + '.TXT'
        self.config_file = 'LDOF' + str(int(time.time())) + '.cfg'   
        
        #Initializing files
        os.system('echo \"\" > ' + self.config_file)
        os.system('echo \"\" > ' + self.results_file)
        os.system('echo \"\" > ' + self.test_filename)
        os.system('echo \"\" > ' + self.train_filename)
        os.system('echo \"\" > ' + self.val_filename)

    def remove_files(self):
        os.remove(self.config_file)
        os.remove(self.results_file)
        os.remove(self.test_filename)
        os.remove(self.train_filename) 
        os.remove(self.val_filename)
Example #10
0
class LDOF_model:
    def __init__(self, xml_elem, logCB=None, progressCB=None):

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)

        #KNN tuning parameters
        self.k = 10  #Make this 1 more than the number of columns
        self.num_display = 10
        self.num_mod = 1

        #Attributes that are used to make the prediction
        attributes_string = xml_elem.attributes['attributes'].value
        self.attributes = util_get_attribute_list(attributes_string)

        #NOT ACTUALLY USED, JUST MAKES IT SO KNN LIBRARY CAN BE USED
        self.test_attribute = None

        #Sets of attributes that must be considered as a whole
        self.attribute_combinations = []

        #Set all weights to 1
        self.initialized_weights = {}
        for attribute in self.attributes:
            self.initialized_weights[attribute] = 1

        #Attributes that will get there values log transformed to produce better results
        if xml_elem.hasAttribute('log_trans_attributes'):
            log_trans_string = xml_elem.attributes[
                'log_trans_attributes'].value
            temp_atts_list = util_get_attribute_list(log_trans_string)
            self.log_trans_atts = set(temp_atts_list)

        self.null_value_list = []  #NOT USED

        #Random information
        self.test_type = "LDOF"

    def get_predictions(self, data_profiler):

        #Creates (or clears) files that are used by the binary
        self.initialize_files()

        #Put data into files
        self.create_input_files(data_profiler)

        #Create the configuration file
        self.util_create_config(data_profiler)

        #Getting list of predictions and confidences
        #(This runs the binary with the config as a input)
        os.system(BINARY + ' ' + self.config_file + ' >> ' + self.results_file)
        (lof_lists, lof_id_lists) = self.util_get_results(self.results_file)

        #Removes files used by the binary
        self.remove_files()

        return Test_result(lof_lists, lof_id_lists)

    #util function
    #Gets the results from the result file
    #Also removes the files that were being used
    def util_get_results(self, filename):
        start_time = int(time.time())
        self.printOut.pLog("LDOF- Getting results")

        f = open(filename)
        predictions = []
        confidences = []

        lof_lists = []
        lof_id_lists = []

        results_portion = False
        index = 0
        for line in f.readlines():

            #Find better way to identify comments
            if line.find('#') < 0 and results_portion:
                line_list = line.split(', ')

                predictions.append(0)
                confidences.append(0)

                #Getting lof values for the line
                lof_list = []
                lof_id_list = []
                num_check = (self.k * 2) / self.num_mod
                for i in range(num_check + 1):
                    if i == 0:
                        id = line_list[i]
                    elif i % 2 == 1:
                        lof_list.append(
                            1000 *
                            float(line_list[i]))  #Don't adjust value like this
                    else:
                        lof_id_list.append(int(line_list[i]))

                lof_lists.append(lof_list)
                lof_id_lists.append(lof_id_list)

                index += 1

            else:
                stripped = line.rstrip('\n')
                self.printOut.pLog("LDOF- " + stripped)

            #Finding the portion of the output file that has results
            if line.find("Printing final results:") > -1:
                results_portion = True

        end_time = int(time.time())
        self.printOut.pLog("LDOF- Time reading results: " +
                           str(end_time - start_time))

        return [lof_lists, lof_id_lists]

    #util function
    #Creates a configuration file that is used by the
    #binary in order to run the test
    def util_create_config(self, data_profiler):

        #Modifying the list of attribute ids
        #making sure the ones that are connected are represented as such
        #This means they have the same id in the configuration file
        #Also, the weights that are specified are given as well
        weights_list = []
        for index in range(len(data_profiler.attribute_id_list)):
            id = data_profiler.attribute_id_list[index]
            attribute = data_profiler.attribute_id_dict[id]

            #Setting right IDs
            for combo in self.attribute_combinations:
                if attribute in combo:
                    data_profiler.attribute_id_list[
                        index] = data_profiler.id_attribute_dict[combo[0]]

            #Setting right weights
            if attribute in self.initialized_weights:
                weights_list.append(self.initialized_weights[attribute])
            else:
                weights_list.append(0)

        #Printing information into the configuration file
        f = open(self.config_file, 'w')

        f.write("test_file: " + self.test_filename + "\n")
        f.write("train_file: " + self.train_filename + "\n")

        f.write("number_neighbors: " + str(self.k) + "\n")
        f.write("number_display: " + str(self.num_display) + "\n")
        f.write("number_mod: " + str(self.num_mod) + "\n")

        f.write("columns_attribute:")
        for i in data_profiler.attribute_id_list:
            f.write(" " + str(i))
        f.write("\n")

        f.write("initial_weights:")
        for weight in weights_list:
            f.write(" " + str(weight))
        f.write("\n")

        f.close()

    #util function
    #Gets information from the data_profiler object to files
    def create_input_files(self, data_profiler):
        start_time = int(time.time())

        #Just loading data structures
        test_labels = []
        test_samples = []
        test_ids = []

        train_labels = []
        train_samples = []
        train_ids = []

        for i in range(len(data_profiler.labels)):
            #Adding instances that are going to be tested to the list
            if data_profiler.query_manager.is_test_list[i]:
                test_labels.append(data_profiler.labels[i])
                test_samples.append(data_profiler.samples[i])
                test_ids.append(i)

            #Adding non null instances to the training set
            if not data_profiler.query_manager.is_null_list[i]:
                train_labels.append(data_profiler.labels[i])
                train_samples.append(data_profiler.samples[i])
                train_ids.append(i)

        #Create test and train files
        self.util_create_valid_file_from_samples(test_labels, test_samples,
                                                 test_ids, self.test_filename)
        self.util_create_valid_file_from_samples(train_labels, train_samples,
                                                 train_ids,
                                                 self.train_filename)

        #Isn't used but something needs to be passed to KNN
        self.util_create_valid_file_from_samples([], [], [], self.val_filename)

        #Re-setting transformation information in data profiler
        data_profiler.log_trans_atts = set([])

        end_time = int(time.time())
        self.printOut.pLog("LDOF- Time creating input files: " +
                           str(end_time - start_time))

    #util function
    #Creates files that will be used by the KNN binary
    def util_create_valid_file_from_samples(self, labels, samples, id_list,
                                            filename):
        f = open(filename, 'w')
        index = 0
        for sample in samples:
            line = str(labels[index])
            line += " " + str(id_list[index])
            keys = sample.keys()
            keys.sort()
            for key in keys:
                line += " " + str(key) + ":" + str(sample[key])
            line += "\n"
            f.write(line)
            index += 1

        f.close()

    def initialize_files(self):

        #Creating unique file names
        self.test_filename = 'LDOF_TEST_INPUT_FILE' + str(int(
            time.time())) + '.TXT'
        self.train_filename = 'LDOF_TRAIN_INPUT_FILE' + str(int(
            time.time())) + '.TXT'
        self.val_filename = 'LDOF_VAL_FILE' + str(int(time.time())) + '.TXT'
        self.results_file = 'LDOF_RESULTS_FILE' + str(int(
            time.time())) + '.TXT'
        self.config_file = 'LDOF' + str(int(time.time())) + '.cfg'

        #Initializing files
        os.system('echo \"\" > ' + self.config_file)
        os.system('echo \"\" > ' + self.results_file)
        os.system('echo \"\" > ' + self.test_filename)
        os.system('echo \"\" > ' + self.train_filename)
        os.system('echo \"\" > ' + self.val_filename)

    def remove_files(self):
        os.remove(self.config_file)
        os.remove(self.results_file)
        os.remove(self.test_filename)
        os.remove(self.train_filename)
        os.remove(self.val_filename)
class Cat_model :
    def __init__(self, xml_elem, MAKE_ALL_PREDS, logCB = None, progressCB = None) :

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)        
        
        #Test specific information
        self.test_attribute = xml_elem.attributes["test_attribute"].value
        self.test_classifier = "weka.classifiers.trees.J48"
        if xml_elem.hasAttribute("classifier") :
            self.test_classifier = xml_elem.attributes["classifier"].value

        self.test_options = ""
        if xml_elem.hasAttribute("options") :
            self.test_options = xml_elem.attributes["options"].value

        #Feature selection information
        self.use_feature_selection = False
        self.using_pca = False
        self.search_class = ""
        self.evaluation_class = ""
        if xml_elem.hasAttribute('fs_evaluation_class'):
            self.use_feature_selection = True

            self.search_class = xml_elem.attributes["fs_search_class"].value
            self.evaluation_class = xml_elem.attributes["fs_evaluation_class"].value
            
            #Checking for pca
            if self.evaluation_class.find("PrincipalComponents") > -1 :
                self.using_pca = True
                
            #Attributes that the search class starts with (Not used with PCA)
            self.start_attributes = []
            if xml_elem.hasAttribute('fs_start_attributes') :
                self.start_attributes = util_get_attribute_list(xml_elem.attributes['fs_start_attributes'].value)            


        #Attributes that are used to make the prediction
        attributes_string = xml_elem.attributes["train_attributes"].value
        self.attributes = util_get_attribute_list(attributes_string)

        #Values that are considered null for the target attribute
        self.null_value_list = []
        elements = xml_elem.getElementsByTagName('null_values')
        if len(elements) > 0 :
            null_val_element = elements[0]
            for element in null_val_element.getElementsByTagName('v') :
    
                attribute = element.attributes['attribute'].value
                type = element.attributes['type'].value
                value = element.attributes['value'].value
                vt = element.attributes['vt'].value
    
                null_dict = {"attribute" : attribute, "type" : type}
    
                if vt == "int" :
                    null_dict["value"] = int(value)
                elif vt == "string" :
                    null_dict["value"] = str(value)
            
                self.null_value_list.append(null_dict)

        #Simply defined null values
        if xml_elem.hasAttribute("null_value") :
            null_value = xml_elem.attributes["null_value"].value
            null_dict = {"attribute" : self.test_attribute, "type" : "E", "value" : null_value}
            self.null_value_list.append(null_dict)

        
        #Information about the model
        self.test_type = "Cat"
        self.MAKE_ALL_PREDS = MAKE_ALL_PREDS

    def get_predictions(self, query_manager) :
               
        #Filenames
        test_filename = "test" + str(int(time.time())) + ".arff"
        train_filename = "train" + str(int(time.time())) + ".arff"
        train_log = "train_log" + str(int(time.time())) + ".arff"
        result_filename = "results" + str(int(time.time())) + ".txt"

        #Creates (or clears) files that are used by the binary 
        IS_NUM_TEST = False
        file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca, test_filename, train_filename, query_manager, self)
        target_values = file_creation_info["target_values"]
        target_value_null = file_creation_info["target_value_null"]
        attribute_indexes = file_creation_info["attribute_indexes"]
        cat_att_mapping = file_creation_info["cat_att_mapping"]

        #If there are no null values in the test set
        #And the run is only replacing null values then terminate if no null values
        if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0 :
            os.remove(test_filename)
            os.remove(train_filename)            
            return None

        #Running feature selection process if needed
        acc_est = {}
        if self.use_feature_selection :
            (test_filename, train_filename, selected_attributes) = feature_selection(test_filename, train_filename, query_manager, file_creation_info, self, IS_NUM_TEST)
            acc_est["selected attributes"] = selected_attributes

        #Running tests
        model_name = "saved_model" + str(int(time.time()))
        path_spef_weka = os.path.join( path, "models", "weka.jar")
        path_spef_libsvm = os.path.join( path, "models", "libsvm.jar")
        train_string = "java -Xmx1024m -cp " + path_spef_weka + ":" +  path_spef_libsvm  + " " + self.test_classifier + " -d " + model_name  + " " + self.test_options + " -t " + train_filename + " >> " + train_log
        test_string = "java -Xmx1024m -cp " +  path_spef_weka + ":" +  path_spef_libsvm  + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename

        self.printOut.pLog( "PRED- Training model")
        os.system(train_string)
        self.printOut.pLog( "PRED- Making predictions")
        os.system(test_string)
                    
        #Gathering results for each test instance
        self.printOut.pLog( "PRED- Getting results")
        
        f = open(result_filename)
        prediction_list = []
        probability_list = []
        
        correctly_imputed = 0
        non_null_count = 0

        index = 0
        collect_results = False
        for line in f.readlines() :
            line_list = line.split()
            
            #Getting results
            if collect_results and len(line_list) > 1:
                tuple = line_list[2].split(":")
                prediction = str(tuple[1])
                
                if not target_value_null[index] and prediction == str(target_values[index]) :
                    correctly_imputed += 1
                if not target_value_null[index] :
                    non_null_count += 1
                
                prediction_list.append(prediction)
                probability_list.append(1)
                index += 1
            
            #Seeing if you are at the results portion of the file
            if line.find("inst#") > -1 :
                collect_results = True

        f.close()
        
        #Gathering accuracy estimations
        f = open(train_log)
        cross_val_info = False
        for line in f.readlines() :
            #Getting all performance related metrics
            if cross_val_info :
                line = line.rstrip('\n')
                line = line.rstrip('\t')
                line = line.rstrip('\b')
                line = line.rstrip(' %')
        
                if line.find('Correctly Classified Instances') > -1 or line.find('Kappa statistic') > -1:
                    list = line.split('  ')
                    if len(list) > 1:
                        attribute = list[0]
                        value = list[len(list) - 1]
                        value = float(value)
                        acc_est[attribute] = value

                    
            #Finding cross validation info                                                                   
            if line.find('Stratified cross-validation') > -1 :
                cross_val_info = True
            elif line.find('Confusion Matrix') > -1 :
                cross_val_info = False

        f.close()
        
        #Actual Performance Stats
        acc_est["Actual Correctly Imputed Percent"] = (float(correctly_imputed) / non_null_count) * 100
        
        #Removing files used for test
        os.remove(train_log)
        os.remove(result_filename)
        os.remove(test_filename)
        os.remove(train_filename)
        os.remove(model_name)

        #Add number of test instances to the accuracy estimation
        current_test_num = query_manager.current_test_block.parcel_count
        acc_est["test instance count"] = current_test_num
        acc_est["block number"] = len(query_manager.used_blocks)


        return Test_result(self.test_type, self.test_attribute, prediction_list, probability_list, acc_est)
Example #12
0
class Data_profiler :

    def __init__(self, query_manager, logCB = None, progressCB = None) :
        
        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)        


        self.query_manager = query_manager

        #Profile of information currently being dealt with
        self.class_result_dict = None
        self.class_att_value_weight = None
        self.numeric_result_dict = None
        self.get_possible_values(query_manager)
        
        #Used by SVM_model to piece together results
        self.label_id_lookup_table = None
        
        #Current data being stored
        self.labels = []
        self.samples = []
        self.is_null_list = []
        
        #Used by KNN
        self.log_trans_atts = set([])
        self.attribute_id_list = []
        self.attribute_id_dict = {}
        self.id_attribute_dict = {}

    #util function
    #creates dictionary of possible values for each column in the table
    def get_possible_values(self, query_manager) :
    
        #Getting info from the query manager
        class_list = query_manager.class_list
        numeric_list = query_manager.numeric_list
        rows = query_manager.current_rows
    
        start_time = int(time.time())
        self.printOut.pLog("PREP- Class columns count: " + str(len(class_list)))
        self.printOut.pLog("PREP- Num columns count: " + str(len(numeric_list)))
    
        #Initializing data structures for storing info
        class_result_dict = {}
        class_att_value_count = {}
        numeric_result_dict = {}
        for c in class_list :
            class_result_dict[c] = []
            class_att_value_count[c] = {}
        for c in numeric_list :
            numeric_result_dict[c] = [None, None]
    
        #Finding all possible values for each column
        for row in rows :
    
            #gathering class info
            for c_name, list in class_result_dict.iteritems() :
                if c_name in row :
                    value = row[c_name]
        
                    #Getting information on class attribute values
                    if value not in list :
                        list.append(value)
                        class_att_value_count[c_name][value] = 1 #May need to worry about the value being Null
                    else :
                        class_att_value_count[c_name][value] += 1
        
            #gathering numeric info
            for c_name, list in numeric_result_dict.iteritems() :
                if c_name in row :
                    
                    value = row[c_name]
                    if value == "" or value == None:
                        value = 0 #May want to think of a more appropriate value
                    else :
                        value = float( value )
        
                    #finding min
                    if value != "" and (list[0] > value or list[0] == None ) :
                        list[0] = value
                    
                    #finding max
                    if value != "" and (list[1] < value or list[1] == None) :
                        list[1] = value
    
        #Deciding on the weight based on the count
        class_att_value_weight = {}
        for att_name, values in class_att_value_count.iteritems() :
            #Finding total number of values
            overall_count = 0
            for value, count in values.iteritems() :
                overall_count += count
            
            #Setting weights
            class_att_value_weight[att_name] = {}
            for value, count in values.iteritems() :
                class_att_value_weight[att_name][value] = float(count / overall_count)
            
            
        self.numeric_result_dict = numeric_result_dict
        self.class_result_dict = class_result_dict
        self.class_att_value_weight = class_att_value_weight
    
        end_time = int(time.time())
        self.printOut.pLog("PREP- Time getting values: " + str(end_time - start_time))
    
    #Prepares the data that will be used with SVM or KNN
    def load_data_structures(self, target, attributes) :
    
        start_time = int(time.time())
    
        #Getting list of columns, but making sure the columns are actually there
        column_list_local = []
        for attribute in attributes :
            if attribute in self.class_result_dict or attribute in self.numeric_result_dict :
                column_list_local.append(attribute)
        
        (class_lookup_table, id_lookup_table)  = self.util_get_class_lookup_table()
        
        self.label_id_lookup_table = id_lookup_table #Used by SVM to piece together test results
        self.class_lookup_table = class_lookup_table #Used by SVM to piece together test results
    
        labels = []
        samples = []
        
        #Getting all needed information from all rows
        for j in range(len(self.query_manager.current_rows)) :
            
            row = self.query_manager.current_rows[j]
            
            #for class target attributes
            value = row[target]
            if target in class_lookup_table :
                labels.append(class_lookup_table[target][value])
        
            #for numeric target attributes (might want to scale label)
            else :
                if value == "" or value == None:
                    value = -1
                labels.append(int( value )) #CHANGE: SHOULD BE FLOAT
        
            #getting sample data
            index = 0
            sample = {}
            for attribute in column_list_local :
                if attribute in row :
                    if attribute in class_lookup_table :
                        value = row[attribute]
                        attribute_value = class_lookup_table[attribute][value]
                            
                        for i in range( len( self.class_result_dict[attribute] ) ) :
                            if attribute_value == i :
                                #sample[index] = 0.5 #1
                                sample[index] = self.class_att_value_weight[attribute][value] #MAKE IT ONLY FOR LDOF
                                
                            index += 1
                            
                    elif attribute in self.numeric_result_dict :
                        value = row[attribute]
                        if value == "" or value == None:
                            value = 0
            
                        scaled = 0
                        max = self.numeric_result_dict[attribute][1]
                        min = self.numeric_result_dict[attribute][0]
            
                        #Transforming specified columns
                        if attribute in self.log_trans_atts :
                            value = math.log(value + 1)
                                
                            #Scaling values
                            denominator = math.log(max + 1) - math.log(min + 1)
                            if denominator > 0 :
                                    
                                #Scaling all attributes between 0 and 1
                                numerator = float( value ) - math.log(min + 1)
                                scaled =  numerator / denominator
                                                
                        else :
                                
                            #Non transformed columns
                            #Scaling values
                            denominator = max - min
                            if denominator > 0 :
                                    
                                #Scaling all attributes between 0 and 1
                                numerator = float( value ) - min
                                scaled =  numerator / denominator
                                    
                                
                        sample[index] = scaled
                        index += 1 
    
            samples.append(sample)
    
        #Used for KNN
        self.printOut.pLog( "PREP- Dimension / column mapping")

        self.attribute_id_dict = {}
        self.id_attribute_dict = {}
        self.attribute_id_list = []
        id = 0
        for attribute in column_list_local :
            self.printOut.pLog( "PREP- ID: " + str(id) + ", Attribute: " + attribute)
        
            if attribute in class_lookup_table :
                for att_value in class_lookup_table[attribute].keys() :
                    self.attribute_id_list.append(id)
                self.attribute_id_dict[id] = attribute
                self.id_attribute_dict[attribute] = id 
            else :
                self.attribute_id_dict[id] = attribute
                self.id_attribute_dict[attribute] = id
                self.attribute_id_list.append(id)
            id += 1
            
        #Setting values for object variables / lists
        self.labels = labels
        self.samples = samples
    
        end_time = int(time.time())
        self.printOut.pLog("PREP- Time loading data structures: " + str( end_time - start_time))
       
    #Setting up all the data for the test
    def prepare_test_data(self, test_object) : 
        
        #Making the data in the data profiler formatted correctly for the
        #Given test attribute and attributes used for the test           
        self.load_data_structures(test_object.test_attribute, test_object.attributes)

        #Setting transformation information in data profiler
        #only do this for KNN attributes
        if test_object.test_type == "KNN" :
            self.log_trans_atts = set(test_object.log_trans_atts)
    
        #Processing information about which rows are null
        self.query_manager.proc_is_null_list(test_object)
    
    
    #util function
    def util_get_class_lookup_table(self) :
        class_lookup_table = {}
        id_lookup_table = {}
        for class_name, values in self.class_result_dict.iteritems() :
            index = 0
            class_temp = {}
            id_temp = {}
            for value in values :
                class_temp[value] = index
                id_temp[index] = value
                index += 1
            class_lookup_table[class_name] = class_temp
            id_lookup_table[class_name] = id_temp
    
        return [class_lookup_table, id_lookup_table]        
Example #13
0
class Cat_model:
    def __init__(self, xml_elem, MAKE_ALL_PREDS, logCB=None, progressCB=None):

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)

        #Test specific information
        self.test_attribute = xml_elem.attributes["test_attribute"].value
        self.test_classifier = "weka.classifiers.trees.J48"
        if xml_elem.hasAttribute("classifier"):
            self.test_classifier = xml_elem.attributes["classifier"].value

        self.test_options = ""
        if xml_elem.hasAttribute("options"):
            self.test_options = xml_elem.attributes["options"].value

        #Feature selection information
        self.use_feature_selection = False
        self.using_pca = False
        self.search_class = ""
        self.evaluation_class = ""
        if xml_elem.hasAttribute('fs_evaluation_class'):
            self.use_feature_selection = True

            self.search_class = xml_elem.attributes["fs_search_class"].value
            self.evaluation_class = xml_elem.attributes[
                "fs_evaluation_class"].value

            #Checking for pca
            if self.evaluation_class.find("PrincipalComponents") > -1:
                self.using_pca = True

            #Attributes that the search class starts with (Not used with PCA)
            self.start_attributes = []
            if xml_elem.hasAttribute('fs_start_attributes'):
                self.start_attributes = util_get_attribute_list(
                    xml_elem.attributes['fs_start_attributes'].value)

        #Attributes that are used to make the prediction
        attributes_string = xml_elem.attributes["train_attributes"].value
        self.attributes = util_get_attribute_list(attributes_string)

        #Values that are considered null for the target attribute
        self.null_value_list = []
        elements = xml_elem.getElementsByTagName('null_values')
        if len(elements) > 0:
            null_val_element = elements[0]
            for element in null_val_element.getElementsByTagName('v'):

                attribute = element.attributes['attribute'].value
                type = element.attributes['type'].value
                value = element.attributes['value'].value
                vt = element.attributes['vt'].value

                null_dict = {"attribute": attribute, "type": type}

                if vt == "int":
                    null_dict["value"] = int(value)
                elif vt == "string":
                    null_dict["value"] = str(value)

                self.null_value_list.append(null_dict)

        #Simply defined null values
        if xml_elem.hasAttribute("null_value"):
            null_value = xml_elem.attributes["null_value"].value
            null_dict = {
                "attribute": self.test_attribute,
                "type": "E",
                "value": null_value
            }
            self.null_value_list.append(null_dict)

        #Information about the model
        self.test_type = "Cat"
        self.MAKE_ALL_PREDS = MAKE_ALL_PREDS

    def get_predictions(self, query_manager):

        #Filenames
        test_filename = "test" + str(int(time.time())) + ".arff"
        train_filename = "train" + str(int(time.time())) + ".arff"
        train_log = "train_log" + str(int(time.time())) + ".arff"
        result_filename = "results" + str(int(time.time())) + ".txt"

        #Creates (or clears) files that are used by the binary
        IS_NUM_TEST = False
        file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca,
                                                test_filename, train_filename,
                                                query_manager, self)
        target_values = file_creation_info["target_values"]
        target_value_null = file_creation_info["target_value_null"]
        attribute_indexes = file_creation_info["attribute_indexes"]
        cat_att_mapping = file_creation_info["cat_att_mapping"]

        #If there are no null values in the test set
        #And the run is only replacing null values then terminate if no null values
        if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0:
            os.remove(test_filename)
            os.remove(train_filename)
            return None

        #Running feature selection process if needed
        acc_est = {}
        if self.use_feature_selection:
            (test_filename, train_filename,
             selected_attributes) = feature_selection(test_filename,
                                                      train_filename,
                                                      query_manager,
                                                      file_creation_info, self,
                                                      IS_NUM_TEST)
            acc_est["selected attributes"] = selected_attributes

        #Running tests
        model_name = "saved_model" + str(int(time.time()))
        path_spef_weka = os.path.join(path, "models", "weka.jar")
        path_spef_libsvm = os.path.join(path, "models", "libsvm.jar")
        train_string = "java -Xmx1024m -cp " + path_spef_weka + ":" + path_spef_libsvm + " " + self.test_classifier + " -d " + model_name + " " + self.test_options + " -t " + train_filename + " >> " + train_log
        test_string = "java -Xmx1024m -cp " + path_spef_weka + ":" + path_spef_libsvm + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename

        self.printOut.pLog("PRED- Training model")
        os.system(train_string)
        self.printOut.pLog("PRED- Making predictions")
        os.system(test_string)

        #Gathering results for each test instance
        self.printOut.pLog("PRED- Getting results")

        f = open(result_filename)
        prediction_list = []
        probability_list = []

        correctly_imputed = 0
        non_null_count = 0

        index = 0
        collect_results = False
        for line in f.readlines():
            line_list = line.split()

            #Getting results
            if collect_results and len(line_list) > 1:
                tuple = line_list[2].split(":")
                prediction = str(tuple[1])

                if not target_value_null[index] and prediction == str(
                        target_values[index]):
                    correctly_imputed += 1
                if not target_value_null[index]:
                    non_null_count += 1

                prediction_list.append(prediction)
                probability_list.append(1)
                index += 1

            #Seeing if you are at the results portion of the file
            if line.find("inst#") > -1:
                collect_results = True

        f.close()

        #Gathering accuracy estimations
        f = open(train_log)
        cross_val_info = False
        for line in f.readlines():
            #Getting all performance related metrics
            if cross_val_info:
                line = line.rstrip('\n')
                line = line.rstrip('\t')
                line = line.rstrip('\b')
                line = line.rstrip(' %')

                if line.find('Correctly Classified Instances'
                             ) > -1 or line.find('Kappa statistic') > -1:
                    list = line.split('  ')
                    if len(list) > 1:
                        attribute = list[0]
                        value = list[len(list) - 1]
                        value = float(value)
                        acc_est[attribute] = value

            #Finding cross validation info
            if line.find('Stratified cross-validation') > -1:
                cross_val_info = True
            elif line.find('Confusion Matrix') > -1:
                cross_val_info = False

        f.close()

        #Actual Performance Stats
        acc_est["Actual Correctly Imputed Percent"] = (
            float(correctly_imputed) / non_null_count) * 100

        #Removing files used for test
        os.remove(train_log)
        os.remove(result_filename)
        os.remove(test_filename)
        os.remove(train_filename)
        os.remove(model_name)

        #Add number of test instances to the accuracy estimation
        current_test_num = query_manager.current_test_block.parcel_count
        acc_est["test instance count"] = current_test_num
        acc_est["block number"] = len(query_manager.used_blocks)

        return Test_result(self.test_type, self.test_attribute,
                           prediction_list, probability_list, acc_est)
Example #14
0
    def __init__(self, io_info_element, logCB=None, progressCB=None):

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)

        #Storing all the information passed as parameters to the query manager
        self.db_url = io_info_element.attributes["input_db_url"].value
        self.table_name = io_info_element.attributes["input_table_name"].value
        self.x_attribute = io_info_element.attributes["x_column"].value
        self.y_attribute = io_info_element.attributes["y_column"].value
        self.id_attribute = io_info_element.attributes["id_column"].value

        #Forcing certain attributes to be categorical
        self.fclass_atts = []
        if io_info_element.hasAttribute('force_to_class'):
            self.fclass_atts = util_get_attribute_list(
                io_info_element.attributes["force_to_class"].value)

        #Forcing certain attributes to be numerical
        self.fnum_atts = []
        elements = io_info_element.getElementsByTagName('force_to_numeric')
        if io_info_element.hasAttribute('force_to_numeric'):
            self.fnum_atts = util_get_attribute_list(
                io_info_element.attributes["force_to_numeric"].value)

        #Size of blocks that will be created
        self.train_size = 40000
        if io_info_element.hasAttribute("train_block_size"):
            self.train_size = int(
                io_info_element.attributes["train_block_size"].value)

        self.test_size = 40000
        if io_info_element.hasAttribute("test_block_size"):
            self.test_size = int(
                io_info_element.attributes["test_block_size"].value)

        #Getting access to the table
        self.table = util_get_table(self.db_url, self.table_name)

        #Getting all attributes from the table
        #Getting what types of attributes they are
        (self.class_list, self.numeric_list,
         self.attributes) = util_get_attribute_info(self.table,
                                                    self.fclass_atts,
                                                    self.fnum_atts)

        #Used for the parcel query
        self.query_string = True
        elements = io_info_element.getElementsByTagName('test_criteria')
        if len(elements) > 0:
            tc_elem = elements[0]
            self.query_string = self.util_create_query_string(tc_elem)

        #Used for extreme rows that are included in every test done
        self.ois_query_string = None
        elements = io_info_element.getElementsByTagName('outlier_inc_set')
        if len(elements) > 0:
            ois_elem = elements[0]
            if len(ois_elem.getElementsByTagName('or')) > 0:
                self.ois_query_string = self.util_create_query_string(ois_elem)

        #Getting x/y boundaries of the parcels and number of rows
        #(may want to find a faster way to do this)
        (self.x_max, self.y_max, self.x_min, self.y_min,
         self.total_count) = self.util_spatial_boundaries()

        self.rows_left = self.total_count

        #Information that is being stored about the number of parcel blocks remaining and used
        self.printOut.pLog("RET- Creating all parcel blocks...")

        self.block_list = self.util_create_parcel_block(
            self.x_max, self.y_max, self.x_min, self.y_min)
        self.set_colors()
        self.used_blocks = []

        #In order to make sure max, min vals didn't leave any out
        #Can happen if x and y attributes are varchars in metadata
        self.adjust_borders()

        #Used for profiling the speed at which the program is running
        self.first_query_time = None
        self.number_rows_tested = 0

        self.table_current_test_rows = []

        #Parcel block information
        self.current_test_block = None
        self.current_training_block = None
        self.group_max = 2
        self.group_count = 2
        if io_info_element.hasAttribute('num_cv_folds'):
            self.group_max = int(
                io_info_element.attributes['num_cv_folds'].value)
            self.group_count = self.group_max

        self.overall_is_test_list = []
        self.use_as_training = []

        #Current rows retrieved
        self.current_rows = []
        self.is_test_list = []
        self.is_null_list = []
        self.test_number = []
Example #15
0
class Query_manager:
    def __init__(self, io_info_element, logCB=None, progressCB=None):

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)

        #Storing all the information passed as parameters to the query manager
        self.db_url = io_info_element.attributes["input_db_url"].value
        self.table_name = io_info_element.attributes["input_table_name"].value
        self.x_attribute = io_info_element.attributes["x_column"].value
        self.y_attribute = io_info_element.attributes["y_column"].value
        self.id_attribute = io_info_element.attributes["id_column"].value

        #Forcing certain attributes to be categorical
        self.fclass_atts = []
        if io_info_element.hasAttribute('force_to_class'):
            self.fclass_atts = util_get_attribute_list(
                io_info_element.attributes["force_to_class"].value)

        #Forcing certain attributes to be numerical
        self.fnum_atts = []
        elements = io_info_element.getElementsByTagName('force_to_numeric')
        if io_info_element.hasAttribute('force_to_numeric'):
            self.fnum_atts = util_get_attribute_list(
                io_info_element.attributes["force_to_numeric"].value)

        #Size of blocks that will be created
        self.train_size = 40000
        if io_info_element.hasAttribute("train_block_size"):
            self.train_size = int(
                io_info_element.attributes["train_block_size"].value)

        self.test_size = 40000
        if io_info_element.hasAttribute("test_block_size"):
            self.test_size = int(
                io_info_element.attributes["test_block_size"].value)

        #Getting access to the table
        self.table = util_get_table(self.db_url, self.table_name)

        #Getting all attributes from the table
        #Getting what types of attributes they are
        (self.class_list, self.numeric_list,
         self.attributes) = util_get_attribute_info(self.table,
                                                    self.fclass_atts,
                                                    self.fnum_atts)

        #Used for the parcel query
        self.query_string = True
        elements = io_info_element.getElementsByTagName('test_criteria')
        if len(elements) > 0:
            tc_elem = elements[0]
            self.query_string = self.util_create_query_string(tc_elem)

        #Used for extreme rows that are included in every test done
        self.ois_query_string = None
        elements = io_info_element.getElementsByTagName('outlier_inc_set')
        if len(elements) > 0:
            ois_elem = elements[0]
            if len(ois_elem.getElementsByTagName('or')) > 0:
                self.ois_query_string = self.util_create_query_string(ois_elem)

        #Getting x/y boundaries of the parcels and number of rows
        #(may want to find a faster way to do this)
        (self.x_max, self.y_max, self.x_min, self.y_min,
         self.total_count) = self.util_spatial_boundaries()

        self.rows_left = self.total_count

        #Information that is being stored about the number of parcel blocks remaining and used
        self.printOut.pLog("RET- Creating all parcel blocks...")

        self.block_list = self.util_create_parcel_block(
            self.x_max, self.y_max, self.x_min, self.y_min)
        self.set_colors()
        self.used_blocks = []

        #In order to make sure max, min vals didn't leave any out
        #Can happen if x and y attributes are varchars in metadata
        self.adjust_borders()

        #Used for profiling the speed at which the program is running
        self.first_query_time = None
        self.number_rows_tested = 0

        self.table_current_test_rows = []

        #Parcel block information
        self.current_test_block = None
        self.current_training_block = None
        self.group_max = 2
        self.group_count = 2
        if io_info_element.hasAttribute('num_cv_folds'):
            self.group_max = int(
                io_info_element.attributes['num_cv_folds'].value)
            self.group_count = self.group_max

        self.overall_is_test_list = []
        self.use_as_training = []

        #Current rows retrieved
        self.current_rows = []
        self.is_test_list = []
        self.is_null_list = []
        self.test_number = []

    #Gets rows that represent a block of parcels
    #Returns None if there aren't any rows left
    def query_rows(self):

        #FOR GUI
        self.printOut.progress(
            int((self.rows_left / float(self.total_count)) * 100))

        #Getting all new data loaded in data structures
        if self.group_count == self.group_max:

            #Reset the group count
            self.group_count = 1

            #Profiling (won't work if distributed) ############
            #Getting information about approximate time left
            if self.first_query_time == None:
                self.first_query_time = time.time()
            else:
                average_time = (time.time() - self.first_query_time) / (
                    self.number_rows_tested)

                self.printOut.pLog("PROFILE- Number of blocks remaining: " +
                                   str(len(self.block_list)))
                self.printOut.pLog("PROFILE- Average time per unit: " +
                                   str(average_time))
                self.printOut.pLog("PROFILE- Number of rows remaining: " +
                                   str(self.rows_left))
                self.printOut.pLog(
                    "PROFILE- Predicted remaining time (in minutes): " +
                    str(int((average_time * (self.rows_left)) / 60)))

            ####################################################

            self.printOut.pLog(
                "RET- Retrieving training and test parcels from remaining: " +
                str(self.rows_left))

            #Getting a block with a non zero parcel count
            block = None
            while self.block_list != [] and block == None:
                block = self.block_list.pop(0)
                if block.parcel_count == 0:
                    block = None

            if block != None:

                self.current_test_block = block
                training_rows_query = self.util_training_rows(block)

                start_time = int(time.time())

                #Getting the attribute values from the raw rows
                #Get rows into the proper format
                proper_rows = []
                id_set = set([])
                for row in training_rows_query:
                    temp_row = {}
                    for attribute in self.attributes:
                        temp_row[attribute] = row[attribute]
                    proper_rows.append(temp_row)
                    id_set.add(row[self.id_attribute])

                #Getting test and training rows (test is a subset of training)
                is_test_list = []
                test_number = []
                test_count = 0
                for index in range(len(proper_rows)):
                    row = proper_rows[index]

                    #REQUIRES X and Y attributes to be included in the rows (may be a problem)
                    if block.row_within_block(self.x_attribute,
                                              self.y_attribute, row):
                        is_test_list.append(True)
                        test_number.append(test_count)
                        test_count += 1
                    else:
                        is_test_list.append(False)
                        test_number.append(None)

                #Adjust block count (cause borders are modified in some cases)
                block.parcel_count = test_count

                self.used_blocks.append(block)
                self.rows_left -= block.parcel_count
                self.number_rows_tested += block.parcel_count

                #Adding the extreme values that need to be added to every data set
                #This helps outlier detection and null value predictions
                if self.ois_query_string != None:
                    s = self.table.select(
                        and_(self.ois_query_string,
                             self.query_string)).execute()
                    self.printOut.pLog("RET- Num extra (rare) rows added: " +
                                       str(s.rowcount))
                    for row in s:
                        if row[self.id_attribute] not in id_set:
                            temp_row = {}
                            for attribute in self.attributes:
                                temp_row[attribute] = row[attribute]

                            proper_rows.append(temp_row)
                            is_test_list.append(False)
                            test_number.append(None)

                self.current_rows = proper_rows
                self.is_test_list = is_test_list
                self.overall_is_test_list = copy.deepcopy(is_test_list)
                self.test_number = test_number

                end_time = int(time.time())
                self.printOut.pLog("RET- Time loading data structures: " +
                                   str(end_time - start_time))

            else:
                self.current_rows = []
                self.is_test_list = []
                self.test_number = []

                self.overall_is_test_list = []
                self.use_as_training = []

        #Increment group count
        else:
            self.group_count += 1

        #Use data that exists / loading temporary data structures
        self.is_test_list = []
        self.use_as_training = []

        test_num = 0
        test_count = 0
        train_count = 0

        #Going over every current row
        for index in range(len(self.current_rows)):
            #if ONE group then all in test
            if self.group_max == 1:
                if self.overall_is_test_list[index]:
                    self.is_test_list.append(True)
                    test_count += 1
                else:
                    self.is_test_list.append(False)

                self.use_as_training.append(True)
                train_count += 1

            #If more than one group then split up test and training sets
            else:
                is_test = self.overall_is_test_list[index]
                if is_test:
                    test_num += 1

                #Deciding whether instance will be in the test set
                #Splits test set up
                #MISSING 4 VALUES IN SANITY CHECK
                used_as_test = False
                if is_test and test_num % self.group_max == (self.group_count -
                                                             1):
                    self.is_test_list.append(True)
                    used_as_test = True
                    test_count += 1

                else:
                    self.is_test_list.append(False)

                #Deciding whether instance should be a training data set
                #FIND INTELIGENT WAY TO STOP THE TRAINING SET FROM BEING TO LARGE
                if not used_as_test:
                    train_count += 1

                if not used_as_test:
                    self.use_as_training.append(True)
                else:
                    self.use_as_training.append(False)

        self.printOut.pLog("RET- Group: " + str(self.group_count))
        self.printOut.pLog("RET- Test count: " + str(test_count))
        self.printOut.pLog("RET- Train count: " + str(train_count))

    #Returns the number of rows that are left
    #to be retrieved
    def number_remaining_blocks(self):
        if len(self.block_list) == 0 and self.group_count == self.group_max:
            return 0
        else:
            return 1

    #Used to setup basic query string
    def util_create_query_string(self, element):

        #Getting dictionary of column objects
        #Creating and clauses for all columns in test criteria combined
        qs = True
        and_list = []
        for or_tag in element.getElementsByTagName('or'):

            #Creating or clauses for a given "or list"
            or_list = []
            for elem in or_tag.getElementsByTagName('tc'):
                attribute = elem.attributes['attribute'].value
                type = elem.attributes['type'].value
                value = elem.attributes['value'].value

                #Getting the right form of the value
                vt = elem.attributes['vt'].value
                if vt == "string":
                    value = str(value)
                elif vt == "int":
                    value = int(value)

                #Creating clause for criteria
                if type == "E":
                    or_list.append(self.table.c[attribute] == value)
                elif type == "NE":
                    or_list.append(self.table.c[attribute] != value)
                elif type == "GT":
                    or_list.append(self.table.c[attribute] > value)
                elif type == "LT":
                    or_list.append(self.table.c[attribute] < value)

            if len(or_list) > 0:
                and_list.append(or_(*or_list))

        #Only make the query string equal to the list if there is something in the list
        if len(and_list) > 0:
            qs = and_(*and_list)

        return qs

    #util
    #keeps track of all parcel blocks
    class Parcel_block:
        def __init__(self, x_max, y_max, x_min, y_min, parcel_count):
            self.x_max = float(x_max)
            self.y_max = float(y_max)
            self.x_min = float(x_min)
            self.y_min = float(y_min)
            self.parcel_count = parcel_count

            self.right_border = False
            self.bottom_border = False

            #For visual represantation
            self.color = None

        #Sets values for whether sides are borders
        def set_border_bools(self, query_manager):
            if self.y_min == query_manager.y_min:
                self.bottom_border = True
            if self.x_max == query_manager.x_max:
                self.right_border = True

        def row_within_block(self, x_at, y_at, row):
            #There are strict equalties for the lower and right sides of the block
            #UNLESS that side borders the edge of space
            xa = float(row[x_at])
            ya = float(row[y_at])
            rb = self.right_border
            bb = self.bottom_border

            if xa >= self.x_min and ((rb and xa <= self.x_max) or
                                     (not rb and xa < self.x_max)):
                if ya <= self.y_max and ((bb and ya >= self.y_min) or
                                         (not bb and ya > self.y_min)):
                    return True

            return False

    #util
    #Gets all the training rows (super set of test rows)
    def util_training_rows(self, block):
        (cx_max, cy_max, cx_min,
         cy_min) = [block.x_max, block.y_max, block.x_min, block.y_min]
        current_count = block.parcel_count

        if current_count == 0:
            return [[], []]
        else:
            self.printOut.pLog("RET- Current count inside training block: " +
                               str(current_count))

            #setting easy variables
            x = self.x_attribute
            y = self.y_attribute
            t = self.table

            #ROOM FOR IMPROVEMENT
            #Make it so that this doesn't terribly overshoot the training size
            count_repeated = 0
            last_count = 0
            select_stat = t.select(
                and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min,
                     t.c[y] <= cy_max, self.query_string))
            while (current_count < self.train_size):
                change = math.sqrt(
                    (self.train_size - current_count) /
                    float(max(self.train_size / 10, current_count)))
                cx_min -= (cx_max - cx_min) * change * .1
                cx_max += (cx_max - cx_min) * change * .1
                cy_min -= (cy_max - cy_min) * change * .1
                cy_max += (cy_max - cy_min) * change * .1

                select_stat = t.select(
                    and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min,
                         t.c[y] <= cy_max, self.query_string))

                #Getting the number of instances inside the block
                s = select([func.count("*")],
                           and_(t.c[x] >= cx_min, t.c[x] <= cx_max,
                                t.c[y] >= cy_min, t.c[y] <= cy_max,
                                self.query_string),
                           from_obj=[t]).execute()
                block_count = parcel_count = sql_get_agg(s, "int")

                last_count = current_count
                current_count = block_count

                self.printOut.pLog(
                    "RET- Current count inside training block: " +
                    str(current_count))

                #Protects against cases in which current_count will never be bigger than train_size
                if last_count == current_count:
                    count_repeated += 1
                else:
                    count_repeated = 0
                if count_repeated == 5:
                    break

            #Executing the training query
            s = select_stat.execute()

            #Used for parcel visual
            self.current_training_block = self.Parcel_block(
                cx_max, cy_max, cx_min, cy_min, "(training block)")
            self.current_training_block.color = self.current_test_block.color

            return s

    #util
    #Seperates parcels in a grid type fashion and creates
    #spatial objects for each grid
    def util_create_parcel_block(self, tx_max, ty_max, tx_min, ty_min):
        t = self.table
        x = self.x_attribute
        y = self.y_attribute

        #NEED TO BE IMPROVED
        #The inequalities should be made strict in a certain way in order to insure nothings redundant
        s = select([func.count("*")],
                   and_(t.c[x] >= tx_min, t.c[x] <= tx_max, t.c[y] >= ty_min,
                        t.c[y] <= ty_max, self.query_string),
                   from_obj=[t]).execute()
        parcel_count = sql_get_agg(s, "int")

        #ROOM FOR IMPROVEMENT
        #Make it so that very small test blocks aren't created
        if parcel_count > self.test_size:
            x_mid = (tx_max - tx_min) / 2 + tx_min
            y_mid = (ty_max - ty_min) / 2 + ty_min

            temp_list = []

            #Always splits in such a way that the the resulting rectangles are squarish
            x_diff = tx_max - tx_min
            y_diff = ty_max - ty_min
            if x_diff < y_diff:
                #Split horiz
                temp_list.extend(
                    self.util_create_parcel_block(tx_max, ty_max, tx_min,
                                                  y_mid))
                temp_list.extend(
                    self.util_create_parcel_block(tx_max, y_mid, tx_min,
                                                  ty_min))
            else:
                #Split vert
                temp_list.extend(
                    self.util_create_parcel_block(tx_max, ty_max, x_mid,
                                                  ty_min))
                temp_list.extend(
                    self.util_create_parcel_block(x_mid, ty_max, tx_min,
                                                  ty_min))

            return temp_list
        else:
            p = self.Parcel_block(tx_max, ty_max, tx_min, ty_min, parcel_count)
            self.printOut.pLog("RET- Block size: " + str(parcel_count))

            p.set_border_bools(self)
            return [p]

    #util
    #Returns the max and min x and y coordinate values
    def util_spatial_boundaries(self):
        self.printOut.pLog(
            "RET- Finding spatial boundaries of the database...")

        t = self.table

        #Setting overall values
        (x_max, y_max, x_min, y_min) = [None, None, None, None]
        s = select([func.count("*")], self.query_string,
                   from_obj=[t]).execute()
        total_count = sql_get_agg(s, "int")

        s = select([func.max(t.c[self.x_attribute])]).execute()
        x_max = sql_get_agg(s, "float")

        s = select([func.min(t.c[self.x_attribute])]).execute()
        x_min = sql_get_agg(s, "float")

        s = select([func.max(t.c[self.y_attribute])]).execute()
        y_max = sql_get_agg(s, "float")

        s = select([func.min(t.c[self.y_attribute])]).execute()
        y_min = sql_get_agg(s, "float")

        return [x_max, y_max, x_min, y_min, total_count]

    #Creates a list that says whether each value is null or not
    def proc_is_null_list(self, test_object):
        self.printOut.pLog("RET- Test Attribute: " +
                           str(test_object.test_attribute))

        is_null_list = []
        for i in range(len(self.current_rows)):
            is_null = False
            for null_dict in test_object.null_value_list:

                value = null_dict["value"]
                type = null_dict["type"]
                row_value = self.current_rows[i][null_dict["attribute"]]

                if type == "GT" and row_value > value:
                    is_null = True
                    break
                elif type == "LT" and row_value < value:
                    is_null = True
                    break
                elif type == "E" and row_value == value:
                    is_null = True
                    break
                elif type == "NE" and row_value != value:
                    is_null = True
                    break

            is_null_list.append(is_null)

        self.is_null_list = is_null_list
        self.printOut.pLog("RET- Found " + str(is_null_list.count(True)) +
                           " null labels in whole training blocks")

    #makes it so class and num attribute lists only represent attributes being used in tests
    def update_att_lists(self, model_list):
        new_class_list = []
        new_num_list = []

        self.printOut.pLog(
            "RET- Checking that all needed attributes are in the table.")

        #finding all attributes being used
        for model in model_list:
            for attribute in model.attributes:
                if attribute in self.class_list:
                    new_class_list.append(attribute)
                elif attribute in self.numeric_list:
                    new_num_list.append(attribute)
                else:
                    self.printOut.pLog("ERROR: Attribute Not in table- " +
                                       attribute)

            #Make sure the target attribute is included
            if model.test_attribute in self.class_list:
                new_class_list.append(model.test_attribute)
            elif model.test_attribute in self.numeric_list:
                new_num_list.append(model.test_attribute)
            elif model.test_attribute != None:
                self.printOut.pLog("ERROR: Target Attribute Not in Table- ",
                                   model.test_attribute)

        self.printOut.pLog("")

        self.class_list = new_class_list
        self.numeric_list = new_num_list

    #Finds color for each block
    #(Uses modified color scheme)
    def set_colors(self):
        color_list = ["red", "blue", "green", "yellow"]
        #Recording all touching blocks
        blocks_touching = {}
        for block in self.block_list:
            blocks_touching[block] = set([])
            #Checking which blocks are touching
            for ob in self.block_list:
                #left or right
                if block.x_min == ob.x_max or block.x_max == ob.x_min:
                    if not block.y_max <= ob.y_min and not block.y_min >= ob.y_max:
                        blocks_touching[block].add(ob)
                #top or bottom
                elif block.y_min == ob.y_max or block.y_max == ob.y_min:
                    if not block.x_max <= ob.x_min and not block.x_min >= ob.x_max:
                        blocks_touching[block].add(ob)

        #Randomly coloring blocks
        #but making sure as many conflicts can be avoided as possible
        conflict_count = 0
        for block in self.block_list:
            available_colors = copy.deepcopy(color_list)
            for nb in blocks_touching[block]:
                if nb.color in available_colors:
                    available_colors.remove(nb.color)

            if len(available_colors) > 0:
                #Picking a color that a neighbor doesn't have
                index = random.randint(0, len(available_colors) - 1)
                block.color = available_colors[index]
            else:
                #Picking a random color
                index = random.randint(0, len(color_list) - 1)
                block.color = color_list[index]
                conflict_count += 1
        self.printOut.pLog("RET- Color conflicts: " + str(conflict_count))

    #For cases in which location variables are strings (in the database)
    def adjust_borders(self):

        new_x_max = round_up(self.x_max)
        new_x_min = round_down(self.x_min)
        new_y_max = round_up(self.y_max)
        new_y_min = round_down(self.y_min)

        for block in self.block_list:
            if block.x_max == self.x_max:
                block.x_max = new_x_max
            if block.y_max == self.y_max:
                block.y_max = new_y_max
            if block.x_min == self.x_min:
                block.x_min = new_x_min
            if block.y_min == self.y_min:
                block.y_min = new_y_min

        self.x_max = new_x_max
        self.y_max = new_y_max
        self.x_min = new_x_min
        self.y_min = new_y_min
Example #16
0
class Data_profiler:
    def __init__(self, query_manager, logCB=None, progressCB=None):

        #For reporting results
        self.printOut = PrintOutput(logCB, progressCB, PROFILING)

        self.query_manager = query_manager

        #Profile of information currently being dealt with
        self.class_result_dict = None
        self.class_att_value_weight = None
        self.numeric_result_dict = None
        self.get_possible_values(query_manager)

        #Used by SVM_model to piece together results
        self.label_id_lookup_table = None

        #Current data being stored
        self.labels = []
        self.samples = []
        self.is_null_list = []

        #Used by KNN
        self.log_trans_atts = set([])
        self.attribute_id_list = []
        self.attribute_id_dict = {}
        self.id_attribute_dict = {}

    #util function
    #creates dictionary of possible values for each column in the table
    def get_possible_values(self, query_manager):

        #Getting info from the query manager
        class_list = query_manager.class_list
        numeric_list = query_manager.numeric_list
        rows = query_manager.current_rows

        start_time = int(time.time())
        self.printOut.pLog("PREP- Class columns count: " +
                           str(len(class_list)))
        self.printOut.pLog("PREP- Num columns count: " +
                           str(len(numeric_list)))

        #Initializing data structures for storing info
        class_result_dict = {}
        class_att_value_count = {}
        numeric_result_dict = {}
        for c in class_list:
            class_result_dict[c] = []
            class_att_value_count[c] = {}
        for c in numeric_list:
            numeric_result_dict[c] = [None, None]

        #Finding all possible values for each column
        for row in rows:

            #gathering class info
            for c_name, list in class_result_dict.iteritems():
                if c_name in row:
                    value = row[c_name]

                    #Getting information on class attribute values
                    if value not in list:
                        list.append(value)
                        class_att_value_count[c_name][
                            value] = 1  #May need to worry about the value being Null
                    else:
                        class_att_value_count[c_name][value] += 1

            #gathering numeric info
            for c_name, list in numeric_result_dict.iteritems():
                if c_name in row:

                    value = row[c_name]
                    if value == "" or value == None:
                        value = 0  #May want to think of a more appropriate value
                    else:
                        value = float(value)

                    #finding min
                    if value != "" and (list[0] > value or list[0] == None):
                        list[0] = value

                    #finding max
                    if value != "" and (list[1] < value or list[1] == None):
                        list[1] = value

        #Deciding on the weight based on the count
        class_att_value_weight = {}
        for att_name, values in class_att_value_count.iteritems():
            #Finding total number of values
            overall_count = 0
            for value, count in values.iteritems():
                overall_count += count

            #Setting weights
            class_att_value_weight[att_name] = {}
            for value, count in values.iteritems():
                class_att_value_weight[att_name][value] = float(count /
                                                                overall_count)

        self.numeric_result_dict = numeric_result_dict
        self.class_result_dict = class_result_dict
        self.class_att_value_weight = class_att_value_weight

        end_time = int(time.time())
        self.printOut.pLog("PREP- Time getting values: " +
                           str(end_time - start_time))

    #Prepares the data that will be used with SVM or KNN
    def load_data_structures(self, target, attributes):

        start_time = int(time.time())

        #Getting list of columns, but making sure the columns are actually there
        column_list_local = []
        for attribute in attributes:
            if attribute in self.class_result_dict or attribute in self.numeric_result_dict:
                column_list_local.append(attribute)

        (class_lookup_table,
         id_lookup_table) = self.util_get_class_lookup_table()

        self.label_id_lookup_table = id_lookup_table  #Used by SVM to piece together test results
        self.class_lookup_table = class_lookup_table  #Used by SVM to piece together test results

        labels = []
        samples = []

        #Getting all needed information from all rows
        for j in range(len(self.query_manager.current_rows)):

            row = self.query_manager.current_rows[j]

            #for class target attributes
            value = row[target]
            if target in class_lookup_table:
                labels.append(class_lookup_table[target][value])

            #for numeric target attributes (might want to scale label)
            else:
                if value == "" or value == None:
                    value = -1
                labels.append(int(value))  #CHANGE: SHOULD BE FLOAT

            #getting sample data
            index = 0
            sample = {}
            for attribute in column_list_local:
                if attribute in row:
                    if attribute in class_lookup_table:
                        value = row[attribute]
                        attribute_value = class_lookup_table[attribute][value]

                        for i in range(len(self.class_result_dict[attribute])):
                            if attribute_value == i:
                                #sample[index] = 0.5 #1
                                sample[index] = self.class_att_value_weight[
                                    attribute][value]  #MAKE IT ONLY FOR LDOF

                            index += 1

                    elif attribute in self.numeric_result_dict:
                        value = row[attribute]
                        if value == "" or value == None:
                            value = 0

                        scaled = 0
                        max = self.numeric_result_dict[attribute][1]
                        min = self.numeric_result_dict[attribute][0]

                        #Transforming specified columns
                        if attribute in self.log_trans_atts:
                            value = math.log(value + 1)

                            #Scaling values
                            denominator = math.log(max + 1) - math.log(min + 1)
                            if denominator > 0:

                                #Scaling all attributes between 0 and 1
                                numerator = float(value) - math.log(min + 1)
                                scaled = numerator / denominator

                        else:

                            #Non transformed columns
                            #Scaling values
                            denominator = max - min
                            if denominator > 0:

                                #Scaling all attributes between 0 and 1
                                numerator = float(value) - min
                                scaled = numerator / denominator

                        sample[index] = scaled
                        index += 1

            samples.append(sample)

        #Used for KNN
        self.printOut.pLog("PREP- Dimension / column mapping")

        self.attribute_id_dict = {}
        self.id_attribute_dict = {}
        self.attribute_id_list = []
        id = 0
        for attribute in column_list_local:
            self.printOut.pLog("PREP- ID: " + str(id) + ", Attribute: " +
                               attribute)

            if attribute in class_lookup_table:
                for att_value in class_lookup_table[attribute].keys():
                    self.attribute_id_list.append(id)
                self.attribute_id_dict[id] = attribute
                self.id_attribute_dict[attribute] = id
            else:
                self.attribute_id_dict[id] = attribute
                self.id_attribute_dict[attribute] = id
                self.attribute_id_list.append(id)
            id += 1

        #Setting values for object variables / lists
        self.labels = labels
        self.samples = samples

        end_time = int(time.time())
        self.printOut.pLog("PREP- Time loading data structures: " +
                           str(end_time - start_time))

    #Setting up all the data for the test
    def prepare_test_data(self, test_object):

        #Making the data in the data profiler formatted correctly for the
        #Given test attribute and attributes used for the test
        self.load_data_structures(test_object.test_attribute,
                                  test_object.attributes)

        #Setting transformation information in data profiler
        #only do this for KNN attributes
        if test_object.test_type == "KNN":
            self.log_trans_atts = set(test_object.log_trans_atts)

        #Processing information about which rows are null
        self.query_manager.proc_is_null_list(test_object)

    #util function
    def util_get_class_lookup_table(self):
        class_lookup_table = {}
        id_lookup_table = {}
        for class_name, values in self.class_result_dict.iteritems():
            index = 0
            class_temp = {}
            id_temp = {}
            for value in values:
                class_temp[value] = index
                id_temp[index] = value
                index += 1
            class_lookup_table[class_name] = class_temp
            id_lookup_table[class_name] = id_temp

        return [class_lookup_table, id_lookup_table]