def __init__(self, query_manager, logCB=None, progressCB=None): #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) self.query_manager = query_manager #Profile of information currently being dealt with self.class_result_dict = None self.class_att_value_weight = None self.numeric_result_dict = None self.get_possible_values(query_manager) #Used by SVM_model to piece together results self.label_id_lookup_table = None #Current data being stored self.labels = [] self.samples = [] self.is_null_list = [] #Used by KNN self.log_trans_atts = set([]) self.attribute_id_list = [] self.attribute_id_dict = {} self.id_attribute_dict = {}
def __init__(self, xml_elem, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #KNN tuning parameters self.k = 10 #Make this 1 more than the number of columns self.num_display = 10 self.num_mod = 1 #Attributes that are used to make the prediction attributes_string = xml_elem.attributes['attributes'].value self.attributes = util_get_attribute_list(attributes_string) #NOT ACTUALLY USED, JUST MAKES IT SO KNN LIBRARY CAN BE USED self.test_attribute = None #Sets of attributes that must be considered as a whole self.attribute_combinations = [] #Set all weights to 1 self.initialized_weights = {} for attribute in self.attributes : self.initialized_weights[attribute] = 1 #Attributes that will get there values log transformed to produce better results if xml_elem.hasAttribute('log_trans_attributes') : log_trans_string = xml_elem.attributes['log_trans_attributes'].value temp_atts_list = util_get_attribute_list(log_trans_string) self.log_trans_atts = set(temp_atts_list) self.null_value_list = [] #NOT USED #Random information self.test_type = "LDOF"
def __init__(self, query_manager, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) self.query_manager = query_manager #Profile of information currently being dealt with self.class_result_dict = None self.class_att_value_weight = None self.numeric_result_dict = None self.get_possible_values(query_manager) #Used by SVM_model to piece together results self.label_id_lookup_table = None #Current data being stored self.labels = [] self.samples = [] self.is_null_list = [] #Used by KNN self.log_trans_atts = set([]) self.attribute_id_list = [] self.attribute_id_dict = {} self.id_attribute_dict = {}
def __init__(self, xml_elem, logCB=None, progressCB=None): #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #KNN tuning parameters self.k = 10 #Make this 1 more than the number of columns self.num_display = 10 self.num_mod = 1 #Attributes that are used to make the prediction attributes_string = xml_elem.attributes['attributes'].value self.attributes = util_get_attribute_list(attributes_string) #NOT ACTUALLY USED, JUST MAKES IT SO KNN LIBRARY CAN BE USED self.test_attribute = None #Sets of attributes that must be considered as a whole self.attribute_combinations = [] #Set all weights to 1 self.initialized_weights = {} for attribute in self.attributes: self.initialized_weights[attribute] = 1 #Attributes that will get there values log transformed to produce better results if xml_elem.hasAttribute('log_trans_attributes'): log_trans_string = xml_elem.attributes[ 'log_trans_attributes'].value temp_atts_list = util_get_attribute_list(log_trans_string) self.log_trans_atts = set(temp_atts_list) self.null_value_list = [] #NOT USED #Random information self.test_type = "LDOF"
def __init__(self, xml_elem, MAKE_ALL_PREDS, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #Test specific information self.test_attribute = xml_elem.attributes["test_attribute"].value self.test_classifier = "weka.classifiers.lazy.IBk" if xml_elem.hasAttribute("test_classifier") : self.test_classifier = xml_elem.attributes["classifier"].value self.test_options = "-I -K 20 -X -A weka.core.neighboursearch.KDTree" if xml_elem.hasAttribute("options") : self.test_options = xml_elem.attributes["options"].value #Feature selection information self.use_feature_selection = False self.using_pca = False self.search_class = "" self.evaluation_class = "" if xml_elem.hasAttribute('fs_evaluation_class'): self.use_feature_selection = True self.search_class = xml_elem.attributes["fs_search_class"].value self.evaluation_class = xml_elem.attributes["fs_evaluation_class"].value #Checking for pca if self.evaluation_class.find("PrincipalComponents") > -1 : self.using_pca = True #Attributes that the search class starts with (Not used with PCA) self.start_attributes = [] if xml_elem.hasAttribute('fs_start_attributes') : self.start_attributes = util_get_attribute_list(xml_elem.attributes['fs_start_attributes'].value) #Attributes that are used to make the prediction attributes_string = xml_elem.attributes["train_attributes"].value self.attributes = util_get_attribute_list(attributes_string) #Values that are considered null for the target attribute self.null_value_list = [] elements = xml_elem.getElementsByTagName('null_values') if len(elements) > 0 : null_val_element = elements[0] for element in null_val_element.getElementsByTagName('v') : attribute = element.attributes['attribute'].value type = element.attributes['type'].value value = element.attributes['value'].value vt = element.attributes['vt'].value null_dict = {"attribute" : attribute, "type" : type} if vt == "int" : null_dict["value"] = int(value) elif vt == "string" : null_dict["value"] = str(value) self.null_value_list.append(null_dict) #Simply defined null values if xml_elem.hasAttribute("null_value") : null_value = xml_elem.attributes["null_value"].value null_dict = {"attribute" : self.test_attribute, "type" : "E", "value" : int(null_value)} self.null_value_list.append(null_dict) #Random information self.test_type = "Num" self.MAKE_ALL_PREDS = MAKE_ALL_PREDS
class Num_model : def __init__(self, xml_elem, MAKE_ALL_PREDS, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #Test specific information self.test_attribute = xml_elem.attributes["test_attribute"].value self.test_classifier = "weka.classifiers.lazy.IBk" if xml_elem.hasAttribute("test_classifier") : self.test_classifier = xml_elem.attributes["classifier"].value self.test_options = "-I -K 20 -X -A weka.core.neighboursearch.KDTree" if xml_elem.hasAttribute("options") : self.test_options = xml_elem.attributes["options"].value #Feature selection information self.use_feature_selection = False self.using_pca = False self.search_class = "" self.evaluation_class = "" if xml_elem.hasAttribute('fs_evaluation_class'): self.use_feature_selection = True self.search_class = xml_elem.attributes["fs_search_class"].value self.evaluation_class = xml_elem.attributes["fs_evaluation_class"].value #Checking for pca if self.evaluation_class.find("PrincipalComponents") > -1 : self.using_pca = True #Attributes that the search class starts with (Not used with PCA) self.start_attributes = [] if xml_elem.hasAttribute('fs_start_attributes') : self.start_attributes = util_get_attribute_list(xml_elem.attributes['fs_start_attributes'].value) #Attributes that are used to make the prediction attributes_string = xml_elem.attributes["train_attributes"].value self.attributes = util_get_attribute_list(attributes_string) #Values that are considered null for the target attribute self.null_value_list = [] elements = xml_elem.getElementsByTagName('null_values') if len(elements) > 0 : null_val_element = elements[0] for element in null_val_element.getElementsByTagName('v') : attribute = element.attributes['attribute'].value type = element.attributes['type'].value value = element.attributes['value'].value vt = element.attributes['vt'].value null_dict = {"attribute" : attribute, "type" : type} if vt == "int" : null_dict["value"] = int(value) elif vt == "string" : null_dict["value"] = str(value) self.null_value_list.append(null_dict) #Simply defined null values if xml_elem.hasAttribute("null_value") : null_value = xml_elem.attributes["null_value"].value null_dict = {"attribute" : self.test_attribute, "type" : "E", "value" : int(null_value)} self.null_value_list.append(null_dict) #Random information self.test_type = "Num" self.MAKE_ALL_PREDS = MAKE_ALL_PREDS def get_predictions(self, query_manager) : #Filenames test_filename = "test" + str(int(time.time())) + ".arff" train_filename = "train" + str(int(time.time())) + ".arff" train_log = "train_log" + str(int(time.time())) + ".arff" result_filename = "results" + str(int(time.time())) + ".txt" #Creates (or clears) files that are used by the binary IS_NUM_TEST = True file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca, test_filename, train_filename, query_manager, self) target_values = file_creation_info["target_values"] target_value_null = file_creation_info["target_value_null"] attribute_indexes = file_creation_info["attribute_indexes"] #If there are no null values in the test set #And the run is only replacing null values then terminate if no null values if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0 : os.remove(test_filename) os.remove(train_filename) return None #Running feature selection process if needed acc_est = {} if self.use_feature_selection : (test_filename, train_filename, selected_attributes) = feature_selection(test_filename, train_filename, query_manager, file_creation_info, self, IS_NUM_TEST) acc_est["selected attributes"] = selected_attributes #Running tests model_name = "saved_model" + str(int(time.time())) path_spef_weka = os.path.join( path, "models", "weka.jar") train_string = "java -Xmx1024m -cp " + path_spef_weka + " " + self.test_classifier + " -d " + model_name + " " + self.test_options + " -t " + train_filename + " >> " + train_log test_string = "java -Xmx1024m -cp " + path_spef_weka + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename self.printOut.pLog( "PRED- Training model") os.system(train_string) self.printOut.pLog( "PRED- Making predictions") os.system(test_string) #Gathering results for each test instance self.printOut.pLog( "PRED- Getting results") f = open(result_filename) prediction_list = [] confidence_list = [] #For stat keeping absolute_diff_list = [] relative_diff_list = [] index = 0 collect_results = False for line in f.readlines() : line_list = line.split() #Getting results if collect_results and len(line_list) > 1: prediction = float(line_list[2]) prediction_list.append(prediction) confidence_list.append(0.0) #Getting difference between predicted and actuall results #For non null values if not target_value_null[index] : actual = float(target_values[index]) diff = math.fabs(actual - prediction) absolute_diff_list.append(diff) if actual > 0 : relative_diff_list.append(diff / actual) else : relative_diff_list.append(-1) index += 1 #Seeing if you are at the results portion of the file if line.find("inst#") > -1 : collect_results = True f.close() #Gathering accuracy estimations f = open(train_log) cross_val_info = False get_k_value = False for line in f.readlines() : #Getting all performance related metrics if cross_val_info : line = line.rstrip('\n') line = line.rstrip('\t') line = line.rstrip('\b') line = line.rstrip(' %') list = line.split(' ') if len(list) > 1: attribute = list[0] value = list[len(list) - 1] value = float(value) acc_est[attribute] = value #Getting parameter search results if get_k_value and line.find('using') > -1: list = line.split(' ') k = int(list[1]) acc_est["1 Parameter: k value"] = k get_k_value = False #Finding cross validation info if line.find('Cross-validation') > -1 : cross_val_info = True #Finding k value info if line.find('IB1 instance-based classifier') > -1 : get_k_value = True f.close() #Adding actual performance statistics absolute_diff_array = numpy.array(absolute_diff_list) relative_diff_array = numpy.array(relative_diff_list) absolute_mean = numpy.mean(absolute_diff_array) absolute_std = numpy.std(absolute_diff_array) relative_mean = numpy.mean(relative_diff_array) relative_std = numpy.std(relative_diff_array) acc_est["2 On test data: mean absolute diff"] = absolute_mean acc_est["2 On test data: std absolute diff"] = absolute_std acc_est["2 On test data: mean relative diff"] = relative_mean acc_est["2 On test data: std relative diff"] = relative_std #Add number of test instances to the accuracy estimation current_test_num = query_manager.current_test_block.parcel_count acc_est["test instance count"] = current_test_num / query_manager.group_max acc_est["block number"] = (len(query_manager.used_blocks) - 1)*query_manager.group_max + query_manager.group_count #Removing files os.remove(test_filename) os.remove(train_filename) os.remove(train_log) os.remove(result_filename) os.remove(model_name) return Test_result("Num", self.test_attribute, prediction_list, confidence_list, acc_est)
def __init__(self, io_info_element, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #Storing all the information passed as parameters to the query manager self.db_url = io_info_element.attributes["input_db_url"].value self.table_name = io_info_element.attributes["input_table_name"].value self.x_attribute = io_info_element.attributes["x_column"].value self.y_attribute = io_info_element.attributes["y_column"].value self.id_attribute = io_info_element.attributes["id_column"].value #Forcing certain attributes to be categorical self.fclass_atts = [] if io_info_element.hasAttribute('force_to_class') : self.fclass_atts = util_get_attribute_list(io_info_element.attributes["force_to_class"].value) #Forcing certain attributes to be numerical self.fnum_atts = [] elements = io_info_element.getElementsByTagName('force_to_numeric') if io_info_element.hasAttribute('force_to_numeric') : self.fnum_atts = util_get_attribute_list(io_info_element.attributes["force_to_numeric"].value) #Size of blocks that will be created self.train_size = 40000 if io_info_element.hasAttribute("train_block_size") : self.train_size = int(io_info_element.attributes["train_block_size"].value) self.test_size = 40000 if io_info_element.hasAttribute("test_block_size") : self.test_size = int(io_info_element.attributes["test_block_size"].value) #Getting access to the table self.table = util_get_table(self.db_url, self.table_name) #Getting all attributes from the table #Getting what types of attributes they are (self.class_list, self.numeric_list, self.attributes) = util_get_attribute_info(self.table, self.fclass_atts, self.fnum_atts) #Used for the parcel query self.query_string = True elements = io_info_element.getElementsByTagName('test_criteria') if len(elements) > 0 : tc_elem = elements[0] self.query_string = self.util_create_query_string(tc_elem) #Used for extreme rows that are included in every test done self.ois_query_string = None elements = io_info_element.getElementsByTagName('outlier_inc_set') if len(elements) > 0 : ois_elem = elements[0] if len(ois_elem.getElementsByTagName('or')) > 0: self.ois_query_string = self.util_create_query_string(ois_elem) #Getting x/y boundaries of the parcels and number of rows #(may want to find a faster way to do this) (self.x_max, self.y_max, self.x_min, self.y_min, self.total_count) = self.util_spatial_boundaries() self.rows_left = self.total_count #Information that is being stored about the number of parcel blocks remaining and used self.printOut.pLog("RET- Creating all parcel blocks...") self.block_list = self.util_create_parcel_block(self.x_max, self.y_max, self.x_min, self.y_min) self.set_colors() self.used_blocks = [] #In order to make sure max, min vals didn't leave any out #Can happen if x and y attributes are varchars in metadata self.adjust_borders() #Used for profiling the speed at which the program is running self.first_query_time = None self.number_rows_tested = 0 self.table_current_test_rows = [] #Parcel block information self.current_test_block = None self.current_training_block = None self.group_max = 2 self.group_count = 2 if io_info_element.hasAttribute('num_cv_folds') : self.group_max = int(io_info_element.attributes['num_cv_folds'].value) self.group_count = self.group_max self.overall_is_test_list = [] self.use_as_training = [] #Current rows retrieved self.current_rows = [] self.is_test_list = [] self.is_null_list = [] self.test_number = []
class Query_manager : def __init__(self, io_info_element, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #Storing all the information passed as parameters to the query manager self.db_url = io_info_element.attributes["input_db_url"].value self.table_name = io_info_element.attributes["input_table_name"].value self.x_attribute = io_info_element.attributes["x_column"].value self.y_attribute = io_info_element.attributes["y_column"].value self.id_attribute = io_info_element.attributes["id_column"].value #Forcing certain attributes to be categorical self.fclass_atts = [] if io_info_element.hasAttribute('force_to_class') : self.fclass_atts = util_get_attribute_list(io_info_element.attributes["force_to_class"].value) #Forcing certain attributes to be numerical self.fnum_atts = [] elements = io_info_element.getElementsByTagName('force_to_numeric') if io_info_element.hasAttribute('force_to_numeric') : self.fnum_atts = util_get_attribute_list(io_info_element.attributes["force_to_numeric"].value) #Size of blocks that will be created self.train_size = 40000 if io_info_element.hasAttribute("train_block_size") : self.train_size = int(io_info_element.attributes["train_block_size"].value) self.test_size = 40000 if io_info_element.hasAttribute("test_block_size") : self.test_size = int(io_info_element.attributes["test_block_size"].value) #Getting access to the table self.table = util_get_table(self.db_url, self.table_name) #Getting all attributes from the table #Getting what types of attributes they are (self.class_list, self.numeric_list, self.attributes) = util_get_attribute_info(self.table, self.fclass_atts, self.fnum_atts) #Used for the parcel query self.query_string = True elements = io_info_element.getElementsByTagName('test_criteria') if len(elements) > 0 : tc_elem = elements[0] self.query_string = self.util_create_query_string(tc_elem) #Used for extreme rows that are included in every test done self.ois_query_string = None elements = io_info_element.getElementsByTagName('outlier_inc_set') if len(elements) > 0 : ois_elem = elements[0] if len(ois_elem.getElementsByTagName('or')) > 0: self.ois_query_string = self.util_create_query_string(ois_elem) #Getting x/y boundaries of the parcels and number of rows #(may want to find a faster way to do this) (self.x_max, self.y_max, self.x_min, self.y_min, self.total_count) = self.util_spatial_boundaries() self.rows_left = self.total_count #Information that is being stored about the number of parcel blocks remaining and used self.printOut.pLog("RET- Creating all parcel blocks...") self.block_list = self.util_create_parcel_block(self.x_max, self.y_max, self.x_min, self.y_min) self.set_colors() self.used_blocks = [] #In order to make sure max, min vals didn't leave any out #Can happen if x and y attributes are varchars in metadata self.adjust_borders() #Used for profiling the speed at which the program is running self.first_query_time = None self.number_rows_tested = 0 self.table_current_test_rows = [] #Parcel block information self.current_test_block = None self.current_training_block = None self.group_max = 2 self.group_count = 2 if io_info_element.hasAttribute('num_cv_folds') : self.group_max = int(io_info_element.attributes['num_cv_folds'].value) self.group_count = self.group_max self.overall_is_test_list = [] self.use_as_training = [] #Current rows retrieved self.current_rows = [] self.is_test_list = [] self.is_null_list = [] self.test_number = [] #Gets rows that represent a block of parcels #Returns None if there aren't any rows left def query_rows(self) : #FOR GUI self.printOut.progress(int((self.rows_left / float(self.total_count))*100)) #Getting all new data loaded in data structures if self.group_count == self.group_max : #Reset the group count self.group_count = 1 #Profiling (won't work if distributed) ############ #Getting information about approximate time left if self.first_query_time == None : self.first_query_time = time.time() else : average_time = (time.time() - self.first_query_time) / (self.number_rows_tested) self.printOut.pLog( "PROFILE- Number of blocks remaining: " + str(len(self.block_list))) self.printOut.pLog( "PROFILE- Average time per unit: " + str(average_time)) self.printOut.pLog( "PROFILE- Number of rows remaining: " + str(self.rows_left)) self.printOut.pLog( "PROFILE- Predicted remaining time (in minutes): " + str(int((average_time*(self.rows_left))/60))) #################################################### self.printOut.pLog( "RET- Retrieving training and test parcels from remaining: " + str(self.rows_left)) #Getting a block with a non zero parcel count block = None while self.block_list != [] and block == None : block = self.block_list.pop(0) if block.parcel_count == 0 : block = None if block != None : self.current_test_block = block training_rows_query = self.util_training_rows(block) start_time = int(time.time()) #Getting the attribute values from the raw rows #Get rows into the proper format proper_rows = [] id_set = set([]) for row in training_rows_query : temp_row = {} for attribute in self.attributes : temp_row[attribute] = row[attribute] proper_rows.append(temp_row) id_set.add(row[self.id_attribute]) #Getting test and training rows (test is a subset of training) is_test_list = [] test_number = [] test_count = 0 for index in range(len(proper_rows)) : row = proper_rows[index] #REQUIRES X and Y attributes to be included in the rows (may be a problem) if block.row_within_block(self.x_attribute, self.y_attribute, row) : is_test_list.append(True) test_number.append(test_count) test_count += 1 else : is_test_list.append(False) test_number.append(None) #Adjust block count (cause borders are modified in some cases) block.parcel_count = test_count self.used_blocks.append(block) self.rows_left -= block.parcel_count self.number_rows_tested += block.parcel_count #Adding the extreme values that need to be added to every data set #This helps outlier detection and null value predictions if self.ois_query_string != None : s = self.table.select(and_(self.ois_query_string, self.query_string )).execute() self.printOut.pLog( "RET- Num extra (rare) rows added: " + str(s.rowcount)) for row in s : if row[self.id_attribute] not in id_set : temp_row = {} for attribute in self.attributes : temp_row[attribute] = row[attribute] proper_rows.append(temp_row) is_test_list.append(False) test_number.append(None) self.current_rows = proper_rows self.is_test_list = is_test_list self.overall_is_test_list = copy.deepcopy(is_test_list) self.test_number = test_number end_time = int(time.time()) self.printOut.pLog( "RET- Time loading data structures: " + str(end_time - start_time)) else : self.current_rows = [] self.is_test_list = [] self.test_number = [] self.overall_is_test_list = [] self.use_as_training = [] #Increment group count else : self.group_count += 1 #Use data that exists / loading temporary data structures self.is_test_list = [] self.use_as_training = [] test_num = 0 test_count = 0 train_count = 0 #Going over every current row for index in range(len(self.current_rows)) : #if ONE group then all in test if self.group_max == 1 : if self.overall_is_test_list[index] : self.is_test_list.append(True) test_count += 1 else : self.is_test_list.append(False) self.use_as_training.append(True) train_count += 1 #If more than one group then split up test and training sets else : is_test = self.overall_is_test_list[index] if is_test : test_num += 1 #Deciding whether instance will be in the test set #Splits test set up #MISSING 4 VALUES IN SANITY CHECK used_as_test = False if is_test and test_num % self.group_max == (self.group_count - 1) : self.is_test_list.append(True) used_as_test = True test_count += 1 else : self.is_test_list.append(False) #Deciding whether instance should be a training data set #FIND INTELIGENT WAY TO STOP THE TRAINING SET FROM BEING TO LARGE if not used_as_test : train_count += 1 if not used_as_test : self.use_as_training.append(True) else : self.use_as_training.append(False) self.printOut.pLog( "RET- Group: " + str(self.group_count)) self.printOut.pLog( "RET- Test count: " + str(test_count)) self.printOut.pLog( "RET- Train count: " + str(train_count)) #Returns the number of rows that are left #to be retrieved def number_remaining_blocks(self) : if len(self.block_list) == 0 and self.group_count == self.group_max : return 0 else : return 1 #Used to setup basic query string def util_create_query_string(self, element): #Getting dictionary of column objects #Creating and clauses for all columns in test criteria combined qs = True and_list = [] for or_tag in element.getElementsByTagName('or') : #Creating or clauses for a given "or list" or_list = [] for elem in or_tag.getElementsByTagName('tc') : attribute = elem.attributes['attribute'].value type = elem.attributes['type'].value value = elem.attributes['value'].value #Getting the right form of the value vt = elem.attributes['vt'].value if vt == "string" : value = str(value) elif vt == "int" : value = int(value) #Creating clause for criteria if type == "E" : or_list.append(self.table.c[attribute] == value) elif type == "NE" : or_list.append(self.table.c[attribute] != value) elif type == "GT" : or_list.append(self.table.c[attribute] > value) elif type == "LT" : or_list.append(self.table.c[attribute] < value) if len(or_list) > 0 : and_list.append(or_(*or_list)) #Only make the query string equal to the list if there is something in the list if len(and_list) > 0 : qs = and_(*and_list) return qs #util #keeps track of all parcel blocks class Parcel_block : def __init__(self, x_max, y_max, x_min, y_min, parcel_count) : self.x_max = float(x_max) self.y_max = float(y_max) self.x_min = float(x_min) self.y_min = float(y_min) self.parcel_count = parcel_count self.right_border = False self.bottom_border = False #For visual represantation self.color = None #Sets values for whether sides are borders def set_border_bools(self, query_manager): if self.y_min == query_manager.y_min : self.bottom_border = True if self.x_max == query_manager.x_max : self.right_border = True def row_within_block(self, x_at, y_at, row) : #There are strict equalties for the lower and right sides of the block #UNLESS that side borders the edge of space xa = float(row[x_at]) ya = float(row[y_at]) rb = self.right_border bb = self.bottom_border if xa >= self.x_min and ((rb and xa <= self.x_max) or (not rb and xa < self.x_max)) : if ya <= self.y_max and ((bb and ya >= self.y_min) or (not bb and ya > self.y_min)): return True return False #util #Gets all the training rows (super set of test rows) def util_training_rows(self, block) : (cx_max, cy_max, cx_min, cy_min) = [block.x_max, block.y_max, block.x_min, block.y_min] current_count = block.parcel_count if current_count == 0 : return [[], []] else : self.printOut.pLog( "RET- Current count inside training block: " + str(current_count)) #setting easy variables x = self.x_attribute y = self.y_attribute t = self.table #ROOM FOR IMPROVEMENT #Make it so that this doesn't terribly overshoot the training size count_repeated = 0 last_count = 0 select_stat = t.select(and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string )) while(current_count < self.train_size) : change = math.sqrt((self.train_size - current_count) / float(max(self.train_size/10, current_count))) cx_min -= (cx_max - cx_min)*change*.1 cx_max += (cx_max - cx_min)*change*.1 cy_min -= (cy_max - cy_min)*change*.1 cy_max += (cy_max - cy_min)*change*.1 select_stat = t.select(and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string )) #Getting the number of instances inside the block s = select([func.count("*")], and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string ), from_obj=[t]).execute() block_count = parcel_count = sql_get_agg(s, "int") last_count = current_count current_count = block_count self.printOut.pLog( "RET- Current count inside training block: " + str(current_count)) #Protects against cases in which current_count will never be bigger than train_size if last_count == current_count : count_repeated += 1 else : count_repeated = 0 if count_repeated == 5 : break #Executing the training query s = select_stat.execute() #Used for parcel visual self.current_training_block = self.Parcel_block(cx_max, cy_max, cx_min, cy_min, "(training block)") self.current_training_block.color = self.current_test_block.color return s #util #Seperates parcels in a grid type fashion and creates #spatial objects for each grid def util_create_parcel_block(self, tx_max, ty_max, tx_min, ty_min) : t = self.table x = self.x_attribute y = self.y_attribute #NEED TO BE IMPROVED #The inequalities should be made strict in a certain way in order to insure nothings redundant s = select([func.count("*")], and_(t.c[x] >= tx_min, t.c[x] <= tx_max, t.c[y] >= ty_min, t.c[y] <= ty_max, self.query_string ), from_obj=[t]).execute() parcel_count = sql_get_agg(s, "int") #ROOM FOR IMPROVEMENT #Make it so that very small test blocks aren't created if parcel_count > self.test_size : x_mid = (tx_max - tx_min) / 2 + tx_min y_mid = (ty_max - ty_min) /2 + ty_min temp_list = [] #Always splits in such a way that the the resulting rectangles are squarish x_diff = tx_max - tx_min y_diff = ty_max - ty_min if x_diff < y_diff : #Split horiz temp_list.extend(self.util_create_parcel_block(tx_max, ty_max, tx_min, y_mid)) temp_list.extend(self.util_create_parcel_block(tx_max, y_mid, tx_min, ty_min)) else : #Split vert temp_list.extend(self.util_create_parcel_block(tx_max, ty_max, x_mid, ty_min)) temp_list.extend(self.util_create_parcel_block(x_mid, ty_max, tx_min, ty_min)) return temp_list else : p = self.Parcel_block(tx_max, ty_max, tx_min, ty_min, parcel_count) self.printOut.pLog( "RET- Block size: " + str(parcel_count)) p.set_border_bools(self) return [p] #util #Returns the max and min x and y coordinate values def util_spatial_boundaries(self) : self.printOut.pLog( "RET- Finding spatial boundaries of the database...") t = self.table #Setting overall values (x_max, y_max, x_min, y_min) = [None, None, None, None] s = select([func.count("*")], self.query_string, from_obj=[t]).execute() total_count = sql_get_agg(s, "int") s = select([func.max(t.c[self.x_attribute])]).execute() x_max = sql_get_agg(s, "float") s = select([func.min(t.c[self.x_attribute])]).execute() x_min = sql_get_agg(s, "float") s = select([func.max(t.c[self.y_attribute])]).execute() y_max = sql_get_agg(s, "float") s = select([func.min(t.c[self.y_attribute])]).execute() y_min = sql_get_agg(s, "float") return [x_max, y_max, x_min, y_min, total_count] #Creates a list that says whether each value is null or not def proc_is_null_list(self, test_object) : self.printOut.pLog( "RET- Test Attribute: " + str(test_object.test_attribute)) is_null_list = [] for i in range(len(self.current_rows)) : is_null = False for null_dict in test_object.null_value_list : value = null_dict["value"] type = null_dict["type"] row_value = self.current_rows[i][null_dict["attribute"]] if type == "GT" and row_value > value : is_null = True break elif type == "LT" and row_value < value : is_null = True break elif type == "E" and row_value == value : is_null = True break elif type == "NE" and row_value != value : is_null = True break is_null_list.append(is_null) self.is_null_list = is_null_list self.printOut.pLog( "RET- Found " + str(is_null_list.count(True)) + " null labels in whole training blocks") #makes it so class and num attribute lists only represent attributes being used in tests def update_att_lists(self, model_list): new_class_list = [] new_num_list = [] self.printOut.pLog( "RET- Checking that all needed attributes are in the table.") #finding all attributes being used for model in model_list : for attribute in model.attributes : if attribute in self.class_list : new_class_list.append(attribute) elif attribute in self.numeric_list : new_num_list.append(attribute) else : self.printOut.pLog( "ERROR: Attribute Not in table- " + attribute) #Make sure the target attribute is included if model.test_attribute in self.class_list : new_class_list.append(model.test_attribute) elif model.test_attribute in self.numeric_list : new_num_list.append(model.test_attribute) elif model.test_attribute != None : self.printOut.pLog( "ERROR: Target Attribute Not in Table- ", model.test_attribute) self.printOut.pLog( "") self.class_list = new_class_list self.numeric_list = new_num_list #Finds color for each block #(Uses modified color scheme) def set_colors(self): color_list = ["red", "blue", "green", "yellow"] #Recording all touching blocks blocks_touching ={} for block in self.block_list : blocks_touching[block] = set([]) #Checking which blocks are touching for ob in self.block_list : #left or right if block.x_min == ob.x_max or block.x_max == ob.x_min : if not block.y_max <= ob.y_min and not block.y_min >= ob.y_max : blocks_touching[block].add(ob) #top or bottom elif block.y_min == ob.y_max or block.y_max == ob.y_min : if not block.x_max <= ob.x_min and not block.x_min >= ob.x_max : blocks_touching[block].add(ob) #Randomly coloring blocks #but making sure as many conflicts can be avoided as possible conflict_count = 0 for block in self.block_list : available_colors = copy.deepcopy(color_list) for nb in blocks_touching[block] : if nb.color in available_colors : available_colors.remove(nb.color) if len(available_colors) > 0 : #Picking a color that a neighbor doesn't have index = random.randint(0, len(available_colors) - 1) block.color = available_colors[index] else : #Picking a random color index = random.randint(0, len(color_list) - 1) block.color = color_list[index] conflict_count += 1 self.printOut.pLog( "RET- Color conflicts: " + str(conflict_count)) #For cases in which location variables are strings (in the database) def adjust_borders(self): new_x_max = round_up(self.x_max) new_x_min = round_down(self.x_min) new_y_max = round_up(self.y_max) new_y_min = round_down(self.y_min) for block in self.block_list : if block.x_max == self.x_max : block.x_max = new_x_max if block.y_max == self.y_max : block.y_max = new_y_max if block.x_min == self.x_min : block.x_min = new_x_min if block.y_min == self.y_min : block.y_min = new_y_min self.x_max = new_x_max self.y_max = new_y_max self.x_min = new_x_min self.y_min = new_y_min
class LDOF_model : def __init__(self, xml_elem, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #KNN tuning parameters self.k = 10 #Make this 1 more than the number of columns self.num_display = 10 self.num_mod = 1 #Attributes that are used to make the prediction attributes_string = xml_elem.attributes['attributes'].value self.attributes = util_get_attribute_list(attributes_string) #NOT ACTUALLY USED, JUST MAKES IT SO KNN LIBRARY CAN BE USED self.test_attribute = None #Sets of attributes that must be considered as a whole self.attribute_combinations = [] #Set all weights to 1 self.initialized_weights = {} for attribute in self.attributes : self.initialized_weights[attribute] = 1 #Attributes that will get there values log transformed to produce better results if xml_elem.hasAttribute('log_trans_attributes') : log_trans_string = xml_elem.attributes['log_trans_attributes'].value temp_atts_list = util_get_attribute_list(log_trans_string) self.log_trans_atts = set(temp_atts_list) self.null_value_list = [] #NOT USED #Random information self.test_type = "LDOF" def get_predictions(self, data_profiler) : #Creates (or clears) files that are used by the binary self.initialize_files() #Put data into files self.create_input_files(data_profiler) #Create the configuration file self.util_create_config(data_profiler) #Getting list of predictions and confidences #(This runs the binary with the config as a input) os.system(BINARY + ' ' + self.config_file + ' >> ' + self.results_file) (lof_lists, lof_id_lists) = self.util_get_results(self.results_file) #Removes files used by the binary self.remove_files() return Test_result(lof_lists, lof_id_lists) #util function #Gets the results from the result file #Also removes the files that were being used def util_get_results(self, filename) : start_time = int(time.time()) self.printOut.pLog( "LDOF- Getting results" ) f = open(filename) predictions = [] confidences = [] lof_lists = [] lof_id_lists = [] results_portion = False index = 0 for line in f.readlines() : #Find better way to identify comments if line.find('#') < 0 and results_portion: line_list = line.split(', ') predictions.append(0) confidences.append(0) #Getting lof values for the line lof_list = [] lof_id_list = [] num_check = (self.k*2) / self.num_mod for i in range(num_check + 1) : if i == 0 : id = line_list[i] elif i % 2 == 1 : lof_list.append(1000*float(line_list[i])) #Don't adjust value like this else : lof_id_list.append(int(line_list[i])) lof_lists.append(lof_list) lof_id_lists.append(lof_id_list) index += 1 else : stripped = line.rstrip('\n') self.printOut.pLog("LDOF- " + stripped) #Finding the portion of the output file that has results if line.find("Printing final results:") > -1 : results_portion = True end_time = int(time.time()) self.printOut.pLog( "LDOF- Time reading results: " + str( end_time - start_time )) return [lof_lists, lof_id_lists] #util function #Creates a configuration file that is used by the #binary in order to run the test def util_create_config(self, data_profiler) : #Modifying the list of attribute ids #making sure the ones that are connected are represented as such #This means they have the same id in the configuration file #Also, the weights that are specified are given as well weights_list = [] for index in range(len(data_profiler.attribute_id_list)) : id = data_profiler.attribute_id_list[index] attribute = data_profiler.attribute_id_dict[id] #Setting right IDs for combo in self.attribute_combinations : if attribute in combo : data_profiler.attribute_id_list[index] = data_profiler.id_attribute_dict[combo[0]] #Setting right weights if attribute in self.initialized_weights : weights_list.append(self.initialized_weights[attribute]) else : weights_list.append(0) #Printing information into the configuration file f = open(self.config_file, 'w') f.write("test_file: " + self.test_filename + "\n") f.write("train_file: " + self.train_filename + "\n") f.write("number_neighbors: " + str(self.k) + "\n") f.write("number_display: " + str(self.num_display) + "\n") f.write("number_mod: " + str(self.num_mod) + "\n") f.write("columns_attribute:") for i in data_profiler.attribute_id_list : f.write(" " + str(i)) f.write("\n") f.write("initial_weights:") for weight in weights_list : f.write(" " + str(weight)) f.write("\n") f.close() #util function #Gets information from the data_profiler object to files def create_input_files(self, data_profiler): start_time = int(time.time()) #Just loading data structures test_labels = [] test_samples = [] test_ids = [] train_labels = [] train_samples = [] train_ids = [] for i in range(len(data_profiler.labels)) : #Adding instances that are going to be tested to the list if data_profiler.query_manager.is_test_list[i] : test_labels.append(data_profiler.labels[i]) test_samples.append(data_profiler.samples[i]) test_ids.append(i) #Adding non null instances to the training set if not data_profiler.query_manager.is_null_list[i]: train_labels.append(data_profiler.labels[i]) train_samples.append(data_profiler.samples[i]) train_ids.append(i) #Create test and train files self.util_create_valid_file_from_samples(test_labels, test_samples, test_ids, self.test_filename) self.util_create_valid_file_from_samples(train_labels, train_samples, train_ids, self.train_filename) #Isn't used but something needs to be passed to KNN self.util_create_valid_file_from_samples([], [], [], self.val_filename) #Re-setting transformation information in data profiler data_profiler.log_trans_atts = set([]) end_time = int(time.time()) self.printOut.pLog( "LDOF- Time creating input files: " + str( end_time - start_time )) #util function #Creates files that will be used by the KNN binary def util_create_valid_file_from_samples(self, labels, samples, id_list, filename) : f = open(filename, 'w') index = 0 for sample in samples : line = str( labels[index] ) line += " " + str(id_list[index]) keys = sample.keys() keys.sort() for key in keys : line += " " + str(key) + ":" + str(sample[key]) line += "\n" f.write(line) index += 1 f.close() def initialize_files(self): #Creating unique file names self.test_filename = 'LDOF_TEST_INPUT_FILE' + str(int(time.time())) + '.TXT' self.train_filename = 'LDOF_TRAIN_INPUT_FILE' + str(int(time.time())) + '.TXT' self.val_filename = 'LDOF_VAL_FILE' + str(int(time.time())) + '.TXT' self.results_file = 'LDOF_RESULTS_FILE' + str(int(time.time())) + '.TXT' self.config_file = 'LDOF' + str(int(time.time())) + '.cfg' #Initializing files os.system('echo \"\" > ' + self.config_file) os.system('echo \"\" > ' + self.results_file) os.system('echo \"\" > ' + self.test_filename) os.system('echo \"\" > ' + self.train_filename) os.system('echo \"\" > ' + self.val_filename) def remove_files(self): os.remove(self.config_file) os.remove(self.results_file) os.remove(self.test_filename) os.remove(self.train_filename) os.remove(self.val_filename)
class LDOF_model: def __init__(self, xml_elem, logCB=None, progressCB=None): #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #KNN tuning parameters self.k = 10 #Make this 1 more than the number of columns self.num_display = 10 self.num_mod = 1 #Attributes that are used to make the prediction attributes_string = xml_elem.attributes['attributes'].value self.attributes = util_get_attribute_list(attributes_string) #NOT ACTUALLY USED, JUST MAKES IT SO KNN LIBRARY CAN BE USED self.test_attribute = None #Sets of attributes that must be considered as a whole self.attribute_combinations = [] #Set all weights to 1 self.initialized_weights = {} for attribute in self.attributes: self.initialized_weights[attribute] = 1 #Attributes that will get there values log transformed to produce better results if xml_elem.hasAttribute('log_trans_attributes'): log_trans_string = xml_elem.attributes[ 'log_trans_attributes'].value temp_atts_list = util_get_attribute_list(log_trans_string) self.log_trans_atts = set(temp_atts_list) self.null_value_list = [] #NOT USED #Random information self.test_type = "LDOF" def get_predictions(self, data_profiler): #Creates (or clears) files that are used by the binary self.initialize_files() #Put data into files self.create_input_files(data_profiler) #Create the configuration file self.util_create_config(data_profiler) #Getting list of predictions and confidences #(This runs the binary with the config as a input) os.system(BINARY + ' ' + self.config_file + ' >> ' + self.results_file) (lof_lists, lof_id_lists) = self.util_get_results(self.results_file) #Removes files used by the binary self.remove_files() return Test_result(lof_lists, lof_id_lists) #util function #Gets the results from the result file #Also removes the files that were being used def util_get_results(self, filename): start_time = int(time.time()) self.printOut.pLog("LDOF- Getting results") f = open(filename) predictions = [] confidences = [] lof_lists = [] lof_id_lists = [] results_portion = False index = 0 for line in f.readlines(): #Find better way to identify comments if line.find('#') < 0 and results_portion: line_list = line.split(', ') predictions.append(0) confidences.append(0) #Getting lof values for the line lof_list = [] lof_id_list = [] num_check = (self.k * 2) / self.num_mod for i in range(num_check + 1): if i == 0: id = line_list[i] elif i % 2 == 1: lof_list.append( 1000 * float(line_list[i])) #Don't adjust value like this else: lof_id_list.append(int(line_list[i])) lof_lists.append(lof_list) lof_id_lists.append(lof_id_list) index += 1 else: stripped = line.rstrip('\n') self.printOut.pLog("LDOF- " + stripped) #Finding the portion of the output file that has results if line.find("Printing final results:") > -1: results_portion = True end_time = int(time.time()) self.printOut.pLog("LDOF- Time reading results: " + str(end_time - start_time)) return [lof_lists, lof_id_lists] #util function #Creates a configuration file that is used by the #binary in order to run the test def util_create_config(self, data_profiler): #Modifying the list of attribute ids #making sure the ones that are connected are represented as such #This means they have the same id in the configuration file #Also, the weights that are specified are given as well weights_list = [] for index in range(len(data_profiler.attribute_id_list)): id = data_profiler.attribute_id_list[index] attribute = data_profiler.attribute_id_dict[id] #Setting right IDs for combo in self.attribute_combinations: if attribute in combo: data_profiler.attribute_id_list[ index] = data_profiler.id_attribute_dict[combo[0]] #Setting right weights if attribute in self.initialized_weights: weights_list.append(self.initialized_weights[attribute]) else: weights_list.append(0) #Printing information into the configuration file f = open(self.config_file, 'w') f.write("test_file: " + self.test_filename + "\n") f.write("train_file: " + self.train_filename + "\n") f.write("number_neighbors: " + str(self.k) + "\n") f.write("number_display: " + str(self.num_display) + "\n") f.write("number_mod: " + str(self.num_mod) + "\n") f.write("columns_attribute:") for i in data_profiler.attribute_id_list: f.write(" " + str(i)) f.write("\n") f.write("initial_weights:") for weight in weights_list: f.write(" " + str(weight)) f.write("\n") f.close() #util function #Gets information from the data_profiler object to files def create_input_files(self, data_profiler): start_time = int(time.time()) #Just loading data structures test_labels = [] test_samples = [] test_ids = [] train_labels = [] train_samples = [] train_ids = [] for i in range(len(data_profiler.labels)): #Adding instances that are going to be tested to the list if data_profiler.query_manager.is_test_list[i]: test_labels.append(data_profiler.labels[i]) test_samples.append(data_profiler.samples[i]) test_ids.append(i) #Adding non null instances to the training set if not data_profiler.query_manager.is_null_list[i]: train_labels.append(data_profiler.labels[i]) train_samples.append(data_profiler.samples[i]) train_ids.append(i) #Create test and train files self.util_create_valid_file_from_samples(test_labels, test_samples, test_ids, self.test_filename) self.util_create_valid_file_from_samples(train_labels, train_samples, train_ids, self.train_filename) #Isn't used but something needs to be passed to KNN self.util_create_valid_file_from_samples([], [], [], self.val_filename) #Re-setting transformation information in data profiler data_profiler.log_trans_atts = set([]) end_time = int(time.time()) self.printOut.pLog("LDOF- Time creating input files: " + str(end_time - start_time)) #util function #Creates files that will be used by the KNN binary def util_create_valid_file_from_samples(self, labels, samples, id_list, filename): f = open(filename, 'w') index = 0 for sample in samples: line = str(labels[index]) line += " " + str(id_list[index]) keys = sample.keys() keys.sort() for key in keys: line += " " + str(key) + ":" + str(sample[key]) line += "\n" f.write(line) index += 1 f.close() def initialize_files(self): #Creating unique file names self.test_filename = 'LDOF_TEST_INPUT_FILE' + str(int( time.time())) + '.TXT' self.train_filename = 'LDOF_TRAIN_INPUT_FILE' + str(int( time.time())) + '.TXT' self.val_filename = 'LDOF_VAL_FILE' + str(int(time.time())) + '.TXT' self.results_file = 'LDOF_RESULTS_FILE' + str(int( time.time())) + '.TXT' self.config_file = 'LDOF' + str(int(time.time())) + '.cfg' #Initializing files os.system('echo \"\" > ' + self.config_file) os.system('echo \"\" > ' + self.results_file) os.system('echo \"\" > ' + self.test_filename) os.system('echo \"\" > ' + self.train_filename) os.system('echo \"\" > ' + self.val_filename) def remove_files(self): os.remove(self.config_file) os.remove(self.results_file) os.remove(self.test_filename) os.remove(self.train_filename) os.remove(self.val_filename)
class Cat_model : def __init__(self, xml_elem, MAKE_ALL_PREDS, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #Test specific information self.test_attribute = xml_elem.attributes["test_attribute"].value self.test_classifier = "weka.classifiers.trees.J48" if xml_elem.hasAttribute("classifier") : self.test_classifier = xml_elem.attributes["classifier"].value self.test_options = "" if xml_elem.hasAttribute("options") : self.test_options = xml_elem.attributes["options"].value #Feature selection information self.use_feature_selection = False self.using_pca = False self.search_class = "" self.evaluation_class = "" if xml_elem.hasAttribute('fs_evaluation_class'): self.use_feature_selection = True self.search_class = xml_elem.attributes["fs_search_class"].value self.evaluation_class = xml_elem.attributes["fs_evaluation_class"].value #Checking for pca if self.evaluation_class.find("PrincipalComponents") > -1 : self.using_pca = True #Attributes that the search class starts with (Not used with PCA) self.start_attributes = [] if xml_elem.hasAttribute('fs_start_attributes') : self.start_attributes = util_get_attribute_list(xml_elem.attributes['fs_start_attributes'].value) #Attributes that are used to make the prediction attributes_string = xml_elem.attributes["train_attributes"].value self.attributes = util_get_attribute_list(attributes_string) #Values that are considered null for the target attribute self.null_value_list = [] elements = xml_elem.getElementsByTagName('null_values') if len(elements) > 0 : null_val_element = elements[0] for element in null_val_element.getElementsByTagName('v') : attribute = element.attributes['attribute'].value type = element.attributes['type'].value value = element.attributes['value'].value vt = element.attributes['vt'].value null_dict = {"attribute" : attribute, "type" : type} if vt == "int" : null_dict["value"] = int(value) elif vt == "string" : null_dict["value"] = str(value) self.null_value_list.append(null_dict) #Simply defined null values if xml_elem.hasAttribute("null_value") : null_value = xml_elem.attributes["null_value"].value null_dict = {"attribute" : self.test_attribute, "type" : "E", "value" : null_value} self.null_value_list.append(null_dict) #Information about the model self.test_type = "Cat" self.MAKE_ALL_PREDS = MAKE_ALL_PREDS def get_predictions(self, query_manager) : #Filenames test_filename = "test" + str(int(time.time())) + ".arff" train_filename = "train" + str(int(time.time())) + ".arff" train_log = "train_log" + str(int(time.time())) + ".arff" result_filename = "results" + str(int(time.time())) + ".txt" #Creates (or clears) files that are used by the binary IS_NUM_TEST = False file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca, test_filename, train_filename, query_manager, self) target_values = file_creation_info["target_values"] target_value_null = file_creation_info["target_value_null"] attribute_indexes = file_creation_info["attribute_indexes"] cat_att_mapping = file_creation_info["cat_att_mapping"] #If there are no null values in the test set #And the run is only replacing null values then terminate if no null values if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0 : os.remove(test_filename) os.remove(train_filename) return None #Running feature selection process if needed acc_est = {} if self.use_feature_selection : (test_filename, train_filename, selected_attributes) = feature_selection(test_filename, train_filename, query_manager, file_creation_info, self, IS_NUM_TEST) acc_est["selected attributes"] = selected_attributes #Running tests model_name = "saved_model" + str(int(time.time())) path_spef_weka = os.path.join( path, "models", "weka.jar") path_spef_libsvm = os.path.join( path, "models", "libsvm.jar") train_string = "java -Xmx1024m -cp " + path_spef_weka + ":" + path_spef_libsvm + " " + self.test_classifier + " -d " + model_name + " " + self.test_options + " -t " + train_filename + " >> " + train_log test_string = "java -Xmx1024m -cp " + path_spef_weka + ":" + path_spef_libsvm + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename self.printOut.pLog( "PRED- Training model") os.system(train_string) self.printOut.pLog( "PRED- Making predictions") os.system(test_string) #Gathering results for each test instance self.printOut.pLog( "PRED- Getting results") f = open(result_filename) prediction_list = [] probability_list = [] correctly_imputed = 0 non_null_count = 0 index = 0 collect_results = False for line in f.readlines() : line_list = line.split() #Getting results if collect_results and len(line_list) > 1: tuple = line_list[2].split(":") prediction = str(tuple[1]) if not target_value_null[index] and prediction == str(target_values[index]) : correctly_imputed += 1 if not target_value_null[index] : non_null_count += 1 prediction_list.append(prediction) probability_list.append(1) index += 1 #Seeing if you are at the results portion of the file if line.find("inst#") > -1 : collect_results = True f.close() #Gathering accuracy estimations f = open(train_log) cross_val_info = False for line in f.readlines() : #Getting all performance related metrics if cross_val_info : line = line.rstrip('\n') line = line.rstrip('\t') line = line.rstrip('\b') line = line.rstrip(' %') if line.find('Correctly Classified Instances') > -1 or line.find('Kappa statistic') > -1: list = line.split(' ') if len(list) > 1: attribute = list[0] value = list[len(list) - 1] value = float(value) acc_est[attribute] = value #Finding cross validation info if line.find('Stratified cross-validation') > -1 : cross_val_info = True elif line.find('Confusion Matrix') > -1 : cross_val_info = False f.close() #Actual Performance Stats acc_est["Actual Correctly Imputed Percent"] = (float(correctly_imputed) / non_null_count) * 100 #Removing files used for test os.remove(train_log) os.remove(result_filename) os.remove(test_filename) os.remove(train_filename) os.remove(model_name) #Add number of test instances to the accuracy estimation current_test_num = query_manager.current_test_block.parcel_count acc_est["test instance count"] = current_test_num acc_est["block number"] = len(query_manager.used_blocks) return Test_result(self.test_type, self.test_attribute, prediction_list, probability_list, acc_est)
class Data_profiler : def __init__(self, query_manager, logCB = None, progressCB = None) : #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) self.query_manager = query_manager #Profile of information currently being dealt with self.class_result_dict = None self.class_att_value_weight = None self.numeric_result_dict = None self.get_possible_values(query_manager) #Used by SVM_model to piece together results self.label_id_lookup_table = None #Current data being stored self.labels = [] self.samples = [] self.is_null_list = [] #Used by KNN self.log_trans_atts = set([]) self.attribute_id_list = [] self.attribute_id_dict = {} self.id_attribute_dict = {} #util function #creates dictionary of possible values for each column in the table def get_possible_values(self, query_manager) : #Getting info from the query manager class_list = query_manager.class_list numeric_list = query_manager.numeric_list rows = query_manager.current_rows start_time = int(time.time()) self.printOut.pLog("PREP- Class columns count: " + str(len(class_list))) self.printOut.pLog("PREP- Num columns count: " + str(len(numeric_list))) #Initializing data structures for storing info class_result_dict = {} class_att_value_count = {} numeric_result_dict = {} for c in class_list : class_result_dict[c] = [] class_att_value_count[c] = {} for c in numeric_list : numeric_result_dict[c] = [None, None] #Finding all possible values for each column for row in rows : #gathering class info for c_name, list in class_result_dict.iteritems() : if c_name in row : value = row[c_name] #Getting information on class attribute values if value not in list : list.append(value) class_att_value_count[c_name][value] = 1 #May need to worry about the value being Null else : class_att_value_count[c_name][value] += 1 #gathering numeric info for c_name, list in numeric_result_dict.iteritems() : if c_name in row : value = row[c_name] if value == "" or value == None: value = 0 #May want to think of a more appropriate value else : value = float( value ) #finding min if value != "" and (list[0] > value or list[0] == None ) : list[0] = value #finding max if value != "" and (list[1] < value or list[1] == None) : list[1] = value #Deciding on the weight based on the count class_att_value_weight = {} for att_name, values in class_att_value_count.iteritems() : #Finding total number of values overall_count = 0 for value, count in values.iteritems() : overall_count += count #Setting weights class_att_value_weight[att_name] = {} for value, count in values.iteritems() : class_att_value_weight[att_name][value] = float(count / overall_count) self.numeric_result_dict = numeric_result_dict self.class_result_dict = class_result_dict self.class_att_value_weight = class_att_value_weight end_time = int(time.time()) self.printOut.pLog("PREP- Time getting values: " + str(end_time - start_time)) #Prepares the data that will be used with SVM or KNN def load_data_structures(self, target, attributes) : start_time = int(time.time()) #Getting list of columns, but making sure the columns are actually there column_list_local = [] for attribute in attributes : if attribute in self.class_result_dict or attribute in self.numeric_result_dict : column_list_local.append(attribute) (class_lookup_table, id_lookup_table) = self.util_get_class_lookup_table() self.label_id_lookup_table = id_lookup_table #Used by SVM to piece together test results self.class_lookup_table = class_lookup_table #Used by SVM to piece together test results labels = [] samples = [] #Getting all needed information from all rows for j in range(len(self.query_manager.current_rows)) : row = self.query_manager.current_rows[j] #for class target attributes value = row[target] if target in class_lookup_table : labels.append(class_lookup_table[target][value]) #for numeric target attributes (might want to scale label) else : if value == "" or value == None: value = -1 labels.append(int( value )) #CHANGE: SHOULD BE FLOAT #getting sample data index = 0 sample = {} for attribute in column_list_local : if attribute in row : if attribute in class_lookup_table : value = row[attribute] attribute_value = class_lookup_table[attribute][value] for i in range( len( self.class_result_dict[attribute] ) ) : if attribute_value == i : #sample[index] = 0.5 #1 sample[index] = self.class_att_value_weight[attribute][value] #MAKE IT ONLY FOR LDOF index += 1 elif attribute in self.numeric_result_dict : value = row[attribute] if value == "" or value == None: value = 0 scaled = 0 max = self.numeric_result_dict[attribute][1] min = self.numeric_result_dict[attribute][0] #Transforming specified columns if attribute in self.log_trans_atts : value = math.log(value + 1) #Scaling values denominator = math.log(max + 1) - math.log(min + 1) if denominator > 0 : #Scaling all attributes between 0 and 1 numerator = float( value ) - math.log(min + 1) scaled = numerator / denominator else : #Non transformed columns #Scaling values denominator = max - min if denominator > 0 : #Scaling all attributes between 0 and 1 numerator = float( value ) - min scaled = numerator / denominator sample[index] = scaled index += 1 samples.append(sample) #Used for KNN self.printOut.pLog( "PREP- Dimension / column mapping") self.attribute_id_dict = {} self.id_attribute_dict = {} self.attribute_id_list = [] id = 0 for attribute in column_list_local : self.printOut.pLog( "PREP- ID: " + str(id) + ", Attribute: " + attribute) if attribute in class_lookup_table : for att_value in class_lookup_table[attribute].keys() : self.attribute_id_list.append(id) self.attribute_id_dict[id] = attribute self.id_attribute_dict[attribute] = id else : self.attribute_id_dict[id] = attribute self.id_attribute_dict[attribute] = id self.attribute_id_list.append(id) id += 1 #Setting values for object variables / lists self.labels = labels self.samples = samples end_time = int(time.time()) self.printOut.pLog("PREP- Time loading data structures: " + str( end_time - start_time)) #Setting up all the data for the test def prepare_test_data(self, test_object) : #Making the data in the data profiler formatted correctly for the #Given test attribute and attributes used for the test self.load_data_structures(test_object.test_attribute, test_object.attributes) #Setting transformation information in data profiler #only do this for KNN attributes if test_object.test_type == "KNN" : self.log_trans_atts = set(test_object.log_trans_atts) #Processing information about which rows are null self.query_manager.proc_is_null_list(test_object) #util function def util_get_class_lookup_table(self) : class_lookup_table = {} id_lookup_table = {} for class_name, values in self.class_result_dict.iteritems() : index = 0 class_temp = {} id_temp = {} for value in values : class_temp[value] = index id_temp[index] = value index += 1 class_lookup_table[class_name] = class_temp id_lookup_table[class_name] = id_temp return [class_lookup_table, id_lookup_table]
class Cat_model: def __init__(self, xml_elem, MAKE_ALL_PREDS, logCB=None, progressCB=None): #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #Test specific information self.test_attribute = xml_elem.attributes["test_attribute"].value self.test_classifier = "weka.classifiers.trees.J48" if xml_elem.hasAttribute("classifier"): self.test_classifier = xml_elem.attributes["classifier"].value self.test_options = "" if xml_elem.hasAttribute("options"): self.test_options = xml_elem.attributes["options"].value #Feature selection information self.use_feature_selection = False self.using_pca = False self.search_class = "" self.evaluation_class = "" if xml_elem.hasAttribute('fs_evaluation_class'): self.use_feature_selection = True self.search_class = xml_elem.attributes["fs_search_class"].value self.evaluation_class = xml_elem.attributes[ "fs_evaluation_class"].value #Checking for pca if self.evaluation_class.find("PrincipalComponents") > -1: self.using_pca = True #Attributes that the search class starts with (Not used with PCA) self.start_attributes = [] if xml_elem.hasAttribute('fs_start_attributes'): self.start_attributes = util_get_attribute_list( xml_elem.attributes['fs_start_attributes'].value) #Attributes that are used to make the prediction attributes_string = xml_elem.attributes["train_attributes"].value self.attributes = util_get_attribute_list(attributes_string) #Values that are considered null for the target attribute self.null_value_list = [] elements = xml_elem.getElementsByTagName('null_values') if len(elements) > 0: null_val_element = elements[0] for element in null_val_element.getElementsByTagName('v'): attribute = element.attributes['attribute'].value type = element.attributes['type'].value value = element.attributes['value'].value vt = element.attributes['vt'].value null_dict = {"attribute": attribute, "type": type} if vt == "int": null_dict["value"] = int(value) elif vt == "string": null_dict["value"] = str(value) self.null_value_list.append(null_dict) #Simply defined null values if xml_elem.hasAttribute("null_value"): null_value = xml_elem.attributes["null_value"].value null_dict = { "attribute": self.test_attribute, "type": "E", "value": null_value } self.null_value_list.append(null_dict) #Information about the model self.test_type = "Cat" self.MAKE_ALL_PREDS = MAKE_ALL_PREDS def get_predictions(self, query_manager): #Filenames test_filename = "test" + str(int(time.time())) + ".arff" train_filename = "train" + str(int(time.time())) + ".arff" train_log = "train_log" + str(int(time.time())) + ".arff" result_filename = "results" + str(int(time.time())) + ".txt" #Creates (or clears) files that are used by the binary IS_NUM_TEST = False file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca, test_filename, train_filename, query_manager, self) target_values = file_creation_info["target_values"] target_value_null = file_creation_info["target_value_null"] attribute_indexes = file_creation_info["attribute_indexes"] cat_att_mapping = file_creation_info["cat_att_mapping"] #If there are no null values in the test set #And the run is only replacing null values then terminate if no null values if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0: os.remove(test_filename) os.remove(train_filename) return None #Running feature selection process if needed acc_est = {} if self.use_feature_selection: (test_filename, train_filename, selected_attributes) = feature_selection(test_filename, train_filename, query_manager, file_creation_info, self, IS_NUM_TEST) acc_est["selected attributes"] = selected_attributes #Running tests model_name = "saved_model" + str(int(time.time())) path_spef_weka = os.path.join(path, "models", "weka.jar") path_spef_libsvm = os.path.join(path, "models", "libsvm.jar") train_string = "java -Xmx1024m -cp " + path_spef_weka + ":" + path_spef_libsvm + " " + self.test_classifier + " -d " + model_name + " " + self.test_options + " -t " + train_filename + " >> " + train_log test_string = "java -Xmx1024m -cp " + path_spef_weka + ":" + path_spef_libsvm + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename self.printOut.pLog("PRED- Training model") os.system(train_string) self.printOut.pLog("PRED- Making predictions") os.system(test_string) #Gathering results for each test instance self.printOut.pLog("PRED- Getting results") f = open(result_filename) prediction_list = [] probability_list = [] correctly_imputed = 0 non_null_count = 0 index = 0 collect_results = False for line in f.readlines(): line_list = line.split() #Getting results if collect_results and len(line_list) > 1: tuple = line_list[2].split(":") prediction = str(tuple[1]) if not target_value_null[index] and prediction == str( target_values[index]): correctly_imputed += 1 if not target_value_null[index]: non_null_count += 1 prediction_list.append(prediction) probability_list.append(1) index += 1 #Seeing if you are at the results portion of the file if line.find("inst#") > -1: collect_results = True f.close() #Gathering accuracy estimations f = open(train_log) cross_val_info = False for line in f.readlines(): #Getting all performance related metrics if cross_val_info: line = line.rstrip('\n') line = line.rstrip('\t') line = line.rstrip('\b') line = line.rstrip(' %') if line.find('Correctly Classified Instances' ) > -1 or line.find('Kappa statistic') > -1: list = line.split(' ') if len(list) > 1: attribute = list[0] value = list[len(list) - 1] value = float(value) acc_est[attribute] = value #Finding cross validation info if line.find('Stratified cross-validation') > -1: cross_val_info = True elif line.find('Confusion Matrix') > -1: cross_val_info = False f.close() #Actual Performance Stats acc_est["Actual Correctly Imputed Percent"] = ( float(correctly_imputed) / non_null_count) * 100 #Removing files used for test os.remove(train_log) os.remove(result_filename) os.remove(test_filename) os.remove(train_filename) os.remove(model_name) #Add number of test instances to the accuracy estimation current_test_num = query_manager.current_test_block.parcel_count acc_est["test instance count"] = current_test_num acc_est["block number"] = len(query_manager.used_blocks) return Test_result(self.test_type, self.test_attribute, prediction_list, probability_list, acc_est)
def __init__(self, io_info_element, logCB=None, progressCB=None): #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #Storing all the information passed as parameters to the query manager self.db_url = io_info_element.attributes["input_db_url"].value self.table_name = io_info_element.attributes["input_table_name"].value self.x_attribute = io_info_element.attributes["x_column"].value self.y_attribute = io_info_element.attributes["y_column"].value self.id_attribute = io_info_element.attributes["id_column"].value #Forcing certain attributes to be categorical self.fclass_atts = [] if io_info_element.hasAttribute('force_to_class'): self.fclass_atts = util_get_attribute_list( io_info_element.attributes["force_to_class"].value) #Forcing certain attributes to be numerical self.fnum_atts = [] elements = io_info_element.getElementsByTagName('force_to_numeric') if io_info_element.hasAttribute('force_to_numeric'): self.fnum_atts = util_get_attribute_list( io_info_element.attributes["force_to_numeric"].value) #Size of blocks that will be created self.train_size = 40000 if io_info_element.hasAttribute("train_block_size"): self.train_size = int( io_info_element.attributes["train_block_size"].value) self.test_size = 40000 if io_info_element.hasAttribute("test_block_size"): self.test_size = int( io_info_element.attributes["test_block_size"].value) #Getting access to the table self.table = util_get_table(self.db_url, self.table_name) #Getting all attributes from the table #Getting what types of attributes they are (self.class_list, self.numeric_list, self.attributes) = util_get_attribute_info(self.table, self.fclass_atts, self.fnum_atts) #Used for the parcel query self.query_string = True elements = io_info_element.getElementsByTagName('test_criteria') if len(elements) > 0: tc_elem = elements[0] self.query_string = self.util_create_query_string(tc_elem) #Used for extreme rows that are included in every test done self.ois_query_string = None elements = io_info_element.getElementsByTagName('outlier_inc_set') if len(elements) > 0: ois_elem = elements[0] if len(ois_elem.getElementsByTagName('or')) > 0: self.ois_query_string = self.util_create_query_string(ois_elem) #Getting x/y boundaries of the parcels and number of rows #(may want to find a faster way to do this) (self.x_max, self.y_max, self.x_min, self.y_min, self.total_count) = self.util_spatial_boundaries() self.rows_left = self.total_count #Information that is being stored about the number of parcel blocks remaining and used self.printOut.pLog("RET- Creating all parcel blocks...") self.block_list = self.util_create_parcel_block( self.x_max, self.y_max, self.x_min, self.y_min) self.set_colors() self.used_blocks = [] #In order to make sure max, min vals didn't leave any out #Can happen if x and y attributes are varchars in metadata self.adjust_borders() #Used for profiling the speed at which the program is running self.first_query_time = None self.number_rows_tested = 0 self.table_current_test_rows = [] #Parcel block information self.current_test_block = None self.current_training_block = None self.group_max = 2 self.group_count = 2 if io_info_element.hasAttribute('num_cv_folds'): self.group_max = int( io_info_element.attributes['num_cv_folds'].value) self.group_count = self.group_max self.overall_is_test_list = [] self.use_as_training = [] #Current rows retrieved self.current_rows = [] self.is_test_list = [] self.is_null_list = [] self.test_number = []
class Query_manager: def __init__(self, io_info_element, logCB=None, progressCB=None): #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) #Storing all the information passed as parameters to the query manager self.db_url = io_info_element.attributes["input_db_url"].value self.table_name = io_info_element.attributes["input_table_name"].value self.x_attribute = io_info_element.attributes["x_column"].value self.y_attribute = io_info_element.attributes["y_column"].value self.id_attribute = io_info_element.attributes["id_column"].value #Forcing certain attributes to be categorical self.fclass_atts = [] if io_info_element.hasAttribute('force_to_class'): self.fclass_atts = util_get_attribute_list( io_info_element.attributes["force_to_class"].value) #Forcing certain attributes to be numerical self.fnum_atts = [] elements = io_info_element.getElementsByTagName('force_to_numeric') if io_info_element.hasAttribute('force_to_numeric'): self.fnum_atts = util_get_attribute_list( io_info_element.attributes["force_to_numeric"].value) #Size of blocks that will be created self.train_size = 40000 if io_info_element.hasAttribute("train_block_size"): self.train_size = int( io_info_element.attributes["train_block_size"].value) self.test_size = 40000 if io_info_element.hasAttribute("test_block_size"): self.test_size = int( io_info_element.attributes["test_block_size"].value) #Getting access to the table self.table = util_get_table(self.db_url, self.table_name) #Getting all attributes from the table #Getting what types of attributes they are (self.class_list, self.numeric_list, self.attributes) = util_get_attribute_info(self.table, self.fclass_atts, self.fnum_atts) #Used for the parcel query self.query_string = True elements = io_info_element.getElementsByTagName('test_criteria') if len(elements) > 0: tc_elem = elements[0] self.query_string = self.util_create_query_string(tc_elem) #Used for extreme rows that are included in every test done self.ois_query_string = None elements = io_info_element.getElementsByTagName('outlier_inc_set') if len(elements) > 0: ois_elem = elements[0] if len(ois_elem.getElementsByTagName('or')) > 0: self.ois_query_string = self.util_create_query_string(ois_elem) #Getting x/y boundaries of the parcels and number of rows #(may want to find a faster way to do this) (self.x_max, self.y_max, self.x_min, self.y_min, self.total_count) = self.util_spatial_boundaries() self.rows_left = self.total_count #Information that is being stored about the number of parcel blocks remaining and used self.printOut.pLog("RET- Creating all parcel blocks...") self.block_list = self.util_create_parcel_block( self.x_max, self.y_max, self.x_min, self.y_min) self.set_colors() self.used_blocks = [] #In order to make sure max, min vals didn't leave any out #Can happen if x and y attributes are varchars in metadata self.adjust_borders() #Used for profiling the speed at which the program is running self.first_query_time = None self.number_rows_tested = 0 self.table_current_test_rows = [] #Parcel block information self.current_test_block = None self.current_training_block = None self.group_max = 2 self.group_count = 2 if io_info_element.hasAttribute('num_cv_folds'): self.group_max = int( io_info_element.attributes['num_cv_folds'].value) self.group_count = self.group_max self.overall_is_test_list = [] self.use_as_training = [] #Current rows retrieved self.current_rows = [] self.is_test_list = [] self.is_null_list = [] self.test_number = [] #Gets rows that represent a block of parcels #Returns None if there aren't any rows left def query_rows(self): #FOR GUI self.printOut.progress( int((self.rows_left / float(self.total_count)) * 100)) #Getting all new data loaded in data structures if self.group_count == self.group_max: #Reset the group count self.group_count = 1 #Profiling (won't work if distributed) ############ #Getting information about approximate time left if self.first_query_time == None: self.first_query_time = time.time() else: average_time = (time.time() - self.first_query_time) / ( self.number_rows_tested) self.printOut.pLog("PROFILE- Number of blocks remaining: " + str(len(self.block_list))) self.printOut.pLog("PROFILE- Average time per unit: " + str(average_time)) self.printOut.pLog("PROFILE- Number of rows remaining: " + str(self.rows_left)) self.printOut.pLog( "PROFILE- Predicted remaining time (in minutes): " + str(int((average_time * (self.rows_left)) / 60))) #################################################### self.printOut.pLog( "RET- Retrieving training and test parcels from remaining: " + str(self.rows_left)) #Getting a block with a non zero parcel count block = None while self.block_list != [] and block == None: block = self.block_list.pop(0) if block.parcel_count == 0: block = None if block != None: self.current_test_block = block training_rows_query = self.util_training_rows(block) start_time = int(time.time()) #Getting the attribute values from the raw rows #Get rows into the proper format proper_rows = [] id_set = set([]) for row in training_rows_query: temp_row = {} for attribute in self.attributes: temp_row[attribute] = row[attribute] proper_rows.append(temp_row) id_set.add(row[self.id_attribute]) #Getting test and training rows (test is a subset of training) is_test_list = [] test_number = [] test_count = 0 for index in range(len(proper_rows)): row = proper_rows[index] #REQUIRES X and Y attributes to be included in the rows (may be a problem) if block.row_within_block(self.x_attribute, self.y_attribute, row): is_test_list.append(True) test_number.append(test_count) test_count += 1 else: is_test_list.append(False) test_number.append(None) #Adjust block count (cause borders are modified in some cases) block.parcel_count = test_count self.used_blocks.append(block) self.rows_left -= block.parcel_count self.number_rows_tested += block.parcel_count #Adding the extreme values that need to be added to every data set #This helps outlier detection and null value predictions if self.ois_query_string != None: s = self.table.select( and_(self.ois_query_string, self.query_string)).execute() self.printOut.pLog("RET- Num extra (rare) rows added: " + str(s.rowcount)) for row in s: if row[self.id_attribute] not in id_set: temp_row = {} for attribute in self.attributes: temp_row[attribute] = row[attribute] proper_rows.append(temp_row) is_test_list.append(False) test_number.append(None) self.current_rows = proper_rows self.is_test_list = is_test_list self.overall_is_test_list = copy.deepcopy(is_test_list) self.test_number = test_number end_time = int(time.time()) self.printOut.pLog("RET- Time loading data structures: " + str(end_time - start_time)) else: self.current_rows = [] self.is_test_list = [] self.test_number = [] self.overall_is_test_list = [] self.use_as_training = [] #Increment group count else: self.group_count += 1 #Use data that exists / loading temporary data structures self.is_test_list = [] self.use_as_training = [] test_num = 0 test_count = 0 train_count = 0 #Going over every current row for index in range(len(self.current_rows)): #if ONE group then all in test if self.group_max == 1: if self.overall_is_test_list[index]: self.is_test_list.append(True) test_count += 1 else: self.is_test_list.append(False) self.use_as_training.append(True) train_count += 1 #If more than one group then split up test and training sets else: is_test = self.overall_is_test_list[index] if is_test: test_num += 1 #Deciding whether instance will be in the test set #Splits test set up #MISSING 4 VALUES IN SANITY CHECK used_as_test = False if is_test and test_num % self.group_max == (self.group_count - 1): self.is_test_list.append(True) used_as_test = True test_count += 1 else: self.is_test_list.append(False) #Deciding whether instance should be a training data set #FIND INTELIGENT WAY TO STOP THE TRAINING SET FROM BEING TO LARGE if not used_as_test: train_count += 1 if not used_as_test: self.use_as_training.append(True) else: self.use_as_training.append(False) self.printOut.pLog("RET- Group: " + str(self.group_count)) self.printOut.pLog("RET- Test count: " + str(test_count)) self.printOut.pLog("RET- Train count: " + str(train_count)) #Returns the number of rows that are left #to be retrieved def number_remaining_blocks(self): if len(self.block_list) == 0 and self.group_count == self.group_max: return 0 else: return 1 #Used to setup basic query string def util_create_query_string(self, element): #Getting dictionary of column objects #Creating and clauses for all columns in test criteria combined qs = True and_list = [] for or_tag in element.getElementsByTagName('or'): #Creating or clauses for a given "or list" or_list = [] for elem in or_tag.getElementsByTagName('tc'): attribute = elem.attributes['attribute'].value type = elem.attributes['type'].value value = elem.attributes['value'].value #Getting the right form of the value vt = elem.attributes['vt'].value if vt == "string": value = str(value) elif vt == "int": value = int(value) #Creating clause for criteria if type == "E": or_list.append(self.table.c[attribute] == value) elif type == "NE": or_list.append(self.table.c[attribute] != value) elif type == "GT": or_list.append(self.table.c[attribute] > value) elif type == "LT": or_list.append(self.table.c[attribute] < value) if len(or_list) > 0: and_list.append(or_(*or_list)) #Only make the query string equal to the list if there is something in the list if len(and_list) > 0: qs = and_(*and_list) return qs #util #keeps track of all parcel blocks class Parcel_block: def __init__(self, x_max, y_max, x_min, y_min, parcel_count): self.x_max = float(x_max) self.y_max = float(y_max) self.x_min = float(x_min) self.y_min = float(y_min) self.parcel_count = parcel_count self.right_border = False self.bottom_border = False #For visual represantation self.color = None #Sets values for whether sides are borders def set_border_bools(self, query_manager): if self.y_min == query_manager.y_min: self.bottom_border = True if self.x_max == query_manager.x_max: self.right_border = True def row_within_block(self, x_at, y_at, row): #There are strict equalties for the lower and right sides of the block #UNLESS that side borders the edge of space xa = float(row[x_at]) ya = float(row[y_at]) rb = self.right_border bb = self.bottom_border if xa >= self.x_min and ((rb and xa <= self.x_max) or (not rb and xa < self.x_max)): if ya <= self.y_max and ((bb and ya >= self.y_min) or (not bb and ya > self.y_min)): return True return False #util #Gets all the training rows (super set of test rows) def util_training_rows(self, block): (cx_max, cy_max, cx_min, cy_min) = [block.x_max, block.y_max, block.x_min, block.y_min] current_count = block.parcel_count if current_count == 0: return [[], []] else: self.printOut.pLog("RET- Current count inside training block: " + str(current_count)) #setting easy variables x = self.x_attribute y = self.y_attribute t = self.table #ROOM FOR IMPROVEMENT #Make it so that this doesn't terribly overshoot the training size count_repeated = 0 last_count = 0 select_stat = t.select( and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string)) while (current_count < self.train_size): change = math.sqrt( (self.train_size - current_count) / float(max(self.train_size / 10, current_count))) cx_min -= (cx_max - cx_min) * change * .1 cx_max += (cx_max - cx_min) * change * .1 cy_min -= (cy_max - cy_min) * change * .1 cy_max += (cy_max - cy_min) * change * .1 select_stat = t.select( and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string)) #Getting the number of instances inside the block s = select([func.count("*")], and_(t.c[x] >= cx_min, t.c[x] <= cx_max, t.c[y] >= cy_min, t.c[y] <= cy_max, self.query_string), from_obj=[t]).execute() block_count = parcel_count = sql_get_agg(s, "int") last_count = current_count current_count = block_count self.printOut.pLog( "RET- Current count inside training block: " + str(current_count)) #Protects against cases in which current_count will never be bigger than train_size if last_count == current_count: count_repeated += 1 else: count_repeated = 0 if count_repeated == 5: break #Executing the training query s = select_stat.execute() #Used for parcel visual self.current_training_block = self.Parcel_block( cx_max, cy_max, cx_min, cy_min, "(training block)") self.current_training_block.color = self.current_test_block.color return s #util #Seperates parcels in a grid type fashion and creates #spatial objects for each grid def util_create_parcel_block(self, tx_max, ty_max, tx_min, ty_min): t = self.table x = self.x_attribute y = self.y_attribute #NEED TO BE IMPROVED #The inequalities should be made strict in a certain way in order to insure nothings redundant s = select([func.count("*")], and_(t.c[x] >= tx_min, t.c[x] <= tx_max, t.c[y] >= ty_min, t.c[y] <= ty_max, self.query_string), from_obj=[t]).execute() parcel_count = sql_get_agg(s, "int") #ROOM FOR IMPROVEMENT #Make it so that very small test blocks aren't created if parcel_count > self.test_size: x_mid = (tx_max - tx_min) / 2 + tx_min y_mid = (ty_max - ty_min) / 2 + ty_min temp_list = [] #Always splits in such a way that the the resulting rectangles are squarish x_diff = tx_max - tx_min y_diff = ty_max - ty_min if x_diff < y_diff: #Split horiz temp_list.extend( self.util_create_parcel_block(tx_max, ty_max, tx_min, y_mid)) temp_list.extend( self.util_create_parcel_block(tx_max, y_mid, tx_min, ty_min)) else: #Split vert temp_list.extend( self.util_create_parcel_block(tx_max, ty_max, x_mid, ty_min)) temp_list.extend( self.util_create_parcel_block(x_mid, ty_max, tx_min, ty_min)) return temp_list else: p = self.Parcel_block(tx_max, ty_max, tx_min, ty_min, parcel_count) self.printOut.pLog("RET- Block size: " + str(parcel_count)) p.set_border_bools(self) return [p] #util #Returns the max and min x and y coordinate values def util_spatial_boundaries(self): self.printOut.pLog( "RET- Finding spatial boundaries of the database...") t = self.table #Setting overall values (x_max, y_max, x_min, y_min) = [None, None, None, None] s = select([func.count("*")], self.query_string, from_obj=[t]).execute() total_count = sql_get_agg(s, "int") s = select([func.max(t.c[self.x_attribute])]).execute() x_max = sql_get_agg(s, "float") s = select([func.min(t.c[self.x_attribute])]).execute() x_min = sql_get_agg(s, "float") s = select([func.max(t.c[self.y_attribute])]).execute() y_max = sql_get_agg(s, "float") s = select([func.min(t.c[self.y_attribute])]).execute() y_min = sql_get_agg(s, "float") return [x_max, y_max, x_min, y_min, total_count] #Creates a list that says whether each value is null or not def proc_is_null_list(self, test_object): self.printOut.pLog("RET- Test Attribute: " + str(test_object.test_attribute)) is_null_list = [] for i in range(len(self.current_rows)): is_null = False for null_dict in test_object.null_value_list: value = null_dict["value"] type = null_dict["type"] row_value = self.current_rows[i][null_dict["attribute"]] if type == "GT" and row_value > value: is_null = True break elif type == "LT" and row_value < value: is_null = True break elif type == "E" and row_value == value: is_null = True break elif type == "NE" and row_value != value: is_null = True break is_null_list.append(is_null) self.is_null_list = is_null_list self.printOut.pLog("RET- Found " + str(is_null_list.count(True)) + " null labels in whole training blocks") #makes it so class and num attribute lists only represent attributes being used in tests def update_att_lists(self, model_list): new_class_list = [] new_num_list = [] self.printOut.pLog( "RET- Checking that all needed attributes are in the table.") #finding all attributes being used for model in model_list: for attribute in model.attributes: if attribute in self.class_list: new_class_list.append(attribute) elif attribute in self.numeric_list: new_num_list.append(attribute) else: self.printOut.pLog("ERROR: Attribute Not in table- " + attribute) #Make sure the target attribute is included if model.test_attribute in self.class_list: new_class_list.append(model.test_attribute) elif model.test_attribute in self.numeric_list: new_num_list.append(model.test_attribute) elif model.test_attribute != None: self.printOut.pLog("ERROR: Target Attribute Not in Table- ", model.test_attribute) self.printOut.pLog("") self.class_list = new_class_list self.numeric_list = new_num_list #Finds color for each block #(Uses modified color scheme) def set_colors(self): color_list = ["red", "blue", "green", "yellow"] #Recording all touching blocks blocks_touching = {} for block in self.block_list: blocks_touching[block] = set([]) #Checking which blocks are touching for ob in self.block_list: #left or right if block.x_min == ob.x_max or block.x_max == ob.x_min: if not block.y_max <= ob.y_min and not block.y_min >= ob.y_max: blocks_touching[block].add(ob) #top or bottom elif block.y_min == ob.y_max or block.y_max == ob.y_min: if not block.x_max <= ob.x_min and not block.x_min >= ob.x_max: blocks_touching[block].add(ob) #Randomly coloring blocks #but making sure as many conflicts can be avoided as possible conflict_count = 0 for block in self.block_list: available_colors = copy.deepcopy(color_list) for nb in blocks_touching[block]: if nb.color in available_colors: available_colors.remove(nb.color) if len(available_colors) > 0: #Picking a color that a neighbor doesn't have index = random.randint(0, len(available_colors) - 1) block.color = available_colors[index] else: #Picking a random color index = random.randint(0, len(color_list) - 1) block.color = color_list[index] conflict_count += 1 self.printOut.pLog("RET- Color conflicts: " + str(conflict_count)) #For cases in which location variables are strings (in the database) def adjust_borders(self): new_x_max = round_up(self.x_max) new_x_min = round_down(self.x_min) new_y_max = round_up(self.y_max) new_y_min = round_down(self.y_min) for block in self.block_list: if block.x_max == self.x_max: block.x_max = new_x_max if block.y_max == self.y_max: block.y_max = new_y_max if block.x_min == self.x_min: block.x_min = new_x_min if block.y_min == self.y_min: block.y_min = new_y_min self.x_max = new_x_max self.y_max = new_y_max self.x_min = new_x_min self.y_min = new_y_min
class Data_profiler: def __init__(self, query_manager, logCB=None, progressCB=None): #For reporting results self.printOut = PrintOutput(logCB, progressCB, PROFILING) self.query_manager = query_manager #Profile of information currently being dealt with self.class_result_dict = None self.class_att_value_weight = None self.numeric_result_dict = None self.get_possible_values(query_manager) #Used by SVM_model to piece together results self.label_id_lookup_table = None #Current data being stored self.labels = [] self.samples = [] self.is_null_list = [] #Used by KNN self.log_trans_atts = set([]) self.attribute_id_list = [] self.attribute_id_dict = {} self.id_attribute_dict = {} #util function #creates dictionary of possible values for each column in the table def get_possible_values(self, query_manager): #Getting info from the query manager class_list = query_manager.class_list numeric_list = query_manager.numeric_list rows = query_manager.current_rows start_time = int(time.time()) self.printOut.pLog("PREP- Class columns count: " + str(len(class_list))) self.printOut.pLog("PREP- Num columns count: " + str(len(numeric_list))) #Initializing data structures for storing info class_result_dict = {} class_att_value_count = {} numeric_result_dict = {} for c in class_list: class_result_dict[c] = [] class_att_value_count[c] = {} for c in numeric_list: numeric_result_dict[c] = [None, None] #Finding all possible values for each column for row in rows: #gathering class info for c_name, list in class_result_dict.iteritems(): if c_name in row: value = row[c_name] #Getting information on class attribute values if value not in list: list.append(value) class_att_value_count[c_name][ value] = 1 #May need to worry about the value being Null else: class_att_value_count[c_name][value] += 1 #gathering numeric info for c_name, list in numeric_result_dict.iteritems(): if c_name in row: value = row[c_name] if value == "" or value == None: value = 0 #May want to think of a more appropriate value else: value = float(value) #finding min if value != "" and (list[0] > value or list[0] == None): list[0] = value #finding max if value != "" and (list[1] < value or list[1] == None): list[1] = value #Deciding on the weight based on the count class_att_value_weight = {} for att_name, values in class_att_value_count.iteritems(): #Finding total number of values overall_count = 0 for value, count in values.iteritems(): overall_count += count #Setting weights class_att_value_weight[att_name] = {} for value, count in values.iteritems(): class_att_value_weight[att_name][value] = float(count / overall_count) self.numeric_result_dict = numeric_result_dict self.class_result_dict = class_result_dict self.class_att_value_weight = class_att_value_weight end_time = int(time.time()) self.printOut.pLog("PREP- Time getting values: " + str(end_time - start_time)) #Prepares the data that will be used with SVM or KNN def load_data_structures(self, target, attributes): start_time = int(time.time()) #Getting list of columns, but making sure the columns are actually there column_list_local = [] for attribute in attributes: if attribute in self.class_result_dict or attribute in self.numeric_result_dict: column_list_local.append(attribute) (class_lookup_table, id_lookup_table) = self.util_get_class_lookup_table() self.label_id_lookup_table = id_lookup_table #Used by SVM to piece together test results self.class_lookup_table = class_lookup_table #Used by SVM to piece together test results labels = [] samples = [] #Getting all needed information from all rows for j in range(len(self.query_manager.current_rows)): row = self.query_manager.current_rows[j] #for class target attributes value = row[target] if target in class_lookup_table: labels.append(class_lookup_table[target][value]) #for numeric target attributes (might want to scale label) else: if value == "" or value == None: value = -1 labels.append(int(value)) #CHANGE: SHOULD BE FLOAT #getting sample data index = 0 sample = {} for attribute in column_list_local: if attribute in row: if attribute in class_lookup_table: value = row[attribute] attribute_value = class_lookup_table[attribute][value] for i in range(len(self.class_result_dict[attribute])): if attribute_value == i: #sample[index] = 0.5 #1 sample[index] = self.class_att_value_weight[ attribute][value] #MAKE IT ONLY FOR LDOF index += 1 elif attribute in self.numeric_result_dict: value = row[attribute] if value == "" or value == None: value = 0 scaled = 0 max = self.numeric_result_dict[attribute][1] min = self.numeric_result_dict[attribute][0] #Transforming specified columns if attribute in self.log_trans_atts: value = math.log(value + 1) #Scaling values denominator = math.log(max + 1) - math.log(min + 1) if denominator > 0: #Scaling all attributes between 0 and 1 numerator = float(value) - math.log(min + 1) scaled = numerator / denominator else: #Non transformed columns #Scaling values denominator = max - min if denominator > 0: #Scaling all attributes between 0 and 1 numerator = float(value) - min scaled = numerator / denominator sample[index] = scaled index += 1 samples.append(sample) #Used for KNN self.printOut.pLog("PREP- Dimension / column mapping") self.attribute_id_dict = {} self.id_attribute_dict = {} self.attribute_id_list = [] id = 0 for attribute in column_list_local: self.printOut.pLog("PREP- ID: " + str(id) + ", Attribute: " + attribute) if attribute in class_lookup_table: for att_value in class_lookup_table[attribute].keys(): self.attribute_id_list.append(id) self.attribute_id_dict[id] = attribute self.id_attribute_dict[attribute] = id else: self.attribute_id_dict[id] = attribute self.id_attribute_dict[attribute] = id self.attribute_id_list.append(id) id += 1 #Setting values for object variables / lists self.labels = labels self.samples = samples end_time = int(time.time()) self.printOut.pLog("PREP- Time loading data structures: " + str(end_time - start_time)) #Setting up all the data for the test def prepare_test_data(self, test_object): #Making the data in the data profiler formatted correctly for the #Given test attribute and attributes used for the test self.load_data_structures(test_object.test_attribute, test_object.attributes) #Setting transformation information in data profiler #only do this for KNN attributes if test_object.test_type == "KNN": self.log_trans_atts = set(test_object.log_trans_atts) #Processing information about which rows are null self.query_manager.proc_is_null_list(test_object) #util function def util_get_class_lookup_table(self): class_lookup_table = {} id_lookup_table = {} for class_name, values in self.class_result_dict.iteritems(): index = 0 class_temp = {} id_temp = {} for value in values: class_temp[value] = index id_temp[index] = value index += 1 class_lookup_table[class_name] = class_temp id_lookup_table[class_name] = id_temp return [class_lookup_table, id_lookup_table]