def get_predictions(self, query_manager) :
               
        #Filenames
        test_filename = "test" + str(int(time.time())) + ".arff"
        train_filename = "train" + str(int(time.time())) + ".arff"
        train_log = "train_log" + str(int(time.time())) + ".arff"
        result_filename = "results" + str(int(time.time())) + ".txt"

        #Creates (or clears) files that are used by the binary 
        IS_NUM_TEST = False
        file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca, test_filename, train_filename, query_manager, self)
        target_values = file_creation_info["target_values"]
        target_value_null = file_creation_info["target_value_null"]
        attribute_indexes = file_creation_info["attribute_indexes"]
        cat_att_mapping = file_creation_info["cat_att_mapping"]

        #If there are no null values in the test set
        #And the run is only replacing null values then terminate if no null values
        if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0 :
            os.remove(test_filename)
            os.remove(train_filename)            
            return None

        #Running feature selection process if needed
        acc_est = {}
        if self.use_feature_selection :
            (test_filename, train_filename, selected_attributes) = feature_selection(test_filename, train_filename, query_manager, file_creation_info, self, IS_NUM_TEST)
            acc_est["selected attributes"] = selected_attributes

        #Running tests
        model_name = "saved_model" + str(int(time.time()))
        path_spef_weka = os.path.join( path, "models", "weka.jar")
        path_spef_libsvm = os.path.join( path, "models", "libsvm.jar")
        train_string = "java -Xmx1024m -cp " + path_spef_weka + ":" +  path_spef_libsvm  + " " + self.test_classifier + " -d " + model_name  + " " + self.test_options + " -t " + train_filename + " >> " + train_log
        test_string = "java -Xmx1024m -cp " +  path_spef_weka + ":" +  path_spef_libsvm  + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename

        self.printOut.pLog( "PRED- Training model")
        os.system(train_string)
        self.printOut.pLog( "PRED- Making predictions")
        os.system(test_string)
                    
        #Gathering results for each test instance
        self.printOut.pLog( "PRED- Getting results")
        
        f = open(result_filename)
        prediction_list = []
        probability_list = []
        
        correctly_imputed = 0
        non_null_count = 0

        index = 0
        collect_results = False
        for line in f.readlines() :
            line_list = line.split()
            
            #Getting results
            if collect_results and len(line_list) > 1:
                tuple = line_list[2].split(":")
                prediction = str(tuple[1])
                
                if not target_value_null[index] and prediction == str(target_values[index]) :
                    correctly_imputed += 1
                if not target_value_null[index] :
                    non_null_count += 1
                
                prediction_list.append(prediction)
                probability_list.append(1)
                index += 1
            
            #Seeing if you are at the results portion of the file
            if line.find("inst#") > -1 :
                collect_results = True

        f.close()
        
        #Gathering accuracy estimations
        f = open(train_log)
        cross_val_info = False
        for line in f.readlines() :
            #Getting all performance related metrics
            if cross_val_info :
                line = line.rstrip('\n')
                line = line.rstrip('\t')
                line = line.rstrip('\b')
                line = line.rstrip(' %')
        
                if line.find('Correctly Classified Instances') > -1 or line.find('Kappa statistic') > -1:
                    list = line.split('  ')
                    if len(list) > 1:
                        attribute = list[0]
                        value = list[len(list) - 1]
                        value = float(value)
                        acc_est[attribute] = value

                    
            #Finding cross validation info                                                                   
            if line.find('Stratified cross-validation') > -1 :
                cross_val_info = True
            elif line.find('Confusion Matrix') > -1 :
                cross_val_info = False

        f.close()
        
        #Actual Performance Stats
        acc_est["Actual Correctly Imputed Percent"] = (float(correctly_imputed) / non_null_count) * 100
        
        #Removing files used for test
        os.remove(train_log)
        os.remove(result_filename)
        os.remove(test_filename)
        os.remove(train_filename)
        os.remove(model_name)

        #Add number of test instances to the accuracy estimation
        current_test_num = query_manager.current_test_block.parcel_count
        acc_est["test instance count"] = current_test_num
        acc_est["block number"] = len(query_manager.used_blocks)


        return Test_result(self.test_type, self.test_attribute, prediction_list, probability_list, acc_est)
    def get_predictions(self, query_manager) :

        #Filenames
        test_filename = "test" + str(int(time.time())) + ".arff"
        train_filename = "train" + str(int(time.time())) + ".arff"
        train_log = "train_log" + str(int(time.time())) + ".arff"
        result_filename = "results" + str(int(time.time())) + ".txt"


        #Creates (or clears) files that are used by the binary 
        IS_NUM_TEST = True
        file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca, test_filename, train_filename, query_manager, self)
        target_values = file_creation_info["target_values"]
        target_value_null = file_creation_info["target_value_null"]
        attribute_indexes = file_creation_info["attribute_indexes"]

        #If there are no null values in the test set
        #And the run is only replacing null values then terminate if no null values
        if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0 :
            os.remove(test_filename)
            os.remove(train_filename)            
            return None


        #Running feature selection process if needed
        acc_est = {}
        if self.use_feature_selection :
            (test_filename, train_filename, selected_attributes) = feature_selection(test_filename, train_filename, query_manager, file_creation_info, self, IS_NUM_TEST)
            acc_est["selected attributes"] = selected_attributes
                      
        #Running tests
        model_name = "saved_model" + str(int(time.time()))
        path_spef_weka = os.path.join( path, "models", "weka.jar")
        train_string = "java -Xmx1024m -cp " + path_spef_weka + " " + self.test_classifier + " -d " + model_name  + " " + self.test_options + " -t " + train_filename + " >> " + train_log
        test_string = "java -Xmx1024m -cp " +  path_spef_weka + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename

        self.printOut.pLog( "PRED- Training model")
        os.system(train_string)
        self.printOut.pLog( "PRED- Making predictions")
        os.system(test_string)
                    
        #Gathering results for each test instance
        self.printOut.pLog( "PRED- Getting results")
        f = open(result_filename)
        
        prediction_list = []
        confidence_list = []
        
        #For stat keeping
        absolute_diff_list = [] 
        relative_diff_list = []
        
        index = 0
        collect_results = False
        for line in f.readlines() :
            line_list = line.split()
            
            #Getting results
            if collect_results and len(line_list) > 1:
                prediction = float(line_list[2])
                prediction_list.append(prediction)
                confidence_list.append(0.0)
                
                #Getting difference between predicted and actuall results 
                #For non null values
                if not target_value_null[index] :
                    actual = float(target_values[index])
                    diff = math.fabs(actual - prediction)
                    
                    absolute_diff_list.append(diff)
                    if actual > 0 :
                        relative_diff_list.append(diff / actual)
                    else :
                        relative_diff_list.append(-1)
                index += 1
            
            #Seeing if you are at the results portion of the file
            if line.find("inst#") > -1 :
                collect_results = True            
        
        f.close()

        #Gathering accuracy estimations
        f = open(train_log)

        cross_val_info = False
        get_k_value = False
        for line in f.readlines() :
        
            #Getting all performance related metrics
            if cross_val_info :
                line = line.rstrip('\n')
                line = line.rstrip('\t')
                line = line.rstrip('\b')
                line = line.rstrip(' %')
        
                list = line.split('  ')
                if len(list) > 1:
                    attribute = list[0]
                    value = list[len(list) - 1]
                    value = float(value)
                    acc_est[attribute] = value
        
            #Getting parameter search results
            if get_k_value and line.find('using') > -1:
                list = line.split(' ')
                k = int(list[1])
                acc_est["1 Parameter: k value"] = k
                get_k_value = False
        
            #Finding cross validation info                                                                   
            if line.find('Cross-validation') > -1 :
                cross_val_info = True

            #Finding k value info
            if line.find('IB1 instance-based classifier') > -1 :
                get_k_value = True

        f.close()

        #Adding actual performance statistics
        absolute_diff_array = numpy.array(absolute_diff_list)
        relative_diff_array = numpy.array(relative_diff_list)

        absolute_mean = numpy.mean(absolute_diff_array)
        absolute_std = numpy.std(absolute_diff_array)

        relative_mean = numpy.mean(relative_diff_array)
        relative_std = numpy.std(relative_diff_array)
        
        acc_est["2 On test data: mean absolute diff"] = absolute_mean
        acc_est["2 On test data: std absolute diff"] = absolute_std
        acc_est["2 On test data: mean relative diff"] = relative_mean
        acc_est["2 On test data: std relative diff"] = relative_std
        

        #Add number of test instances to the accuracy estimation
        current_test_num = query_manager.current_test_block.parcel_count
        acc_est["test instance count"] = current_test_num / query_manager.group_max
        acc_est["block number"] = (len(query_manager.used_blocks) - 1)*query_manager.group_max + query_manager.group_count
         
        #Removing files
        os.remove(test_filename)
        os.remove(train_filename)
        os.remove(train_log)
        os.remove(result_filename)
        os.remove(model_name)                  
                    
        return Test_result("Num", self.test_attribute, prediction_list, confidence_list, acc_est)
Beispiel #3
0
    def get_predictions(self, query_manager):

        #Filenames
        test_filename = "test" + str(int(time.time())) + ".arff"
        train_filename = "train" + str(int(time.time())) + ".arff"
        train_log = "train_log" + str(int(time.time())) + ".arff"
        result_filename = "results" + str(int(time.time())) + ".txt"

        #Creates (or clears) files that are used by the binary
        IS_NUM_TEST = False
        file_creation_info = test_file_creation(IS_NUM_TEST, self.using_pca,
                                                test_filename, train_filename,
                                                query_manager, self)
        target_values = file_creation_info["target_values"]
        target_value_null = file_creation_info["target_value_null"]
        attribute_indexes = file_creation_info["attribute_indexes"]
        cat_att_mapping = file_creation_info["cat_att_mapping"]

        #If there are no null values in the test set
        #And the run is only replacing null values then terminate if no null values
        if not self.MAKE_ALL_PREDS and target_value_null.count(True) == 0:
            os.remove(test_filename)
            os.remove(train_filename)
            return None

        #Running feature selection process if needed
        acc_est = {}
        if self.use_feature_selection:
            (test_filename, train_filename,
             selected_attributes) = feature_selection(test_filename,
                                                      train_filename,
                                                      query_manager,
                                                      file_creation_info, self,
                                                      IS_NUM_TEST)
            acc_est["selected attributes"] = selected_attributes

        #Running tests
        model_name = "saved_model" + str(int(time.time()))
        path_spef_weka = os.path.join(path, "models", "weka.jar")
        path_spef_libsvm = os.path.join(path, "models", "libsvm.jar")
        train_string = "java -Xmx1024m -cp " + path_spef_weka + ":" + path_spef_libsvm + " " + self.test_classifier + " -d " + model_name + " " + self.test_options + " -t " + train_filename + " >> " + train_log
        test_string = "java -Xmx1024m -cp " + path_spef_weka + ":" + path_spef_libsvm + " " + self.test_classifier + " -l " + model_name + " -T " + test_filename + " -p 0 >> " + result_filename

        self.printOut.pLog("PRED- Training model")
        os.system(train_string)
        self.printOut.pLog("PRED- Making predictions")
        os.system(test_string)

        #Gathering results for each test instance
        self.printOut.pLog("PRED- Getting results")

        f = open(result_filename)
        prediction_list = []
        probability_list = []

        correctly_imputed = 0
        non_null_count = 0

        index = 0
        collect_results = False
        for line in f.readlines():
            line_list = line.split()

            #Getting results
            if collect_results and len(line_list) > 1:
                tuple = line_list[2].split(":")
                prediction = str(tuple[1])

                if not target_value_null[index] and prediction == str(
                        target_values[index]):
                    correctly_imputed += 1
                if not target_value_null[index]:
                    non_null_count += 1

                prediction_list.append(prediction)
                probability_list.append(1)
                index += 1

            #Seeing if you are at the results portion of the file
            if line.find("inst#") > -1:
                collect_results = True

        f.close()

        #Gathering accuracy estimations
        f = open(train_log)
        cross_val_info = False
        for line in f.readlines():
            #Getting all performance related metrics
            if cross_val_info:
                line = line.rstrip('\n')
                line = line.rstrip('\t')
                line = line.rstrip('\b')
                line = line.rstrip(' %')

                if line.find('Correctly Classified Instances'
                             ) > -1 or line.find('Kappa statistic') > -1:
                    list = line.split('  ')
                    if len(list) > 1:
                        attribute = list[0]
                        value = list[len(list) - 1]
                        value = float(value)
                        acc_est[attribute] = value

            #Finding cross validation info
            if line.find('Stratified cross-validation') > -1:
                cross_val_info = True
            elif line.find('Confusion Matrix') > -1:
                cross_val_info = False

        f.close()

        #Actual Performance Stats
        acc_est["Actual Correctly Imputed Percent"] = (
            float(correctly_imputed) / non_null_count) * 100

        #Removing files used for test
        os.remove(train_log)
        os.remove(result_filename)
        os.remove(test_filename)
        os.remove(train_filename)
        os.remove(model_name)

        #Add number of test instances to the accuracy estimation
        current_test_num = query_manager.current_test_block.parcel_count
        acc_est["test instance count"] = current_test_num
        acc_est["block number"] = len(query_manager.used_blocks)

        return Test_result(self.test_type, self.test_attribute,
                           prediction_list, probability_list, acc_est)