def _audit_worker(params): global shared_all global shared_train global shared_test model_or_factory, headers, ignored_features, feature_to_repair, repair_level, output_file, kdd, dump_all = params SAVE_REPAIRED_DATA = True if dump_all else False SAVE_PREDICTION_DETAILS = True if dump_all else False index_to_repair = headers.index(feature_to_repair) repairer = Repairer(shared_all, index_to_repair, repair_level, kdd, features_to_ignore=ignored_features) # Build a model on repaired training data if specified. if isinstance(model_or_factory, AbstractModelFactory): rep_train = repairer.repair(shared_train) model = model_or_factory.build(rep_train) # Log that this specific model was used for this repair level. with open(output_file + ".models.names.txt", "a") as f: f.write("{}: {}\n".format(repair_level, model.model_name)) # Save the repaired version of the data if specified. if SAVE_REPAIRED_DATA: with open(output_file + ".train.repaired_{}.data".format(repair_level), "w") as f: writer = csv.writer(f) for row in [headers]+rep_train: writer.writerow(row) else: model = model_or_factory rep_test = repairer.repair(shared_test) test_name = "{}_{}".format(index_to_repair, repair_level) pred_tuples = model.test(rep_test, test_name=test_name) conf_table = get_conf_matrix(pred_tuples) # Save the repaired version of the data if specified. if SAVE_REPAIRED_DATA: with open(output_file + ".test.repaired_{}.data".format(repair_level), "w") as f: writer = csv.writer(f) for row in [headers]+rep_test: writer.writerow(row) repaired = output_file+".test.repaired_{}.data".format(repair_level) # Save the prediction_tuples and the original values of the features to repair. if SAVE_PREDICTION_DETAILS: with open(output_file + ".repaired_{}.predictions".format(repair_level), "w") as f: writer = csv.writer(f) file_headers = ["Pre-Repaired Feature", "Response", "Prediction"] writer.writerow(file_headers) for i, orig_row in enumerate(shared_test): row = [orig_row[index_to_repair], pred_tuples[i][0], pred_tuples[i][1]] writer.writerow(row) del repairer del rep_test gc.collect() return repaired, (repair_level, conf_table)
def __call__(self, data, output_dir=None, dump_all=False, features_to_audit=None): start_time = datetime.now() headers, train_set, test_set, response_header, features_to_ignore, correct_types = data self._audits_data = { "headers": headers, "train": train_set, "test": test_set, "response": response_header, "ignore": features_to_ignore, "types": correct_types, "full_audit": True if features_to_audit is None else False } if self.trained_model == None: """ ModelFactories require a `build` method that accepts some training data with which to train a brand new model. This `build` method should output a Model object that has a `test` method -- which, when given test data in the same format as the training data, yields a confusion table detailing the correct and incorrect predictions of the model. """ all_data = train_set + test_set model_factory = self.ModelFactory( all_data, headers, response_header, features_to_ignore=features_to_ignore, options=self.model_options) if self.trained_model != None: model_or_factory = self.trained_model elif not self.RETRAIN_MODEL_PER_REPAIR: vprint("Training initial model.", self.verbose) model = model_factory.build(train_set) # Check the quality of the initial model on verbose runs. if self.verbose: print("Calculating original model statistics on test data:") print("\tTraining Set:") train_pred_tuples = model.test(train_set) train_conf_matrix = get_conf_matrix(train_pred_tuples) print("\t\tConf-Matrix:", train_conf_matrix) for measurer in self.measurers: print("\t\t{}: {}".format(measurer.__name__, measurer(train_conf_matrix))) print("\tTesting Set:") test_pred_tuples = model.test(test_set) test_conf_matrix = get_conf_matrix(test_pred_tuples) print("\t\tConf-Matrix", test_conf_matrix) for measurer in self.measurers: print("\t\t{}: {}".format(measurer.__name__, measurer(test_conf_matrix))) model_or_factory = model else: model_or_factory = model_factory # Translate the headers into indexes for the auditor. audit_indices_to_ignore = [ headers.index(f) for f in features_to_ignore ] # Don't audit the response feature. audit_indices_to_ignore.append(headers.index(response_header)) # Prepare the auditor. auditor = GradientFeatureAuditor( model_or_factory, headers, train_set, test_set, repair_steps=self.REPAIR_STEPS, kdd=self.kdd, features_to_ignore=audit_indices_to_ignore, features_to_audit=features_to_audit, output_dir=output_dir, dump_all=dump_all) # Perform the Gradient Feature Audit and dump the audit results into files. audit_filenames = auditor.audit(verbose=self.verbose) # Retrieve repaired data from audit self._audits_data["rep_test"] = auditor._rep_test ranked_features = [] for measurer in self.measurers: vprint("Ranking audit files by {}.".format(measurer.__name__), self.verbose) #ranked_graph_filename = "{}/{}.png".format(auditor.OUTPUT_DIR, measurer.__name__) ranks = rank_audit_files(audit_filenames, measurer) vprint("\t{}".format(ranks), self.verbose) ranked_features.append((measurer, ranks)) end_time = datetime.now() # Store a summary of this experiment. model_id = model_factory.factory_name if self.trained_model == None else "Pretrained" model_name = model_factory.verbose_factory_name if self.trained_model == None else "Pretrained" summary = [ "Audit Start Time: {}".format(start_time), "Audit End Time: {}".format(end_time), "Retrained Per Repair: {}".format(self.RETRAIN_MODEL_PER_REPAIR), "Model Factory ID: {}".format(model_id), "Model Type: {}".format(model_name), "Non-standard Model Options: {}".format(self.model_options), "Train Size: {}".format(len(train_set)), "Test Size: {}".format(len(test_set)), "Non-standard Ignored Features: {}".format(features_to_ignore), "Features: {}\n".format(headers) ] # Print summary for line in summary: print(line) for ranker, ranks in ranked_features: print("Ranked Features by {}: {}".format(ranker.__name__, ranks)) groups = group_audit_ranks(audit_filenames, ranker) print("\tApprox. Trend Groups: {}\n".format(groups)) if ranker.__name__ == "accuracy": self._audits_data["ranks"] = ranks # Dump all experiment results if opted if dump_all: vprint("Dumping original training data.", self.verbose) # Dump the train data to the log. train_dump = "{}/original_train_data".format(auditor.OUTPUT_DIR) with open(train_dump + ".csv", "w") as f: writer = csv.writer(f) writer.writerow(headers) for row in train_set: writer.writerow(row) if self.WRITE_ORIGINAL_PREDICTIONS: # Dump the predictions on the test data. with open(train_dump + ".predictions", "w") as f: writer = csv.writer(f) file_headers = ["Response", "Prediction"] writer.writerow(file_headers) for response, guess in train_pred_tuples: writer.writerow([response, guess]) vprint("Dumping original testing data.", self.verbose) # Dump the train data to the log. test_dump = "{}/original_test_data".format(auditor.OUTPUT_DIR) with open(test_dump + ".csv", "w") as f: writer = csv.writer(f) writer.writerow(headers) for row in test_set: writer.writerow(row) if self.WRITE_ORIGINAL_PREDICTIONS: # Dump the predictions on the test data. with open(test_dump + ".predictions", "w") as f: writer = csv.writer(f) file_headers = ["Response", "Prediction"] writer.writerow(file_headers) for response, guess in test_pred_tuples: writer.writerow([response, guess]) # Graph the audit files. vprint("Graphing audit files.", self.verbose) for audit_filename in audit_filenames: audit_image_filename = audit_filename + ".png" graph_audit(audit_filename, self.measurers, audit_image_filename) # Store a graph of how many predictions change as features are repaired. vprint("Graphing prediction changes throughout repair.", self.verbose) output_image = auditor.OUTPUT_DIR + "/similarity_to_original_predictions.png" graph_prediction_consistency(auditor.OUTPUT_DIR, output_image) for measurer in self.measurers: ranked_graph_filename = "{}/{}.png".format(auditor.OUTPUT_DIR, measurer.__name__) graph_audits(audit_filenames, measurer, ranked_graph_filename) # Store a summary of this experiment to file. summary_file = "{}/summary.txt".format(auditor.OUTPUT_DIR) with open(summary_file, "w") as f: for line in summary: f.write(line + '\n') for ranker, ranks in ranked_features: f.write("Ranked Features by {}: {}\n".format( ranker.__name__, ranks)) groups = group_audit_ranks(audit_filenames, ranker) f.write("\tApprox. Trend Groups: {}\n".format(groups)) vprint("Summary file written to: {}\n".format(summary_file), self.verbose)
def train(self, train_set, test_set, headers, response_header, features_to_ignore=[]): """ A method to train a model using model factories. ModelFactories require a `build` method that accepts some training data with which to train a brand new model. This `build` method should output a Model object that has a `test` method -- which, when given test data in the same format as the training data, yields a confusion table detailing the correct and incorrect predictions of the model. Parameters ---------- train_set, test_set : list of list or numpy.array with teh dimensions (# of features)*(# of samples). Data for training the model and testing the model. headers : list of strings The headers of the data. response_header : string The response header of the data. features_to_ignore : list of strings (default = []) The features we want to ignore. """ all_data = train_set + test_set model_factory = self.ModelFactory( all_data, headers, response_header, features_to_ignore=features_to_ignore, options=self.model_options) vprint("Training initial model.", self.verbose) model = model_factory.build(train_set) # Check the quality of the initial model on verbose runs. if self.verbose: print("Calculating original model statistics on test data:") print("\tTraining Set:") train_pred_tuples = model.test(train_set) train_conf_matrix = get_conf_matrix(train_pred_tuples) print("\t\tConf-Matrix:", train_conf_matrix) for measurer in self.measurers: print("\t\t{}: {}".format(measurer.__name__, measurer(train_conf_matrix))) print("\tTesting Set:") test_pred_tuples = model.test(test_set) test_conf_matrix = get_conf_matrix(test_pred_tuples) print("\t\tConf-Matrix", test_conf_matrix) for measurer in self.measurers: print("\t\t{}: {}".format(measurer.__name__, measurer(test_conf_matrix))) return model