def _audit_worker(params):
  global shared_all
  global shared_train
  global shared_test

  model_or_factory, headers, ignored_features, feature_to_repair, repair_level, output_file, kdd, dump_all = params

  SAVE_REPAIRED_DATA = True if dump_all else False 
  SAVE_PREDICTION_DETAILS = True if dump_all else False

  index_to_repair = headers.index(feature_to_repair)

  repairer = Repairer(shared_all, index_to_repair,
                      repair_level, kdd, features_to_ignore=ignored_features)

  # Build a model on repaired training data if specified.
  if isinstance(model_or_factory, AbstractModelFactory):
    rep_train = repairer.repair(shared_train)
    model = model_or_factory.build(rep_train)

    # Log that this specific model was used for this repair level.
    with open(output_file + ".models.names.txt", "a") as f:
      f.write("{}: {}\n".format(repair_level, model.model_name))

    # Save the repaired version of the data if specified.
    if SAVE_REPAIRED_DATA:
      with open(output_file + ".train.repaired_{}.data".format(repair_level), "w") as f:
        writer = csv.writer(f)
        for row in [headers]+rep_train:
          writer.writerow(row)
  else:
    model = model_or_factory

  rep_test = repairer.repair(shared_test)

  test_name = "{}_{}".format(index_to_repair, repair_level)
  pred_tuples = model.test(rep_test, test_name=test_name)
  conf_table = get_conf_matrix(pred_tuples)
  
  # Save the repaired version of the data if specified.
  if SAVE_REPAIRED_DATA:
    with open(output_file + ".test.repaired_{}.data".format(repair_level), "w") as f:
      writer = csv.writer(f)
      for row in [headers]+rep_test:
        writer.writerow(row)
  
  repaired = output_file+".test.repaired_{}.data".format(repair_level)

  # Save the prediction_tuples and the original values of the features to repair.
  if SAVE_PREDICTION_DETAILS:
    with open(output_file + ".repaired_{}.predictions".format(repair_level), "w") as f:
      writer = csv.writer(f)
      file_headers = ["Pre-Repaired Feature", "Response", "Prediction"]
      writer.writerow(file_headers)
      for i, orig_row in enumerate(shared_test):
        row = [orig_row[index_to_repair], pred_tuples[i][0], pred_tuples[i][1]]
        writer.writerow(row)

  del repairer
  del rep_test
  gc.collect() 

  return repaired, (repair_level, conf_table)
Example #2
0
    def __call__(self,
                 data,
                 output_dir=None,
                 dump_all=False,
                 features_to_audit=None):
        start_time = datetime.now()

        headers, train_set, test_set, response_header, features_to_ignore, correct_types = data

        self._audits_data = {
            "headers": headers,
            "train": train_set,
            "test": test_set,
            "response": response_header,
            "ignore": features_to_ignore,
            "types": correct_types,
            "full_audit": True if features_to_audit is None else False
        }

        if self.trained_model == None:
            """
       ModelFactories require a `build` method that accepts some training data
       with which to train a brand new model. This `build` method should output
       a Model object that has a `test` method -- which, when given test data
       in the same format as the training data, yields a confusion table detailing
       the correct and incorrect predictions of the model.
      """

            all_data = train_set + test_set
            model_factory = self.ModelFactory(
                all_data,
                headers,
                response_header,
                features_to_ignore=features_to_ignore,
                options=self.model_options)

        if self.trained_model != None:
            model_or_factory = self.trained_model
        elif not self.RETRAIN_MODEL_PER_REPAIR:
            vprint("Training initial model.", self.verbose)
            model = model_factory.build(train_set)

            # Check the quality of the initial model on verbose runs.
            if self.verbose:
                print("Calculating original model statistics on test data:")
                print("\tTraining Set:")
                train_pred_tuples = model.test(train_set)
                train_conf_matrix = get_conf_matrix(train_pred_tuples)
                print("\t\tConf-Matrix:", train_conf_matrix)
                for measurer in self.measurers:
                    print("\t\t{}: {}".format(measurer.__name__,
                                              measurer(train_conf_matrix)))

                print("\tTesting Set:")
                test_pred_tuples = model.test(test_set)
                test_conf_matrix = get_conf_matrix(test_pred_tuples)
                print("\t\tConf-Matrix", test_conf_matrix)
                for measurer in self.measurers:
                    print("\t\t{}: {}".format(measurer.__name__,
                                              measurer(test_conf_matrix)))

            model_or_factory = model
        else:
            model_or_factory = model_factory

        # Translate the headers into indexes for the auditor.
        audit_indices_to_ignore = [
            headers.index(f) for f in features_to_ignore
        ]

        # Don't audit the response feature.
        audit_indices_to_ignore.append(headers.index(response_header))

        # Prepare the auditor.
        auditor = GradientFeatureAuditor(
            model_or_factory,
            headers,
            train_set,
            test_set,
            repair_steps=self.REPAIR_STEPS,
            kdd=self.kdd,
            features_to_ignore=audit_indices_to_ignore,
            features_to_audit=features_to_audit,
            output_dir=output_dir,
            dump_all=dump_all)

        # Perform the Gradient Feature Audit and dump the audit results into files.
        audit_filenames = auditor.audit(verbose=self.verbose)

        # Retrieve repaired data from audit
        self._audits_data["rep_test"] = auditor._rep_test

        ranked_features = []
        for measurer in self.measurers:
            vprint("Ranking audit files by {}.".format(measurer.__name__),
                   self.verbose)
            #ranked_graph_filename = "{}/{}.png".format(auditor.OUTPUT_DIR, measurer.__name__)
            ranks = rank_audit_files(audit_filenames, measurer)
            vprint("\t{}".format(ranks), self.verbose)
            ranked_features.append((measurer, ranks))

        end_time = datetime.now()

        # Store a summary of this experiment.
        model_id = model_factory.factory_name if self.trained_model == None else "Pretrained"
        model_name = model_factory.verbose_factory_name if self.trained_model == None else "Pretrained"
        summary = [
            "Audit Start Time: {}".format(start_time),
            "Audit End Time: {}".format(end_time),
            "Retrained Per Repair: {}".format(self.RETRAIN_MODEL_PER_REPAIR),
            "Model Factory ID: {}".format(model_id),
            "Model Type: {}".format(model_name),
            "Non-standard Model Options: {}".format(self.model_options),
            "Train Size: {}".format(len(train_set)),
            "Test Size: {}".format(len(test_set)),
            "Non-standard Ignored Features: {}".format(features_to_ignore),
            "Features: {}\n".format(headers)
        ]

        # Print summary
        for line in summary:
            print(line)

        for ranker, ranks in ranked_features:
            print("Ranked Features by {}: {}".format(ranker.__name__, ranks))
            groups = group_audit_ranks(audit_filenames, ranker)
            print("\tApprox. Trend Groups: {}\n".format(groups))

            if ranker.__name__ == "accuracy":
                self._audits_data["ranks"] = ranks

        # Dump all experiment results if opted
        if dump_all:
            vprint("Dumping original training data.", self.verbose)
            # Dump the train data to the log.
            train_dump = "{}/original_train_data".format(auditor.OUTPUT_DIR)
            with open(train_dump + ".csv", "w") as f:
                writer = csv.writer(f)
                writer.writerow(headers)
                for row in train_set:
                    writer.writerow(row)

            if self.WRITE_ORIGINAL_PREDICTIONS:
                # Dump the predictions on the test data.
                with open(train_dump + ".predictions", "w") as f:
                    writer = csv.writer(f)
                    file_headers = ["Response", "Prediction"]
                    writer.writerow(file_headers)
                    for response, guess in train_pred_tuples:
                        writer.writerow([response, guess])

            vprint("Dumping original testing data.", self.verbose)
            # Dump the train data to the log.
            test_dump = "{}/original_test_data".format(auditor.OUTPUT_DIR)
            with open(test_dump + ".csv", "w") as f:
                writer = csv.writer(f)
                writer.writerow(headers)
                for row in test_set:
                    writer.writerow(row)

            if self.WRITE_ORIGINAL_PREDICTIONS:
                # Dump the predictions on the test data.
                with open(test_dump + ".predictions", "w") as f:
                    writer = csv.writer(f)
                    file_headers = ["Response", "Prediction"]
                    writer.writerow(file_headers)
                    for response, guess in test_pred_tuples:
                        writer.writerow([response, guess])

            # Graph the audit files.
            vprint("Graphing audit files.", self.verbose)
            for audit_filename in audit_filenames:
                audit_image_filename = audit_filename + ".png"
                graph_audit(audit_filename, self.measurers,
                            audit_image_filename)

            # Store a graph of how many predictions change as features are repaired.
            vprint("Graphing prediction changes throughout repair.",
                   self.verbose)
            output_image = auditor.OUTPUT_DIR + "/similarity_to_original_predictions.png"
            graph_prediction_consistency(auditor.OUTPUT_DIR, output_image)

        for measurer in self.measurers:
            ranked_graph_filename = "{}/{}.png".format(auditor.OUTPUT_DIR,
                                                       measurer.__name__)
            graph_audits(audit_filenames, measurer, ranked_graph_filename)

        # Store a summary of this experiment to file.
        summary_file = "{}/summary.txt".format(auditor.OUTPUT_DIR)
        with open(summary_file, "w") as f:
            for line in summary:
                f.write(line + '\n')

            for ranker, ranks in ranked_features:
                f.write("Ranked Features by {}: {}\n".format(
                    ranker.__name__, ranks))
                groups = group_audit_ranks(audit_filenames, ranker)
                f.write("\tApprox. Trend Groups: {}\n".format(groups))

        vprint("Summary file written to: {}\n".format(summary_file),
               self.verbose)
Example #3
0
    def train(self,
              train_set,
              test_set,
              headers,
              response_header,
              features_to_ignore=[]):
        """
    A method to train a model using model factories. 
    ModelFactories require a `build` method that accepts some training data
    with which to train a brand new model. This `build` method should output
    a Model object that has a `test` method -- which, when given test data
    in the same format as the training data, yields a confusion table detailing
    the correct and incorrect predictions of the model.
    
    Parameters
    ----------
    train_set, test_set : list of list or numpy.array with teh dimensions (# of features)*(# of samples).
      Data for training the model and testing the model.

    headers : list of strings
      The headers of the data.

    response_header : string
      The response header of the data.

    features_to_ignore : list of strings (default = [])
      The features we want to ignore.

    """

        all_data = train_set + test_set
        model_factory = self.ModelFactory(
            all_data,
            headers,
            response_header,
            features_to_ignore=features_to_ignore,
            options=self.model_options)

        vprint("Training initial model.", self.verbose)
        model = model_factory.build(train_set)

        # Check the quality of the initial model on verbose runs.
        if self.verbose:
            print("Calculating original model statistics on test data:")
            print("\tTraining Set:")
            train_pred_tuples = model.test(train_set)
            train_conf_matrix = get_conf_matrix(train_pred_tuples)
            print("\t\tConf-Matrix:", train_conf_matrix)
            for measurer in self.measurers:
                print("\t\t{}: {}".format(measurer.__name__,
                                          measurer(train_conf_matrix)))

            print("\tTesting Set:")
            test_pred_tuples = model.test(test_set)
            test_conf_matrix = get_conf_matrix(test_pred_tuples)
            print("\t\tConf-Matrix", test_conf_matrix)
            for measurer in self.measurers:
                print("\t\t{}: {}".format(measurer.__name__,
                                          measurer(test_conf_matrix)))

        return model