Example #1
0
class InterModel:
    """
	This class handles interpolation over our forward models to make the reverse predictions
	"""
    def __init__(self, regenerate_classifiers=True):
        """Make and save the interpolation models"""
        self.MH = ModelHelper.get_instance()  # type: ModelHelper
        self.fwd_model = ForwardModel(regenerate_classifiers)

    def get_closest_point(self,
                          desired_vals,
                          constraints={},
                          max_drop_exp_error=-1,
                          skip_list=[]):
        """Return closest real data point to our desired values that is within the given constraints
		Used to find a good starting point for our solution
		THIS IS FUNDAMENTALLY DIFFERENT FROM THE NEAREST DATA POINT FORWARD MODEL!
			Nearest data point forward model - Find the outputs of the closest data point to the prediction
			This method - Find the data point closest to the desired outputs

		We will try to find the point closest to the center of our constraints that is close to the target answer

		ALL INPUTS ARE NORMALIZED!

		By itself, this class really isn't all that bad at performing DAFD's main functionality: reverse model prediction
			Therefore, this class should be the baseline level of accuracy for DAFD.
		"""

        use_regime_2 = False
        if "orifice_size" in constraints and "droplet_size" in desired_vals and self.MH.denormalize(
                constraints["orifice_size"][1],
                "orifice_size") < self.MH.denormalize(
                    desired_vals["droplet_size"], "droplet_size"):
            use_regime_2 = True

        closest_point = {}
        min_val = float("inf")
        match_index = -1
        for i in range(self.MH.train_data_size):
            if i in skip_list:
                continue
            if use_regime_2 and self.MH.train_regime_dat[i] != 2:
                continue

            if max_drop_exp_error != -1 and "droplet_size" in desired_vals:
                exp_error = abs(
                    self.MH.denormalize(desired_vals["droplet_size"],
                                        "droplet_size") -
                    self.MH.train_labels_dat["droplet_size"][i])
                if exp_error > max_drop_exp_error:
                    continue

            feat_point = self.MH.train_features_dat_wholenorm[i]
            prediction = self.fwd_model.predict(feat_point, normalized=True)

            if prediction["regime"] != self.MH.train_regime_dat[i]:
                continue

            if self.constrained_regime != -1 and prediction[
                    "regime"] != self.constrained_regime:
                continue

            nval = sum([
                abs(
                    self.MH.normalize(self.MH.train_labels_dat[x][i], x) -
                    desired_vals[x]) for x in desired_vals
            ])
            if "droplet_size" in desired_vals:
                nval += abs(
                    self.MH.normalize(prediction["droplet_size"],
                                      "droplet_size") -
                    desired_vals["droplet_size"])

                denorm_feat_list = self.MH.denormalize_set(feat_point)
                denorm_feat = {
                    x: denorm_feat_list[i]
                    for i, x in enumerate(self.MH.input_headers)
                }
                denorm_feat["generation_rate"] = prediction["generation_rate"]

                _, _, inferred_size = self.MH.calculate_formulaic_relations(
                    denorm_feat)
                inferred_size_error = abs(
                    desired_vals["droplet_size"] -
                    self.MH.normalize(inferred_size, "droplet_size"))
                nval += inferred_size_error

            if "generation_rate" in desired_vals:
                nval += abs(
                    self.MH.normalize(prediction["generation_rate"],
                                      "generation_rate") -
                    desired_vals["generation_rate"])

            for j in range(len(self.MH.input_headers)):
                if self.MH.input_headers[j] in constraints:
                    cname = self.MH.input_headers[j]
                    if feat_point[j] < constraints[cname][0] or feat_point[
                            j] > constraints[cname][0]:
                        nval += 1000
                        nval += abs(feat_point[j] -
                                    (constraints[cname][0] +
                                     constraints[cname][1]) / 2.0)

            if nval < min_val:
                closest_point = feat_point
                min_val = nval
                match_index = i

        return closest_point, match_index

    def model_error(self, x):
        """Returns how far each solution mapped on the model deviates from the desired value
		Used in our minimization function
		"""
        prediction = self.fwd_model.predict(x, normalized=True)
        val_dict = {
            self.MH.input_headers[i]:
            self.MH.denormalize(val, self.MH.input_headers[i])
            for i, val in enumerate(x)
        }
        val_dict["generation_rate"] = prediction["generation_rate"]
        _, _, droplet_inferred_size = self.MH.calculate_formulaic_relations(
            val_dict)
        if "droplet_size" in self.desired_vals_global:
            denorm_drop_error = abs(droplet_inferred_size -
                                    self.desired_vals_global["droplet_size"])
            drop_error = abs(
                self.MH.normalize(droplet_inferred_size, "droplet_size") -
                self.norm_desired_vals_global["droplet_size"])
        else:
            denorm_drop_error = abs(droplet_inferred_size -
                                    prediction["droplet_size"])
            drop_error = abs(
                self.MH.normalize(droplet_inferred_size, "droplet_size") -
                self.MH.normalize(prediction["droplet_size"], "droplet_size"))

        merrors = [
            abs(
                self.MH.normalize(prediction[head], head) -
                self.norm_desired_vals_global[head])
            for head in self.norm_desired_vals_global
        ]
        return sum(merrors) + drop_error * 1

    def callback_func(self, x):
        """Returns how far each solution mapped on the model deviates from the desired value
		Used in our minimization function
		"""
        prediction = self.fwd_model.predict(x, normalized=True)
        #merrors = [abs(self.MH.normalize(prediction[head], head) - self.norm_desired_vals_global_adjusted[head]) for head in self.norm_desired_vals_global_adjusted]
        val_dict = {
            self.MH.input_headers[i]:
            self.MH.denormalize(val, self.MH.input_headers[i])
            for i, val in enumerate(x)
        }
        val_dict["generation_rate"] = prediction["generation_rate"]
        _, _, droplet_inferred_size = self.MH.calculate_formulaic_relations(
            val_dict)
        if "droplet_size" in self.desired_vals_global:
            denorm_drop_error = abs(droplet_inferred_size -
                                    self.desired_vals_global["droplet_size"])
            drop_error = abs(
                self.MH.normalize(droplet_inferred_size, "droplet_size") -
                self.norm_desired_vals_global["droplet_size"])
        else:
            denorm_drop_error = abs(droplet_inferred_size -
                                    prediction["droplet_size"])
            drop_error = abs(
                self.MH.normalize(droplet_inferred_size, "droplet_size") -
                self.MH.normalize(prediction["droplet_size"], "droplet_size"))
        print(prediction["droplet_size"])
        print(prediction["generation_rate"])
        merrors = [
            abs(
                self.MH.normalize(prediction[head], head) -
                self.norm_desired_vals_global[head])
            for head in self.norm_desired_vals_global
        ]
        all_errors = sum(merrors) + drop_error
        print(all_errors)
        print()

        with open("InterResults.csv", "a") as f:
            f.write(",".join(map(str, self.MH.denormalize_set(x))) + "," +
                    str(prediction['regime']) + "," +
                    str(prediction['generation_rate']) + "," +
                    str(prediction['droplet_size']) + "," + str(all_errors) +
                    "\n")

    def correct_by_constraints(self, values, constraints):
        """Sets values to be within constraints (can be normalized or not, as long as values match constraints)"""
        for i, head in enumerate(self.MH.input_headers):
            if head in constraints:
                if values[i] < constraints[head][0]:
                    values[i] = constraints[head][0]
                elif values[i] > constraints[head][1]:
                    values[i] = constraints[head][1]

    def interpolate(self, desired_val_dict, constraints):
        """Return an input set within the given constraints that produces the output set
		The core part of DAFD
		Args:
			desired_val_dict: Dict with output type as the key and desired value as the value
				Just don't include other output type if you just want to optimize on one

			constraints: Dict with input type as key and acceptable range as the value
				The acceptable range should be a tuple with the min as the first val and the max as the second val
				Again, just leave input types you don't care about blank
		"""

        self.constrained_regime = constraints.pop("regime", -1)

        norm_constraints = {}
        for cname in constraints:
            cons_low = self.MH.normalize(constraints[cname][0], cname)
            cons_high = self.MH.normalize(constraints[cname][1], cname)
            norm_constraints[cname] = (cons_low, cons_high)

        norm_desired_vals = {}
        for lname in desired_val_dict:
            norm_desired_vals[lname] = self.MH.normalize(
                desired_val_dict[lname], lname)

        self.norm_desired_vals_global = norm_desired_vals
        self.desired_vals_global = desired_val_dict

        # This loop runs until either the algorithm finds a point that is close enough to the user's desiers or until
        #  every point has been searched and the model needs to move on to optimization.
        skip_list = []  # List of points we've already tried
        while (True):
            # Get the closest point we haven't tried already
            start_pos, closest_index = self.get_closest_point(
                norm_desired_vals,
                constraints=norm_constraints,
                max_drop_exp_error=5,
                skip_list=skip_list)
            if closest_index == -1:
                start_pos, closest_index = self.get_closest_point(
                    norm_desired_vals, constraints=norm_constraints)
                # Give up if we have tried every point and then optimize based on the closest point
                break
            skip_list.append(closest_index)

            prediction = self.fwd_model.predict(start_pos, normalized=True)
            all_dat_labels = ["chip_number"] + self.MH.input_headers + [
                "regime"
            ] + self.MH.output_headers
            print(",".join(all_dat_labels))
            print("Starting point")
            print(self.MH.all_dat[closest_index])
            print([self.MH.all_dat[closest_index][x] for x in all_dat_labels])
            print("Start pred")
            print(prediction)

            should_skip_optim_rate = True
            should_skip_optim_size = True
            should_skip_optim_constraints = True

            # If the point is outside of the constraint range, skip it
            for constraint in constraints:
                cons_range = constraints[constraint]
                this_val = self.MH.all_dat[closest_index][constraint]
                if this_val < cons_range[0] or this_val > cons_range[1]:
                    should_skip_optim_constraints = False

            if self.constrained_regime != -1 and self.MH.all_dat[
                    closest_index]["regime"] != self.constrained_regime:
                should_skip_optim_constraints = False

            # If the rate is too far deviated, skip the point
            if "generation_rate" in desired_val_dict:
                if desired_val_dict["generation_rate"] > 100:
                    pred_rate_error = abs(
                        desired_val_dict["generation_rate"] -
                        prediction["generation_rate"]
                    ) / desired_val_dict["generation_rate"]
                    exp_rate_error = abs(
                        desired_val_dict["generation_rate"] -
                        self.MH.all_dat[closest_index]["generation_rate"]
                    ) / desired_val_dict["generation_rate"]
                    if pred_rate_error > 0.15 or exp_rate_error > 0.15:
                        should_skip_optim_rate = False
                else:
                    pred_rate_error = abs(desired_val_dict["generation_rate"] -
                                          prediction["generation_rate"])
                    exp_rate_error = abs(
                        desired_val_dict["generation_rate"] -
                        self.MH.all_dat[closest_index]["generation_rate"])
                    if pred_rate_error > 15 or exp_rate_error > 15:
                        should_skip_optim_rate = False

            # If the size is too far deviated, skip the point
            if "droplet_size" in desired_val_dict:
                pred_size_error = abs(desired_val_dict["droplet_size"] -
                                      prediction["droplet_size"])
                exp_size_error = abs(
                    desired_val_dict["droplet_size"] -
                    self.MH.all_dat[closest_index]["droplet_size"])
                print(self.MH.all_dat[closest_index])
                pred_point = {
                    x: self.MH.all_dat[closest_index][x]
                    for x in self.MH.all_dat[closest_index]
                }
                pred_point["generation_rate"] = prediction["generation_rate"]
                print(self.MH.all_dat[closest_index])
                _, _, inferred_size = self.MH.calculate_formulaic_relations(
                    pred_point)
                inferred_size_error = abs(desired_val_dict["droplet_size"] -
                                          inferred_size)
                print(inferred_size)
                print(inferred_size_error)
                if pred_size_error > 10 or inferred_size_error > 10 or exp_size_error > 5:
                    should_skip_optim_size = False

            # Return experimental point if it meets criteria
            if should_skip_optim_rate and should_skip_optim_size and should_skip_optim_constraints:
                results = {
                    x: self.MH.all_dat[closest_index][x]
                    for x in self.MH.input_headers
                }
                results["point_source"] = "Experimental"
                print(results)
                return results

        with open("InterResults.csv", "w") as f:
            f.write("Experimental outputs:" +
                    str(self.MH.all_dat[closest_index]["generation_rate"]) +
                    "," + str(self.MH.all_dat[closest_index]["droplet_size"]) +
                    "\n")

            if "generation_rate" not in desired_val_dict:
                des_rate = "-1"
            else:
                des_rate = str(desired_val_dict["generation_rate"])

            if "droplet_size" not in desired_val_dict:
                des_size = "-1"
            else:
                des_size = str(desired_val_dict["droplet_size"])

            f.write("Desired outputs:" + des_rate + "," + des_size + "\n")
            f.write(",".join(self.MH.input_headers) +
                    ",regime,generation_rate,droplet_size,cost_function\n")

        pos = start_pos
        self.callback_func(pos)

        self.correct_by_constraints(pos, norm_constraints)

        loss = self.model_error(pos)
        samplesize = 1e-3
        stepsize = 1e-2
        ftol = 1e-9

        # I log the values here so I can use them for visualization
        with open("AlgorithmProcess.csv", "w") as f:
            double_headers = []
            for header in self.MH.input_headers:
                double_headers.append(header + "_pos")
                double_headers.append(header + "_neg")
            f.write(",".join(double_headers) + "\n")

        # Iterate the optimization
        for i in range(
                5000
        ):  # 5000 is an arbitrary upper bound on optimization steps
            new_pos = pos
            new_loss = loss
            for index, val in enumerate(pos):
                copy = [x for x in pos]
                copy[index] = val + stepsize
                self.correct_by_constraints(copy, norm_constraints)
                error = self.model_error(copy)
                if error < new_loss:
                    new_pos = copy
                    new_loss = error

                with open("AlgorithmProcess.csv", "a") as f:
                    f.write(str(error) + ",")

                copy = [x for x in pos]
                copy[index] = val - stepsize
                self.correct_by_constraints(copy, norm_constraints)
                error = self.model_error(copy)
                if error < new_loss:
                    new_pos = copy
                    new_loss = error

                with open("AlgorithmProcess.csv", "a") as f:
                    f.write(str(error) + ",")

            with open("AlgorithmProcess.csv", "a") as f:
                f.write("\n")

            # If we failed to decrease the loss by more than ftol, break the loop
            if loss - new_loss < ftol:
                print(loss)
                print(new_loss)
                break

            pos = new_pos
            loss = new_loss

            self.callback_func(pos)

        self.last_point = pos

        #Denormalize results
        results = {
            x: self.MH.denormalize(pos[i], x)
            for i, x in enumerate(self.MH.input_headers)
        }
        prediction = self.fwd_model.predict(
            [results[x] for x in self.MH.input_headers])
        print("Final Suggestions")
        print(",".join(self.MH.input_headers) + "," + "desired_size" + "," +
              "predicted_generation_rate" + "," + "predicted_droplet_size")
        output_string = ",".join(
            [str(results[x]) for x in self.MH.input_headers])
        output_string += "," + str(desired_val_dict["droplet_size"])
        output_string += "," + str(prediction["generation_rate"])
        output_string += "," + str(prediction["droplet_size"])
        print(output_string)
        print("Final Prediction")
        print(prediction)
        results["point_source"] = "Predicted"
        return results
Example #2
0
class ForwardModelTester:
    """ This class is used to test the accuracy of the forward models. It is simply a helpful utility and not a requirement
		for the function of the system"""
    def __init__(self):
        self.MH = ModelHelper.get_instance()  # type: ModelHelper

    def train(self):
        """ Train the model and stop. Uses all data."""
        data_size = len(self.MH.train_features_dat_wholenorm)
        self.MH.make_train_data([x for x in range(data_size)])
        self.forward_model = ForwardModel()

    def cross_validate(self, folds):
        """ Typical cross-validation of data to determine accuracy
				folds is the division of the dataset (such as 10 for 10-fold CV)
		"""
        data_size = len(self.MH.all_dat)

        if folds == -1:
            folds = data_size  # Leave one out cross validation

        group_size = int(data_size / folds)

        rand_indices = random.sample([x for x in range(data_size)], data_size)

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        # Go through each fold, train on everything else, and test it
        for i in range(folds):
            train_indices = [
                x for x in (rand_indices[:i * group_size] +
                            rand_indices[(i + 1) * group_size:])
            ]
            self.MH.make_train_data(train_indices)
            self.forward_model = ForwardModel()
            test_indices = rand_indices[i * group_size:(i + 1) * group_size]
            test_dat = [self.MH.all_dat[x] for x in test_indices]
            for i, dat_point in enumerate(test_dat):
                ret_vals = self.validate_model(dat_point)
                for header in ret_vals:
                    validations[header].append(
                        ret_vals[header]
                    )  # Validations dict says how well we did for the point

        # Data for the cross validation is written out at all_preds_droplet_size.csv and all_preds_generation_rate.csv
        # This data only has the shown file headers. For bulk statistics (like coefficient of determination), you will
        #  need to run the data through DAFD/model_data/disp_graphs.py. See that file for more information.
        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open("all_preds_" + header + ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

    def cross_validate_regime(self, folds, fileprefix=""):
        """ This class is pretty much the same as the normal cross validation class, but we assume that the regime
				classifier is 100% accurate. This allows us to determine the accuracy of our regressors more precisely"""
        data_size = len(self.MH.all_dat)

        regime1_points = [
            i for i in range(data_size) if self.MH.all_dat[i]["regime"] == 1
        ]
        regime2_points = [
            i for i in range(data_size) if self.MH.all_dat[i]["regime"] == 2
        ]

        random.shuffle(regime1_points)
        random.shuffle(regime2_points)

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        # Regime 1 validations
        for i in range(folds):
            group_size = int(len(regime1_points) / folds)
            train_indices = [
                x for x in (regime1_points[:i * group_size] +
                            regime1_points[(i + 1) * group_size:])
            ]
            test_indices = regime1_points[i * group_size:(i + 1) * group_size]
            self.MH.make_train_data(train_indices)
            self.MH.make_test_data(test_indices)
            self.forward_model = ForwardModel(
                should_generate_regime_classifier=False)
            test_dat = [self.MH.all_dat[x] for x in test_indices]
            for i, dat_point in enumerate(test_dat):
                ret_vals = self.validate_model(dat_point, given_regime=1)
                for header in ret_vals:
                    validations[header].append(ret_vals[header])

        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open(
                    "model_data/" + fileprefix + "r1_all_preds_" + header +
                    ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        # I added the call to disp_graphs.py here to speed up testing
        os.system(
            "python3 model_data/disp_graphs.py model_data/r1_all_preds_generation_rate.csv >> model_data/r1rate.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/r1_all_preds_droplet_size.csv >> model_data/r1size.txt"
        )

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        # Now it is time for regime 2 validations
        for i in range(folds):
            group_size = int(len(regime2_points) / folds)
            train_indices = [
                x for x in (regime2_points[:i * group_size] +
                            regime2_points[(i + 1) * group_size:])
            ]
            test_indices = regime2_points[i * group_size:(i + 1) * group_size]
            self.MH.make_train_data(train_indices)
            self.MH.make_test_data(test_indices)
            self.forward_model = ForwardModel(
                should_generate_regime_classifier=False)
            test_dat = [self.MH.all_dat[x] for x in test_indices]
            for i, dat_point in enumerate(test_dat):
                ret_vals = self.validate_model(dat_point, given_regime=2)
                for header in ret_vals:
                    validations[header].append(ret_vals[header])

        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open(
                    "model_data/" + fileprefix + "r2_all_preds_" + header +
                    ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        os.system(
            "python3 model_data/disp_graphs.py model_data/r2_all_preds_generation_rate.csv >> model_data/r2rate.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/r2_all_preds_droplet_size.csv >> model_data/r2size.txt"
        )

    def hold_out_classifier(self, hold_out_percent):
        """ Hold out accuracy tests for the regime classifier"""
        data_size = len(self.MH.all_dat)
        all_indices = [x for x in range(data_size)]
        random.seed(400)
        random.shuffle(all_indices)
        train_indices = all_indices[int(data_size * hold_out_percent):]
        test_indices = all_indices[:int(data_size * hold_out_percent)]

        self.MH.make_train_data(train_indices)
        self.MH.make_test_data(test_indices)

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        self.forward_model = ForwardModel(
            should_generate_regime_classifier=True)
        test_dat = [self.MH.all_dat[x] for x in test_indices]
        for i, dat_point in enumerate(test_dat):
            ret_vals = self.validate_model(dat_point)
            for header in ret_vals:
                validations[header].append(ret_vals[header])

        # We still print out everything, but we really only care about classifier accuracy for this section
        # You should use the method hold_out if you care about regressor accuracy
        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open("model_data/all_preds_" + header + ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        os.system(
            "python3 model_data/disp_graphs.py model_data/all_preds_generation_rate.csv >> model_data/rate.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/all_preds_droplet_size.csv >> model_data/size.txt"
        )

    def hold_out(self, hold_out_percent, fileprefix=""):
        """ Hold out accuracy for our regressors. Assumes 100% accurate classifier"""
        data_size = len(self.MH.all_dat)

        regime1_points = [
            i for i in range(data_size) if self.MH.all_dat[i]["regime"] == 1
        ]
        regime2_points = [
            i for i in range(data_size) if self.MH.all_dat[i]["regime"] == 2
        ]

        random.shuffle(regime1_points)
        random.shuffle(regime2_points)

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        # Regime 1 accuracy tests
        train_indices = regime1_points[
            int(len(regime1_points) * hold_out_percent):]
        test_indices = regime1_points[:int(
            len(regime1_points) * hold_out_percent)]
        self.MH.make_train_data(train_indices)
        self.MH.make_test_data(test_indices)
        self.forward_model = ForwardModel(
            should_generate_regime_classifier=False)
        test_dat = [self.MH.all_dat[x] for x in test_indices]
        for i, dat_point in enumerate(test_dat):
            ret_vals = self.validate_model(dat_point, given_regime=1)
            for header in ret_vals:
                validations[header].append(ret_vals[header])

        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open(
                    "model_data/" + fileprefix + "r1_all_preds_" + header +
                    ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        os.system(
            "python3 model_data/disp_graphs.py model_data/r1_all_preds_generation_rate.csv >> model_data/r1rate.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/r1_all_preds_droplet_size.csv >> model_data/r1size.txt"
        )

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        # Regime 2 accuracy tests
        train_indices = regime2_points[
            int(len(regime2_points) * hold_out_percent):]
        test_indices = regime2_points[:int(
            len(regime2_points) * hold_out_percent)]
        self.MH.make_train_data(train_indices)
        self.MH.make_test_data(test_indices)
        self.forward_model = ForwardModel(
            should_generate_regime_classifier=False)
        test_dat = [self.MH.all_dat[x] for x in test_indices]
        for i, dat_point in enumerate(test_dat):
            ret_vals = self.validate_model(dat_point, given_regime=2)
            for header in ret_vals:
                validations[header].append(ret_vals[header])

        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open(
                    "model_data/" + fileprefix + "r2_all_preds_" + header +
                    ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        os.system(
            "python3 model_data/disp_graphs.py model_data/r2_all_preds_generation_rate.csv >> model_data/r2rate.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/r2_all_preds_droplet_size.csv >> model_data/r2size.txt"
        )

    def hold_out_double_test(self, hold_out_percent, fileprefix=""):
        """ This class was built for a very specific experiment

			I wanted to prove that if we ran the training a million times and chose the model with the best test
			accuracy, we wouldn't necessarily have chosen the best overall model, just one that happened to fit our
			specific test data really nicely.

			The experiment goes as follows:
				Choose 10% of data to hold out
				Choose another 10% of the data to hold out as a second set
				Train on the remaining 80%
				Check if there is rank correlation between models that do the best on hold out set 1 vs hold out set 2

			If the models that do best on hold out set 1 are also the models that do best on hold out set 2, then I was
			wrong and we should try to pick out the model that does best on testing.

			If there is no rank correlation (ie a model's accuracy on set 1 does not correlate with its accuracy on
			set 2), then it does not matter which model we choose as long as it converged during training.

			Turns out that there is not really any rank correlation, so we don't have to worry about choosing one model
			over another GIVEN THEY HAVE THE SAME HYPERPARAMETERS
			"""
        data_size = len(self.MH.all_dat)

        regime1_points = [
            i for i in range(data_size) if self.MH.all_dat[i]["regime"] == 1
        ]
        regime2_points = [
            i for i in range(data_size) if self.MH.all_dat[i]["regime"] == 2
        ]

        random.shuffle(regime1_points)
        random.shuffle(regime2_points)

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        train_indices = regime1_points[
            int(len(regime1_points) * hold_out_percent):]
        test_indices1 = regime1_points[:int(
            len(regime1_points) * hold_out_percent * 0.5)]
        test_indices2 = regime1_points[
            int(len(regime1_points) * hold_out_percent *
                0.5):int(len(regime1_points) * hold_out_percent)]
        self.MH.make_train_data(train_indices)
        self.MH.make_test_data(test_indices1)
        self.forward_model = ForwardModel(
            should_generate_regime_classifier=False)

        test_dat = [self.MH.all_dat[x] for x in test_indices1]
        for i, dat_point in enumerate(test_dat):
            ret_vals = self.validate_model(dat_point, given_regime=1)
            for header in ret_vals:
                validations[header].append(ret_vals[header])

        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open(
                    "model_data/" + fileprefix + "r1_all_preds_set1_" +
                    header + ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        os.system(
            "python3 model_data/disp_graphs.py model_data/r1_all_preds_set1_generation_rate.csv >> model_data/r1rate_set1.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/r1_all_preds_set1_droplet_size.csv >> model_data/r1size_set1.txt"
        )

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        test_dat = [self.MH.all_dat[x] for x in test_indices2]
        for i, dat_point in enumerate(test_dat):
            ret_vals = self.validate_model(dat_point, given_regime=1)
            for header in ret_vals:
                validations[header].append(ret_vals[header])

        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open(
                    "model_data/" + fileprefix + "r1_all_preds_set2_" +
                    header + ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        os.system(
            "python3 model_data/disp_graphs.py model_data/r1_all_preds_set2_generation_rate.csv >> model_data/r1rate_set2.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/r1_all_preds_set2_droplet_size.csv >> model_data/r1size_set2.txt"
        )

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        train_indices = regime2_points[
            int(len(regime2_points) * hold_out_percent):]
        test_indices1 = regime2_points[:int(
            len(regime2_points) * hold_out_percent * 0.5)]
        test_indices2 = regime2_points[
            int(len(regime2_points) * hold_out_percent *
                0.5):int(len(regime2_points) * hold_out_percent)]
        self.MH.make_train_data(train_indices)
        self.MH.make_test_data(test_indices1)
        self.forward_model = ForwardModel(
            should_generate_regime_classifier=False)

        test_dat = [self.MH.all_dat[x] for x in test_indices1]
        for i, dat_point in enumerate(test_dat):
            ret_vals = self.validate_model(dat_point, given_regime=2)
            for header in ret_vals:
                validations[header].append(ret_vals[header])

        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open(
                    "model_data/" + fileprefix + "r2_all_preds_set1_" +
                    header + ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        os.system(
            "python3 model_data/disp_graphs.py model_data/r2_all_preds_set1_generation_rate.csv >> model_data/r2rate_set1.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/r2_all_preds_set1_droplet_size.csv >> model_data/r2size_set1.txt"
        )

        validations = {}
        for header in self.MH.output_headers:
            validations[header] = []

        test_dat = [self.MH.all_dat[x] for x in test_indices2]
        for i, dat_point in enumerate(test_dat):
            ret_vals = self.validate_model(dat_point, given_regime=2)
            for header in ret_vals:
                validations[header].append(ret_vals[header])

        for header in validations:
            file_headers = [
                "actual_val", "pred_val", "deviation", "deviation_percent",
                "actual_regime", "pred_regime", "chip_number"
            ]
            with open(
                    "model_data/" + fileprefix + "r2_all_preds_set2_" +
                    header + ".csv", "w") as f:
                f.write(",".join(file_headers) + "\n")
                for x in validations[header]:
                    f.write(",".join([str(xi) for xi in x]) + "\n")

        os.system(
            "python3 model_data/disp_graphs.py model_data/r2_all_preds_set2_generation_rate.csv >> model_data/r2rate_set2.txt"
        )
        os.system(
            "python3 model_data/disp_graphs.py model_data/r2_all_preds_set2_droplet_size.csv >> model_data/r2size_set2.txt"
        )

    def validate_model(self, dat_point, given_regime=0):
        """ Get test accuracies for a data point """
        features = [dat_point[x] for x in self.MH.input_headers]
        labels = {x: dat_point[x] for x in self.MH.output_headers}
        regime = dat_point["regime"]
        chip_number = dat_point["chip_number"]

        pred_vals = self.forward_model.predict(features, regime=given_regime)

        ret_val = {}
        for header in labels:
            actual_val = labels[header]
            pred_val = pred_vals[header]
            actual_regime = regime
            pred_regime = pred_vals["regime"]
            ret_val[header] = [
                actual_val, pred_val,
                abs(pred_val - actual_val),
                abs(pred_val - actual_val) / actual_val, actual_regime,
                pred_regime, chip_number
            ]

        return ret_val