Beispiel #1
0
def main(args):
    oracle = Oracle(args)
    samples_dict = oracle.initializeDataset(save=False, returnData=True)
    scores = samples_dict["scores"]
    samples_mat = samples_dict["samples"]
    seq_letters = oracle.numbers2letters(samples_mat)
    seq_ints = [
        "".join([str(el) for el in seq if el > 0]) for seq in samples_mat
    ]
    if isinstance(scores, dict):
        scores.update({"letters": seq_letters, "indices": seq_ints})
        df = pd.DataFrame(scores)
    else:
        df = pd.DataFrame({
            "letters": seq_letters,
            "indices": seq_ints,
            "scores": scores
        })
    if args.output:
        output_yml = Path(args.output).with_suffix(".yml")
        with open(output_yml, "w") as f:
            yaml.dump(numpy2python(namespace2dict(args)),
                      f,
                      default_flow_style=False)
        if args.no_indices:
            df.drop(columns="indices", inplace=True)
        df.to_csv(args.output)
    def updateModelState(self, model_state, model):
        """
        update the model state and store it for later sampling
        :param model_state:
        :return:
        """
        model_state_dict = model_state
        previous_model_state = self.model_state
        # things to put into the model state
        # test loss and standard deviation between models
        self.model_state = torch.stack(
            (
                torch.tensor(model_state_dict["test loss"]),
                torch.tensor(model_state_dict["test std"]),
            )
        )

        # sample energies
        self.model_state = torch.cat(
            (self.model_state, torch.tensor(model_state_dict["best cluster energies"]))
        )

        # sample uncertainties
        self.model_state = torch.cat(
            (self.model_state, torch.Tensor(model_state_dict["best cluster deviations"]))
        )

        # internal dist, dataset dist, random set dist
        self.model_state = torch.cat(
            (self.model_state, torch.tensor(model_state_dict["best clusters internal diff"]))
        )
        self.model_state = torch.cat(
            (self.model_state, torch.tensor(model_state_dict["best clusters dataset diff"]))
        )
        self.model_state = torch.cat(
            (self.model_state, torch.tensor(model_state_dict["best clusters random set diff"]))
        )

        # n proxy models,         # clustering cutoff,         # progress fraction
        singletons = torch.stack(
            (
                torch.tensor(model_state_dict["n proxy models"]),
                torch.tensor(model_state_dict["clustering cutoff"]),
                torch.tensor(model_state_dict["iter"] / model_state_dict["budget"]),
            )
        )

        self.model_state = torch.cat((self.model_state, singletons))
        self.model_state = self.model_state.to(self.device)

        self.proxyModel = model  # this should already be on correct device - passed directly from the main program

        # get data to compute distances
        # model state samples
        self.modelStateSamples = model_state_dict["best cluster samples"]
        # training dataset
        self.trainingSamples = np.load('datasets/' + self.config.dataset.oracle + '.npy', allow_pickle=True).item()
        self.trainingSamples = self.trainingSamples['samples']
        # large random sample
        numSamples = min(int(1e4), self.config.dataset.dict_size ** self.config.dataset.max_length // 100) # either 1e4, or 1% of the sample space, whichever is smaller
        dataoracle = Oracle(self.config)
        self.randomSamples = dataoracle.initializeDataset(save=False, returnData=True, customSize=numSamples) # get large random dataset
        self.randomSamples = self.randomSamples['samples']

        return previous_model_state, self.model_state