def pretrain(self, trainloader, validloader, lr=0.001, batch_size=128, num_epochs=10, corrupt=0.2, loss_type="cross-entropy"):
        trloader = trainloader
        valoader = validloader
        daeLayers = []
        for l in range(1, len(self.layers)):
            infeatures = self.layers[l-1]
            outfeatures = self.layers[l]
            if l!= len(self.layers)-1:
                dae = DenoisingAutoencoder(infeatures, outfeatures, activation=self.activation, dropout=corrupt)
            else:
                dae = DenoisingAutoencoder(infeatures, outfeatures, activation="none", dropout=0)
            print(dae)
            if l==1:
                dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type=loss_type)
            else:
                if self.activation=="sigmoid":
                    dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type="cross-entropy")
                else:
                    dae.fit(trloader, valoader, lr=lr, batch_size=batch_size, num_epochs=num_epochs, corrupt=corrupt, loss_type="mse")
            data_x = dae.encodeBatch(trloader)
            valid_x = dae.encodeBatch(valoader)
            trainset = Dataset(data_x, data_x)
            trloader = torch.utils.data.DataLoader(
                trainset, batch_size=batch_size, shuffle=True, num_workers=0)
            validset = Dataset(valid_x, valid_x)
            valoader = torch.utils.data.DataLoader(
                validset, batch_size=1000, shuffle=False, num_workers=0)
            daeLayers.append(dae)

        self.copyParam(daeLayers)
Example #2
0
    def loop_through_data_for_training(self,
                                       model: Union[Model, nn.DataParallel],
                                       optimizer: Optimizer,
                                       mapping: Mapping,
                                       reconstruction: bool = False,
                                       **loader_params):
        """A generator function that takes in a model, optimizer, and data mapping, and loops through the data with the
        model.

        Example:
            default_args = self.get_training_args(model=different_model, reconstruction=true)
            for (input_images, labels, loss, predictions) in loop_through_data_for_training(**default_args):
                # do stuff with loss or predictions

        Args:
            model: The model to train the data with.
            optimizer: The optimizer with the model's parameters set as target.
            mapping: A lib.util.mapping.Mapping object.
            reconstruction: Whether to calculate the reconstruction loss instead of the classification loss.
            **loader_params: Any key-value arguments applicable to the torch.utils.data.DataLoader class, see a list
                here https://pytorch.org/docs/stable/data.html
        """
        loader = Dataset.build_loader(mapping,
                                      label_encoder=self.label_encoder,
                                      image_transforms=self.provide_image_transforms(),
                                      **loader_params)
        for iter_idx, (images, labels) in enumerate(loader):
            model.to(device=self.device)
            if len(images) != 2:
                images = images.float().to(device=self.device)
            else: 
                images[0] = images[0].float().to(device=self.device)
                images[1] = images[1].float().to(device=self.device)    
            optimizer.zero_grad()
           
            pred = None

            if reconstruction:
                if type(model) == torch.nn.DataParallel:
                    loss = model.module.reconstruction_loss(images)
                else:
                    loss = model.reconstruction_loss(images)
            else:
                labels = labels.squeeze().long().to(device=self.device)
                if type(model) == torch.nn.DataParallel:
                    loss, pred = model.module.classification_loss(images, labels)
                else:
                    loss, pred = model.classification_loss(images, labels)
           
            loss.backward()
            optimizer.step()
      
            if len(images) == 2:
                  images= images[0]
            yield images.detach().cpu(), \
                  labels.detach().cpu(), \
                  loss.detach().cpu(), \
                  pred.detach().cpu() if pred is not None else pred
    def train(self, train_ds, test_ds):
        """
        train_ds: [enc_train_ds, dec_train_ds]
        val_ds: [enc_val_ds, dec_val_ds]
        test_ds: [enc_test_ds, dec_test_ds]
        """
        root_model_dir = generate_model_dir(self.flags.root_model_dir)
        create_path(root_model_dir)
        checkpoint_dir = os.path.join(root_model_dir, 'training_checkpoints')
        checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
        checkpoint = tf.train.Checkpoint(
            model=self.model)

        # save absl flags
        flags_txt = (
            self.flags.flags_into_string() +
            '--enc_vocab_size={}\n'.format(self.model.enc_vocab_size) +
            '--dec_vocab_size={}\n'.format(self.model.dec_vocab_size))
        flags_file = open(os.path.join(root_model_dir, 'flags.txt'), 'w')
        flags_file.write(flags_txt)
        flags_file.close()

        batch_count = 0
        epoch_count = -1
        batch_manager = Dataset(train_ds[0], train_ds[1], self.flags.num_epochs)
        while batch_manager.epochs_done <= self.flags.num_epochs:
            batch_count += 1
            enc_input, dec_input, dec_output_true = batch_manager.batch(self.flags.batch_size)
            batch_loss = self.train_step(enc_input, dec_input, dec_output_true)

            if epoch_count != batch_manager.epochs_done:
                test_enc_input = keras.preprocessing.sequence.pad_sequences(
                    test_ds[0], padding='post', value=0)
                test_dec_input = keras.preprocessing.sequence.pad_sequences(
                    test_ds[1], padding='post', value=0)
                test_dec_true = np.roll(test_dec_input, -1)
                test_dec_true[:, -1] = 0
                test_loss = self.eval(test_enc_input, test_dec_input, test_dec_true)

                checkpoint.save(file_prefix=checkpoint_prefix)
                epoch_count += 1

            if batch_count % 100 == 0:
                logging.info('Epoch {}, Batch {}, Loss {}, Last test loss: {}'.format(
                    batch_manager.epochs_done, batch_count, batch_loss, test_loss))
Example #4
0
    def __init__(self):
        Object.__init__(self)
        ABC.__init__(self)

        self.dataset: Optional[Dataset] = Dataset(self.config.dataset_path)
        self.model: Optional[Union[Model, DataParallel]] = self.build_model()
        self.optimizer: Optional[Optimizer] = Engine.build_optimizer(
            **self.build_optimizer_params())
        self.lr_scheduler: Optional[
            lr_scheduler._LRScheduler] = self.build_lr_scheduler()
Example #5
0
    def graph_without_dev(self) -> Optional[np.ndarray]:
        """For NELL-995, the returned graph triplets have dev set removed for RGCN training.
        """
        dataset_path = self.config.dataset_path

        if "nell-995" in dataset_path:
            triplets = Dataset.load_triplets_from_file(
                f"{dataset_path}/graph_without_dev.txt")
            return self.dataset.triplets_to_idx(triplets).T
        else:
            return self.graph_data
Example #6
0
    def encodeBatch(self, x, batch_size=256):
        use_cuda = torch.cuda.is_available()
        dataset = Dataset(x, x)
        dataloader = torch.utils.data.DataLoader(dataset,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 num_workers=2)

        encoded = []
        for batch_idx, (inputs, _) in enumerate(dataloader):
            inputs = inputs.view(inputs.size(0), -1).float()
            if use_cuda:
                inputs = inputs.cuda()
            inputs = Variable(inputs)
            hidden = self.encode(inputs, train=False)
            encoded.append(hidden.data.cpu())

        encoded = torch.cat(encoded, dim=0)
        return encoded
Example #7
0
    def loop_through_data_for_testing(self, model: Union[Model,
                                                         nn.DataParallel],
                                      mapping: Mapping,
                                      **loader_params) -> Result:
        """A function that loops through the provided data and returns a lib.utils.result.Result object, which can then
        be used to calculate scores.

        Args:
            model: The model to make predictions.
            mapping: A lib.util.mapping.Mapping object.
            **loader_params: Any key-value arguments applicable to the torch.utils.data.DataLoader class, see a list
                here https://pytorch.org/docs/stable/data.html

        Returns: A lib.utils.result.Result object.
        """
        loader = Dataset.build_loader(
            mapping,
            label_encoder=self.label_encoder,
            image_transforms=self.provide_image_transforms(),
            **loader_params)

        result = Result(label_encoder=self.label_encoder)

        for iter_idx, (images, labels) in enumerate(loader):
            model.eval()
            model.to(device=self.device)

            images = images.float().to(device=self.device)
            labels = labels.squeeze().long().to(device=self.device)

            if type(model) == torch.nn.DataParallel:
                loss, scores = model.module.classification_loss(images, labels)
            else:
                loss, scores = model.classification_loss(images, labels)

            result.append_scores(scores, labels)
            result.append_loss(loss)

        return result
Example #8
0
    def fit(self,
            data_x,
            valid_x,
            lr=0.001,
            batch_size=128,
            num_epochs=10,
            corrupt=0.5,
            loss_type="mse"):
        """
        data_x: FloatTensor
        valid_x: FloatTensor
        """
        # pdb.set_trace()
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            self.cuda()
        print("=====Denoising Autoencoding layer=======")
        print("Loss: ", loss_type)
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      self.parameters()),
                               lr=lr)
        # optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.parameters()), lr=lr, momentum=0.9)
        # criterion = nn.MSELoss(size_average=False)
        # criterion = nn.MSELoss()
        if loss_type == "mse":
            criterion = MSELoss()
        elif loss_type == "cross-entropy":
            criterion = BCELoss()

        if isinstance(data_x, data.Dataset):
            trainset = data_x
        else:
            trainset = Dataset(data_x, data_x)
        trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=batch_size,
                                                  shuffle=True,
                                                  num_workers=2)
        if isinstance(valid_x, data.Dataset):
            validset = valid_x
        else:
            validset = Dataset(valid_x, valid_x)
        validloader = torch.utils.data.DataLoader(validset,
                                                  batch_size=1000,
                                                  shuffle=False,
                                                  num_workers=2)

        # validate
        total_loss = 0.0
        total_num = 0
        for batch_idx, (inputs, _) in enumerate(validloader):
            inputs = inputs.view(inputs.size(0), -1).float()
            if use_cuda:
                inputs = inputs.cuda()
            inputs = Variable(inputs)
            hidden = self.encode(inputs)
            outputs = self.decode(hidden)
            if loss_type == "cross-entropy":
                outputs = F.sigmoid(outputs)

            valid_recon_loss = criterion(outputs, inputs)
            total_loss += valid_recon_loss.data * inputs.size()[0]
            total_num += inputs.size()[0]

        valid_loss = total_loss / total_num
        print("#Epoch 0: Valid Reconstruct Loss: %.3f" % (valid_loss))

        for epoch in range(num_epochs):
            # train 1 epoch
            for batch_idx, (inputs, _) in enumerate(trainloader):
                inputs = inputs.view(inputs.size(0), -1).float()
                inputs_corr = masking_noise(inputs, corrupt)
                if use_cuda:
                    inputs = inputs.cuda()
                    inputs_corr = inputs_corr.cuda()
                optimizer.zero_grad()
                inputs = Variable(inputs)
                inputs_corr = Variable(inputs_corr)

                hidden = self.encode(inputs_corr)
                outputs = self.decode(hidden)
                if loss_type == "cross-entropy":
                    outputs = F.sigmoid(outputs)

                recon_loss = criterion(outputs, inputs)
                recon_loss.backward()
                optimizer.step()
                # print("    #Iter %3d: Reconstruct Loss: %.3f" % (
                #     batch_idx, recon_loss.data[0]))

            # validate
            total_loss = 0.0
            total_num = 0
            for batch_idx, (inputs, _) in enumerate(validloader):
                inputs = inputs.view(inputs.size(0), -1).float()
                if use_cuda:
                    inputs = inputs.cuda()
                inputs = Variable(inputs)
                hidden = self.encode(inputs, train=False)
                outputs = self.decode(hidden)
                if loss_type == "cross-entropy":
                    outputs = F.sigmoid(outputs)

                valid_recon_loss = criterion(outputs, inputs)
                total_loss += valid_recon_loss.data * inputs.size()[0]
                total_num += inputs.size()[0]

            valid_loss = total_loss / total_num
            print(
                "#Epoch %3d: Reconstruct Loss: %.3f, Valid Reconstruct Loss: %.3f"
                % (epoch, recon_loss.data[0], valid_loss))
@ray.remote
def enumerate_path_wrapper(src_dst_pair, output_file_name, num_hops,
                           pickled_graph, entity_dict, relation_dict):
    candidate_paths = enumerate_paths(src_dst_pair, num_hops, pickled_graph,
                                      entity_dict, relation_dict)

    with open(f"{dataset_path}/{output_file_name}", "w") as file:
        json.dump(candidate_paths, file)


if __name__ == "__main__":
    print(
        f"Started generating paths with input: {input_set}, graph: {graph_set}, and output: {output_path}."
    )
    start_time = time()
    mapping = Dataset(dataset_path=dataset_path)

    input_triplets = torch.from_numpy(mapping.get(input_set).T)
    graph_triplets = mapping.get(graph_set).T

    graph, relations, _ = \
        build_test_graph(mapping.num_entities, mapping.num_relations, graph_triplets, inverse=False)
    graph.ndata.update({
        "id":
        torch.arange(0, mapping.num_entities, dtype=torch.long).view(-1, 1)
    })
    graph.edata.update({"type": torch.from_numpy(relations)})

    padding = ray.put([mapping.num_relations, mapping.num_entities])
    src_dst_pairs = list(
        set(map(lambda x: (x[0].item(), x[2].item()), input_triplets)))