Exemple #1
0
    def __getitem__(self, idx):
        img = cv2.imread(
            os.path.join('..', 'input', 'train_images',
                         self.df.iloc[idx]['ImageId']))
        img = img.astype(np.float32) / 255.

        if self.mode == 'train':
            mask = build_mask(self.df.iloc[idx])
            mask = mask.astype(np.float32)
            if self.transforms is not None:
                transformed = self.transforms(image=img)
                img = transformed['image']
            img = img.transpose(2, 0, 1)
            mask = mask.transpose(2, 0, 1)
            return img, mask

        elif self.mode == 'valid':
            mask = build_mask(self.df.iloc[idx])
            mask = mask.astype(np.float32)
            if self.transforms is not None:
                transformed = self.transforms(image=img)
                img = transformed['image']
            img = img.transpose(2, 0, 1)
            mask = mask.transpose(2, 0, 1)
            return img, mask

        elif self.mode == 'test':
            return img
Exemple #2
0
    def __init__(self,
                 base,
                 in_chan=1,
                 n_classes=2,
                 imsize=150,
                 kernel_size=5,
                 N=None,
                 quiet=True):
        super(VanillaLeNet, self).__init__()
        kernel_size = int(kernel_size)
        imsize = int(imsize)
        out_chan = int(n_classes)

        z = imsize // 2 // 2

        self.mask = utils.build_mask(imsize, margin=1)

        self.conv1 = nn.Conv2d(in_chan,
                               6,
                               kernel_size,
                               padding=kernel_size // 2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size, padding=kernel_size // 2)
        self.fc1 = nn.Linear(16 * z * z, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, out_chan)
        self.drop = nn.Dropout(p=0.5)

        # dummy parameter for tracking device
        self.dummy = nn.Parameter(torch.empty(0))
Exemple #3
0
def train_model(config, epochs, batch_size=64, sentence_max_len=64, lr=1e-4):
    text_input_folder = os.path.join(
        config.data_folder, 'input/text_files/{}/'.format(config.inventory))
    input_folder = os.path.join(config.data_folder,
                                'input/matrices/{}/'.format(config.inventory))

    if config.finegrained:
        domains_vocab_path = os.path.join(text_input_folder, 'sensekeys.pkl')
        domains_vocab = pkl.load(open(domains_vocab_path, 'rb'))
        labels = sorted([x for x in domains_vocab if x != 'untagged'])
        labels_dict = {label: k + 1 for k, label in enumerate(labels)}
        labels_dict[None] = 0
        reverse_labels_dict = {v: k for k, v in labels_dict.items()}

    else:
        domains_vocab_path = os.path.join(text_input_folder, 'domains.pkl')
        domains_vocab = pkl.load(open(domains_vocab_path, 'rb'))
        labels = sorted([x for x in domains_vocab if x != 'untagged'])
        labels_dict = {label: k + 1 for k, label in enumerate(labels)}
        labels_dict[None] = 0
        reverse_labels_dict = {v: k for k, v in labels_dict.items()}

    gold_folder = os.path.join(config.data_folder,
                               'gold/{}/'.format(config.inventory))

    mapping = pkl.load(open(config.mapping_path, 'rb'))
    train_x = pkl.load(
        open(
            os.path.join(input_folder,
                         "{}_words.pkl".format(config.training_name)),
            "rb")).tolist()

    if config.finegrained:
        train_y = pkl.load(
            open(
                os.path.join(input_folder,
                             '{}_sensekeys.pkl'.format(config.training_name)),
                "rb")).tolist()
        dev_y = pkl.load(
            open(
                os.path.join(input_folder,
                             '{}_sensekeys.pkl'.format(config.dev_name)),
                "rb")).tolist()

    else:
        train_y = pkl.load(
            open(
                os.path.join(input_folder,
                             '{}_domains.pkl'.format(config.training_name)),
                "rb")).tolist()
        dev_y = pkl.load(
            open(
                os.path.join(input_folder,
                             '{}_domains.pkl'.format(config.dev_name)),
                "rb")).tolist()

    txt_file = os.path.join(text_input_folder,
                            '{}_input.txt'.format(config.dev_name))
    dev_x = pkl.load(
        open(
            os.path.join(input_folder, "{}_words.pkl".format(config.dev_name)),
            "rb")).tolist()
    dev_y_idx = process_labels(dev_y,
                               labels_dict).detach().cpu().numpy().tolist()
    tokens = process_tokens(
        pkl.load(
            open(
                os.path.join(input_folder,
                             '{}_tokens.pkl'.format(config.dev_name)),
                'rb')).tolist())

    candidate_domains = utils.build_possible_senses(
        labels_dict, os.path.join(text_input_folder, 'semcor_input.txt'))

    dev_mask = utils.build_mask(words=dev_x,
                                true_y=dev_y_idx,
                                labels_dict=labels_dict,
                                tokens=tokens,
                                file_txt=txt_file,
                                candidate=candidate_domains)

    gold_dictionary = {
        line.strip().split()[0]: line.strip().split()[1:]
        for line in open(
            os.path.join(gold_folder, '{}.gold.txt'.format(config.dev_name)))
    }

    if config.model_name == 'BertDense':
        model = BertDense(len(labels_dict)).train()

    elif config.model_name == 'BertLSTM':
        model = BertLSTM(len(labels_dict)).train()

    criterion = NLLLoss(ignore_index=0)
    optimizer = Adam(model.parameters(), lr=lr)
    writer = SummaryWriter(os.path.join(config.experiment_folder, 'logs'))
    output_file = open(
        os.path.join(config.experiment_folder,
                     '{}.output.tsv'.format(config.dev_name)), 'w')
    if config.start_from_checkpoint:
        load_checkpoints_path = os.path.join(
            config.experiment_folder, 'weights',
            'checkpoint_{}.tar'.format(config.starting_epoch))
        model.load_state_dict(
            torch.load(load_checkpoints_path)['model_state_dict'])
        optimizer.load_state_dict(
            torch.load(load_checkpoints_path)['optimizer_state_dict'])

    for j in tqdm.tqdm(range(epochs - config.starting_epoch)):
        epoch = j + config.starting_epoch
        if config.starting_epoch > 0:
            epoch += 1
        path_checkpoints = os.path.join(config.experiment_folder, 'weights',
                                        'checkpoint_{}.tar'.format(epoch))
        total, correct = 0, 0
        for i in tqdm.tqdm(range(0, len(train_x), batch_size)):
            inputs = train_x[i:i + batch_size]
            temp_lab = train_y[i:i + batch_size]
            if any(len(sublist) > sentence_max_len for sublist in inputs):
                inputs = [x[:sentence_max_len] for x in inputs]
                temp_lab = [x[:sentence_max_len] for x in temp_lab]
            labels = process_labels(temp_lab, labels_dict)
            optimizer.zero_grad()
            outputs = model.train()(inputs)
            loss = criterion(outputs.view(-1, outputs.size(2)),
                             labels.view(-1))
            loss.backward()
            optimizer.step()
        writer.add_scalar('training_loss', loss.item(), epoch)
        eval_outs = torch.exp(model.eval()(dev_x))
        eval_outs *= dev_mask.cuda()
        predicted = torch.argmax(eval_outs, 2)
        for a, token_sent in enumerate(tokens):
            for b, instance_id in enumerate(token_sent):
                if instance_id is not None and not instance_id == 'untagged':
                    gold_label = gold_dictionary[instance_id]
                    predicted_label = reverse_labels_dict[predicted[a,
                                                                    b].item()]
                    if predicted_label is None:
                        predicted_label = utils.getMFS(instance_id, mapping,
                                                       txt_file,
                                                       config.finegrained)
                    if predicted_label in gold_label:
                        correct += 1
                        output_file.write('c\t')
                    else:
                        output_file.write('w\t')
                    total += 1
                    output_file.write(instance_id + '\t' + predicted_label +
                                      '\t' + str(gold_label) + '\n')

        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                'eval_acc': correct / total
            }, path_checkpoints)
        writer.add_scalar('eval_acc', correct / total, epoch)
        del loss
        del eval_outs
    writer.close()
Exemple #4
0
if args.cuda:
    autoencoder.cuda()
    criterion.cuda()
    image_tensor = image_tensor.cuda()
    image_noise = image_noise.cuda()
    label = label.cuda()
    noise = noise.cuda()
    fixed_noise = fixed_noise.cuda()

autoencoder.apply(weights_init)

for epoch in range(args.epochs):
    for i, data in enumerate(train_loader):
        # Generate Mask
        mask = build_mask((64, 64), 26, 26, 'center')
        images, _ = data

        if args.cuda:
            images = images.cuda()

        image_tensor.resize_as_(images).copy_(images)
        image_var = Variable(image_tensor)

        # Forward propagation
        output = autoencoder(image_var.masked_fill(mask, 0))
        loss = criterion(output, image_var)

        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
 def test_build_mask_no_padding(self):
     seq_lengths = [3, 3]
     correct_mask = [[1, 1, 1], [1, 1, 1]]
     self.assertAlmostEqual((build_mask(seq_lengths)).tolist(),
                            correct_mask)
Exemple #6
0
def train_running_save(data_path,
                       model,
                       optimizer,
                       criterion,
                       device,
                       logger,
                       args,
                       step=10):

    if not os.path.exists("results"):
        os.mkdir("results")
    if not os.path.exists(args.checkpoint_dir):
        os.mkdir(args.checkpoint_dir)

    data_loader = DataLoader(data_path, args.verbose)

    X, y, seq = data_loader.run_pipeline(args.split_rate)

    train_iter = DataIterator(X[0], y[0], seq[0], batch_size=args.batch_size)
    test_iter = DataIterator(X[1], y[1], seq[1], batch_size=args.batch_size)

    train_err, test_err = [], []
    train_acc, test_acc = [], []

    logger.info(model)

    for epoch in range(args.epoch):

        logger.info("Epoch: {} / {}".format(epoch + 1, args.epoch))

        ### TRAIN LOOP ###
        err = []
        acc = []
        model.train()
        for proteins, sequence_lengths, targets in (tqdm(
                train_iter,
                ascii=False,
                desc="Training",
                total=int(len(X[0]) / args.batch_size),
                unit="batch") if args.verbose else train_iter):

            inputs = proteins.to(device)
            seq_lens = sequence_lengths.to(device)
            targets = targets.to(device)

            predictions = model(inputs, seq_lens)

            mask = build_mask(sequence_lengths).to(device)

            optimizer.zero_grad()
            batch_loss = criterion(predictions, targets, mask)
            batch_loss.backward()
            optimizer.step()

            cos_sim = cosine_similarity(predictions, targets, mask)

            err.append(batch_loss.cpu().item())
            acc.append(cos_sim.cpu().item())

        epoch_trainig_error = sum(err) / len(err)
        epoch_training_accuracy = sum(acc) / len(acc)
        train_err.append(epoch_trainig_error)
        train_acc.append(epoch_training_accuracy)

        ### TEST LOOP ###
        err = []
        acc = []
        model.eval()
        for proteins, sequence_lengths, targets in (tqdm(
                test_iter,
                ascii=False,
                desc="Testing",
                total=int(len(X[1]) / args.batch_size),
                unit="batch") if args.verbose else test_iter):

            inputs = proteins.to(device)
            seq_lens = sequence_lengths.to(device)
            targets = targets.to(device)

            predictions = model(inputs, seq_lens)

            mask = build_mask(sequence_lengths).to(device)

            batch_loss = criterion(predictions, targets, mask)

            cos_sim = cosine_similarity(predictions, targets, mask)

            err.append(batch_loss.cpu().item())
            acc.append(cos_sim.cpu().item())

        epoch_test_error = sum(err) / len(err)
        epoch_test_accuracy = sum(acc) / len(acc)
        test_err.append(epoch_test_error)
        test_acc.append(epoch_test_accuracy)

        logger.info(
            "Training error: {0:.4f},\tTest error: {1:.4f}\t\tTraining accuracy: {2:.4f}\tTest accuracy: {3:.4f}"
            .format(epoch_trainig_error, epoch_test_error,
                    epoch_training_accuracy, epoch_test_accuracy))

        if epoch % step == 0:

            logger.info("Saving checkpoint")

            performance_path = os.path.join("results", "{}-epoch{}.pk".format(
                args.results_name.split(".")[0], epoch))  # temporary name
            checkpoint_name = "{}-epoch{}.pt".format(
                args.checkpoint_name.split(".")[0], epoch)  # temporary name
            results = (train_err, test_err), (train_acc, test_acc)
            with open(performance_path, "wb") as file:
                pickle.dump(results, file)
            torch.save(
                {
                    "epoch": args.epoch,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict()
                }, os.path.join(args.checkpoint_dir, checkpoint_name))

    return (train_err, test_err), (train_acc, test_acc)
Exemple #7
0
def train(data_path, model, optimizer, criterion, device, logger, args):
    data_loader = DataLoader(data_path, args.verbose)

    X, y, seq = data_loader.run_pipeline(args.split_rate)

    train_iter = DataIterator(X[0], y[0], seq[0], batch_size=args.batch_size)
    test_iter = DataIterator(X[1], y[1], seq[1], batch_size=args.batch_size)

    train_err, test_err = [], []
    train_acc, test_acc = [], []

    logger.info(model)

    for epoch in range(args.epoch):

        logger.info("Epoch: {} / {}".format(epoch + 1, args.epoch))

        ### TRAIN LOOP ###
        err = []
        acc = []
        model.train()
        for proteins, sequence_lengths, targets in (tqdm(
                train_iter,
                ascii=False,
                desc="Training",
                total=int(len(X[0]) / args.batch_size),
                unit="batch") if args.verbose else train_iter):

            inputs = proteins.to(device)
            seq_lens = sequence_lengths.to(device)
            targets = targets.to(device)

            predictions = model(inputs, seq_lens)

            mask = build_mask(sequence_lengths).to(device)

            optimizer.zero_grad()
            batch_loss = criterion(predictions, targets, mask)
            batch_loss.backward()
            optimizer.step()

            cos_sim = cosine_similarity(predictions, targets, mask)

            err.append(batch_loss.cpu().item())
            acc.append(cos_sim.cpu().item())

        epoch_trainig_error = sum(err) / len(err)
        epoch_training_accuracy = sum(acc) / len(acc)
        train_err.append(epoch_trainig_error)
        train_acc.append(epoch_training_accuracy)

        ### TEST LOOP ###
        err = []
        acc = []
        model.eval()
        for proteins, sequence_lengths, targets in (tqdm(
                test_iter,
                ascii=False,
                desc="Testing",
                total=int(len(X[1]) / args.batch_size),
                unit="batch") if args.verbose else test_iter):

            inputs = proteins.to(device)
            seq_lens = sequence_lengths.to(device)
            targets = targets.to(device)

            predictions = model(inputs, seq_lens)

            mask = build_mask(sequence_lengths).to(device)

            batch_loss = criterion(predictions, targets, mask)

            cos_sim = cosine_similarity(predictions, targets, mask)

            err.append(batch_loss.cpu().item())
            acc.append(cos_sim.cpu().item())

        epoch_test_error = sum(err) / len(err)
        epoch_test_accuracy = sum(acc) / len(acc)
        test_err.append(epoch_test_error)
        test_acc.append(epoch_test_accuracy)

        logger.info(
            "Training error: {0:.4f},\tTest error: {1:.4f}\t\tTraining accuracy: {2:.4f}\tTest accuracy: {3:.4f}"
            .format(epoch_trainig_error, epoch_test_error,
                    epoch_training_accuracy, epoch_test_accuracy))

    return (train_err, test_err), (train_acc, test_acc)