コード例 #1
0
ファイル: TestInterface.py プロジェクト: tianfuzeng/helen
def test_interface(test_file, batch_size, gpu_mode, num_workers, model_path,
                   output_directory, print_details):
    """
    Test a trained model
    :param test_file: Path to directory containing test images
    :param batch_size: Batch size for training
    :param gpu_mode: If true the model will be trained on GPU
    :param num_workers: Number of workers for data loading
    :param model_path: Path to a saved model
    :param output_directory: Path to output_directory
    :param print_details: If true then logs will be printed
    :return:
    """
    sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)

    output_directory = FileManager.handle_output_directory(output_directory)

    if os.path.isfile(model_path) is False:
        sys.stderr.write(TextColor.RED + "ERROR: INVALID PATH TO MODEL\n")
        exit(1)

    sys.stderr.write(TextColor.GREEN + "INFO: MODEL LOADING\n" + TextColor.END)

    transducer_model, hidden_size, gru_layers, prev_ite = \
        ModelHandler.load_simple_model(model_path,
                                       input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                       image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                       seq_len=ImageSizeOptions.SEQ_LENGTH,
                                       num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS,
                                       num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS)

    sys.stderr.write(TextColor.GREEN + "INFO: MODEL LOADED\n" + TextColor.END)

    if print_details and gpu_mode:
        sys.stderr.write(TextColor.GREEN +
                         "INFO: GPU MODE NOT AVAILABLE WHEN PRINTING DETAILS. "
                         "SETTING GPU MODE TO FALSE.\n" + TextColor.END)
        gpu_mode = False

    if gpu_mode:
        transducer_model = transducer_model.cuda()

    stats_dictionary = test(
        test_file,
        batch_size,
        gpu_mode,
        transducer_model,
        num_workers,
        gru_layers,
        hidden_size,
        num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS,
        num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS,
        output_directory=output_directory,
        print_details=print_details)

    save_rle_confusion_matrix(stats_dictionary, output_directory)
    save_base_confusion_matrix(stats_dictionary, output_directory)

    sys.stderr.write(TextColor.PURPLE + 'DONE\n' + TextColor.END)
コード例 #2
0
ファイル: train_distributed.py プロジェクト: tianfuzeng/helen
def save_best_model(transducer_model, model_optimizer, hidden_size, layers, epoch,
                    file_name):
    """
    Save the best model
    :param transducer_model: A trained model
    :param model_optimizer: Model optimizer
    :param hidden_size: Number of hidden layers
    :param layers: Number of GRU layers to use
    :param epoch: Epoch/iteration number
    :param file_name: Output file name
    :return:
    """
    if os.path.isfile(file_name):
        os.remove(file_name)
    ModelHandler.save_checkpoint({
        'model_state_dict': transducer_model.state_dict(),
        'model_optimizer': model_optimizer.state_dict(),
        'hidden_size': hidden_size,
        'gru_layers': layers,
        'epochs': epoch,
    }, file_name)
    sys.stderr.write(TextColor.RED + "\nMODEL SAVED SUCCESSFULLY.\n" + TextColor.END)
コード例 #3
0
ファイル: train_distributed.py プロジェクト: tianfuzeng/helen
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode, num_workers, retrain_model,
          retrain_model_path, gru_layers, hidden_size, lr, decay, model_dir, stats_dir, train_mode,
          world_size, rank, device_id):

    if train_mode is True and rank == 0:
        train_loss_logger = open(stats_dir + "train_loss.csv", 'w')
        test_loss_logger = open(stats_dir + "test_loss.csv", 'w')
        confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w')
    else:
        train_loss_logger = None
        test_loss_logger = None
        confusion_matrix_logger = None

    torch.cuda.set_device(device_id)

    if rank == 0:
        sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)

    train_data_set = SequenceDataset(train_file)

    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_data_set,
        num_replicas=world_size,
        rank=rank
    )

    train_loader = torch.utils.data.DataLoader(
        dataset=train_data_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        sampler=train_sampler)

    num_base_classes = ImageSizeOptions.TOTAL_BASE_LABELS
    num_rle_classes = ImageSizeOptions.TOTAL_RLE_LABELS

    if retrain_model is True:
        if os.path.isfile(retrain_model_path) is False:
            sys.stderr.write(TextColor.RED + "ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n")
            exit(1)
        sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADING\n" + TextColor.END)
        transducer_model, hidden_size, gru_layers, prev_ite = \
            ModelHandler.load_simple_model(retrain_model_path,
                                           input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                           image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                           seq_len=ImageSizeOptions.SEQ_LENGTH,
                                           num_base_classes=num_base_classes,
                                           num_rle_classes=num_rle_classes)

        if train_mode is True:
            epoch_limit = prev_ite + epoch_limit

        sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADED\n" + TextColor.END)
    else:
        transducer_model = ModelHandler.get_new_gru_model(input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                                          image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                                          gru_layers=gru_layers,
                                                          hidden_size=hidden_size,
                                                          num_base_classes=num_base_classes,
                                                          num_rle_classes=num_rle_classes)
        prev_ite = 0

    param_count = sum(p.numel() for p in transducer_model.parameters() if p.requires_grad)
    if rank == 0:
        sys.stderr.write(TextColor.RED + "INFO: TOTAL TRAINABLE PARAMETERS:\t" + str(param_count) + "\n" + TextColor.END)

    model_optimizer = torch.optim.Adam(transducer_model.parameters(), lr=lr, weight_decay=decay)

    if retrain_model is True:
        sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADING\n" + TextColor.END)
        model_optimizer = ModelHandler.load_simple_optimizer(model_optimizer, retrain_model_path, gpu_mode)
        sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADED\n" + TextColor.END)

    if gpu_mode:
        transducer_model = transducer_model.to(device_id)
        transducer_model = nn.parallel.DistributedDataParallel(transducer_model, device_ids=[device_id])

    class_weights = torch.Tensor(TrainOptions.CLASS_WEIGHTS)
    # we perform a multi-task classification, so we need two loss functions, each performing a single task
    # criterion base is the loss function for base prediction
    criterion_base = nn.CrossEntropyLoss()
    # criterion rle is the loss function for RLE prediction
    criterion_rle = nn.CrossEntropyLoss(weight=class_weights)

    if gpu_mode is True:
        criterion_base = criterion_base.to(device_id)
        criterion_rle = criterion_rle.to(device_id)

    start_epoch = prev_ite

    # Train the Model
    if rank == 0:
        sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END)
        sys.stderr.write(TextColor.BLUE + 'Start: ' + str(start_epoch + 1) + ' End: ' + str(epoch_limit) + "\n")

    stats = dict()
    stats['loss_epoch'] = []
    stats['accuracy_epoch'] = []

    for epoch in range(start_epoch, epoch_limit, 1):
        total_loss_base = 0
        total_loss_rle = 0
        total_loss = 0
        total_images = 0
        if rank == 0:
            sys.stderr.write(TextColor.BLUE + 'Train epoch: ' + str(epoch + 1) + "\n")
        # make sure the model is in train mode. BN is different in train and eval.

        batch_no = 1
        if rank == 0:
            progress_bar = tqdm(
                total=len(train_loader),
                ncols=100,
                leave=True,
                position=rank,
                desc="Loss: ",
            )
        else:
            progress_bar = None

        transducer_model.train()
        for images, label_base, label_rle in train_loader:
            # convert the tensors to the proper datatypes.
            images = images.type(torch.FloatTensor)
            label_base = label_base.type(torch.LongTensor)
            label_rle = label_rle.type(torch.LongTensor)

            hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE)

            if gpu_mode:
                hidden = hidden.to(device_id)
                images = images.to(device_id)
                label_base = label_base.to(device_id)
                label_rle = label_rle.to(device_id)

            for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP):
                model_optimizer.zero_grad()

                if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH:
                    break

                image_chunk = images[:, i:i+TrainOptions.TRAIN_WINDOW]
                label_base_chunk = label_base[:, i:i+TrainOptions.TRAIN_WINDOW]
                label_rle_chunk = label_rle[:, i:i+TrainOptions.TRAIN_WINDOW]

                # get the inference from the model
                output_base, output_rle, hidden = transducer_model(image_chunk, hidden)

                # calculate loss for base prediction
                loss_base = criterion_base(output_base.contiguous().view(-1, num_base_classes),
                                           label_base_chunk.contiguous().view(-1))
                # calculate loss for RLE prediction
                loss_rle = criterion_rle(output_rle.contiguous().view(-1, num_rle_classes),
                                         label_rle_chunk.contiguous().view(-1))

                # sum the losses to have a singlee optimization over multiple tasks
                loss = loss_base + loss_rle

                # backpropagation and weight update
                loss.backward()
                model_optimizer.step()

                # update the loss values
                total_loss += loss.item()
                total_loss_base += loss_base.item()
                total_loss_rle += loss_rle.item()
                total_images += image_chunk.size(0)

                # detach the hidden from the graph as the next chunk will be a new optimization
                hidden = hidden.detach()

            # update the progress bar
            avg_loss = (total_loss / total_images) if total_images else 0

            if train_mode is True and rank == 0:
                train_loss_logger.write(str(epoch + 1) + "," + str(batch_no) + "," + str(avg_loss) + "\n")

            if rank == 0:
                avg_loss = (total_loss / total_images) if total_images else 0
                progress_bar.set_description("Base: " + str(round(total_loss_base, 4)) +
                                             ", RLE: " + str(round(total_loss_rle, 4)) +
                                             ", TOTAL: " + str(round(total_loss, 4)))
                progress_bar.refresh()
                progress_bar.update(1)
                batch_no += 1

        if rank == 0:
            progress_bar.close()
        dist.barrier()

        if rank == 0:
            stats_dictionary = test(test_file, batch_size, gpu_mode, transducer_model, num_workers,
                                    gru_layers, hidden_size, num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS,
                                    num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS)
            stats['loss'] = stats_dictionary['loss']
            stats['accuracy'] = stats_dictionary['accuracy']
            stats['loss_epoch'].append((epoch, stats_dictionary['loss']))
            stats['accuracy_epoch'].append((epoch, stats_dictionary['accuracy']))
        dist.barrier()

        # update the loggers
        if train_mode is True and rank == 0:
            ModelHandler.save_model(transducer_model, model_optimizer,
                                    hidden_size, gru_layers,
                                    epoch, model_dir + "HELEN_epoch_" + str(epoch + 1) + '_checkpoint.pkl')
            sys.stderr.write(TextColor.RED + "\nMODEL SAVED SUCCESSFULLY.\n" + TextColor.END)

            test_loss_logger.write(str(epoch + 1) + "," + str(stats['loss']) + "," + str(stats['accuracy']) + "\n")
            confusion_matrix_logger.write(str(epoch + 1) + "\n" + str(stats_dictionary['base_confusion_matrix']) + "\n")
            train_loss_logger.flush()
            test_loss_logger.flush()
            confusion_matrix_logger.flush()
        elif train_mode is False:
            # this setup is for hyperband
            if epoch + 1 >= 10 and stats['accuracy'] < 98:
                sys.stderr.write(TextColor.PURPLE + 'EARLY STOPPING AS THE MODEL NOT DOING WELL\n' + TextColor.END)
                return transducer_model, model_optimizer, stats

    if rank == 0:
        sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END)

    return transducer_model, model_optimizer, stats
コード例 #4
0
def predict_cpu(file_chunks, output_filepath, model_path, batch_size,
                total_callers, threads, num_workers):
    """
    Create a prediction table/dictionary of an images set using a trained model.
    :param file_chunks: Path to chunked files
    :param batch_size: Batch size used for prediction
    :param model_path: Path to a trained model
    :param output_filepath: Path to output directory
    :param total_callers: Number of callers to spawn
    :param threads: Number of threads to use per caller
    :param num_workers: Number of workers to be used by the dataloader
    :return: Prediction dictionary
    """
    # load the model using the model path
    transducer_model, hidden_size, gru_layers, prev_ite = \
        ModelHandler.load_simple_model(model_path,
                                       input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                       image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                       seq_len=ImageSizeOptions.SEQ_LENGTH,
                                       num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS,
                                       num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS)
    transducer_model.eval()

    sys.stderr.write("INFO: MODEL LOADING TO ONNX\n")
    x = torch.zeros(1, TrainOptions.TRAIN_WINDOW,
                    ImageSizeOptions.IMAGE_HEIGHT)
    h = torch.zeros(1, 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE)

    if not os.path.isfile(model_path + ".onnx"):
        sys.stderr.write("INFO: SAVING MODEL TO ONNX\n")

        # export the model as ONNX mode
        torch.onnx.export(
            transducer_model, (x, h),
            model_path + ".onnx",
            training=False,
            opset_version=10,
            do_constant_folding=True,
            input_names=['input_image', 'input_hidden'],
            output_names=['output_pred', 'output_rle', 'output_hidden'],
            dynamic_axes={
                'input_image': {
                    0: 'batch_size'
                },
                'input_hidden': {
                    0: 'batch_size'
                },
                'output_pred': {
                    0: 'batch_size'
                },
                'output_rle': {
                    0: 'batch_size'
                },
                'output_hidden': {
                    0: 'batch_size'
                }
            })

    # create the arguments to send for prediction
    args = (output_filepath, model_path, batch_size, num_workers, threads)

    # spawn the processes to call the prediction method
    mp.spawn(setup,
             args=(total_callers, args, file_chunks),
             nprocs=total_callers,
             join=True)
コード例 #5
0
ファイル: train.py プロジェクト: tianfuzeng/helen
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode,
          num_workers, retrain_model, retrain_model_path, gru_layers,
          hidden_size, lr, decay, model_dir, stats_dir, not_hyperband):
    """
    This method implements the training scheme of HELEN. It takes a set of training images and a set of testing
    images and trains the model on the training images and after each epoch it evaluates the trained model on the
    test image set. It saves all model state after each epoch regardless of it's performance. It also saves some
    statistics of the saved models and the training run.
    :param train_file: Path to train image set.
    :param test_file: Path to test image set.
    :param batch_size: Batch size for minibatch operation
    :param epoch_limit: Number of iterations the training will go for
    :param gpu_mode: If True, training and testing will be done on GPU
    :param num_workers: Number of workers for dataloader
    :param retrain_model: If True then it will load a previously-trained model for retraining
    :param retrain_model_path: Path to a previously trained model
    :param gru_layers: Number of GRU layers in the model
    :param hidden_size: Hidden size of the model
    :param lr: Learning Rate for the optimizer
    :param decay: Weight Decay for the optimizer
    :param model_dir: Directory where models will be saved
    :param stats_dir: Directory where statistics of this run will be saved
    :param not_hyperband: This is used by hyperband. If True then hyperband is not running.
    :return:
    """

    # if hyperband is not running then create stat logger for train, test and confusion
    if not_hyperband is True:
        train_loss_logger = open(stats_dir + "train_loss.csv", 'w')
        test_loss_logger = open(stats_dir + "test_loss.csv", 'w')
        confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w')
    else:
        train_loss_logger = None
        test_loss_logger = None
        confusion_matrix_logger = None

    sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)
    # initialize training dataset loader
    train_data_set = SequenceDataset(train_file)
    train_loader = DataLoader(train_data_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=num_workers,
                              pin_memory=gpu_mode)
    num_base_classes = ImageSizeOptions.TOTAL_BASE_LABELS
    num_rle_classes = ImageSizeOptions.TOTAL_RLE_LABELS

    # if retrain model is true then load the model from the model path
    if retrain_model is True:
        if os.path.isfile(retrain_model_path) is False:
            sys.stderr.write(
                TextColor.RED +
                "ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n"
            )
            exit(1)
        sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADING\n" +
                         TextColor.END)
        transducer_model, hidden_size, gru_layers, prev_ite = \
            ModelHandler.load_simple_model(retrain_model_path,
                                           input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                           image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                           seq_len=ImageSizeOptions.SEQ_LENGTH,
                                           num_base_classes=num_base_classes,
                                           num_rle_classes=num_rle_classes)

        if not_hyperband is True:
            epoch_limit = prev_ite + epoch_limit

        sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADED\n" +
                         TextColor.END)
    else:
        # if training from scractch, then create a new model
        transducer_model = ModelHandler.get_new_gru_model(
            input_channels=ImageSizeOptions.IMAGE_CHANNELS,
            image_features=ImageSizeOptions.IMAGE_HEIGHT,
            gru_layers=gru_layers,
            hidden_size=hidden_size,
            num_base_classes=num_base_classes,
            num_rle_classes=num_rle_classes)
        prev_ite = 0

    # count the number of trainable parameters for reporting
    param_count = sum(p.numel() for p in transducer_model.parameters()
                      if p.requires_grad)
    sys.stderr.write(TextColor.RED + "INFO: TOTAL TRAINABLE PARAMETERS:\t" +
                     str(param_count) + "\n" + TextColor.END)

    # create a model optimizer
    model_optimizer = torch.optim.Adam(transducer_model.parameters(),
                                       lr=lr,
                                       weight_decay=decay)
    # this learning rate scheduler reduces learning rate when model reaches plateau
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        model_optimizer, 'min')

    # if retrain model is true then load the optimizer
    if retrain_model is True:
        sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADING\n" +
                         TextColor.END)
        model_optimizer = ModelHandler.load_simple_optimizer(
            model_optimizer, retrain_model_path, gpu_mode)
        sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADED\n" +
                         TextColor.END)

    class_weights = torch.Tensor(TrainOptions.CLASS_WEIGHTS)
    # we perform a multi-task classification, so we need two loss functions, each performing a single task
    # criterion base is the loss function for base prediction
    criterion_base = nn.CrossEntropyLoss()
    # criterion rle is the loss function for RLE prediction
    criterion_rle = nn.CrossEntropyLoss(weight=class_weights)

    # if gpu mode is true then transfer the model and loss functions to cuda
    if gpu_mode is True:
        transducer_model = torch.nn.DataParallel(transducer_model).cuda()
        criterion_base = criterion_base.cuda()
        criterion_rle = criterion_rle.cuda()

    start_epoch = prev_ite

    # Train the Model
    sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END)
    # create stats dicts
    stats = dict()
    stats['loss_epoch'] = []
    stats['accuracy_epoch'] = []
    sys.stderr.write(TextColor.BLUE + 'Start: ' + str(start_epoch + 1) +
                     ' End: ' + str(epoch_limit) + "\n")

    # for each epoch we iterate over the training dataset once
    for epoch in range(start_epoch, epoch_limit, 1):
        total_loss_base = 0
        total_loss_rle = 0
        total_loss = 0
        total_images = 0
        sys.stderr.write(TextColor.BLUE + 'Train epoch: ' + str(epoch + 1) +
                         "\n")
        batch_no = 1

        # tqdm is the progress bar we use for logging
        with tqdm(total=len(train_loader), desc='Loss', leave=True,
                  ncols=100) as progress_bar:
            # make sure the model is in train mode. BN is different in train and eval.
            transducer_model.train()
            for images, label_base, label_rle in train_loader:
                # convert the tensors to the proper datatypes.
                images = images.type(torch.FloatTensor)
                label_base = label_base.type(torch.LongTensor)
                label_rle = label_rle.type(torch.LongTensor)

                # initialize the hidden input for the first chunk
                hidden = torch.zeros(images.size(0),
                                     2 * TrainOptions.GRU_LAYERS,
                                     TrainOptions.HIDDEN_SIZE)

                # if gpu_mode is true then transfer all tensors to cuda
                if gpu_mode:
                    images = images.cuda()
                    label_base = label_base.cuda()
                    label_rle = label_rle.cuda()
                    hidden = hidden.cuda()

                # perform a sliding window on the entire image sequence length
                for i in range(0, ImageSizeOptions.SEQ_LENGTH,
                               TrainOptions.WINDOW_JUMP):
                    # we optimize over each chunk
                    model_optimizer.zero_grad()

                    # if current position + window size goes beyond the size of the window,
                    # that means we've reached the end
                    if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH:
                        break

                    # get the chunks for this window
                    image_chunk = images[:, i:i + TrainOptions.TRAIN_WINDOW]
                    label_base_chunk = label_base[:, i:i +
                                                  TrainOptions.TRAIN_WINDOW]
                    label_rle_chunk = label_rle[:, i:i +
                                                TrainOptions.TRAIN_WINDOW]

                    # get the inference from the model
                    output_base, output_rle, hidden = transducer_model(
                        image_chunk, hidden)

                    # calculate loss for base prediction
                    loss_base = criterion_base(
                        output_base.contiguous().view(-1, num_base_classes),
                        label_base_chunk.contiguous().view(-1))
                    # calculate loss for RLE prediction
                    loss_rle = criterion_rle(
                        output_rle.contiguous().view(-1, num_rle_classes),
                        label_rle_chunk.contiguous().view(-1))

                    # sum the losses to have a singlee optimization over multiple tasks
                    loss = loss_base + loss_rle

                    # backpropagation and weight update
                    loss.backward()
                    model_optimizer.step()

                    # update the loss values
                    total_loss += loss.item()
                    total_loss_base += loss_base.item()
                    total_loss_rle += loss_rle.item()
                    total_images += image_chunk.size(0)

                    # detach the hidden from the graph as the next chunk will be a new optimization
                    hidden = hidden.detach()

                # update the progress bar
                avg_loss = (total_loss / total_images) if total_images else 0
                progress_bar.set_description("Base: " +
                                             str(round(total_loss_base, 4)) +
                                             ", RLE: " +
                                             str(round(total_loss_rle, 4)) +
                                             ", TOTAL: " +
                                             str(round(total_loss, 4)))

                if not_hyperband is True:
                    train_loss_logger.write(
                        str(epoch + 1) + "," + str(batch_no) + "," +
                        str(avg_loss) + "\n")
                progress_bar.refresh()
                progress_bar.update(1)
                batch_no += 1

            progress_bar.close()

        # after each epoch, evaluate the current state of the model
        stats_dictionary = test(
            test_file,
            batch_size,
            gpu_mode,
            transducer_model,
            num_workers,
            gru_layers,
            hidden_size,
            num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS,
            num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS)
        stats['loss'] = stats_dictionary['loss']
        stats['accuracy'] = stats_dictionary['accuracy']
        stats['loss_epoch'].append((epoch, stats_dictionary['loss']))
        stats['accuracy_epoch'].append((epoch, stats_dictionary['accuracy']))

        lr_scheduler.step(stats['loss'])

        # save the model after each epoch and update the loggers after each epoch
        if not_hyperband is True:
            ModelHandler.save_model(
                transducer_model, model_optimizer, hidden_size, gru_layers,
                epoch, model_dir + "HELEN_epoch_" + str(epoch + 1) +
                '_checkpoint.pkl')
            sys.stderr.write(TextColor.RED + "\nMODEL SAVED SUCCESSFULLY.\n" +
                             TextColor.END)

            test_loss_logger.write(
                str(epoch + 1) + "," + str(stats['loss']) + "," +
                str(stats['accuracy']) + "\n")
            confusion_matrix_logger.write(
                str(epoch + 1) + "\n" +
                str(stats_dictionary['base_confusion_matrix']) + "\n")
            train_loss_logger.flush()
            test_loss_logger.flush()
            confusion_matrix_logger.flush()
        # else:
        #     # this setup is for hyperband
        #     if epoch + 1 >= 2 and stats['accuracy'] < 90:
        #         sys.stderr.write(TextColor.PURPLE + 'EARLY STOPPING AS THE MODEL NOT DOING WELL\n' + TextColor.END)
        #         return transducer_model, model_optimizer, stats

    # notify that the model has finished training.
    sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END)
コード例 #6
0
ファイル: predict_gpu.py プロジェクト: tianfuzeng/helen
def predict(test_file, output_filename, model_path, batch_size, num_workers,
            rank, device_id):
    """
    The predict method loads images generated by MarginPolish and produces base predictions using a
    sequence transduction model based deep neural network. This method loads the model and iterates over
    minibatch images to generate the predictions and saves the predictions to a hdf5 file.

    :param test_file: File to predict on
    :param output_filename: Name and path to the output file
    :param batch_size: Batch size used for minibatch prediction
    :param model_path: Path to a trained model
    :param rank: Rank of this caller
    :param num_workers: Number of workers to be used by the dataloader
    :param device_id: Device id of the model to set to
    :param threads: Number of threads to use with pytorch
    :return: Prediction dictionary
    """
    # create the output hdf5 file where all the predictions will be saved
    prediction_data_file = DataStore(output_filename + "_" + str(rank) +
                                     ".hdf",
                                     mode='w')

    # load the model using the model path
    transducer_model, hidden_size, gru_layers, prev_ite = \
        ModelHandler.load_simple_model(model_path,
                                       input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                       image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                       seq_len=ImageSizeOptions.SEQ_LENGTH,
                                       num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS,
                                       num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS)
    transducer_model.eval()

    torch.cuda.set_device(device_id)
    transducer_model.to(device_id)
    transducer_model.eval()
    transducer_model = DistributedDataParallel(transducer_model,
                                               device_ids=[device_id])

    # only output for rank 0 caller
    if rank == 0:
        print(output_filename + "_" + str(rank) + ".hdf")
        sys.stderr.write(TextColor.GREEN + 'INFO: TORCH THREADS SET TO: ' +
                         str(torch.get_num_threads()) + ".\n" + TextColor.END)
        # notify that the process has started and loading data
        sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)

    # create a pytorch dataset and dataloader that loads the data in mini_batches
    test_data = SequenceDataset(image_directory=None, file_list=test_file)
    test_loader = DataLoader(test_data,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=num_workers)

    # iterate over the data in minibatches
    with torch.no_grad():
        # keep an eye for batch
        total_batches = len(test_loader)
        batch_iterator = 0

        # the dataloader loop, iterates in minibatches. tqdm is the progress logger.
        for contig, contig_start, contig_end, chunk_id, images, position, filename in test_loader:
            start_time = time.time()
            # the images are usually in uint8, convert them to FloatTensor
            images = images.type(torch.FloatTensor)
            # initialize the first hidden input as all zeros
            hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS,
                                 TrainOptions.HIDDEN_SIZE)

            # this is a multi-task neural network where we predict a base and a run-length. We use two dictionaries
            # to keep track of predictions.
            # these two dictionaries save predictions for each of the chunks and later we aggregate all the predictions
            # over the entire sequence to get a sequence prediction for the whole sequence.
            prediction_base_tensor = torch.zeros(
                (images.size(0), images.size(1),
                 ImageSizeOptions.TOTAL_BASE_LABELS))
            prediction_rle_tensor = torch.zeros(
                (images.size(0), images.size(1),
                 ImageSizeOptions.TOTAL_RLE_LABELS))

            # move the tensors to CUDA
            prediction_base_tensor = prediction_base_tensor.to(device_id)
            prediction_rle_tensor = prediction_rle_tensor.to(device_id)

            # now the images usually contain 1000 bases, we iterate on a sliding window basis where we process
            # the window size then jump to the next window
            for i in range(0, ImageSizeOptions.SEQ_LENGTH,
                           TrainOptions.WINDOW_JUMP):
                # if current position + window size goes beyond the size of the window, that means we've reached the end
                if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH:
                    break
                chunk_start = i
                chunk_end = i + TrainOptions.TRAIN_WINDOW

                # get the image chunk
                image_chunk = images[:, chunk_start:chunk_end]

                # move the images and the hidden layers to CUDA
                images = images.to(device_id)
                hidden = hidden.to(device_id)

                # run inference
                output_base, output_rle, hidden = transducer_model(
                    image_chunk, hidden)

                # now calculate how much padding is on the top and bottom of this chunk so we can do a simple
                # add operation
                top_zeros = chunk_start
                bottom_zeros = ImageSizeOptions.SEQ_LENGTH - chunk_end

                # we run a softmax a padding to make the output tensor compatible for adding
                inference_layers = nn.Sequential(
                    nn.Softmax(dim=2),
                    nn.ZeroPad2d((0, 0, top_zeros, bottom_zeros)))
                inference_layers = inference_layers.to(device_id)

                # run the softmax and padding layers
                base_prediction = inference_layers(output_base)
                rle_prediction = inference_layers(output_rle)

                # now simply add the tensor to the global counter
                prediction_base_tensor = torch.add(prediction_base_tensor,
                                                   base_prediction)
                prediction_rle_tensor = torch.add(prediction_rle_tensor,
                                                  rle_prediction)

            # all done now create a SEQ_LENGTH long prediction list
            prediction_base_tensor = prediction_base_tensor.cpu()
            prediction_rle_tensor = prediction_rle_tensor.cpu()

            base_values, base_labels = torch.max(prediction_base_tensor, 2)
            rle_values, rle_labels = torch.max(prediction_rle_tensor, 2)

            predicted_base_labels = base_labels.cpu().numpy()
            predicted_rle_labels = rle_labels.cpu().numpy()

            batch_iterator += 1

            if rank == 0:
                # calculate the expected time to finish
                eta = (time.time() - start_time) * (total_batches -
                                                    batch_iterator)
                hours = str(int(eta / 3600))
                eta = eta - (eta / 3600)
                mins = str(int(eta / 60))
                secs = str(int(eta) % 60)
                time_stamp = hours + " HOURS " + mins + " MINS " + secs + " SECS."
                batch_string = "BATCHES DONE: " + str(
                    batch_iterator) + "/" + str(total_batches) + ". "
                time_left = "ESTIMATED TIME LEFT: " + str(time_stamp)
                sys.stderr.write(TextColor.GREEN + "INFO: " + batch_string +
                                 time_left + "\n" + TextColor.END)

            # go to each of the images and save the predictions to the file
            for i in range(images.size(0)):
                prediction_data_file.write_prediction(
                    contig[i], contig_start[i], contig_end[i], chunk_id[i],
                    position[i], predicted_base_labels[i],
                    predicted_rle_labels[i], filename[i])
コード例 #7
0
ファイル: predict.py プロジェクト: tianfuzeng/helen
def predict(test_file, output_filename, model_path, batch_size, num_workers,
            threads, gpu_mode):
    """
    The predict method loads images generated by MarginPolish and produces base predictions using a
    sequence transduction model based deep neural network. This method loads the model and iterates over
    minibatch images to generate the predictions and saves the predictions to a hdf5 file.

    :param test_file: File to predict on
    :param output_filename: Name and path to the output file
    :param batch_size: Batch size used for minibatch prediction
    :param model_path: Path to a trained model
    :param gpu_mode: If true, predictions will be done over GPU
    :param num_workers: Number of workers to be used by the dataloader
    :param threads: Number of threads to use with pytorch
    :return: Prediction dictionary
    """
    # create the output hdf5 file where all the predictions will be saved
    prediction_data_file = DataStore(output_filename, mode='w')
    torch.set_num_threads(threads)
    sys.stderr.write(TextColor.GREEN + 'INFO: TORCH THREADS SET TO: ' +
                     str(torch.get_num_threads()) + ".\n" + TextColor.END)

    # notify that the process has started and loading data
    sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END)

    # create a pytorch dataset and dataloader that loads the data in mini_batches
    test_data = SequenceDataset(test_file)
    test_loader = DataLoader(test_data,
                             batch_size=batch_size,
                             shuffle=False,
                             num_workers=num_workers)

    # load the model using the model path
    transducer_model, hidden_size, gru_layers, prev_ite = \
        ModelHandler.load_simple_model(model_path,
                                       input_channels=ImageSizeOptions.IMAGE_CHANNELS,
                                       image_features=ImageSizeOptions.IMAGE_HEIGHT,
                                       seq_len=ImageSizeOptions.SEQ_LENGTH,
                                       num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS,
                                       num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS)

    # set the model to evaluation mode.
    transducer_model.eval()

    # if gpu mode is True, then load the model in the GPUs
    if gpu_mode:
        transducer_model = torch.nn.DataParallel(transducer_model).cuda()

    # notify that the model has loaded successfully
    sys.stderr.write(TextColor.CYAN + 'MODEL LOADED\n')

    # iterate over the data in minibatches
    with torch.no_grad():
        # the dataloader loop, iterates in minibatches. tqdm is the progress logger.
        for contig, contig_start, contig_end, chunk_id, images, position, filename in tqdm(
                test_loader, ncols=50):
            # the images are usually in uint8, convert them to FloatTensor
            images = images.type(torch.FloatTensor)
            # initialize the first hidden input as all zeros
            hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS,
                                 TrainOptions.HIDDEN_SIZE)

            # if gpu_mode is True, transfer the image and hidden tensors to the GPU
            if gpu_mode:
                images = images.cuda()
                hidden = hidden.cuda()

            # this is a multi-task neural network where we predict a base and a run-length. We use two dictionaries
            # to keep track of predictions.
            # these two dictionaries save predictions for each of the chunks and later we aggregate all the predictions
            # over the entire sequence to get a sequence prediction for the whole sequence.
            prediction_base_tensor = torch.zeros(
                (images.size(0), images.size(1),
                 ImageSizeOptions.TOTAL_BASE_LABELS))
            prediction_rle_tensor = torch.zeros(
                (images.size(0), images.size(1),
                 ImageSizeOptions.TOTAL_RLE_LABELS))

            if gpu_mode:
                prediction_base_tensor = prediction_base_tensor.cuda()
                prediction_rle_tensor = prediction_rle_tensor.cuda()

            # now the images usually contain 1000 bases, we iterate on a sliding window basis where we process
            # the window size then jump to the next window
            for i in range(0, ImageSizeOptions.SEQ_LENGTH,
                           TrainOptions.WINDOW_JUMP):
                # if current position + window size goes beyond the size of the window, that means we've reached the end
                if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH:
                    break
                chunk_start = i
                chunk_end = i + TrainOptions.TRAIN_WINDOW

                # get the image chunk
                image_chunk = images[:, chunk_start:chunk_end]

                # run inference
                output_base, output_rle, hidden = transducer_model(
                    image_chunk, hidden)

                # now calculate how much padding is on the top and bottom of this chunk so we can do a simple
                # add operation
                top_zeros = chunk_start
                bottom_zeros = ImageSizeOptions.SEQ_LENGTH - chunk_end

                # we run a softmax a padding to make the output tensor compatible for adding
                inference_layers = nn.Sequential(
                    nn.Softmax(dim=2),
                    nn.ZeroPad2d((0, 0, top_zeros, bottom_zeros)))
                if gpu_mode:
                    inference_layers = inference_layers.cuda()

                # run the softmax and padding layers
                base_prediction = inference_layers(output_base)
                rle_prediction = inference_layers(output_rle)

                # now simply add the tensor to the global counter
                prediction_base_tensor = torch.add(prediction_base_tensor,
                                                   base_prediction)
                prediction_rle_tensor = torch.add(prediction_rle_tensor,
                                                  rle_prediction)

            # all done now create a SEQ_LENGTH long prediction list
            prediction_base_tensor = prediction_base_tensor.cpu()
            prediction_rle_tensor = prediction_rle_tensor.cpu()

            base_values, base_labels = torch.max(prediction_base_tensor, 2)
            rle_values, rle_labels = torch.max(prediction_rle_tensor, 2)

            predicted_base_labels = base_labels.cpu().numpy()
            predicted_rle_labels = rle_labels.cpu().numpy()

            # go to each of the images and save the predictions to the file
            for i in range(images.size(0)):
                prediction_data_file.write_prediction(
                    contig[i], contig_start[i], contig_end[i], chunk_id[i],
                    position[i], predicted_base_labels[i],
                    predicted_rle_labels[i], filename[i])