def collect_eval_dir(root_uri):
    eval_json_uris = list_paths(join(root_uri, 'eval'), ext='eval.json')
    for eval_json_uri in eval_json_uris:
        eval_json = file_to_json(eval_json_uri)
        print(basename(dirname(eval_json_uri)))
        print(eval_json['overall'][-1]['f1'])
        print()
def collect_experiment(key, root_uri, output_dir, get_pred_package=False):
    print('\nCollecting experiment {}...\n'.format(key))

    if root_uri.startswith('s3://'):
        predict_package_uris = list_paths(join(root_uri, key, 'bundle'), ext='predict_package.zip')
        eval_json_uris = list_paths(join(root_uri, key, 'eval'), ext='eval.json')
    else:
        predict_package_uris = glob.glob(join(root_uri, key, 'bundle', '*', 'predict_package.zip'))
        eval_json_uris = glob.glob(join(root_uri, key, 'eval', '*', 'eval.json'))

    if len(predict_package_uris) > 1 or len(eval_json_uris) > 1:
        print('Cannot collect from key with multiple experiments!!!')
        return

    if len(predict_package_uris) == 0 or len(eval_json_uris) == 0:
        print('Missing output!!!')
        return

    predict_package_uri = predict_package_uris[0]
    eval_json_uri = eval_json_uris[0]
    make_dir(join(output_dir, key))
    if get_pred_package:
        download_or_copy(predict_package_uri, join(output_dir, key))

    download_or_copy(eval_json_uri, join(output_dir, key))

    eval_json = file_to_json(join(output_dir, key, 'eval.json'))
    pprint.pprint(eval_json['overall'], indent=4)
Esempio n. 3
0
    def __init__(self, img_dir, annotation_uris, transforms=None):
        self.img_dir = img_dir
        self.annotation_uris = annotation_uris
        self.transforms = transforms

        self.imgs = []
        self.img2id = {}
        self.id2img = {}
        self.id2boxes = defaultdict(lambda: [])
        self.id2labels = defaultdict(lambda: [])
        self.label2name = {}
        for annotation_uri in annotation_uris:
            ann_json = file_to_json(annotation_uri)
            for img in ann_json['images']:
                self.imgs.append(img['file_name'])
                self.img2id[img['file_name']] = img['id']
                self.id2img[img['id']] = img['file_name']
            for ann in ann_json['annotations']:
                img_id = ann['image_id']
                box = ann['bbox']
                label = ann['category_id']
                box = torch.tensor(
                    [[box[1], box[0], box[1] + box[3], box[0] + box[2]]])
                self.id2boxes[img_id].append(box)
                self.id2labels[img_id].append(label)
        self.id2boxes = dict([(id, torch.cat(boxes).float())
                              for id, boxes in self.id2boxes.items()])
        self.id2labels = dict([(id, torch.tensor(labels))
                               for id, labels in self.id2labels.items()])
Esempio n. 4
0
def get_label_names(coco_path):
    categories = file_to_json(coco_path)['categories']
    label2name = dict([(cat['id'], cat['name']) for cat in categories])
    labels = ['background'
              ] + [label2name[i] for i in range(1,
                                                len(label2name) + 1)]
    return labels
Esempio n. 5
0
    def from_model_bundle(model_bundle_uri, tmp_dir):
        model_bundle_path = download_if_needed(model_bundle_uri, tmp_dir)
        model_bundle_dir = join(tmp_dir, 'model-bundle')
        unzip(model_bundle_path, model_bundle_dir)

        config_path = join(model_bundle_dir, 'config.json')
        model_path = join(model_bundle_dir, 'model.pth')
        cfg = build_config(file_to_json(config_path))
        return cfg.get_learner()(cfg, tmp_dir, model_path=model_path)
Esempio n. 6
0
def _run_command(cfg_json_uri, command, split_ind, num_splits):
    tmp_root_dir = '/opt/data/tmp'
    make_dir(tmp_root_dir)
    tmp_dir_obj = tempfile.TemporaryDirectory(dir=tmp_root_dir)
    tmp_dir = tmp_dir_obj.name

    pipeline_cfg_dict = file_to_json(cfg_json_uri)
    cfg = build_config(pipeline_cfg_dict)
    pipeline = cfg.get_pipeline()(cfg, tmp_dir)

    # TODO generalize this to work outside batch
    if split_ind is None:
        split_ind = int(os.environ.get('AWS_BATCH_JOB_ARRAY_INDEX', 0))
    command_fn = getattr(pipeline, command)

    if num_splits > 1:
        print('Running {} command split {}/{}...'.format(
            command, split_ind + 1, num_splits))
        command_fn(split_ind=split_ind, num_splits=num_splits)
    else:
        print('Running {} command...'.format(command))
        command_fn()
Esempio n. 7
0
    def train(self, tmp_dir):
        """Train a model.

        This downloads any previous output saved to the train_uri,
        starts training (or resumes from a checkpoint), periodically
        syncs contents of train_dir to train_uri and after training finishes.

        Args:
            tmp_dir: (str) path to temp directory
        """
        self.log_options()

        # Sync output of previous training run from cloud.
        train_uri = self.backend_opts.train_uri
        train_dir = get_local_path(train_uri, tmp_dir)
        make_dir(train_dir)
        sync_from_dir(train_uri, train_dir)

        # Get zip file for each group, and unzip them into chip_dir.
        chip_dir = join(tmp_dir, 'chips')
        make_dir(chip_dir)
        for zip_uri in list_paths(self.backend_opts.chip_uri, 'zip'):
            zip_path = download_if_needed(zip_uri, tmp_dir)
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                zipf.extractall(chip_dir)

        # Setup data loader.
        batch_size = self.train_opts.batch_size
        chip_size = self.task_config.chip_size
        class_names = self.class_map.get_class_names()
        databunch = build_databunch(chip_dir, chip_size, batch_size,
                                    class_names)
        log.info(databunch)
        num_labels = len(databunch.label_names)
        if self.train_opts.debug:
            make_debug_chips(databunch, self.class_map, tmp_dir, train_uri)

        # Setup model
        num_labels = len(databunch.label_names)
        model = get_model(self.train_opts.model_arch,
                          num_labels,
                          pretrained=True)
        model = model.to(self.device)
        model_path = join(train_dir, 'model')

        # Load weights from a pretrained model.
        pretrained_uri = self.backend_opts.pretrained_uri
        if pretrained_uri:
            log.info('Loading weights from pretrained_uri: {}'.format(
                pretrained_uri))
            pretrained_path = download_if_needed(pretrained_uri, tmp_dir)
            model.load_state_dict(
                torch.load(pretrained_path, map_location=self.device))

        # Possibly resume training from checkpoint.
        start_epoch = 0
        train_state_path = join(train_dir, 'train_state.json')
        if isfile(train_state_path):
            log.info('Resuming from checkpoint: {}\n'.format(model_path))
            train_state = file_to_json(train_state_path)
            start_epoch = train_state['epoch'] + 1
            model.load_state_dict(
                torch.load(model_path, map_location=self.device))

        # Write header of log CSV file.
        metric_names = ['precision', 'recall', 'f1']
        log_path = join(train_dir, 'log.csv')
        if not isfile(log_path):
            with open(log_path, 'w') as log_file:
                log_writer = csv.writer(log_file)
                row = ['epoch', 'time', 'train_loss'] + metric_names
                log_writer.writerow(row)

        # Setup Tensorboard logging.
        if self.train_opts.log_tensorboard:
            log_dir = join(train_dir, 'tb-logs')
            make_dir(log_dir)
            tb_writer = SummaryWriter(log_dir=log_dir)
            if self.train_opts.run_tensorboard:
                log.info('Starting tensorboard process')
                tensorboard_process = Popen(
                    ['tensorboard', '--logdir={}'.format(log_dir)])
                terminate_at_exit(tensorboard_process)

        # Setup optimizer, loss, and LR scheduler.
        loss_fn = torch.nn.CrossEntropyLoss()
        lr = self.train_opts.lr
        opt = optim.Adam(model.parameters(), lr=lr)
        step_scheduler, epoch_scheduler = None, None
        num_epochs = self.train_opts.num_epochs

        if self.train_opts.one_cycle and num_epochs > 1:
            steps_per_epoch = len(databunch.train_ds) // batch_size
            total_steps = num_epochs * steps_per_epoch
            step_size_up = (num_epochs // 2) * steps_per_epoch
            step_size_down = total_steps - step_size_up
            step_scheduler = CyclicLR(opt,
                                      base_lr=lr / 10,
                                      max_lr=lr,
                                      step_size_up=step_size_up,
                                      step_size_down=step_size_down,
                                      cycle_momentum=False)
            for _ in range(start_epoch * steps_per_epoch):
                step_scheduler.step()

        # Training loop.
        for epoch in range(start_epoch, num_epochs):
            # Train one epoch.
            log.info('-----------------------------------------------------')
            log.info('epoch: {}'.format(epoch))
            start = time.time()
            train_loss = train_epoch(model, self.device, databunch.train_dl,
                                     opt, loss_fn, step_scheduler)
            if epoch_scheduler:
                epoch_scheduler.step()
            log.info('train loss: {}'.format(train_loss))

            # Validate one epoch.
            metrics = validate_epoch(model, self.device, databunch.valid_dl,
                                     num_labels)
            log.info('validation metrics: {}'.format(metrics))

            # Print elapsed time for epoch.
            end = time.time()
            epoch_time = datetime.timedelta(seconds=end - start)
            log.info('epoch elapsed time: {}'.format(epoch_time))

            # Save model and state.
            torch.save(model.state_dict(), model_path)
            train_state = {'epoch': epoch}
            json_to_file(train_state, train_state_path)

            # Append to log CSV file.
            with open(log_path, 'a') as log_file:
                log_writer = csv.writer(log_file)
                row = [epoch, epoch_time, train_loss]
                row += [metrics[k] for k in metric_names]
                log_writer.writerow(row)

            # Write to Tensorboard log.
            if self.train_opts.log_tensorboard:
                for key, val in metrics.items():
                    tb_writer.add_scalar(key, val, epoch)
                tb_writer.add_scalar('train_loss', train_loss, epoch)
                for name, param in model.named_parameters():
                    tb_writer.add_histogram(name, param, epoch)

            if (train_uri.startswith('s3://')
                    and (((epoch + 1) % self.train_opts.sync_interval) == 0)):
                sync_to_dir(train_dir, train_uri)

        # Close Tensorboard.
        if self.train_opts.log_tensorboard:
            tb_writer.close()
            if self.train_opts.run_tensorboard:
                tensorboard_process.terminate()

        # Since model is exported every epoch, we need some other way to
        # show that training is finished.
        str_to_file('done!', self.backend_opts.train_done_uri)

        # Sync output to cloud.
        sync_to_dir(train_dir, self.backend_opts.train_uri)