Ejemplo n.º 1
0
def make_data_iterators(train_images, train_targets, test_images, test_targets,
                        crop, split, args):
    from topaz.utils.data.sampler import StratifiedCoordinateSampler
    from torch.utils.data.dataloader import DataLoader

    ## training parameters
    minibatch_size = args.minibatch_size
    epoch_size = args.epoch_size
    num_epochs = args.num_epochs
    num_workers = args.num_workers
    if num_workers < 0:  # set num workers to use all CPUs
        num_workers = mp.cpu_count()

    testing_batch_size = args.test_batch_size
    balance = args.minibatch_balance  # ratio of positive to negative in minibatch
    if args.natural:
        balance = None
    report('minibatch_size={}, epoch_size={}, num_epochs={}'.format(
        minibatch_size, epoch_size, num_epochs))

    ## create augmented training dataset
    train_dataset = make_traindataset(train_images, train_targets, crop)
    test_dataset = None
    if test_targets is not None:
        test_dataset = make_testdataset(test_images, test_targets)

    ## create minibatch iterators
    labels = train_dataset.data.labels
    sampler = StratifiedCoordinateSampler(labels,
                                          size=epoch_size * minibatch_size,
                                          balance=balance,
                                          split=split)
    train_iterator = DataLoader(train_dataset,
                                batch_size=minibatch_size,
                                sampler=sampler,
                                num_workers=num_workers)

    test_iterator = None
    if test_dataset is not None:
        test_iterator = DataLoader(test_dataset,
                                   batch_size=testing_batch_size,
                                   num_workers=0)

    return train_iterator, test_iterator
Ejemplo n.º 2
0
def report_data_stats(train_images, train_targets, test_images, test_targets):
    report('source\tsplit\tp_observed\tnum_positive_regions\ttotal_regions')
    num_positive_regions = 0
    total_regions = 0
    for i in range(len(train_images)):
        p = sum(train_targets[i][j].sum()
                for j in range(len(train_targets[i])))
        p = int(p)
        total = sum(train_targets[i][j].size
                    for j in range(len(train_targets[i])))
        num_positive_regions += p
        total_regions += total
        p_observed = p / total
        p_observed = '{:.3g}'.format(p_observed)
        report(
            str(i) + '\t' + 'train' + '\t' + p_observed + '\t' + str(p) +
            '\t' + str(total))
        if test_targets is not None:
            p = sum(test_targets[i][j].sum()
                    for j in range(len(test_targets[i])))
            p = int(p)
            total = sum(test_targets[i][j].size
                        for j in range(len(test_targets[i])))
            p_observed = p / total
            p_observed = '{:.3g}'.format(p_observed)
            report(
                str(i) + '\t' + 'test' + '\t' + p_observed + '\t' + str(p) +
                '\t' + str(total))
    return num_positive_regions, total_regions
Ejemplo n.º 3
0
def make_model(args):
    from topaz.model.factory import get_feature_extractor
    import topaz.model.classifier as C
    from topaz.model.classifier import LinearClassifier

    report('Loading model:', args.model)
    if args.model.endswith('.sav'):  # loading pretrained model
        model = torch.load(args.model)
        model.train()
        return model

    report('Model parameters: units={}, dropout={}, bn={}'.format(
        args.units, args.dropout, args.bn))

    # initialize the model
    units = args.units
    dropout = args.dropout
    bn = args.bn == 'on'
    pooling = args.pooling
    unit_scaling = args.unit_scaling

    feature_extractor = get_feature_extractor(args.model,
                                              units,
                                              dropout=dropout,
                                              bn=bn,
                                              unit_scaling=unit_scaling,
                                              pooling=pooling)
    classifier = C.LinearClassifier(feature_extractor)

    ## if the method is generative, create the generative model as well
    generative = None
    if args.autoencoder > 0:
        from topaz.model.generative import ConvGenerator
        ngf = args.ngf
        depth = int(np.log2(classifier.width + 1) - 3)
        generative = ConvGenerator(classifier.latent_dim,
                                   units=ngf,
                                   depth=depth)
        ## attach the generative model
        classifier.generative = generative
        report('Generator: units={}, size={}'.format(ngf, generative.width))

    report('Receptive field:', classifier.width)

    return classifier
Ejemplo n.º 4
0
def report_data_stats(train_images, train_targets, test_images, test_targets):
    report('source\tsplit\tp\ttotal')
    num_positive_regions = 0
    total_regions = 0
    for i in range(len(train_images)):
        p = sum(train_targets[i][j].sum()
                for j in range(len(train_targets[i])))
        total = sum(train_targets[i][j].size
                    for j in range(len(train_targets[i])))
        num_positive_regions += p
        total_regions += total
        p = p / total
        report(str(i) + '\t' + 'train' + '\t' + str(p) + '\t' + str(total))
        if test_targets is not None:
            p = sum(test_targets[i][j].sum()
                    for j in range(len(test_targets[i])))
            total = sum(test_targets[i][j].size
                        for j in range(len(test_targets[i])))
            p = p / total
            report(str(i) + '\t' + 'test' + '\t' + str(p) + '\t' + str(total))
    return num_positive_regions, total_regions
Ejemplo n.º 5
0
def main(args):
    ## initialize the model
    classifier = make_model(args)

    if args.describe:
        ## only print a description of the model and terminate
        print(classifier)
        sys.exit()

    ## set the device
    """
    use_cuda = False
    if args.device >= 0:
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            torch.cuda.set_device(args.device)
        else:
            print('WARNING: you specified GPU (device={}) but no GPUs were detected. This may mean there is a mismatch between your system CUDA version and your pytorch CUDA version.'.format(args.device), file=sys.stderr)
    """

    use_cuda = topaz.cuda.set_device(args.device)
    report('Using device={} with cuda={}'.format(args.device, use_cuda))

    if use_cuda:
        classifier.cuda()

    ## load the data
    radius = args.radius  # number of pixels around coordinates to label as positive
    train_images, train_targets, test_images, test_targets = \
            load_data(args.train_images,
                      args.train_targets,
                      args.test_images,
                      args.test_targets,
                      radius,
                      format_=args.format_,
                      k_fold=args.k_fold,
                      fold=args.fold,
                      cross_validation_seed=args.cross_validation_seed,
                      image_ext=args.image_ext
                     )
    num_positive_regions, total_regions = report_data_stats(
        train_images, train_targets, test_images, test_targets)

    ## make the training step method
    if args.num_particles > 0:
        expected_num_particles = args.num_particles
        # make this expected particles in training set rather than per micrograph
        num_micrographs = sum(len(images) for images in train_images)
        expected_num_particles *= num_micrographs

        # given the expected number of particles and the radius
        # calculate what pi should be
        # pi = pixels_per_particle*expected_number_of_particles/pixels_in_dataset
        grid = np.linspace(-radius, radius, 2 * radius + 1)
        xx = np.zeros((2 * radius + 1, 2 * radius + 1)) + grid[:, np.newaxis]
        yy = np.zeros((2 * radius + 1, 2 * radius + 1)) + grid[np.newaxis]
        d2 = xx**2 + yy**2
        mask = (d2 <= radius**2).astype(int)
        pixels_per_particle = mask.sum()

        # total_regions is number of regions in the data
        pi = pixels_per_particle * expected_num_particles / total_regions

        report(
            'Specified expected number of particle per micrograph = {}'.format(
                args.num_particles))
        report('With radius = {}'.format(radius))
        report('Setting pi = {}'.format(pi))
    else:
        pi = args.pi
        report('pi = {}'.format(pi))

    trainer, criteria, split = make_training_step_method(
        classifier,
        num_positive_regions,
        num_positive_regions / total_regions,
        lr=args.learning_rate,
        l2=args.l2,
        method=args.method,
        pi=pi,
        slack=args.slack,
        autoencoder=args.autoencoder)

    ## training parameters
    train_iterator, test_iterator = make_data_iterators(
        train_images, train_targets, test_images, test_targets,
        classifier.width, split, args)

    ## fit the model, report train/test stats, save model if required
    output = sys.stdout if args.output is None else open(args.output, 'w')
    save_prefix = args.save_prefix
    #if not os.path.exists(os.path.dirname(save_prefix)):
    #    os.makedirs(os.path.dirname(save_prefix))
    fit_epochs(classifier,
               criteria,
               trainer,
               train_iterator,
               test_iterator,
               args.num_epochs,
               save_prefix=save_prefix,
               use_cuda=use_cuda,
               output=output)

    report('Done!')
Ejemplo n.º 6
0
def load_data(train_images,
              train_targets,
              test_images,
              test_targets,
              radius,
              k_fold=0,
              fold=0,
              cross_validation_seed=42,
              format_='auto',
              image_ext=''):

    # if train_images is a directory path, map to all images in the directory
    if os.path.isdir(train_images):
        paths = glob.glob(train_images + os.sep + '*' + image_ext)
        valid_paths = []
        image_names = []
        for path in paths:
            name = os.path.basename(path)
            name, ext = os.path.splitext(name)
            if ext in ['.mrc', '.tiff', '.png']:
                image_names.append(name)
                valid_paths.append(path)
        train_images = pd.DataFrame({
            'image_name': image_names,
            'path': valid_paths
        })
    else:
        train_images = pd.read_csv(train_images,
                                   sep='\t')  # training image file list
    #train_targets = pd.read_csv(train_targets, sep='\t') # training particle coordinates file
    train_targets = file_utils.read_coordinates(train_targets, format=format_)

    # check for source columns
    if 'source' not in train_images and 'source' not in train_targets:
        train_images['source'] = 0
        train_targets['source'] = 0
    # load the images and create target masks from the particle coordinates
    train_images = load_images_from_list(train_images.image_name,
                                         train_images.path,
                                         sources=train_images.source)

    # discard coordinates for micrographs not in the set of images
    # and warn the user if any are discarded
    names = set()
    for k, d in train_images.items():
        for name in d.keys():
            names.add(name)
    check = train_targets.image_name.apply(lambda x: x in names)
    missing = train_targets.image_name.loc[~check].unique().tolist()
    if len(missing) > 0:
        print(
            'WARNING: {} micrographs listed in the coordinates file are missing from the training images. Image names are listed below.'
            .format(len(missing)),
            file=sys.stderr)
        print('WARNING: missing micrographs are: {}'.format(missing),
              file=sys.stderr)
    train_targets = train_targets.loc[check]

    # check that the particles roughly fit within the images
    # if they don't, the user may not have scaled the particles/images correctly
    width = 0
    height = 0
    for k, d in train_images.items():
        for image in d.values():
            w, h = image.size
            if w > width:
                width = w
            if h > height:
                height = h
    out_of_bounds = (train_targets.x_coord > width) | (train_targets.y_coord >
                                                       height)
    count = out_of_bounds.sum()
    if count > int(
            0.1 * len(train_targets)
    ):  # arbitrary cutoff of more than 10% of particles being out of bounds...
        print(
            'WARNING: {} particle coordinates are out of the micrograph dimensions. Did you scale the micrographs and particle coordinates correctly?'
            .format(count),
            file=sys.stderr)
    #  also check that the coordinates fill most of the micrograph
    x_max = train_targets.x_coord.max()
    y_max = train_targets.y_coord.max()
    if x_max < 0.7 * width and y_max < 0.7 * height:  # more arbitrary cutoffs
        print(
            'WARNING: no coordinates are observed with x_coord > {} or y_coord > {}. Did you scale the micrographs and particle coordinates correctly?'
            .format(x_max, y_max),
            file=sys.stderr)

    num_micrographs = sum(len(train_images[k]) for k in train_images.keys())
    num_particles = len(train_targets)
    report('Loaded {} training micrographs with {} labeled particles'.format(
        num_micrographs, num_particles))

    train_images, train_targets = match_images_targets(train_images,
                                                       train_targets, radius)

    if test_images is not None:
        if os.path.isdir(test_images):
            paths = glob.glob(test_images + os.sep + '*' + image_ext)
            valid_paths = []
            image_names = []
            for path in paths:
                name = os.path.basename(path)
                name, ext = os.path.splitext(name)
                if ext in ['.mrc', '.tiff', '.png']:
                    image_names.append(name)
                    valid_paths.append(path)
            test_images = pd.DataFrame({
                'image_name': image_names,
                'path': valid_paths
            })
        else:
            test_images = pd.read_csv(test_images, sep='\t')
        #test_targets = pd.read_csv(test_targets, sep='\t')
        test_targets = file_utils.read_coordinates(test_targets,
                                                   format=format_)
        # check for source columns
        if 'source' not in test_images and 'source' not in test_targets:
            test_images['source'] = 0
            test_targets['source'] = 0
        test_images = load_images_from_list(test_images.image_name,
                                            test_images.path,
                                            sources=test_images.source)

        # discard coordinates for micrographs not in the set of images
        # and warn the user if any are discarded
        names = set()
        for k, d in test_images.items():
            for name in d.keys():
                names.add(name)
        check = test_targets.image_name.apply(lambda x: x in names)
        missing = test_targets.image_name.loc[~check].unique().tolist()
        if len(missing) > 0:
            print(
                'WARNING: {} micrographs listed in the coordinates file are missing from the test images. Image names are listed below.'
                .format(len(missing)),
                file=sys.stderr)
            print('WARNING: missing micrographs are: {}'.format(missing),
                  file=sys.stderr)
        test_targets = test_targets.loc[check]

        num_micrographs = sum(len(test_images[k]) for k in test_images.keys())
        num_particles = len(test_targets)
        report('Loaded {} test micrographs with {} labeled particles'.format(
            num_micrographs, num_particles))

        test_images, test_targets = match_images_targets(
            test_images, test_targets, radius)
    elif k_fold > 1:
        ## seed for partitioning the data
        random = np.random.RandomState(cross_validation_seed)
        ## make the split
        train_images, train_targets, test_images, test_targets = cross_validation_split(
            k_fold, fold, train_images, train_targets, random=random)

        n_train = sum(len(images) for images in train_images)
        n_test = sum(len(images) for images in test_images)
        report('Split into {} train and {} test micrographs'.format(
            n_train, n_test))

    return train_images, train_targets, test_images, test_targets
Ejemplo n.º 7
0
def main(args):
    ## initialize the model
    classifier = make_model(args)

    if args.describe:
        ## only print a description of the model and terminate
        print(classifier)
        sys.exit()

    ## set the device
    use_cuda = False
    if args.device >= 0:
        use_cuda = torch.cuda.is_available()
        if use_cuda:
            torch.cuda.set_device(args.device)
    report('Using device={} with cuda={}'.format(args.device, use_cuda))

    if use_cuda:
        classifier.cuda()

    ## load the data
    radius = args.radius  # number of pixels around coordinates to label as positive
    train_images, train_targets, test_images, test_targets = \
            load_data(args.train_images,
                      args.train_targets,
                      args.test_images,
                      args.test_targets,
                      radius,
                      k_fold=args.k_fold,
                      fold=args.fold,
                      cross_validation_seed=args.cross_validation_seed,
                     )
    num_positive_regions, total_regions = report_data_stats(
        train_images, train_targets, test_images, test_targets)

    ## make the training step method
    trainer, criteria, split = make_training_step_method(
        classifier, num_positive_regions, num_positive_regions / total_regions,
        args)

    ## training parameters
    train_iterator, test_iterator = make_data_iterators(
        train_images, train_targets, test_images, test_targets,
        classifier.width, split, args)

    ## fit the model, report train/test stats, save model if required
    output = sys.stdout if args.output is None else open(args.output, 'w')
    save_prefix = args.save_prefix
    #if not os.path.exists(os.path.dirname(save_prefix)):
    #    os.makedirs(os.path.dirname(save_prefix))
    fit_epochs(classifier,
               criteria,
               trainer,
               train_iterator,
               test_iterator,
               args.num_epochs,
               save_prefix=save_prefix,
               use_cuda=use_cuda,
               output=output)

    report('Done!')