Esempio n. 1
0
def main(args):

    seed = args.seed
    random = np.random.RandomState(seed)

    n = args.number

    ## load the labels

    path = args.file
    format_ = args.format_
    coords = file_utils.read_coordinates(path, format=format_)

    ## split to coordinates up by image name
    image_names = []
    groups = []
    for name, group in coords.groupby('image_name'):
        image_names.append(name)
        groups.append(group)

    print(
        '# splitting {} micrographs with {} labeled particles into {} train and {} test micrographs'
        .format(len(image_names), len(coords),
                len(image_names) - n, n),
        file=sys.stderr)

    ## randomly split the labels by micrograph
    order = random.permutation(len(image_names))

    image_names_test = []
    groups_test = []
    for i in range(n):
        j = order[i]
        image_names_test.append(image_names[j])
        groups_test.append(groups[j])

    image_names_train = []
    groups_train = []
    for i in range(n, len(image_names)):
        j = order[i]
        image_names_train.append(image_names[j])
        groups_train.append(groups[j])

    targets_train = pd.concat(groups_train, 0)
    targets_test = pd.concat(groups_test, 0)

    ## if the image-dir is specified, make the image list files
    root = args.image_dir
    ext = args.image_ext

    paths_train = []
    for image_name in image_names_train:
        path = get_image_path(image_name, root, ext)
        if path is not None:
            paths_train.append(path)

    paths_test = []
    for image_name in image_names_test:
        path = get_image_path(image_name, root, ext)
        if path is not None:
            paths_test.append(path)

    image_list_train = pd.DataFrame({
        'image_name': image_names_train,
        'path': paths_train
    })
    image_list_test = pd.DataFrame({
        'image_name': image_names_test,
        'path': paths_test
    })

    ## write the files to the same location as the original labels
    root = os.path.dirname(args.file)
    basename = os.path.splitext(args.file)[0]

    ## write the split targets table
    path = basename + '_train.txt'
    print('# writing:', path, file=sys.stderr)
    targets_train.to_csv(path, sep='\t', index=False)

    path = basename + '_test.txt'
    print('# writing:', path, file=sys.stderr)
    targets_test.to_csv(path, sep='\t', index=False)

    ## write the image list tables
    path = root + os.sep + 'image_list_train.txt'
    print('# writing:', path, file=sys.stderr)
    image_list_train.to_csv(path, sep='\t', index=False)

    path = root + os.sep + 'image_list_test.txt'
    print('# writing:', path, file=sys.stderr)
    image_list_test.to_csv(path, sep='\t', index=False)
Esempio n. 2
0
def load_data(train_images,
              train_targets,
              test_images,
              test_targets,
              radius,
              k_fold=0,
              fold=0,
              cross_validation_seed=42,
              format_='auto',
              image_ext=''):

    # if train_images is a directory path, map to all images in the directory
    if os.path.isdir(train_images):
        paths = glob.glob(train_images + os.sep + '*' + image_ext)
        valid_paths = []
        image_names = []
        for path in paths:
            name = os.path.basename(path)
            name, ext = os.path.splitext(name)
            if ext in ['.mrc', '.tiff', '.png']:
                image_names.append(name)
                valid_paths.append(path)
        train_images = pd.DataFrame({
            'image_name': image_names,
            'path': valid_paths
        })
    else:
        train_images = pd.read_csv(train_images,
                                   sep='\t')  # training image file list
    #train_targets = pd.read_csv(train_targets, sep='\t') # training particle coordinates file
    train_targets = file_utils.read_coordinates(train_targets, format=format_)

    # check for source columns
    if 'source' not in train_images and 'source' not in train_targets:
        train_images['source'] = 0
        train_targets['source'] = 0
    # load the images and create target masks from the particle coordinates
    train_images = load_images_from_list(train_images.image_name,
                                         train_images.path,
                                         sources=train_images.source)

    # discard coordinates for micrographs not in the set of images
    # and warn the user if any are discarded
    names = set()
    for k, d in train_images.items():
        for name in d.keys():
            names.add(name)
    check = train_targets.image_name.apply(lambda x: x in names)
    missing = train_targets.image_name.loc[~check].unique().tolist()
    if len(missing) > 0:
        print(
            'WARNING: {} micrographs listed in the coordinates file are missing from the training images. Image names are listed below.'
            .format(len(missing)),
            file=sys.stderr)
        print('WARNING: missing micrographs are: {}'.format(missing),
              file=sys.stderr)
    train_targets = train_targets.loc[check]

    # check that the particles roughly fit within the images
    # if they don't, the user may not have scaled the particles/images correctly
    width = 0
    height = 0
    for k, d in train_images.items():
        for image in d.values():
            w, h = image.size
            if w > width:
                width = w
            if h > height:
                height = h
    out_of_bounds = (train_targets.x_coord > width) | (train_targets.y_coord >
                                                       height)
    count = out_of_bounds.sum()
    if count > int(
            0.1 * len(train_targets)
    ):  # arbitrary cutoff of more than 10% of particles being out of bounds...
        print(
            'WARNING: {} particle coordinates are out of the micrograph dimensions. Did you scale the micrographs and particle coordinates correctly?'
            .format(count),
            file=sys.stderr)
    #  also check that the coordinates fill most of the micrograph
    x_max = train_targets.x_coord.max()
    y_max = train_targets.y_coord.max()
    if x_max < 0.7 * width and y_max < 0.7 * height:  # more arbitrary cutoffs
        print(
            'WARNING: no coordinates are observed with x_coord > {} or y_coord > {}. Did you scale the micrographs and particle coordinates correctly?'
            .format(x_max, y_max),
            file=sys.stderr)

    num_micrographs = sum(len(train_images[k]) for k in train_images.keys())
    num_particles = len(train_targets)
    report('Loaded {} training micrographs with {} labeled particles'.format(
        num_micrographs, num_particles))

    train_images, train_targets = match_images_targets(train_images,
                                                       train_targets, radius)

    if test_images is not None:
        if os.path.isdir(test_images):
            paths = glob.glob(test_images + os.sep + '*' + image_ext)
            valid_paths = []
            image_names = []
            for path in paths:
                name = os.path.basename(path)
                name, ext = os.path.splitext(name)
                if ext in ['.mrc', '.tiff', '.png']:
                    image_names.append(name)
                    valid_paths.append(path)
            test_images = pd.DataFrame({
                'image_name': image_names,
                'path': valid_paths
            })
        else:
            test_images = pd.read_csv(test_images, sep='\t')
        #test_targets = pd.read_csv(test_targets, sep='\t')
        test_targets = file_utils.read_coordinates(test_targets,
                                                   format=format_)
        # check for source columns
        if 'source' not in test_images and 'source' not in test_targets:
            test_images['source'] = 0
            test_targets['source'] = 0
        test_images = load_images_from_list(test_images.image_name,
                                            test_images.path,
                                            sources=test_images.source)

        # discard coordinates for micrographs not in the set of images
        # and warn the user if any are discarded
        names = set()
        for k, d in test_images.items():
            for name in d.keys():
                names.add(name)
        check = test_targets.image_name.apply(lambda x: x in names)
        missing = test_targets.image_name.loc[~check].unique().tolist()
        if len(missing) > 0:
            print(
                'WARNING: {} micrographs listed in the coordinates file are missing from the test images. Image names are listed below.'
                .format(len(missing)),
                file=sys.stderr)
            print('WARNING: missing micrographs are: {}'.format(missing),
                  file=sys.stderr)
        test_targets = test_targets.loc[check]

        num_micrographs = sum(len(test_images[k]) for k in test_images.keys())
        num_particles = len(test_targets)
        report('Loaded {} test micrographs with {} labeled particles'.format(
            num_micrographs, num_particles))

        test_images, test_targets = match_images_targets(
            test_images, test_targets, radius)
    elif k_fold > 1:
        ## seed for partitioning the data
        random = np.random.RandomState(cross_validation_seed)
        ## make the split
        train_images, train_targets, test_images, test_targets = cross_validation_split(
            k_fold, fold, train_images, train_targets, random=random)

        n_train = sum(len(images) for images in train_images)
        n_test = sum(len(images) for images in test_images)
        report('Split into {} train and {} test micrographs'.format(
            n_train, n_test))

    return train_images, train_targets, test_images, test_targets
Esempio n. 3
0
def main(args):

    verbose = args.verbose

    form = args._from
    from_forms = [form for _ in range(len(args.files))]

    # detect the input file formats
    if form == 'auto':
        try:
            from_forms = [
                file_utils.detect_format(path) for path in args.files
            ]
        except file_utils.UnknownFormatError as e:
            print('Error: unrecognized input coordinates file extension (' +
                  e.ext + ')',
                  file=sys.stderr)
            sys.exit(1)
    formats_detected = list(set(from_forms))
    if verbose > 0:
        print('# INPUT formats detected: ' + str(formats_detected),
              file=sys.stderr)

    # determine the output file format
    output_path = args.output
    output = None
    to_form = args.to
    if output_path is None:
        output = sys.stdout
        # if output is to stdout and form is not set
        # then raise an error
        if to_form == 'auto':
            if len(formats_detected) == 1:
                # write the same output format as input format
                to_form = from_forms[0]
            else:
                print(
                    'Error: writing file to stdout and multiple input formats present with no output format (--to) set! Please tell me what format to write!'
                )
                sys.exit(1)
        if to_form == 'box' or to_form == 'json':
            print(
                'Error: writing BOX or JSON output files requires a destination directory. Please set the --output parameter!'
            )
            sys.exit(1)

    image_ext = args.image_ext
    boxsize = args.boxsize
    if to_form == 'auto':
        # first check for directory
        if output_path[-1] == '/':
            # image-ext must be set for these file formats
            if image_ext is None:
                print(
                    'Error: writing BOX or JSON output files requires setting the image file extension!'
                )
                sys.exit(1)
            # format is either json or box, check for boxsize to decide
            if boxsize > 0:
                # write boxes!
                if verbose > 0:
                    print(
                        '# Detected output format is BOX, because OUTPUT is a directory and boxsize > 0.',
                        file=sys.stderr)
                to_form = 'box'
            else:
                if verbose > 0:
                    print(
                        '# Detected output format is JSON, because OUTPUT is a directory and no boxsize set.',
                        file=sys.stderr)
                to_form = 'json'
        else:
            try:
                to_form = file_utils.detect_format(output_path)
            except file_utils.UnkownFormatError as e:
                print(
                    'Error: unrecognized output coordinates file extension (' +
                    e.ext + ')',
                    file=sys.stderr)
                sys.exit(1)
    if verbose > 0:
        print('# OUTPUT format: ' + to_form)

    suffix = args.suffix

    t = args.threshold
    down_scale = args.down_scale
    up_scale = args.up_scale
    scale = up_scale / down_scale

    # special case when inputs and outputs are all star files
    if len(formats_detected
           ) == 1 and formats_detected[0] == 'star' and to_form == 'star':
        dfs = []
        for path in args.files:
            with open(path, 'r') as f:
                table = star.parse(f)
            dfs.append(table)
        table = pd.concat(dfs, axis=0)
        # convert  score column to float and apply threshold
        if star.SCORE_COLUMN_NAME in table.columns:
            table = table.loc[table[star.SCORE_COLUMN_NAME] >= t]
        # scale coordinates
        if scale != 1:
            x_coord = table[star.X_COLUMN_NAME].values
            x_coord = np.round(scale * x_coord).astype(int)
            table[star.X_COLUMN_NAME] = x_coord
            y_coord = table[star.Y_COLUMN_NAME].values
            y_coord = np.round(scale * y_coord).astype(int)
            table[star.Y_COLUMN_NAME] = y_coord
        # add metadata if specified
        if args.voltage > 0:
            table[star.VOLTAGE] = args.voltage
        if args.detector_pixel_size > 0:
            table[star.DETECTOR_PIXEL_SIZE] = args.detector_pixel_size
        if args.magnification > 0:
            table[star.MAGNIFICATION] = args.magnification
        if args.amplitude_contrast > 0:
            table[star.AMPLITUDE_CONTRAST] = args.amplitude_contrast
        # write output file
        if output is None:
            with open(output_path, 'w') as f:
                star.write(table, f)
        else:
            star.write(table, output)

    else:  # general case

        # read the input files
        dfs = []
        for i in range(len(args.files)):
            path = args.files[i]
            coords = file_utils.read_coordinates(path, format=from_forms[i])
            dfs.append(coords)
        coords = pd.concat(dfs, axis=0)

        # threshold particles by score (if there is a score)
        if 'score' in coords.columns:
            coords = coords.loc[coords['score'] >= t]

        # scale coordinates
        if scale != 1:
            x_coord = coords['x_coord'].values
            x_coord = np.round(scale * x_coord).astype(int)
            coords['x_coord'] = x_coord
            y_coord = coords['y_coord'].values
            y_coord = np.round(scale * y_coord).astype(int)
            coords['y_coord'] = y_coord

        # add metadata if specified
        if args.voltage > 0:
            coords['voltage'] = args.voltage
        if args.detector_pixel_size > 0:
            coords['detector_pixel_size'] = args.detector_pixel_size
        if args.magnification > 0:
            coords['magnification'] = args.magnification
        if args.amplitude_contrast > 0:
            coords['amplitude_contrast'] = args.amplitude_contrast

        # invert y-axis coordinates if specified
        invert_y = args.invert_y
        if invert_y:
            if args.imagedir is None:
                print(
                    'Error: --imagedir must specify the directory of images in order to mirror the y-axis coordinates',
                    file=sys.stderr)
                sys.exit(1)
            dfs = []
            for image_name, group in coords.groupby('image_name'):
                impath = os.path.join(args.imagedir,
                                      image_name) + '.' + args.image_ext
                # use glob incase image_ext is '*'
                impath = glob.glob(impath)[0]
                im = load_image(impath)
                height = im.height

                group = mirror_y_axis(group, height)
                dfs.append(group)
            coords = pd.concat(dfs, axis=0)

        # output file format is decided and coordinates are processed, now write files
        if output is None and to_form != 'box' and to_form != 'json':
            output = open(output_path, 'w')
        if to_form == 'box' or to_form == 'json':
            output = output_path

        file_utils.write_coordinates(output,
                                     coords,
                                     format=to_form,
                                     boxsize=boxsize,
                                     image_ext=image_ext,
                                     suffix=suffix)