def main(args): seed = args.seed random = np.random.RandomState(seed) n = args.number ## load the labels path = args.file format_ = args.format_ coords = file_utils.read_coordinates(path, format=format_) ## split to coordinates up by image name image_names = [] groups = [] for name, group in coords.groupby('image_name'): image_names.append(name) groups.append(group) print( '# splitting {} micrographs with {} labeled particles into {} train and {} test micrographs' .format(len(image_names), len(coords), len(image_names) - n, n), file=sys.stderr) ## randomly split the labels by micrograph order = random.permutation(len(image_names)) image_names_test = [] groups_test = [] for i in range(n): j = order[i] image_names_test.append(image_names[j]) groups_test.append(groups[j]) image_names_train = [] groups_train = [] for i in range(n, len(image_names)): j = order[i] image_names_train.append(image_names[j]) groups_train.append(groups[j]) targets_train = pd.concat(groups_train, 0) targets_test = pd.concat(groups_test, 0) ## if the image-dir is specified, make the image list files root = args.image_dir ext = args.image_ext paths_train = [] for image_name in image_names_train: path = get_image_path(image_name, root, ext) if path is not None: paths_train.append(path) paths_test = [] for image_name in image_names_test: path = get_image_path(image_name, root, ext) if path is not None: paths_test.append(path) image_list_train = pd.DataFrame({ 'image_name': image_names_train, 'path': paths_train }) image_list_test = pd.DataFrame({ 'image_name': image_names_test, 'path': paths_test }) ## write the files to the same location as the original labels root = os.path.dirname(args.file) basename = os.path.splitext(args.file)[0] ## write the split targets table path = basename + '_train.txt' print('# writing:', path, file=sys.stderr) targets_train.to_csv(path, sep='\t', index=False) path = basename + '_test.txt' print('# writing:', path, file=sys.stderr) targets_test.to_csv(path, sep='\t', index=False) ## write the image list tables path = root + os.sep + 'image_list_train.txt' print('# writing:', path, file=sys.stderr) image_list_train.to_csv(path, sep='\t', index=False) path = root + os.sep + 'image_list_test.txt' print('# writing:', path, file=sys.stderr) image_list_test.to_csv(path, sep='\t', index=False)
def load_data(train_images, train_targets, test_images, test_targets, radius, k_fold=0, fold=0, cross_validation_seed=42, format_='auto', image_ext=''): # if train_images is a directory path, map to all images in the directory if os.path.isdir(train_images): paths = glob.glob(train_images + os.sep + '*' + image_ext) valid_paths = [] image_names = [] for path in paths: name = os.path.basename(path) name, ext = os.path.splitext(name) if ext in ['.mrc', '.tiff', '.png']: image_names.append(name) valid_paths.append(path) train_images = pd.DataFrame({ 'image_name': image_names, 'path': valid_paths }) else: train_images = pd.read_csv(train_images, sep='\t') # training image file list #train_targets = pd.read_csv(train_targets, sep='\t') # training particle coordinates file train_targets = file_utils.read_coordinates(train_targets, format=format_) # check for source columns if 'source' not in train_images and 'source' not in train_targets: train_images['source'] = 0 train_targets['source'] = 0 # load the images and create target masks from the particle coordinates train_images = load_images_from_list(train_images.image_name, train_images.path, sources=train_images.source) # discard coordinates for micrographs not in the set of images # and warn the user if any are discarded names = set() for k, d in train_images.items(): for name in d.keys(): names.add(name) check = train_targets.image_name.apply(lambda x: x in names) missing = train_targets.image_name.loc[~check].unique().tolist() if len(missing) > 0: print( 'WARNING: {} micrographs listed in the coordinates file are missing from the training images. Image names are listed below.' .format(len(missing)), file=sys.stderr) print('WARNING: missing micrographs are: {}'.format(missing), file=sys.stderr) train_targets = train_targets.loc[check] # check that the particles roughly fit within the images # if they don't, the user may not have scaled the particles/images correctly width = 0 height = 0 for k, d in train_images.items(): for image in d.values(): w, h = image.size if w > width: width = w if h > height: height = h out_of_bounds = (train_targets.x_coord > width) | (train_targets.y_coord > height) count = out_of_bounds.sum() if count > int( 0.1 * len(train_targets) ): # arbitrary cutoff of more than 10% of particles being out of bounds... print( 'WARNING: {} particle coordinates are out of the micrograph dimensions. Did you scale the micrographs and particle coordinates correctly?' .format(count), file=sys.stderr) # also check that the coordinates fill most of the micrograph x_max = train_targets.x_coord.max() y_max = train_targets.y_coord.max() if x_max < 0.7 * width and y_max < 0.7 * height: # more arbitrary cutoffs print( 'WARNING: no coordinates are observed with x_coord > {} or y_coord > {}. Did you scale the micrographs and particle coordinates correctly?' .format(x_max, y_max), file=sys.stderr) num_micrographs = sum(len(train_images[k]) for k in train_images.keys()) num_particles = len(train_targets) report('Loaded {} training micrographs with {} labeled particles'.format( num_micrographs, num_particles)) train_images, train_targets = match_images_targets(train_images, train_targets, radius) if test_images is not None: if os.path.isdir(test_images): paths = glob.glob(test_images + os.sep + '*' + image_ext) valid_paths = [] image_names = [] for path in paths: name = os.path.basename(path) name, ext = os.path.splitext(name) if ext in ['.mrc', '.tiff', '.png']: image_names.append(name) valid_paths.append(path) test_images = pd.DataFrame({ 'image_name': image_names, 'path': valid_paths }) else: test_images = pd.read_csv(test_images, sep='\t') #test_targets = pd.read_csv(test_targets, sep='\t') test_targets = file_utils.read_coordinates(test_targets, format=format_) # check for source columns if 'source' not in test_images and 'source' not in test_targets: test_images['source'] = 0 test_targets['source'] = 0 test_images = load_images_from_list(test_images.image_name, test_images.path, sources=test_images.source) # discard coordinates for micrographs not in the set of images # and warn the user if any are discarded names = set() for k, d in test_images.items(): for name in d.keys(): names.add(name) check = test_targets.image_name.apply(lambda x: x in names) missing = test_targets.image_name.loc[~check].unique().tolist() if len(missing) > 0: print( 'WARNING: {} micrographs listed in the coordinates file are missing from the test images. Image names are listed below.' .format(len(missing)), file=sys.stderr) print('WARNING: missing micrographs are: {}'.format(missing), file=sys.stderr) test_targets = test_targets.loc[check] num_micrographs = sum(len(test_images[k]) for k in test_images.keys()) num_particles = len(test_targets) report('Loaded {} test micrographs with {} labeled particles'.format( num_micrographs, num_particles)) test_images, test_targets = match_images_targets( test_images, test_targets, radius) elif k_fold > 1: ## seed for partitioning the data random = np.random.RandomState(cross_validation_seed) ## make the split train_images, train_targets, test_images, test_targets = cross_validation_split( k_fold, fold, train_images, train_targets, random=random) n_train = sum(len(images) for images in train_images) n_test = sum(len(images) for images in test_images) report('Split into {} train and {} test micrographs'.format( n_train, n_test)) return train_images, train_targets, test_images, test_targets
def main(args): verbose = args.verbose form = args._from from_forms = [form for _ in range(len(args.files))] # detect the input file formats if form == 'auto': try: from_forms = [ file_utils.detect_format(path) for path in args.files ] except file_utils.UnknownFormatError as e: print('Error: unrecognized input coordinates file extension (' + e.ext + ')', file=sys.stderr) sys.exit(1) formats_detected = list(set(from_forms)) if verbose > 0: print('# INPUT formats detected: ' + str(formats_detected), file=sys.stderr) # determine the output file format output_path = args.output output = None to_form = args.to if output_path is None: output = sys.stdout # if output is to stdout and form is not set # then raise an error if to_form == 'auto': if len(formats_detected) == 1: # write the same output format as input format to_form = from_forms[0] else: print( 'Error: writing file to stdout and multiple input formats present with no output format (--to) set! Please tell me what format to write!' ) sys.exit(1) if to_form == 'box' or to_form == 'json': print( 'Error: writing BOX or JSON output files requires a destination directory. Please set the --output parameter!' ) sys.exit(1) image_ext = args.image_ext boxsize = args.boxsize if to_form == 'auto': # first check for directory if output_path[-1] == '/': # image-ext must be set for these file formats if image_ext is None: print( 'Error: writing BOX or JSON output files requires setting the image file extension!' ) sys.exit(1) # format is either json or box, check for boxsize to decide if boxsize > 0: # write boxes! if verbose > 0: print( '# Detected output format is BOX, because OUTPUT is a directory and boxsize > 0.', file=sys.stderr) to_form = 'box' else: if verbose > 0: print( '# Detected output format is JSON, because OUTPUT is a directory and no boxsize set.', file=sys.stderr) to_form = 'json' else: try: to_form = file_utils.detect_format(output_path) except file_utils.UnkownFormatError as e: print( 'Error: unrecognized output coordinates file extension (' + e.ext + ')', file=sys.stderr) sys.exit(1) if verbose > 0: print('# OUTPUT format: ' + to_form) suffix = args.suffix t = args.threshold down_scale = args.down_scale up_scale = args.up_scale scale = up_scale / down_scale # special case when inputs and outputs are all star files if len(formats_detected ) == 1 and formats_detected[0] == 'star' and to_form == 'star': dfs = [] for path in args.files: with open(path, 'r') as f: table = star.parse(f) dfs.append(table) table = pd.concat(dfs, axis=0) # convert score column to float and apply threshold if star.SCORE_COLUMN_NAME in table.columns: table = table.loc[table[star.SCORE_COLUMN_NAME] >= t] # scale coordinates if scale != 1: x_coord = table[star.X_COLUMN_NAME].values x_coord = np.round(scale * x_coord).astype(int) table[star.X_COLUMN_NAME] = x_coord y_coord = table[star.Y_COLUMN_NAME].values y_coord = np.round(scale * y_coord).astype(int) table[star.Y_COLUMN_NAME] = y_coord # add metadata if specified if args.voltage > 0: table[star.VOLTAGE] = args.voltage if args.detector_pixel_size > 0: table[star.DETECTOR_PIXEL_SIZE] = args.detector_pixel_size if args.magnification > 0: table[star.MAGNIFICATION] = args.magnification if args.amplitude_contrast > 0: table[star.AMPLITUDE_CONTRAST] = args.amplitude_contrast # write output file if output is None: with open(output_path, 'w') as f: star.write(table, f) else: star.write(table, output) else: # general case # read the input files dfs = [] for i in range(len(args.files)): path = args.files[i] coords = file_utils.read_coordinates(path, format=from_forms[i]) dfs.append(coords) coords = pd.concat(dfs, axis=0) # threshold particles by score (if there is a score) if 'score' in coords.columns: coords = coords.loc[coords['score'] >= t] # scale coordinates if scale != 1: x_coord = coords['x_coord'].values x_coord = np.round(scale * x_coord).astype(int) coords['x_coord'] = x_coord y_coord = coords['y_coord'].values y_coord = np.round(scale * y_coord).astype(int) coords['y_coord'] = y_coord # add metadata if specified if args.voltage > 0: coords['voltage'] = args.voltage if args.detector_pixel_size > 0: coords['detector_pixel_size'] = args.detector_pixel_size if args.magnification > 0: coords['magnification'] = args.magnification if args.amplitude_contrast > 0: coords['amplitude_contrast'] = args.amplitude_contrast # invert y-axis coordinates if specified invert_y = args.invert_y if invert_y: if args.imagedir is None: print( 'Error: --imagedir must specify the directory of images in order to mirror the y-axis coordinates', file=sys.stderr) sys.exit(1) dfs = [] for image_name, group in coords.groupby('image_name'): impath = os.path.join(args.imagedir, image_name) + '.' + args.image_ext # use glob incase image_ext is '*' impath = glob.glob(impath)[0] im = load_image(impath) height = im.height group = mirror_y_axis(group, height) dfs.append(group) coords = pd.concat(dfs, axis=0) # output file format is decided and coordinates are processed, now write files if output is None and to_form != 'box' and to_form != 'json': output = open(output_path, 'w') if to_form == 'box' or to_form == 'json': output = output_path file_utils.write_coordinates(output, coords, format=to_form, boxsize=boxsize, image_ext=image_ext, suffix=suffix)