def __init__(self, data_path, resize): self.dd = ifcb.DataDirectory(data_path) # use 299x299 for inception_v3, all other models use 244x244 if isinstance(resize, int): resize = (resize, resize) self.resize = resize
def scan(self): for dd in self.dataset.directories.filter(kind=DataDirectory.RAW).order_by('priority'): if not os.path.exists(dd.path): continue # skip and continue searching directory = ifcb.DataDirectory(dd.path) for b in directory: yield b
def get_raw_directory(self): if self.kind != self.RAW: raise ValueError('not a raw directory') # return the underlying ifcb.DataDirectory whitelist = re.split(',', self.whitelist) blacklist = re.split(',', self.blacklist) return ifcb.DataDirectory(self.path, whitelist=whitelist, blacklist=blacklist)
def scan(self, dir): """scan a directory and add all bins found""" logging.debug('scanning {} ...'.format(dir)) dd = ifcb.DataDirectory(dir) for bin in dd: if not self.exists(bin): logging.debug('adding {}'.format(bin)) self.add_bin(bin) else: logging.debug('skipping {}, exists'.format(bin)) logging.debug('done.')
def main(args): data_dir = ifcb.DataDirectory(args.input_dir) for sample_bin in data_dir: print(sample_bin) number_of_images = len(sample_bin.images) lid = sample_bin.lid print('{} has {} image(s)'.format(lid, number_of_images)) with sample_bin: path = os.path.join(args.output_dir, lid) Path(path).mkdir(parents=True, exist_ok=True) for roi_number in sample_bin.images: img_path = os.path.join(path, lid + '_' + str(roi_number) + '.png') print(img_path) imageio.imwrite(img_path, sample_bin.images[roi_number])
def sync_one(self, pid): bin = None for dd in self.dataset.directories.filter( kind=DataDirectory.RAW).order_by('priority'): if not os.path.exists(dd.path): continue # skip and continue searching directory = ifcb.DataDirectory(dd.path) try: bin = directory[pid] except KeyError: continue if bin is None: return 'bin {} not found'.format(pid) # create instrument if necessary i = bin.pid.instrument version = bin.pid.schema_version instrument, created = Instrument.objects.get_or_create( number=i, defaults={'version': version}) # create model object timestamp = bin.pid.timestamp b, created = Bin.objects.get_or_create( pid=pid, defaults={ 'timestamp': timestamp, 'sample_time': timestamp, 'instrument': instrument, 'skip': True, # in case accession is interrupted }) if not created and not dataset in b.datasets: self.dataset.bins.add(b) return b2s, error = self.add_bin(bin, b) if error is not None: # there was an error. if we created a bin, delete it if created: b.delete() return error with transaction.atomic(): if not b2s.qc_no_rois: b2s.skip = False b2s.save() self.dataset.bins.add(b2s) else: b2s.save()
def do_run(args): # assert correct filter arguments if args.filter: if not args.filter[0] in ['IN', 'OUT']: argparse.ArgumentTypeError('IN|OUT must be either "IN" or "OUT"') if len(args.filter) < 2: argparse.ArgumentTypeError('Must be at least one KEYWORD') # load model classifier = NeustonModel.load_from_checkpoint(args.MODEL) seed_everything(classifier.hparams.seed) # ARG CORRECTIONS AND CHECKS if os.path.isdir(args.SRC) and not args.SRC.endswith(os.sep): args.SRC = args.SRC + os.sep # set OUTFILE defaults if not args.outfile: if args.src_type == 'bin': args.outfile = ['D{BIN_YEAR}/D{BIN_DATE}/{BIN_ID}_class.h5'] if args.src_type == 'img': args.outfile = ['img_results.json'] # Setup Callbacks plotting_callbacks = [] # TODO run_results_callbacks = [] for outfile in args.outfile: svr = SaveTestResults(outdir=args.outdir, outfile=outfile, timestamp=args.cmd_timestamp) run_results_callbacks.append(svr) # create trainer trainer = Trainer( deterministic=True, gpus=len(args.gpus) if args.gpus else None, logger=False, checkpoint_callback=False, callbacks=run_results_callbacks, ) # dataset filter if any filter_mode, filter_keywords = None, [] if args.filter: filter_mode = args.filter[0] for keyword in args.filter[1:]: if os.path.isfile(keyword): with open(keyword) as f: filter_keywords.extend(f.readlines()) else: filter_keywords.append(keyword) # create dataset image_loaders = [] if args.src_type == 'bin': # Formatting Dataset if os.path.isdir(args.SRC): if filter_mode == 'IN': dd = ifcb.DataDirectory(args.SRC, whitelist=filter_keywords) elif filter_mode == 'OUT': dd = ifcb.DataDirectory(args.SRC, blacklist=filter_keywords) else: dd = ifcb.DataDirectory(args.SRC) elif os.path.isfile(args.SRC) and args.SRC.endswith( '.txt'): # TODO TEST: textfile bin run with open(args.SRC, 'r') as f: bins = f.readlines() parent = os.path.commonpath(bins) dd = ifcb.DataDirectory(parent, whitelist=bins) else: # single bin # TODO TEST: single bin run parent = os.path.dirname(args.SRC) bin_id = os.path.basename(args.SRC) dd = ifcb.DataDirectory(parent, whitelist=[bin_id]) error_bins = [] if args.gobig: print('Loading Bins', end=' ') for i, bin_fileset in enumerate(dd): bin_fileset.pid.namespace = os.path.dirname( bin_fileset.fileset.basepath.replace(args.SRC, '')) + os.sep bin_obj = bin_fileset.pid if args.filter: # applying filter if filter_mode == 'IN': # if bin does NOT match any of the keywords, skip it if not any([k in str(bin_obj) for k in filter_keywords]): continue elif filter_mode == 'OUT': # if bin matches any of the keywords, skip it if any([k in str(bin_obj) for k in filter_keywords]): continue if not args.clobber: output_files = [ os.path.join(args.outdir, ofile) for ofile in args.outfile ] outfile_dict = dict(BIN_ID=bin_obj.pid, BIN_YEAR=bin_obj.year, BIN_DATE=bin_obj.yearday, INPUT_SUBDIRS=bin_obj.namespace) output_files = [ ofile.format(**outfile_dict).replace(2 * os.sep, os.sep) for ofile in output_files ] if all([os.path.isfile(ofile) for ofile in output_files]): print( '{} result-file(s) already exist - skipping this bin'. format(bin_obj)) continue bin_dataset = IfcbBinDataset(bin_fileset, classifier.hparams.resize, classifier.hparams.img_norm) image_loader = DataLoader(bin_dataset, batch_size=args.batch_size, pin_memory=True, num_workers=args.loaders) # skip empty bins if len(image_loader) == 0: error_bins.append((bin_obj, AssertionError('Bin is Empty'))) continue if args.gobig: print('.', end='', flush=True) image_loaders.append(image_loader) else: # Do runs one bin at a time try: trainer.test(classifier, test_dataloaders=image_loader) except Exception as e: error_bins.append((bin_obj, e)) # Do Runs all at once if args.gobig: print() trainer.test(classifier, test_dataloaders=image_loaders) # Final Statements print('RUN IS DONE') if error_bins: print("The following bins failed; they were not processed:") for bin_obj, err in error_bins: print(bin_obj, type(err), err) ## IMAGES ## else: img_paths = [] if os.path.isdir(args.SRC): for pardir, _, imgs in os.walk(args.SRC): imgs = [ os.path.join(pardir, img) for img in imgs if img.endswith(IMG_EXTENSIONS) ] img_paths.extend(imgs) elif os.path.isfile(args.SRC) and args.SRC.endswith( '.txt'): # TODO TEST: textfile img run with open(args.SRC, 'r') as f: img_paths = f.readlines() img_paths = [img.strip() for img in img_paths] img_paths = [ img for img in img_paths if img.endswith(IMG_EXTENSIONS) ] elif args.SRC.endswith( IMG_EXTENSIONS): # single img # TODO TEST: single img run img_paths.append(args.SRC) # applying filter if args.filter: for img in img_paths[:]: if filter_mode == 'IN': # if img does NOT match any of the keywords, skip it if not any([k in img for k in filter_keywords]): img_paths.remove(img) elif filter_mode == 'OUT': # if img matches any of the keywords, skip it if any([k in img for k in filter_keywords]): img_paths.remove(img) assert len(img_paths) > 0, 'No images to process' image_dataset = ImageDataset(img_paths, resize=classifier.hparams.resize, input_src=args.SRC) image_loader = DataLoader(image_dataset, batch_size=args.batch_size, pin_memory=True, num_workers=args.loaders) trainer.test(classifier, test_dataloaders=image_loader)
if torch.cuda.device_count() > 1: print("multiple-gpu's detected ({}), using all available gpu's".format( torch.cuda.device_count())) model = nn.DataParallel(model, device_ids=list(range(len(gpus)))) resize = 299 if 'inception' in str(model.__class__) else 244 ## creating output directory os.makedirs(args.outdir, exist_ok=True) ## ingesting input if all([os.path.isdir(src) for src in args.src]): for src in args.src: if args.input_type == 'bins': assert '{bin}' in args.outfile dd = ifcb.DataDirectory(src) num_of_bins = len(dd) if args['bin_filter']: with open(args.bin_filter) as f: args.bin_filter = [line.strip() for line in f] for i, bin in enumerate(dd): bin_id = os.path.basename(str(bin)) if args.bin_filter and bin_id not in args.bin_filter: continue bin_dataset = IfcbBinDataset(bin, resize) image_loader = DataLoader(bin_dataset, batch_size=args.batch_size, pin_memory=True, num_workers=args.loaders) print('{:.02f}% {} images:{}, batches:{}'.format(