Example #1
0
    def __init__(self, data_path, resize):
        self.dd = ifcb.DataDirectory(data_path)

        # use 299x299 for inception_v3, all other models use 244x244
        if isinstance(resize, int):
            resize = (resize, resize)
        self.resize = resize
Example #2
0
 def scan(self):
     for dd in self.dataset.directories.filter(kind=DataDirectory.RAW).order_by('priority'):
         if not os.path.exists(dd.path):
             continue # skip and continue searching
         directory = ifcb.DataDirectory(dd.path)
         for b in directory:
             yield b
Example #3
0
 def get_raw_directory(self):
     if self.kind != self.RAW:
         raise ValueError('not a raw directory')
     # return the underlying ifcb.DataDirectory
     whitelist = re.split(',', self.whitelist)
     blacklist = re.split(',', self.blacklist)
     return ifcb.DataDirectory(self.path,
                               whitelist=whitelist,
                               blacklist=blacklist)
Example #4
0
 def scan(self, dir):
     """scan a directory and add all bins found"""
     logging.debug('scanning {} ...'.format(dir))
     dd = ifcb.DataDirectory(dir)
     for bin in dd:
         if not self.exists(bin):
             logging.debug('adding {}'.format(bin))
             self.add_bin(bin)
         else:
             logging.debug('skipping {}, exists'.format(bin))
     logging.debug('done.')
Example #5
0
def main(args):
    data_dir = ifcb.DataDirectory(args.input_dir)
    for sample_bin in data_dir:
        print(sample_bin)
        number_of_images = len(sample_bin.images)
        lid = sample_bin.lid
        print('{} has {} image(s)'.format(lid, number_of_images))
        with sample_bin:
            path = os.path.join(args.output_dir, lid)
            Path(path).mkdir(parents=True, exist_ok=True)
            for roi_number in sample_bin.images:
                img_path = os.path.join(path,
                                        lid + '_' + str(roi_number) + '.png')
                print(img_path)
                imageio.imwrite(img_path, sample_bin.images[roi_number])
Example #6
0
 def sync_one(self, pid):
     bin = None
     for dd in self.dataset.directories.filter(
             kind=DataDirectory.RAW).order_by('priority'):
         if not os.path.exists(dd.path):
             continue  # skip and continue searching
         directory = ifcb.DataDirectory(dd.path)
         try:
             bin = directory[pid]
         except KeyError:
             continue
     if bin is None:
         return 'bin {} not found'.format(pid)
     # create instrument if necessary
     i = bin.pid.instrument
     version = bin.pid.schema_version
     instrument, created = Instrument.objects.get_or_create(
         number=i, defaults={'version': version})
     # create model object
     timestamp = bin.pid.timestamp
     b, created = Bin.objects.get_or_create(
         pid=pid,
         defaults={
             'timestamp': timestamp,
             'sample_time': timestamp,
             'instrument': instrument,
             'skip': True,  # in case accession is interrupted
         })
     if not created and not dataset in b.datasets:
         self.dataset.bins.add(b)
         return
     b2s, error = self.add_bin(bin, b)
     if error is not None:
         # there was an error. if we created a bin, delete it
         if created:
             b.delete()
             return error
     with transaction.atomic():
         if not b2s.qc_no_rois:
             b2s.skip = False
             b2s.save()
             self.dataset.bins.add(b2s)
         else:
             b2s.save()
Example #7
0
def do_run(args):

    # assert correct filter arguments
    if args.filter:
        if not args.filter[0] in ['IN', 'OUT']:
            argparse.ArgumentTypeError('IN|OUT must be either "IN" or "OUT"')
        if len(args.filter) < 2:
            argparse.ArgumentTypeError('Must be at least one KEYWORD')

    # load model
    classifier = NeustonModel.load_from_checkpoint(args.MODEL)
    seed_everything(classifier.hparams.seed)

    # ARG CORRECTIONS AND CHECKS
    if os.path.isdir(args.SRC) and not args.SRC.endswith(os.sep):
        args.SRC = args.SRC + os.sep

    # set OUTFILE defaults
    if not args.outfile:
        if args.src_type == 'bin':
            args.outfile = ['D{BIN_YEAR}/D{BIN_DATE}/{BIN_ID}_class.h5']
        if args.src_type == 'img': args.outfile = ['img_results.json']

    # Setup Callbacks
    plotting_callbacks = []  # TODO
    run_results_callbacks = []
    for outfile in args.outfile:
        svr = SaveTestResults(outdir=args.outdir,
                              outfile=outfile,
                              timestamp=args.cmd_timestamp)
        run_results_callbacks.append(svr)

    # create trainer
    trainer = Trainer(
        deterministic=True,
        gpus=len(args.gpus) if args.gpus else None,
        logger=False,
        checkpoint_callback=False,
        callbacks=run_results_callbacks,
    )

    # dataset filter if any
    filter_mode, filter_keywords = None, []
    if args.filter:
        filter_mode = args.filter[0]
        for keyword in args.filter[1:]:
            if os.path.isfile(keyword):
                with open(keyword) as f:
                    filter_keywords.extend(f.readlines())
            else:
                filter_keywords.append(keyword)

    # create dataset
    image_loaders = []
    if args.src_type == 'bin':
        # Formatting Dataset
        if os.path.isdir(args.SRC):
            if filter_mode == 'IN':
                dd = ifcb.DataDirectory(args.SRC, whitelist=filter_keywords)
            elif filter_mode == 'OUT':
                dd = ifcb.DataDirectory(args.SRC, blacklist=filter_keywords)
            else:
                dd = ifcb.DataDirectory(args.SRC)
        elif os.path.isfile(args.SRC) and args.SRC.endswith(
                '.txt'):  # TODO TEST: textfile bin run
            with open(args.SRC, 'r') as f:
                bins = f.readlines()
            parent = os.path.commonpath(bins)
            dd = ifcb.DataDirectory(parent, whitelist=bins)
        else:  # single bin # TODO TEST: single bin run
            parent = os.path.dirname(args.SRC)
            bin_id = os.path.basename(args.SRC)
            dd = ifcb.DataDirectory(parent, whitelist=[bin_id])

        error_bins = []

        if args.gobig: print('Loading Bins', end=' ')
        for i, bin_fileset in enumerate(dd):
            bin_fileset.pid.namespace = os.path.dirname(
                bin_fileset.fileset.basepath.replace(args.SRC, '')) + os.sep
            bin_obj = bin_fileset.pid
            if args.filter:  # applying filter
                if filter_mode == 'IN':  # if bin does NOT match any of the keywords, skip it
                    if not any([k in str(bin_obj) for k in filter_keywords]):
                        continue
                elif filter_mode == 'OUT':  # if bin matches any of the keywords, skip it
                    if any([k in str(bin_obj) for k in filter_keywords]):
                        continue

            if not args.clobber:
                output_files = [
                    os.path.join(args.outdir, ofile) for ofile in args.outfile
                ]
                outfile_dict = dict(BIN_ID=bin_obj.pid,
                                    BIN_YEAR=bin_obj.year,
                                    BIN_DATE=bin_obj.yearday,
                                    INPUT_SUBDIRS=bin_obj.namespace)
                output_files = [
                    ofile.format(**outfile_dict).replace(2 * os.sep, os.sep)
                    for ofile in output_files
                ]
                if all([os.path.isfile(ofile) for ofile in output_files]):
                    print(
                        '{} result-file(s) already exist - skipping this bin'.
                        format(bin_obj))
                    continue

            bin_dataset = IfcbBinDataset(bin_fileset,
                                         classifier.hparams.resize,
                                         classifier.hparams.img_norm)
            image_loader = DataLoader(bin_dataset,
                                      batch_size=args.batch_size,
                                      pin_memory=True,
                                      num_workers=args.loaders)

            # skip empty bins
            if len(image_loader) == 0:
                error_bins.append((bin_obj, AssertionError('Bin is Empty')))
                continue
            if args.gobig:
                print('.', end='', flush=True)
                image_loaders.append(image_loader)
            else:
                # Do runs one bin at a time
                try:
                    trainer.test(classifier, test_dataloaders=image_loader)
                except Exception as e:
                    error_bins.append((bin_obj, e))

        # Do Runs all at once
        if args.gobig:
            print()
            trainer.test(classifier, test_dataloaders=image_loaders)

        # Final Statements
        print('RUN IS DONE')
        if error_bins:
            print("The following bins failed; they were not processed:")
            for bin_obj, err in error_bins:
                print(bin_obj, type(err), err)

    ## IMAGES ##
    else:
        img_paths = []
        if os.path.isdir(args.SRC):
            for pardir, _, imgs in os.walk(args.SRC):
                imgs = [
                    os.path.join(pardir, img) for img in imgs
                    if img.endswith(IMG_EXTENSIONS)
                ]
                img_paths.extend(imgs)
        elif os.path.isfile(args.SRC) and args.SRC.endswith(
                '.txt'):  # TODO TEST: textfile img run
            with open(args.SRC, 'r') as f:
                img_paths = f.readlines()
                img_paths = [img.strip() for img in img_paths]
                img_paths = [
                    img for img in img_paths if img.endswith(IMG_EXTENSIONS)
                ]
        elif args.SRC.endswith(
                IMG_EXTENSIONS):  # single img # TODO TEST: single img run
            img_paths.append(args.SRC)

        # applying filter
        if args.filter:
            for img in img_paths[:]:
                if filter_mode == 'IN':  # if img does NOT match any of the keywords, skip it
                    if not any([k in img for k in filter_keywords]):
                        img_paths.remove(img)
                elif filter_mode == 'OUT':  # if img matches any of the keywords, skip it
                    if any([k in img for k in filter_keywords]):
                        img_paths.remove(img)

        assert len(img_paths) > 0, 'No images to process'
        image_dataset = ImageDataset(img_paths,
                                     resize=classifier.hparams.resize,
                                     input_src=args.SRC)
        image_loader = DataLoader(image_dataset,
                                  batch_size=args.batch_size,
                                  pin_memory=True,
                                  num_workers=args.loaders)

        trainer.test(classifier, test_dataloaders=image_loader)
Example #8
0
    if torch.cuda.device_count() > 1:
        print("multiple-gpu's detected ({}), using all available gpu's".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model, device_ids=list(range(len(gpus))))

    resize = 299 if 'inception' in str(model.__class__) else 244

    ## creating output directory
    os.makedirs(args.outdir, exist_ok=True)

    ## ingesting input
    if all([os.path.isdir(src) for src in args.src]):
        for src in args.src:
            if args.input_type == 'bins':
                assert '{bin}' in args.outfile
                dd = ifcb.DataDirectory(src)
                num_of_bins = len(dd)
                if args['bin_filter']:
                    with open(args.bin_filter) as f:
                        args.bin_filter = [line.strip() for line in f]
                for i, bin in enumerate(dd):
                    bin_id = os.path.basename(str(bin))
                    if args.bin_filter and bin_id not in args.bin_filter:
                        continue
                    bin_dataset = IfcbBinDataset(bin, resize)
                    image_loader = DataLoader(bin_dataset,
                                              batch_size=args.batch_size,
                                              pin_memory=True,
                                              num_workers=args.loaders)

                    print('{:.02f}% {} images:{}, batches:{}'.format(