def __init__(self, context): super(BottleneckProjectionsReader, self).__init__(context) self.logger = LOGGER.getChild("BottleneckProjectionsReader") raw_split = context.get_input_split(raw=True) split = OpaqueInputSplit().read(io.BytesIO(raw_split)) jc = context.job_conf if common.SEED_KEY in jc: # https://github.com/crs4/pydoop/issues/318 seed = jc.get_int(common.SEED_KEY) LOGGER.info("random seed: %r", seed) random.seed(seed) model = models.get_model_info(jc[common.GRAPH_ARCH_KEY]) graph = model.load_prep() bneck_tensor = model.get_bottleneck(graph) self.bneck_store = BottleneckStore(bneck_tensor.shape[1].value, bneck_tensor.dtype) self.n_steps = jc.get_int(common.NUM_STEPS_KEY) top_dir = jc.get(common.BNECKS_DIR_KEY) val_fraction = jc.get_int(common.VALIDATION_PERCENT_KEY) / 100 # get *all* bottlenecks for this split, assuming they fit in memory bneck_map = self.bneck_store.get_bnecks(top_dir, posmap=split.payload) self.val_bneck_map, self.train_bneck_map = {}, {} while bneck_map: c, bnecks = bneck_map.popitem() i = round(val_fraction * len(bnecks)) self.val_bneck_map[c] = bnecks[:i] self.train_bneck_map[c] = bnecks[i:] self.logger.info("training size = %d, validation size = %d", len(self.train_bneck_map[c]), len(self.val_bneck_map[c])) train_bs = jc.get_int(common.TRAIN_BATCH_SIZE_KEY) val_bs = jc.get_int(common.VALIDATION_BATCH_SIZE_KEY) self.val_bs_map, self.train_bs_map = self.__map_bs(val_bs, train_bs) self.step_count = 0
def main(argv=sys.argv): os.chdir(os.path.dirname(os.path.abspath(__file__))) parser = make_parser() args = parser.parse_args(argv[1:]) LOGGER.setLevel(args.log_level) model = models.get_model_info(args.architecture) get_graph(model, log_level=args.log_level) img_map = map_input_files(args.input) LOGGER.info("%d classes, %r images", len(img_map), [len(_) for _ in img_map.values()]) if hdfs.path.exists(args.bnecks_dir): LOGGER.info("%r already exists, skipping bottleneck calculation", args.bnecks_dir) else: LOGGER.info("caching bottlenecks to %r", args.bnecks_dir) calc_bottlenecks(model, img_map, args.bnecks_dir) train_bneck_map, val_bneck_map, test_bneck_map = maps = get_bneck_maps( model, args.bnecks_dir, args.test_percent, args.validation_percent) for cat, m in zip(("train", "val", "test"), maps): LOGGER.info("%s set: %r", cat, [len(_) for _ in m.values()]) labels = BottleneckStore.assign_labels(args.bnecks_dir) n_classes = len(labels) for m in (img_map, *maps): assert len(m) == n_classes # train retrainer = tflow.Retrainer(model, n_classes, args.learning_rate) for i in range(args.num_steps): train_bnecks, train_gtruths = get_sample_vectors( train_bneck_map, args.train_batch_size, labels) val_bnecks, val_gtruths = get_sample_vectors( val_bneck_map, args.validation_batch_size, labels) retrainer.run_train_step(train_bnecks, train_gtruths) with hdfs.open(args.train_output, "wt") as f: if (i % args.eval_step_interval == 0) or (i + 1 >= args.num_steps): train_accuracy, cross_entropy = retrainer.run_eval_step( train_bnecks, train_gtruths) val_accuracy = retrainer.run_validation_step( val_bnecks, val_gtruths) dump_stats(f, i, cross_entropy, train_accuracy, val_accuracy) retrainer.checkpoint(args.trained_model) # test test_bnecks, test_gtruths = get_sample_vectors(test_bneck_map, args.test_batch_size, labels) test_accuracy, predictions = retrainer.session.run( [retrainer.eval_step, retrainer.prediction], feed_dict={ retrainer.bneck_input: test_bnecks, retrainer.ground_truth_input: test_gtruths }) print("test_accuracy: %f%%" % (100 * test_accuracy)) retrainer.close_session()
def __init__(self, context): super(Mapper, self).__init__(context) jc = context.job_conf LOGGER.setLevel(jc[common.LOG_LEVEL_KEY]) self.n_steps = jc.get_int(common.NUM_STEPS_KEY) self.eval_step_interval = jc.get_int(common.EVAL_STEP_INTERVAL_KEY) learn_rate = jc.get_float(common.LEARNING_RATE_KEY) top_dir = jc.get(common.BNECKS_DIR_KEY) self.labels = BottleneckStore.assign_labels(top_dir) self.validation_percent = jc.get_int(common.VALIDATION_PERCENT_KEY) model = models.get_model_info(jc[common.GRAPH_ARCH_KEY]) self.retrainer = tflow.Retrainer(model, len(self.labels), learn_rate) self.out_path = "%s.zip" % context.get_default_work_file()
def main(argv=None): os.chdir(os.path.dirname(os.path.abspath(__file__))) wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = WORKER args.module = WORKER args.upload_file_to_cache = ['%s.py' % WORKER] args.python_zip = [zip_fn] args.do_not_use_java_record_reader = True args.num_reducers = 0 if args.seed: LOGGER.info("setting random seed to %d", args.seed) random.seed(args.seed) model = models.get_model_info(args.architecture) graph = model.load_prep() bneck_tensor = model.get_bottleneck(graph) bneck_store = ioformats.BottleneckStore( bneck_tensor.shape[1].value, bneck_tensor.dtype ) bneck_map = bneck_store.build_map(args.input) LOGGER.info("%d subdirs, %r bottlenecks" % (len(bneck_map), [len(_) for _ in bneck_map.values()])) splits_path = os.path.join(args.input, '_' + uuid.uuid4().hex) generate_input_splits(args.num_maps, bneck_map, splits_path) submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.properties.update({ common.BNECKS_DIR_KEY: args.input, common.EVAL_STEP_INTERVAL_KEY: args.eval_step_interval, common.GRAPH_ARCH_KEY: args.architecture, common.LEARNING_RATE_KEY: args.learning_rate, common.LOG_LEVEL_KEY: args.log_level, common.NUM_MAPS_KEY: args.num_maps, common.NUM_STEPS_KEY: args.num_steps, common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_path, common.TRAIN_BATCH_SIZE_KEY: args.train_batch_size, common.VALIDATION_BATCH_SIZE_KEY: args.validation_batch_size, common.VALIDATION_PERCENT_KEY: args.validation_percent, }) if args.seed: submitter.properties[common.SEED_KEY] = args.seed submitter.run() hdfs.rmr(splits_path) shutil.rmtree(wd)
def __init__(self, context): super(Mapper, self).__init__(context) jc = context.job_conf LOGGER.setLevel(jc[common.LOG_LEVEL_KEY]) top_dir = jc.get(common.BNECKS_DIR_KEY) self.model = models.get_model_info(jc[common.GRAPH_ARCH_KEY]) graph = self.model.load_prep() bneck_tensor = self.model.get_bottleneck(graph) del graph self.bneck_store = BottleneckStore(bneck_tensor.shape[1].value, bneck_tensor.dtype) # get *all* bottlenecks bneck_map = self.bneck_store.get_bnecks(top_dir) self.bnecks, self.gtruths = BottleneckStore.bnecks_map_to_vectors( bneck_map, BottleneckStore.assign_labels(top_dir))
def main(argv=None): os.chdir(os.path.dirname(os.path.abspath(__file__))) wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = WORKER args.module = WORKER args.upload_file_to_cache = ['%s.py' % WORKER] args.python_zip = [zip_fn] args.do_not_use_java_record_reader = True args.do_not_use_java_record_writer = True args.num_reducers = 0 LOGGER.setLevel(args.log_level) model = get_model_info(args.architecture) get_graph(model, log_level=args.log_level) images = list_images(args.input) splits = common.balanced_split(images, args.num_maps) uri = os.path.join(args.input, '_' + uuid.uuid4().hex) LOGGER.debug("saving input splits to: %s", uri) with hdfs.open(uri, 'wb') as f: write_opaques([OpaqueInputSplit(1, _) for _ in splits], f) submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.properties.update({ common.NUM_MAPS_KEY: args.num_maps, common.GRAPH_ARCH_KEY: args.architecture, common.PYDOOP_EXTERNALSPLITS_URI_KEY: uri, }) submitter.run() hdfs.rmr(uri) shutil.rmtree(wd)
def main(argv=sys.argv): os.chdir(os.path.dirname(os.path.abspath(__file__))) parser = make_parser() args = parser.parse_args(argv[1:]) LOGGER.setLevel(args.log_level) if args.input_from_list: if len(args.input) > 1: raise RuntimeError( "with --input-from-list, specify only 1 input (file)") with hdfs.open(args.input[0], "rt") as f: args.input = [_.strip() for _ in f] if not args.output: args.output = "pydeep-%s" % uuid.uuid4() LOGGER.info("dumping to %s", args.output) model = models.get_model_info(args.architecture) if args.mapred: run_mapred(model, args.input, args.output, args.num_maps, args.log_level, collate=args.collate) else: run_locally(model, args.input, args.output, collate=args.collate)
def __init__(self, context): super(Mapper, self).__init__(context) jc = context.job_conf self.model = models.get_model_info(jc[common.GRAPH_ARCH_KEY]) LOGGER.setLevel(jc[common.LOG_LEVEL_KEY])
def __init__(self, context): super(Mapper, self).__init__(context) jc = context.job_conf model = models.get_model_info(jc[GRAPH_ARCH_KEY]) self.projector = BottleneckProjector(model)