Example #1
0
 def __init__(self, context):
     super(BottleneckProjectionsReader, self).__init__(context)
     self.logger = LOGGER.getChild("BottleneckProjectionsReader")
     raw_split = context.get_input_split(raw=True)
     split = OpaqueInputSplit().read(io.BytesIO(raw_split))
     jc = context.job_conf
     if common.SEED_KEY in jc:  # https://github.com/crs4/pydoop/issues/318
         seed = jc.get_int(common.SEED_KEY)
         LOGGER.info("random seed: %r", seed)
         random.seed(seed)
     model = models.get_model_info(jc[common.GRAPH_ARCH_KEY])
     graph = model.load_prep()
     bneck_tensor = model.get_bottleneck(graph)
     self.bneck_store = BottleneckStore(bneck_tensor.shape[1].value,
                                        bneck_tensor.dtype)
     self.n_steps = jc.get_int(common.NUM_STEPS_KEY)
     top_dir = jc.get(common.BNECKS_DIR_KEY)
     val_fraction = jc.get_int(common.VALIDATION_PERCENT_KEY) / 100
     # get *all* bottlenecks for this split, assuming they fit in memory
     bneck_map = self.bneck_store.get_bnecks(top_dir, posmap=split.payload)
     self.val_bneck_map, self.train_bneck_map = {}, {}
     while bneck_map:
         c, bnecks = bneck_map.popitem()
         i = round(val_fraction * len(bnecks))
         self.val_bneck_map[c] = bnecks[:i]
         self.train_bneck_map[c] = bnecks[i:]
     self.logger.info("training size = %d, validation size = %d",
                      len(self.train_bneck_map[c]),
                      len(self.val_bneck_map[c]))
     train_bs = jc.get_int(common.TRAIN_BATCH_SIZE_KEY)
     val_bs = jc.get_int(common.VALIDATION_BATCH_SIZE_KEY)
     self.val_bs_map, self.train_bs_map = self.__map_bs(val_bs, train_bs)
     self.step_count = 0
Example #2
0
def main(argv=sys.argv):
    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    parser = make_parser()
    args = parser.parse_args(argv[1:])
    LOGGER.setLevel(args.log_level)
    model = models.get_model_info(args.architecture)
    get_graph(model, log_level=args.log_level)
    img_map = map_input_files(args.input)
    LOGGER.info("%d classes, %r images", len(img_map),
                [len(_) for _ in img_map.values()])
    if hdfs.path.exists(args.bnecks_dir):
        LOGGER.info("%r already exists, skipping bottleneck calculation",
                    args.bnecks_dir)
    else:
        LOGGER.info("caching bottlenecks to %r", args.bnecks_dir)
        calc_bottlenecks(model, img_map, args.bnecks_dir)
    train_bneck_map, val_bneck_map, test_bneck_map = maps = get_bneck_maps(
        model, args.bnecks_dir, args.test_percent, args.validation_percent)
    for cat, m in zip(("train", "val", "test"), maps):
        LOGGER.info("%s set: %r", cat, [len(_) for _ in m.values()])
    labels = BottleneckStore.assign_labels(args.bnecks_dir)
    n_classes = len(labels)
    for m in (img_map, *maps):
        assert len(m) == n_classes

    # train
    retrainer = tflow.Retrainer(model, n_classes, args.learning_rate)
    for i in range(args.num_steps):
        train_bnecks, train_gtruths = get_sample_vectors(
            train_bneck_map, args.train_batch_size, labels)
        val_bnecks, val_gtruths = get_sample_vectors(
            val_bneck_map, args.validation_batch_size, labels)
        retrainer.run_train_step(train_bnecks, train_gtruths)
        with hdfs.open(args.train_output, "wt") as f:
            if (i % args.eval_step_interval == 0) or (i + 1 >= args.num_steps):
                train_accuracy, cross_entropy = retrainer.run_eval_step(
                    train_bnecks, train_gtruths)
                val_accuracy = retrainer.run_validation_step(
                    val_bnecks, val_gtruths)
                dump_stats(f, i, cross_entropy, train_accuracy, val_accuracy)
    retrainer.checkpoint(args.trained_model)

    # test
    test_bnecks, test_gtruths = get_sample_vectors(test_bneck_map,
                                                   args.test_batch_size,
                                                   labels)
    test_accuracy, predictions = retrainer.session.run(
        [retrainer.eval_step, retrainer.prediction],
        feed_dict={
            retrainer.bneck_input: test_bnecks,
            retrainer.ground_truth_input: test_gtruths
        })
    print("test_accuracy: %f%%" % (100 * test_accuracy))
    retrainer.close_session()
Example #3
0
 def __init__(self, context):
     super(Mapper, self).__init__(context)
     jc = context.job_conf
     LOGGER.setLevel(jc[common.LOG_LEVEL_KEY])
     self.n_steps = jc.get_int(common.NUM_STEPS_KEY)
     self.eval_step_interval = jc.get_int(common.EVAL_STEP_INTERVAL_KEY)
     learn_rate = jc.get_float(common.LEARNING_RATE_KEY)
     top_dir = jc.get(common.BNECKS_DIR_KEY)
     self.labels = BottleneckStore.assign_labels(top_dir)
     self.validation_percent = jc.get_int(common.VALIDATION_PERCENT_KEY)
     model = models.get_model_info(jc[common.GRAPH_ARCH_KEY])
     self.retrainer = tflow.Retrainer(model, len(self.labels), learn_rate)
     self.out_path = "%s.zip" % context.get_default_work_file()
Example #4
0
def main(argv=None):

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)

    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = WORKER
    args.module = WORKER
    args.upload_file_to_cache = ['%s.py' % WORKER]
    args.python_zip = [zip_fn]
    args.do_not_use_java_record_reader = True
    args.num_reducers = 0
    if args.seed:
        LOGGER.info("setting random seed to %d", args.seed)
        random.seed(args.seed)

    model = models.get_model_info(args.architecture)
    graph = model.load_prep()
    bneck_tensor = model.get_bottleneck(graph)
    bneck_store = ioformats.BottleneckStore(
        bneck_tensor.shape[1].value, bneck_tensor.dtype
    )
    bneck_map = bneck_store.build_map(args.input)
    LOGGER.info("%d subdirs, %r bottlenecks" %
                (len(bneck_map), [len(_) for _ in bneck_map.values()]))
    splits_path = os.path.join(args.input, '_' + uuid.uuid4().hex)
    generate_input_splits(args.num_maps, bneck_map, splits_path)
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.properties.update({
        common.BNECKS_DIR_KEY: args.input,
        common.EVAL_STEP_INTERVAL_KEY: args.eval_step_interval,
        common.GRAPH_ARCH_KEY: args.architecture,
        common.LEARNING_RATE_KEY: args.learning_rate,
        common.LOG_LEVEL_KEY: args.log_level,
        common.NUM_MAPS_KEY: args.num_maps,
        common.NUM_STEPS_KEY: args.num_steps,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_path,
        common.TRAIN_BATCH_SIZE_KEY: args.train_batch_size,
        common.VALIDATION_BATCH_SIZE_KEY: args.validation_batch_size,
        common.VALIDATION_PERCENT_KEY: args.validation_percent,
    })
    if args.seed:
        submitter.properties[common.SEED_KEY] = args.seed
    submitter.run()
    hdfs.rmr(splits_path)
    shutil.rmtree(wd)
Example #5
0
 def __init__(self, context):
     super(Mapper, self).__init__(context)
     jc = context.job_conf
     LOGGER.setLevel(jc[common.LOG_LEVEL_KEY])
     top_dir = jc.get(common.BNECKS_DIR_KEY)
     self.model = models.get_model_info(jc[common.GRAPH_ARCH_KEY])
     graph = self.model.load_prep()
     bneck_tensor = self.model.get_bottleneck(graph)
     del graph
     self.bneck_store = BottleneckStore(bneck_tensor.shape[1].value,
                                        bneck_tensor.dtype)
     # get *all* bottlenecks
     bneck_map = self.bneck_store.get_bnecks(top_dir)
     self.bnecks, self.gtruths = BottleneckStore.bnecks_map_to_vectors(
         bneck_map, BottleneckStore.assign_labels(top_dir))
Example #6
0
def main(argv=None):

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)

    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = WORKER
    args.module = WORKER
    args.upload_file_to_cache = ['%s.py' % WORKER]
    args.python_zip = [zip_fn]
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = True
    args.num_reducers = 0

    LOGGER.setLevel(args.log_level)
    model = get_model_info(args.architecture)
    get_graph(model, log_level=args.log_level)

    images = list_images(args.input)
    splits = common.balanced_split(images, args.num_maps)
    uri = os.path.join(args.input, '_' + uuid.uuid4().hex)
    LOGGER.debug("saving input splits to: %s", uri)
    with hdfs.open(uri, 'wb') as f:
        write_opaques([OpaqueInputSplit(1, _) for _ in splits], f)
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.properties.update({
        common.NUM_MAPS_KEY: args.num_maps,
        common.GRAPH_ARCH_KEY: args.architecture,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: uri,
    })
    submitter.run()
    hdfs.rmr(uri)
    shutil.rmtree(wd)
Example #7
0
def main(argv=sys.argv):
    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    parser = make_parser()
    args = parser.parse_args(argv[1:])
    LOGGER.setLevel(args.log_level)
    if args.input_from_list:
        if len(args.input) > 1:
            raise RuntimeError(
                "with --input-from-list, specify only 1 input (file)")
        with hdfs.open(args.input[0], "rt") as f:
            args.input = [_.strip() for _ in f]
    if not args.output:
        args.output = "pydeep-%s" % uuid.uuid4()
    LOGGER.info("dumping to %s", args.output)
    model = models.get_model_info(args.architecture)
    if args.mapred:
        run_mapred(model,
                   args.input,
                   args.output,
                   args.num_maps,
                   args.log_level,
                   collate=args.collate)
    else:
        run_locally(model, args.input, args.output, collate=args.collate)
Example #8
0
 def __init__(self, context):
     super(Mapper, self).__init__(context)
     jc = context.job_conf
     self.model = models.get_model_info(jc[common.GRAPH_ARCH_KEY])
     LOGGER.setLevel(jc[common.LOG_LEVEL_KEY])
Example #9
0
 def __init__(self, context):
     super(Mapper, self).__init__(context)
     jc = context.job_conf
     model = models.get_model_info(jc[GRAPH_ARCH_KEY])
     self.projector = BottleneckProjector(model)