Example #1
0
def run_mapred(model, input_dirs, output_dir, nmaps, log_level, collate=False):
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)
    if nmaps > len(input_dirs):
        nmaps = len(input_dirs)
        LOGGER.warn("Not enough input dirs, will only do %d splits" % nmaps)
    splits = common.balanced_split(input_dirs, nmaps)
    splits_uri = "pydoop_splits_%s" % uuid.uuid4().hex
    with hdfs.open(splits_uri, 'wb') as f:
        write_opaques([OpaqueInputSplit(1, _) for _ in splits], f)
    submitter = PydoopSubmitter()
    properties = {
        common.GRAPH_ARCH_KEY: model.name,
        common.LOG_LEVEL_KEY: log_level,
        common.NUM_MAPS_KEY: nmaps,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_uri,
    }
    submitter.set_args(
        argparse.Namespace(
            D=list(properties.items()),
            avro_input=None,
            avro_output=None,
            cache_archive=None,
            cache_file=None,
            disable_property_name_conversion=True,
            do_not_use_java_record_reader=True,
            do_not_use_java_record_writer=True,
            entry_point="__main__",
            hadoop_conf=None,
            input=input_dirs[0],  # does it matter?
            input_format=None,
            job_conf=None,
            job_name="dump_weights",
            keep_wd=False,
            libjars=None,
            log_level=log_level,
            module=os.path.splitext(os.path.basename(__file__))[0],
            no_override_env=False,
            no_override_home=False,
            no_override_ld_path=False,
            no_override_path=False,
            no_override_pypath=False,
            num_reducers=0,
            output=output_dir,
            output_format=None,
            pretend=False,
            pstats_dir=None,
            python_program=sys.executable,
            python_zip=[zip_fn],
            set_env=None,
            upload_archive_to_cache=None,
            upload_file_to_cache=[__file__],
        ))
    submitter.run()
    hdfs.rmr(splits_uri)
    if collate:
        collate_mapred_output(output_dir)
    shutil.rmtree(wd)
Example #2
0
def main(argv=None):

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)

    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = WORKER
    args.module = WORKER
    args.upload_file_to_cache = ['%s.py' % WORKER]
    args.python_zip = [zip_fn]
    args.do_not_use_java_record_reader = True
    args.num_reducers = 0
    if args.seed:
        LOGGER.info("setting random seed to %d", args.seed)
        random.seed(args.seed)

    model = models.get_model_info(args.architecture)
    graph = model.load_prep()
    bneck_tensor = model.get_bottleneck(graph)
    bneck_store = ioformats.BottleneckStore(
        bneck_tensor.shape[1].value, bneck_tensor.dtype
    )
    bneck_map = bneck_store.build_map(args.input)
    LOGGER.info("%d subdirs, %r bottlenecks" %
                (len(bneck_map), [len(_) for _ in bneck_map.values()]))
    splits_path = os.path.join(args.input, '_' + uuid.uuid4().hex)
    generate_input_splits(args.num_maps, bneck_map, splits_path)
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.properties.update({
        common.BNECKS_DIR_KEY: args.input,
        common.EVAL_STEP_INTERVAL_KEY: args.eval_step_interval,
        common.GRAPH_ARCH_KEY: args.architecture,
        common.LEARNING_RATE_KEY: args.learning_rate,
        common.LOG_LEVEL_KEY: args.log_level,
        common.NUM_MAPS_KEY: args.num_maps,
        common.NUM_STEPS_KEY: args.num_steps,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_path,
        common.TRAIN_BATCH_SIZE_KEY: args.train_batch_size,
        common.VALIDATION_BATCH_SIZE_KEY: args.validation_batch_size,
        common.VALIDATION_PERCENT_KEY: args.validation_percent,
    })
    if args.seed:
        submitter.properties[common.SEED_KEY] = args.seed
    submitter.run()
    hdfs.rmr(splits_path)
    shutil.rmtree(wd)
Example #3
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteragen'
    args.module = 'pteragen'
    args.upload_file_to_cache = ['pteragen.py', 'ioformats.py']
    args.input_format = 'it.crs4.pydoop.examples.pterasort.RangeInputFormat'
    args.do_not_use_java_record_writer = True
    # args.libjars = ['pydoop-input-formats.jar']
    add_D_arg(args, 'num_records', NUM_ROWS_KEY)
    add_D_arg(args, 'num_maps', NUM_MAPS_KEY)
    args.num_reducers = 0
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
Example #4
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pterasort'
    args.module = 'pterasort'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = True
    bp_filename = Partitioner.initialize_break_points(args.num_reducers,
                                                      args.sampled_records,
                                                      args.input,
                                                      args.num_threads)
    args.upload_file_to_cache = ['pterasort.py', 'ioformats.py', bp_filename]
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
Example #5
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteragen'
    args.module = 'pteragen'
    args.upload_file_to_cache = ['pteragen.py', 'ioformats.py']
    args.input_format = 'it.crs4.pydoop.examples.pterasort.RangeInputFormat'
    args.do_not_use_java_record_writer = True
    # args.libjars = ['pydoop-input-formats.jar']
    add_D_arg(args, 'num_records', NUM_ROWS_KEY)
    add_D_arg(args, 'num_maps', NUM_MAPS_KEY)
    args.num_reducers = 0
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
Example #6
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pterasort'
    args.module = 'pterasort'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = True
    bp_filename = Partitioner.initialize_break_points(args.num_reducers,
                                                      args.sampled_records,
                                                      args.input,
                                                      args.num_threads)
    args.upload_file_to_cache = ['pterasort.py', 'ioformats.py', bp_filename]
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
Example #7
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])
Example #8
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])
Example #9
0
def main(argv=None):

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)

    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = WORKER
    args.module = WORKER
    args.upload_file_to_cache = ['%s.py' % WORKER]
    args.python_zip = [zip_fn]
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = True
    args.num_reducers = 0

    LOGGER.setLevel(args.log_level)
    model = get_model_info(args.architecture)
    get_graph(model, log_level=args.log_level)

    images = list_images(args.input)
    splits = common.balanced_split(images, args.num_maps)
    uri = os.path.join(args.input, '_' + uuid.uuid4().hex)
    LOGGER.debug("saving input splits to: %s", uri)
    with hdfs.open(uri, 'wb') as f:
        write_opaques([OpaqueInputSplit(1, _) for _ in splits], f)
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.properties.update({
        common.NUM_MAPS_KEY: args.num_maps,
        common.GRAPH_ARCH_KEY: args.architecture,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: uri,
    })
    submitter.run()
    hdfs.rmr(uri)
    shutil.rmtree(wd)