def run_mapred(model, input_dirs, output_dir, nmaps, log_level, collate=False): wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) if nmaps > len(input_dirs): nmaps = len(input_dirs) LOGGER.warn("Not enough input dirs, will only do %d splits" % nmaps) splits = common.balanced_split(input_dirs, nmaps) splits_uri = "pydoop_splits_%s" % uuid.uuid4().hex with hdfs.open(splits_uri, 'wb') as f: write_opaques([OpaqueInputSplit(1, _) for _ in splits], f) submitter = PydoopSubmitter() properties = { common.GRAPH_ARCH_KEY: model.name, common.LOG_LEVEL_KEY: log_level, common.NUM_MAPS_KEY: nmaps, common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_uri, } submitter.set_args( argparse.Namespace( D=list(properties.items()), avro_input=None, avro_output=None, cache_archive=None, cache_file=None, disable_property_name_conversion=True, do_not_use_java_record_reader=True, do_not_use_java_record_writer=True, entry_point="__main__", hadoop_conf=None, input=input_dirs[0], # does it matter? input_format=None, job_conf=None, job_name="dump_weights", keep_wd=False, libjars=None, log_level=log_level, module=os.path.splitext(os.path.basename(__file__))[0], no_override_env=False, no_override_home=False, no_override_ld_path=False, no_override_path=False, no_override_pypath=False, num_reducers=0, output=output_dir, output_format=None, pretend=False, pstats_dir=None, python_program=sys.executable, python_zip=[zip_fn], set_env=None, upload_archive_to_cache=None, upload_file_to_cache=[__file__], )) submitter.run() hdfs.rmr(splits_uri) if collate: collate_mapred_output(output_dir) shutil.rmtree(wd)
def main(argv=None): os.chdir(os.path.dirname(os.path.abspath(__file__))) wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = WORKER args.module = WORKER args.upload_file_to_cache = ['%s.py' % WORKER] args.python_zip = [zip_fn] args.do_not_use_java_record_reader = True args.num_reducers = 0 if args.seed: LOGGER.info("setting random seed to %d", args.seed) random.seed(args.seed) model = models.get_model_info(args.architecture) graph = model.load_prep() bneck_tensor = model.get_bottleneck(graph) bneck_store = ioformats.BottleneckStore( bneck_tensor.shape[1].value, bneck_tensor.dtype ) bneck_map = bneck_store.build_map(args.input) LOGGER.info("%d subdirs, %r bottlenecks" % (len(bneck_map), [len(_) for _ in bneck_map.values()])) splits_path = os.path.join(args.input, '_' + uuid.uuid4().hex) generate_input_splits(args.num_maps, bneck_map, splits_path) submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.properties.update({ common.BNECKS_DIR_KEY: args.input, common.EVAL_STEP_INTERVAL_KEY: args.eval_step_interval, common.GRAPH_ARCH_KEY: args.architecture, common.LEARNING_RATE_KEY: args.learning_rate, common.LOG_LEVEL_KEY: args.log_level, common.NUM_MAPS_KEY: args.num_maps, common.NUM_STEPS_KEY: args.num_steps, common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_path, common.TRAIN_BATCH_SIZE_KEY: args.train_batch_size, common.VALIDATION_BATCH_SIZE_KEY: args.validation_batch_size, common.VALIDATION_PERCENT_KEY: args.validation_percent, }) if args.seed: submitter.properties[common.SEED_KEY] = args.seed submitter.run() hdfs.rmr(splits_path) shutil.rmtree(wd)
def main(argv=None): parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = 'pteragen' args.module = 'pteragen' args.upload_file_to_cache = ['pteragen.py', 'ioformats.py'] args.input_format = 'it.crs4.pydoop.examples.pterasort.RangeInputFormat' args.do_not_use_java_record_writer = True # args.libjars = ['pydoop-input-formats.jar'] add_D_arg(args, 'num_records', NUM_ROWS_KEY) add_D_arg(args, 'num_maps', NUM_MAPS_KEY) args.num_reducers = 0 submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.run()
def main(argv=None): parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = 'pterasort' args.module = 'pterasort' args.do_not_use_java_record_reader = True args.do_not_use_java_record_writer = True bp_filename = Partitioner.initialize_break_points(args.num_reducers, args.sampled_records, args.input, args.num_threads) args.upload_file_to_cache = ['pterasort.py', 'ioformats.py', bp_filename] submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.run()
def main(argv=None): parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = 'pteracheck' args.module = 'pteracheck' args.do_not_use_java_record_reader = True args.do_not_use_java_record_writer = False args.num_reducers = 1 args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py'] submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.run() path = os.path.join(args.output, 'part-r-00000') with hdfs.open(path, 'rb') as f: data = f.read() check_rows(data.split(b'\n')[:-1])
def main(argv=None): os.chdir(os.path.dirname(os.path.abspath(__file__))) wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = WORKER args.module = WORKER args.upload_file_to_cache = ['%s.py' % WORKER] args.python_zip = [zip_fn] args.do_not_use_java_record_reader = True args.do_not_use_java_record_writer = True args.num_reducers = 0 LOGGER.setLevel(args.log_level) model = get_model_info(args.architecture) get_graph(model, log_level=args.log_level) images = list_images(args.input) splits = common.balanced_split(images, args.num_maps) uri = os.path.join(args.input, '_' + uuid.uuid4().hex) LOGGER.debug("saving input splits to: %s", uri) with hdfs.open(uri, 'wb') as f: write_opaques([OpaqueInputSplit(1, _) for _ in splits], f) submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.properties.update({ common.NUM_MAPS_KEY: args.num_maps, common.GRAPH_ARCH_KEY: args.architecture, common.PYDOOP_EXTERNALSPLITS_URI_KEY: uri, }) submitter.run() hdfs.rmr(uri) shutil.rmtree(wd)