def run_filter(opt, input_): runner = hadut.PipesRunner(prefix=PREFIX) options = BASE_MR_OPTIONS.copy() options.update({ MR_JOB_NAME: "filter", MR_IN_CLASS: "%s.SequenceFileInputFormat" % MRLIB, MR_REDUCE_TASKS: "0", "filter.occurrence.threshold": opt.threshold, }) with open(LOCAL_FILTER_SCRIPT) as f: pipes_code = pts.adapt_script(f.read()) runner.set_input(input_) runner.set_exe(pipes_code) runner.run(properties=options, hadoop_conf_dir=HADOOP_CONF_DIR) return runner.output
def run_filter(opt, input_): runner = hadut.PipesRunner(prefix=PREFIX) options = BASE_MR_OPTIONS.copy() options.update({ # [TODO] replace student_id with your id, e.g. 2011-12345 MR_JOB_NAME: "filter_2018-26190", MR_IN_CLASS: "%s.SequenceFileInputFormat" % MRLIB, MR_REDUCE_TASKS: "1", }) with open(LOCAL_FILTER_SCRIPT) as f: pipes_code = pts.adapt_script(f.read()) runner.set_input(input_) runner.set_exe(pipes_code) runner.run(properties=options, hadoop_conf_dir=HADOOP_CONF_DIR) return runner.output
def run_wc(opt): runner = hadut.PipesRunner(prefix=PREFIX) options = BASE_MR_OPTIONS.copy() options.update({ MR_JOB_NAME: "wordcount", MR_OUT_CLASS: "%s.SequenceFileOutputFormat" % MRLIB, MR_OUT_COMPRESS_TYPE: "NONE", MR_REDUCE_TASKS: "2", }) with open(LOCAL_WC_SCRIPT) as f: pipes_code = pts.adapt_script(f.read()) runner.set_input(opt.input, put=True) runner.set_exe(pipes_code) runner.run(properties=options, hadoop_conf_dir=HADOOP_CONF_DIR) return runner.output
def run_dst(opt): runner = hadut.PipesRunner(prefix=PREFIX) options = BASE_MR_OPTIONS.copy() options.update({ # [TODO] replace student_id with your id, e.g. 2011-12345 MR_JOB_NAME: "dst_count_2018-26190", MR_OUT_CLASS: "%s.SequenceFileOutputFormat" % MRLIB, MR_OUT_COMPRESS_TYPE: "NONE", MR_REDUCE_TASKS: "2", }) with open(LOCAL_DST_SCRIPT) as f: pipes_code = pts.adapt_script(f.read()) runner.set_input(opt.input, put=True) runner.set_exe(pipes_code) runner.run(properties=options, hadoop_conf_dir=HADOOP_CONF_DIR) return runner.output
def main(argv): parser = make_parser() args = parser.parse_args(argv) update_conf(args) logger = logging.getLogger("main") logger.setLevel(logging.INFO) runner = hadut.PipesRunner(prefix=PREFIX, logger=logger) with open(args.pipes_exe) as f: pipes_code = pts.adapt_script(f.read()) runner.set_input(args.local_input, put=True) runner.set_exe(pipes_code) runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger) res = runner.collect_output() if not os.getenv("DEBUG"): runner.clean() local_wc = pts.LocalWordCount(args.local_input) logging.info(local_wc.check(res))
def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.DEBUG) with Timer() as total_time: parser = make_parser() args = parser.parse_args(argv) if args.dataset: print(args.dataset) create_dataset(logger, args.dataset) if args.script: piped_code_file = args.script else: piped_code_file = DEFAULT_SCRIPT if not os.path.exists(piped_code_file): raise IOError("script {0} not found !!!".format(piped_code_file)) with open(piped_code_file) as f: pipes_code = pts.adapt_script(f.read()) dataset = [d for d in os.listdir("dataset") if d.endswith("MB")] dataset.sort(key=lambda x: int(x.replace("MB", ""))) logger.info(" Uploading dataset: { %s }", ', '.join(dataset)) if not hadut.path_exists(os.path.join(DATASET_DIR)): logger.info(" dataset folder created") hdfs.mkdir(DATASET_DIR) for data_filename in dataset: source_path = os.path.join(DATASET_DIR, data_filename) dest_path = os.path.join(DATASET_DIR, data_filename) if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)): logger.info(" -> uploading %s...", source_path) hdfs.put(source_path, dest_path) update_conf(args) results = dict() for data_input in dataset: with Timer() as t: runner = hadut.PipesRunner(prefix=PREFIX, logger=logger) logger.info("Running the script %s with data input %s..", piped_code_file, data_input) data_input_path = os.path.join(DATASET_DIR, data_input) runner.set_input(data_input_path, put=False) runner.set_exe(pipes_code) runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger) res = runner.collect_output() print(data_input_path) local_wc = pts.LocalWordCount(data_input_path) logging.info(local_wc.check(res)) # print(res) # runner.clean() results[data_input] = (t.secs, t.msecs) print("\n\n RESULTs") print("=" * (len(piped_code_file) + 15)) print(" * script: {0}".format(piped_code_file)) print(" * mappers: {0}".format(CONF["mapred.map.tasks"])) print(" * reducers: {0}".format(CONF["mapred.reduce.tasks"])) print(" * dataset: [{0}]".format(",".join(dataset))) print(" * times (input -> secs):") for data_input in dataset: print(" - {0} -> {1} secs.".format(data_input, results[data_input][0])) print("\n => Total execution time: {0}".format(total_time.secs)) print("=" * (len(piped_code_file) + 15)) print("\n")
def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.DEBUG) with Timer() as total_time: parser = make_parser() args = parser.parse_args(argv) if args.dataset: print(args.dataset) create_dataset(logger, args.dataset) if args.script: piped_code_file = args.script else: piped_code_file = DEFAULT_SCRIPT if not os.path.exists(piped_code_file): raise IOError("script {0} not found !!!".format(piped_code_file)) with open(piped_code_file) as f: pipes_code = pts.adapt_script(f.read()) dataset = [d for d in os.listdir("dataset") if d.endswith("MB")] dataset.sort(key=lambda x: int(x.replace("MB", ""))) logger.info(" Uploading dataset: { %s }", ', '.join(dataset)) if not hadut.path_exists(os.path.join(DATASET_DIR)): logger.info(" dataset folder created") hdfs.mkdir(DATASET_DIR) for data_filename in dataset: source_path = os.path.join(DATASET_DIR, data_filename) dest_path = os.path.join(DATASET_DIR, data_filename) if not hadut.path_exists(os.path.join(DATASET_DIR, data_filename)): logger.info(" -> uploading %s...", source_path) hdfs.put(source_path, dest_path) update_conf(args) results = dict() for data_input in dataset: with Timer() as t: runner = hadut.PipesRunner(prefix=PREFIX, logger=logger) logger.info("Running the script %s with data input %s..", piped_code_file, data_input) data_input_path = os.path.join(DATASET_DIR, data_input) runner.set_input(data_input_path, put=False) runner.set_exe(pipes_code) runner.run(properties=CONF, hadoop_conf_dir=HADOOP_CONF_DIR, logger=logger) res = runner.collect_output() print(data_input_path) local_wc = pts.LocalWordCount(data_input_path) logging.info(local_wc.check(res)) # print(res) # runner.clean() results[data_input] = (t.secs, t.msecs) print("\n\n RESULTs") print("=" * (len(piped_code_file) + 15)) print(" * script: {0}".format(piped_code_file)) print(" * mappers: {0}".format(CONF["mapred.map.tasks"])) print(" * reducers: {0}".format(CONF["mapred.reduce.tasks"])) print(" * dataset: [{0}]".format(",".join(dataset))) print(" * times (input -> secs):") for data_input in dataset: print(" - {0} -> {1} secs.".format( data_input, results[data_input][0] )) print("\n => Total execution time: {0}".format(total_time.secs)) print("=" * (len(piped_code_file) + 15)) print("\n")