import logging logging.basicConfig(level=logging.INFO) import pydoop import pydoop.hadut as hadut import pydoop.test_support as pts CONF = { "mapreduce.job.maps": "2", "mapreduce.job.reduces": "2", # [TODO] replace student_id with your id, e.g. 2011-12345 "mapreduce.job.name": "nsf_2016-19762", } HADOOP_CONF_DIR = pydoop.hadoop_conf() PREFIX = os.getenv("PREFIX", pts.get_wd_prefix()) def update_conf(args): if args.D: for kv_pair in args.D: k, v = [_.strip() for _ in kv_pair.split("=")] CONF[k] = v def make_parser(): parser = argparse.ArgumentParser() parser.add_argument("pipes_exe", metavar="PIPES_EXE", help="python script to be run by pipes") parser.add_argument("local_input",
MR_HOME_DIR = 'mapreduce.admin.user.home.dir' PIPES_JAVA_RR = "hadoop.pipes.java.recordreader" PIPES_JAVA_RW = "hadoop.pipes.java.recordwriter" MR_OUT_COMPRESS_TYPE = "mapred.output.compression.type" MR_REDUCE_TASKS = "mapred.reduce.tasks" MR_IN_CLASS = "mapred.input.format.class" MR_OUT_CLASS = "mapred.output.format.class" MRLIB = "org.apache.hadoop.mapred" BASE_MR_OPTIONS = { PIPES_JAVA_RR: "true", PIPES_JAVA_RW: "true", MR_HOME_DIR: os.path.expanduser("~"), } PREFIX = os.getenv("PREFIX", pts.get_wd_prefix()) def make_parser(): parser = optparse.OptionParser(usage="%prog [OPTIONS]") parser.add_option("-i", dest="input", metavar="STRING", help="input dir/file ['%default']", default=DEFAULT_INPUT) parser.add_option("-t", type="int", dest="threshold", metavar="INT", help="min word occurrence [%default]", default=10) return parser def run_wc(opt): runner = hadut.PipesRunner(prefix=PREFIX) options = BASE_MR_OPTIONS.copy()