Ejemplo n.º 1
0
Archivo: run.py Proyecto: crs4/pydoop
def run_network_avro(logger, avro_in='v', avro_out=None):
    try:
        program_name = AVRO_APPS[(avro_in, avro_out)]
    except KeyError:
        raise ValueError(
            "not supported: avro_in=%s, avro_out=%s" % (avro_in, avro_out)
        )
    else:
        program = os.path.join(WD, program_name)
        for name in program_name, "avro_base.py":
            shutil.copy(os.path.join(AVRO_PY_DIR, name), WD)
        os.chmod(program, os.stat(program).st_mode | stat.S_IEXEC)
    file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN
    schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None
    schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None
    simulator = HadoopSimulatorNetwork(
        program, logger, logging.INFO, context_cls=AvroContext,
        avro_input=avro_in, avro_output=avro_out,
        avro_output_key_schema=schema_k_out,
        avro_output_value_schema=schema_v_out
    )
    with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout:
        simulator.run(fin, fout, {}, num_reducers=1)
    dump_counters(simulator, logger)
    if avro_out:
        data_out_des = DATA_OUT + '-des'
        avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out)
        avro_check_results.main(USERS_CSV_FN, data_out_des)
    else:
        avro_check_results.main(USERS_CSV_FN, DATA_OUT)
Ejemplo n.º 2
0
def main():
    program_name = './foobar'
    dump_to_disk('data.in', DATA)
    dump_to_disk(program_name, FOOBAR_PY)
    os.chmod(program_name, 0777)
    hsn = HadoopSimulatorNetwork(program=program_name, loglevel=logging.INFO)
    hsn.run(open('data.in'), open('data.out', 'w'), {'a.useless.key': 'we'})
Ejemplo n.º 3
0
def run_network_avro(logger, avro_in='v', avro_out=None):
    try:
        program_name = AVRO_APPS[(avro_in, avro_out)]
    except KeyError:
        raise ValueError("not supported: avro_in=%s, avro_out=%s" %
                         (avro_in, avro_out))
    else:
        program = os.path.join(WD, program_name)
        for name in program_name, "avro_base.py":
            shutil.copy(os.path.join(AVRO_PY_DIR, name), WD)
        os.chmod(program, os.stat(program).st_mode | stat.S_IEXEC)
    file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN
    schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None
    schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None
    simulator = HadoopSimulatorNetwork(program,
                                       logger,
                                       logging.INFO,
                                       context_cls=AvroContext,
                                       avro_input=avro_in,
                                       avro_output=avro_out,
                                       avro_output_key_schema=schema_k_out,
                                       avro_output_value_schema=schema_v_out)
    with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout:
        simulator.run(fin, fout, {}, num_reducers=1)
    dump_counters(simulator, logger)
    if avro_out:
        data_out_des = DATA_OUT + '-des'
        avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out)
        avro_check_results.main(USERS_CSV_FN, data_out_des)
    else:
        avro_check_results.main(USERS_CSV_FN, DATA_OUT)
Ejemplo n.º 4
0
Archivo: run.py Proyecto: crs4/pydoop
def run_network_minimal(logger):
    program_name = cp_script(os.path.join(WC_DIR, 'wordcount_minimal.py'))
    data_in, data_out, conf, input_split, output_dir = create_configuration()
    hs = HadoopSimulatorNetwork(program=program_name, logger=logger,
                                loglevel=logger.level)
    hs.run(open(data_in), open(data_out, 'wb'), conf)
    dump_counters(hs, logger)
    check_results(data_in, data_out, logger)
    clean_up(data_out, output_dir)
Ejemplo n.º 5
0
def run_network_minimal(logger):
    program_name = cp_script(os.path.join(WC_DIR, 'wordcount_minimal.py'))
    data_in, data_out, conf, input_split, output_dir = create_configuration()
    hs = HadoopSimulatorNetwork(program=program_name,
                                logger=logger,
                                loglevel=logger.level)
    hs.run(open(data_in), open(data_out, 'wb'), conf)
    dump_counters(hs, logger)
    check_results(data_in, data_out, logger)
    clean_up(data_out, output_dir)
Ejemplo n.º 6
0
Archivo: run.py Proyecto: crs4/pydoop
def run_network_full(logger):
    program_name = cp_script(os.path.join(WC_DIR, 'wordcount_full.py'))
    data_in, data_out, conf, input_split, output_dir = create_configuration()
    hs = HadoopSimulatorNetwork(program=program_name, logger=logger,
                                loglevel=logger.level)
    hs.run(None, None, conf, input_split=input_split)
    data_out = os.path.join(output_dir,
                            'part-r-%05d' % int(conf["mapred.task.partition"]))
    dump_counters(hs, logger)
    check_results(data_in, data_out, logger)
    clean_up(data_out, output_dir)
Ejemplo n.º 7
0
def run_network_full(logger):
    program_name = cp_script(os.path.join(WC_DIR, 'wordcount_full.py'))
    data_in, data_out, conf, input_split, output_dir = create_configuration()
    hs = HadoopSimulatorNetwork(program=program_name,
                                logger=logger,
                                loglevel=logger.level)
    hs.run(None, None, conf, input_split=input_split)
    data_out = os.path.join(output_dir,
                            'part-r-%05d' % int(conf["mapred.task.partition"]))
    dump_counters(hs, logger)
    check_results(data_in, data_out, logger)
    clean_up(data_out, output_dir)
Ejemplo n.º 8
0
def main():
    program_name = './avro_pyrw.py'
    data_in = './users.avro'
    path = os.path.realpath(data_in)
    length = os.stat(path).st_size
    input_split = InputSplit.to_string('file://' + path, 0, length)
    out_path = os.path.realpath('.')
    conf = {
        "mapreduce.task.partition": "0",
        "mapreduce.task.output.dir": 'file://%s' % out_path,
    }
    hsn = HadoopSimulatorNetwork(program=program_name)
    hsn.run(None, None, conf, input_split=input_split)
Ejemplo n.º 9
0
def main():
    program_name = './avro_pyrw.py'
    data_in = './users.avro'
    path = os.path.realpath(data_in)
    length = os.stat(path).st_size
    input_split = InputSplit.to_string('file://'+path, 0, length)
    out_path = os.path.realpath('.')
    conf = {
        "mapreduce.task.partition": "0",
        "mapreduce.task.output.dir": 'file://%s' % out_path,
    }
    hsn = HadoopSimulatorNetwork(program=program_name)
    hsn.run(None, None, conf, input_split=input_split)
Ejemplo n.º 10
0
def main(argv):
    try:
        data_in = argv[1]
    except IndexError:
        sys.exit("Usage: python %s AVRO_FILE" % argv[0])
    shutil.copy('../schemas/stats.avsc', 'stats.avsc')
    program_name = cp_script('./avro_pyrw.py')
    path = os.path.realpath(data_in)
    length = os.stat(path).st_size
    input_split = InputSplit.to_string('file://' + path, 0, length)
    out_path = os.path.realpath('.')
    conf = {
        "mapreduce.task.partition": "0",
        "mapreduce.task.output.dir": 'file://%s' % out_path,
    }
    hsn = HadoopSimulatorNetwork(program=program_name)
    hsn.run(None, None, conf, input_split=input_split)