def test_record_reader(self): job_conf = {'this.is.not.used': '22'} hs = HadoopSimulatorLocal(TFactory(record_reader=TrivialRecordReader)) foname = 'map_reduce.out' with self._mkf(foname) as fout: hs.run(None, fout, job_conf, 0) self.assertTrue(os.stat(fout.name).st_size > 0)
def test_map_only(self): job_conf = {'this.is.not.used': '22'} hs = HadoopSimulatorLocal(TFactory()) with open(self.fname, 'r') as fin: with self._mkf('map_only.out') as fout: hs.run(fin, fout, job_conf, 0) self.assertTrue(os.stat(fout.name).st_size > 0)
def test_map_only(self): job_conf = {'this.is.not.used': '22'} hs = HadoopSimulatorLocal(TFactory(), loglevel=logging.CRITICAL) with open(self.fname, 'r') as fin: with self._mkf('map_only.out', 'w') as fout: hs.run(fin, fout, job_conf, 0) self.assertTrue(os.stat(fout.name).st_size > 0)
def run_local_minimal(logger): hs = HadoopSimulatorLocal(factory=factory_minimal, logger=logger, loglevel=logger.level) data_in, data_out, conf, input_split, output_dir = create_configuration() hs.run(open(data_in), open(data_out, 'wb'), conf) dump_counters(hs, logger) check_results(data_in, data_out, logger) clean_up(data_out, output_dir)
def run_local_full(logger): data_in, data_out, conf, input_split, output_dir = create_configuration() hsl = HadoopSimulatorLocal(factory=factory_full, logger=logger, loglevel=logger.level) hsl.run(None, None, conf, input_split=input_split, num_reducers=1) data_out = os.path.join(output_dir, 'part-r-%05d' % int(conf["mapred.task.partition"])) dump_counters(hsl, logger) check_results(data_in, data_out, logger) clean_up(data_out, output_dir)
def test_map_combiner_reduce(self): job_conf = {'this.is.not.used': '22'} hs = HadoopSimulatorLocal(TFactory(combiner=TReducer)) foname = 'map_combiner_reduce.out' with open(self.fname, 'r') as fin: with self._mkf(foname) as fout: hs.run(fin, fout, job_conf, 1) with open(self._mkfn(foname)) as f: for l in f: k, c = l.strip().split() self.assertEqual(COUNTS[k], int(c))
def test_map_reduce(self): job_conf = {'this.is.not.used': '22'} hs = HadoopSimulatorLocal(TFactory(), loglevel=logging.CRITICAL) foname = 'map_reduce.out' with open(self.fname, 'r') as fin: with self._mkf(foname) as fout: hs.run(fin, fout, job_conf, 1) self.assertTrue(os.stat(fout.name).st_size > 0) with open(self._mkfn(foname)) as f: for l in f: k, c = l.strip().split() self.assertEqual(COUNTS[k], int(c))
def test_map_reduce_with_counters(self): job_conf = {'this.is.not.used': '22'} hs = HadoopSimulatorLocal(TFactory(reducer_class=TReducerWithCounters)) foname = 'map_reduce.out' with open(self.fname, 'r') as fin: with self._mkf(foname) as fout: hs.run(fin, fout, job_conf, 1) self.assertTrue(os.stat(fout.name).st_size > 0) with open(self._mkfn(foname)) as f: for l in f: k, c = l.strip().split() if "COUNTER_" in k: ck = int(k[8:]) - 1 key = COUNTS.keys()[ck] self.assertEqual(COUNTS[key], int(c)) else: self.assertEqual(COUNTS[k], int(c))
def run_local_avro(logger, avro_in='v', avro_out=None): mapper, reducer = AVRO_MAPPERS[avro_in], AVRO_REDUCERS[avro_out] schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN factory = pp.Factory(mapper_class=mapper, reducer_class=reducer) simulator = HadoopSimulatorLocal(factory, logger, logging.INFO, AvroContext, avro_in, avro_out, schema_k_out, schema_v_out) with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout: simulator.run(fin, fout, {}, num_reducers=1) dump_counters(simulator, logger) if avro_out: data_out_des = DATA_OUT + '-des' avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out) avro_check_results.main(USERS_CSV_FN, data_out_des) else: avro_check_results.main(USERS_CSV_FN, DATA_OUT)
def run_local_avro(logger, avro_in='v', avro_out=None): mapper, reducer = AVRO_MAPPERS[avro_in], AVRO_REDUCERS[avro_out] schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN factory = pp.Factory(mapper_class=mapper, reducer_class=reducer) simulator = HadoopSimulatorLocal( factory, logger, logging.INFO, AvroContext, avro_in, avro_out, schema_k_out, schema_v_out ) with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout: simulator.run(fin, fout, {}, num_reducers=1) dump_counters(simulator, logger) if avro_out: data_out_des = DATA_OUT + '-des' avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out) avro_check_results.main(USERS_CSV_FN, data_out_des) else: avro_check_results.main(USERS_CSV_FN, DATA_OUT)
def test_map_reduce_with_counters(self): job_conf = {'this.is.not.used': '22'} hs = HadoopSimulatorLocal(TFactory(reducer_class=TReducerWithCounters)) foname = 'map_reduce.out' with open(self.fname, 'r') as fin: with self._mkf(foname) as fout: hs.run(fin, fout, job_conf, 1) with open(self._mkfn(foname)) as f: sum_ = 0 counter_value = 0 for l in f: k, c = l.strip().split() if "COUNTER_" in k: counter_value = int(c) self.assertEqual(sum_, counter_value) else: sum_ += int(c) self.assertEqual(COUNTS[k], int(c))
def test_map_only(self): job_conf = {'this.is.not.used': '22'} hs = HadoopSimulatorLocal(TFactory()) with open(self.fname, 'r') as fin: with self._mkf('map_only.out') as fout: hs.run(fin, fout, job_conf, 0)