Exemple #1
0
 def test_record_reader(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(record_reader=TrivialRecordReader))
     foname = 'map_reduce.out'
     with self._mkf(foname) as fout:
         hs.run(None, fout, job_conf, 0)
         self.assertTrue(os.stat(fout.name).st_size > 0)
Exemple #2
0
 def test_map_only(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory())
     with open(self.fname, 'r') as fin:
         with self._mkf('map_only.out') as fout:
             hs.run(fin, fout, job_conf, 0)
             self.assertTrue(os.stat(fout.name).st_size > 0)
Exemple #3
0
 def test_map_only(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(), loglevel=logging.CRITICAL)
     with open(self.fname, 'r') as fin:
         with self._mkf('map_only.out', 'w') as fout:
             hs.run(fin, fout, job_conf, 0)
             self.assertTrue(os.stat(fout.name).st_size > 0)
Exemple #4
0
 def test_record_reader(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(record_reader=TrivialRecordReader))
     foname = 'map_reduce.out'
     with self._mkf(foname) as fout:
         hs.run(None, fout, job_conf, 0)
         self.assertTrue(os.stat(fout.name).st_size > 0)
Exemple #5
0
def run_local_minimal(logger):
    hs = HadoopSimulatorLocal(factory=factory_minimal, logger=logger,
                              loglevel=logger.level)
    data_in, data_out, conf, input_split, output_dir = create_configuration()
    hs.run(open(data_in), open(data_out, 'wb'), conf)
    dump_counters(hs, logger)
    check_results(data_in, data_out, logger)
    clean_up(data_out, output_dir)
Exemple #6
0
def run_local_minimal(logger):
    hs = HadoopSimulatorLocal(factory=factory_minimal,
                              logger=logger,
                              loglevel=logger.level)
    data_in, data_out, conf, input_split, output_dir = create_configuration()
    hs.run(open(data_in), open(data_out, 'wb'), conf)
    dump_counters(hs, logger)
    check_results(data_in, data_out, logger)
    clean_up(data_out, output_dir)
Exemple #7
0
def run_local_full(logger):
    data_in, data_out, conf, input_split, output_dir = create_configuration()
    hsl = HadoopSimulatorLocal(factory=factory_full, logger=logger,
                               loglevel=logger.level)
    hsl.run(None, None, conf, input_split=input_split, num_reducers=1)
    data_out = os.path.join(output_dir,
                            'part-r-%05d' % int(conf["mapred.task.partition"]))
    dump_counters(hsl, logger)
    check_results(data_in, data_out, logger)
    clean_up(data_out, output_dir)
Exemple #8
0
def run_local_full(logger):
    data_in, data_out, conf, input_split, output_dir = create_configuration()
    hsl = HadoopSimulatorLocal(factory=factory_full,
                               logger=logger,
                               loglevel=logger.level)
    hsl.run(None, None, conf, input_split=input_split, num_reducers=1)
    data_out = os.path.join(output_dir,
                            'part-r-%05d' % int(conf["mapred.task.partition"]))
    dump_counters(hsl, logger)
    check_results(data_in, data_out, logger)
    clean_up(data_out, output_dir)
Exemple #9
0
 def test_map_combiner_reduce(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(combiner=TReducer))
     foname = 'map_combiner_reduce.out'
     with open(self.fname, 'r') as fin:
         with self._mkf(foname) as fout:
             hs.run(fin, fout, job_conf, 1)
     with open(self._mkfn(foname)) as f:
         for l in f:
             k, c = l.strip().split()
             self.assertEqual(COUNTS[k], int(c))
 def test_map_combiner_reduce(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(combiner=TReducer))
     foname = 'map_combiner_reduce.out'
     with open(self.fname, 'r') as fin:
         with self._mkf(foname) as fout:
             hs.run(fin, fout, job_conf, 1)
     with open(self._mkfn(foname)) as f:
         for l in f:
             k, c = l.strip().split()
             self.assertEqual(COUNTS[k], int(c))
Exemple #11
0
 def test_map_reduce(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(), loglevel=logging.CRITICAL)
     foname = 'map_reduce.out'
     with open(self.fname, 'r') as fin:
         with self._mkf(foname) as fout:
             hs.run(fin, fout, job_conf, 1)
             self.assertTrue(os.stat(fout.name).st_size > 0)
     with open(self._mkfn(foname)) as f:
         for l in f:
             k, c = l.strip().split()
             self.assertEqual(COUNTS[k], int(c))
Exemple #12
0
 def test_map_reduce(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(), loglevel=logging.CRITICAL)
     foname = 'map_reduce.out'
     with open(self.fname, 'r') as fin:
         with self._mkf(foname) as fout:
             hs.run(fin, fout, job_conf, 1)
             self.assertTrue(os.stat(fout.name).st_size > 0)
     with open(self._mkfn(foname)) as f:
         for l in f:
             k, c = l.strip().split()
             self.assertEqual(COUNTS[k], int(c))
 def test_map_reduce_with_counters(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(reducer_class=TReducerWithCounters))
     foname = 'map_reduce.out'
     with open(self.fname, 'r') as fin:
         with self._mkf(foname) as fout:
             hs.run(fin, fout, job_conf, 1)
             self.assertTrue(os.stat(fout.name).st_size > 0)
     with open(self._mkfn(foname)) as f:
         for l in f:
             k, c = l.strip().split()
             if "COUNTER_" in k:
                 ck = int(k[8:]) - 1
                 key = COUNTS.keys()[ck]
                 self.assertEqual(COUNTS[key], int(c))
             else:
                 self.assertEqual(COUNTS[k], int(c))
Exemple #14
0
 def test_map_reduce_with_counters(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory(reducer_class=TReducerWithCounters))
     foname = 'map_reduce.out'
     with open(self.fname, 'r') as fin:
         with self._mkf(foname) as fout:
             hs.run(fin, fout, job_conf, 1)
             self.assertTrue(os.stat(fout.name).st_size > 0)
     with open(self._mkfn(foname)) as f:
         for l in f:
             k, c = l.strip().split()
             if "COUNTER_" in k:
                 ck = int(k[8:]) - 1
                 key = COUNTS.keys()[ck]
                 self.assertEqual(COUNTS[key], int(c))
             else:
                 self.assertEqual(COUNTS[k], int(c))
Exemple #15
0
def run_local_avro(logger, avro_in='v', avro_out=None):
    mapper, reducer = AVRO_MAPPERS[avro_in], AVRO_REDUCERS[avro_out]
    schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None
    schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None
    file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN
    factory = pp.Factory(mapper_class=mapper, reducer_class=reducer)
    simulator = HadoopSimulatorLocal(factory, logger, logging.INFO,
                                     AvroContext, avro_in, avro_out,
                                     schema_k_out, schema_v_out)
    with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout:
        simulator.run(fin, fout, {}, num_reducers=1)
    dump_counters(simulator, logger)
    if avro_out:
        data_out_des = DATA_OUT + '-des'
        avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out)
        avro_check_results.main(USERS_CSV_FN, data_out_des)
    else:
        avro_check_results.main(USERS_CSV_FN, DATA_OUT)
Exemple #16
0
def run_local_avro(logger, avro_in='v', avro_out=None):
    mapper, reducer = AVRO_MAPPERS[avro_in], AVRO_REDUCERS[avro_out]
    schema_k_out = STATS_SCHEMA_STR if avro_out in {'k', 'kv'} else None
    schema_v_out = STATS_SCHEMA_STR if avro_out in {'v', 'kv'} else None
    file_in = USERS_PETS_FN if avro_in == 'kv' else AVRO_FN
    factory = pp.Factory(mapper_class=mapper, reducer_class=reducer)
    simulator = HadoopSimulatorLocal(
        factory, logger, logging.INFO, AvroContext,
        avro_in, avro_out, schema_k_out, schema_v_out
    )
    with open(file_in, 'rb') as fin, open(DATA_OUT, 'wb') as fout:
        simulator.run(fin, fout, {}, num_reducers=1)
    dump_counters(simulator, logger)
    if avro_out:
        data_out_des = DATA_OUT + '-des'
        avro_container_dump_results.main(DATA_OUT, data_out_des, avro_out)
        avro_check_results.main(USERS_CSV_FN, data_out_des)
    else:
        avro_check_results.main(USERS_CSV_FN, DATA_OUT)
Exemple #17
0
    def test_map_reduce_with_counters(self):
        job_conf = {'this.is.not.used': '22'}
        hs = HadoopSimulatorLocal(TFactory(reducer_class=TReducerWithCounters))
        foname = 'map_reduce.out'

        with open(self.fname, 'r') as fin:
            with self._mkf(foname) as fout:
                hs.run(fin, fout, job_conf, 1)

        with open(self._mkfn(foname)) as f:
            sum_ = 0
            counter_value = 0
            for l in f:
                k, c = l.strip().split()
                if "COUNTER_" in k:
                    counter_value = int(c)
                    self.assertEqual(sum_, counter_value)
                else:
                    sum_ += int(c)
                    self.assertEqual(COUNTS[k], int(c))
    def test_map_reduce_with_counters(self):
        job_conf = {'this.is.not.used': '22'}
        hs = HadoopSimulatorLocal(TFactory(reducer_class=TReducerWithCounters))
        foname = 'map_reduce.out'

        with open(self.fname, 'r') as fin:
            with self._mkf(foname) as fout:
                hs.run(fin, fout, job_conf, 1)

        with open(self._mkfn(foname)) as f:
            sum_ = 0
            counter_value = 0
            for l in f:
                k, c = l.strip().split()
                if "COUNTER_" in k:
                    counter_value = int(c)
                    self.assertEqual(sum_, counter_value)
                else:
                    sum_ += int(c)
                    self.assertEqual(COUNTS[k], int(c))
 def test_map_only(self):
     job_conf = {'this.is.not.used': '22'}
     hs = HadoopSimulatorLocal(TFactory())
     with open(self.fname, 'r') as fin:
         with self._mkf('map_only.out') as fout:
             hs.run(fin, fout, job_conf, 0)