def test_job_conf(self): job_conf = {} for k in mrv1_to_mrv2: job_conf[k] = k jc = JobConf( [item for sublist in iteritems(job_conf) for item in sublist]) for k in mrv2_to_mrv1: self.assertEqual(jc[k], job_conf[mrv2_to_mrv1[k]])
def dump_counters(hs, logger): counters = hs.get_counters() for phase in ['mapping', 'reducing']: logger.info("%s counters:", phase.capitalize()) for group in counters[phase]: logger.info(" Group %s", group) for c, v in iteritems(counters[phase][group]): logger.info(" %s: %s", c, v)
def test_job_conf(self): job_conf = {} for k in mrv1_to_mrv2: job_conf[k] = k jc = JobConf( [item for sublist in iteritems(job_conf) for item in sublist] ) for k in mrv2_to_mrv1: self.assertEqual(jc[k], job_conf[mrv2_to_mrv1[k]])
def __serialize_as_needed(self, key, value): out_kv = {'K': key, 'V': value} jc = self.job_conf if AVRO_OUTPUT in jc and (self.is_reducer() or self.__is_map_only()): for mode, record in iteritems(out_kv): serializer = self.__serializers.get(mode) if serializer is not None: out_kv[mode] = serializer.serialize(record) return out_kv['K'], out_kv['V']
def check_counts(self, fname, exp_count): count = count_outputs(fname) try: for k, v in iteritems(exp_count): self.assertTrue(k in count) self.assertEqual(count[k], v) except AssertionError: print(count) raise
def run(self): wc = {} if os.path.isdir(self.input_path): for fn in os.listdir(self.input_path): if fn[0] == ".": continue self._wordcount_file(wc, fn, self.input_path) else: self._wordcount_file(wc, self.input_path) if self.min_occurrence: wc = dict(t for t in iteritems(wc) if t[1] >= self.min_occurrence) return wc
def __run_test(self, mode, mapper_class, context_class): cmd_file = self.__write_cmd_file(mode) pp.run_task( pp.Factory(mapper_class=mapper_class), private_encoding=False, context_class=context_class, cmd_file=cmd_file) out_fn = cmd_file + '.out' out_records = [] with open(out_fn, 'rb') as f: bf = BinaryDownStreamAdapter(f) for cmd, args in bf: if cmd == bf.OUTPUT: name, color = args out_records.append({'name': name, 'favorite_color': color}) self.assertEqual(len(out_records), len(self.records)) for out_r, r in zip(out_records, self.records): for k, v in iteritems(out_r): self.assertEqual(v.decode('UTF-8'), r[k])
def spill_all(self): self.ctx.increment_counter(self.spill_counter, 1) self.ctx.increment_counter(self.spilled_bytes_counter, self.used_bytes) ctx = self.ctx writer = ctx.writer ctx.writer = None # disable auto-deserialize (ctx.key will be called by reduce) # FIXME: this might break custom Context implementations get_input_key = ctx.get_input_key ctx.get_input_key = types.MethodType(lambda self: self._key, ctx) for key, values in iteritems(self.data): ctx._key, ctx._values = key, iter(values) self.reducer.reduce(ctx) ctx.writer = writer ctx.get_input_key = get_input_key self.data.clear() self.used_bytes = 0
def __run_test(self, mode, mapper_class, context_class): cmd_file = self.__write_cmd_file(mode) pp.run_task(pp.Factory(mapper_class=mapper_class), private_encoding=False, context_class=context_class, cmd_file=cmd_file) out_fn = cmd_file + '.out' out_records = [] with open(out_fn, 'rb') as f: bf = BinaryDownStreamAdapter(f) for cmd, args in bf: if cmd == bf.OUTPUT: name, color = args out_records.append({'name': name, 'favorite_color': color}) self.assertEqual(len(out_records), len(self.records)) for out_r, r in zip(out_records, self.records): for k, v in iteritems(out_r): self.assertEqual(v.decode('UTF-8'), r[k])
def main(exp, res): expected = {} for l in iter_lines(exp): p = l.strip().split(';') expected.setdefault(p[1], Counter())[p[2]] += 1 computed = {} for l in iter_lines(res): p = l.strip().split('\t') computed[p[0]] = eval(p[1]) if set(computed) != set(expected): sys.exit("ERROR: computed keys != expected keys: %r != %r" % (sorted(computed), sorted(expected))) for k, v in iteritems(expected): if computed[k] != v: sys.exit("ERROR: %r: %r != %r" % (k, computed[k], dict(v))) print('All is ok!')
def main(exp, res): expected = {} for l in iter_lines(exp): p = l.strip().split(';') expected.setdefault(p[1], Counter())[p[2]] += 1 computed = {} for l in iter_lines(res): p = l.strip().split('\t') computed[p[0]] = eval(p[1]) if set(computed) != set(expected): sys.exit("ERROR: computed keys != expected keys: %r != %r" % ( sorted(computed), sorted(expected))) for k, v in iteritems(expected): if computed[k] != v: sys.exit("ERROR: %r: %r != %r" % (k, computed[k], dict(v))) print('All is ok!')
def get_counters(self): r""" Extract counters information accumulated by this simulator instance. The expected usage is as follows:: .. code-block:: python counters = hs.get_counters() for phase in ['mapping', 'reducing']: print "{} counters:".format(phase.capitalize()) for group in counters[phase]: print " Group {}".format(group) for c, v in counters[phase][group].iteritems(): print " {}: {}".format(c, v) """ ctable = {'mapping': {}, 'reducing': {}} for k, v in iteritems(self.counters): ctable.setdefault( k[0], {}).setdefault(v[0][0], {}).setdefault(v[0][1], v[1]) return ctable
def write_header_down_stream(self, down_stream, authorization, job_conf): out_jc = sum([[k, v] for k, v in iteritems(job_conf)], []) self.write_authorization(down_stream, authorization) down_stream.send(down_stream.START_MESSAGE, 0) down_stream.send(down_stream.SET_JOB_CONF, *out_jc)