def check_grep(mr_out_dir): output = hadut.collect_output(mr_out_dir).splitlines() exp_output = [] for name in sorted(os.listdir(DEFAULT_INPUT_DIR)): with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f: exp_output.extend([_.strip() for _ in f if "March" in _]) return output == exp_output
def check_caseswitch(mr_out_dir, switch="upper"): output = hadut.collect_output(mr_out_dir) exp_output = [] for name in sorted(os.listdir(DEFAULT_INPUT_DIR)): with open(os.path.join(DEFAULT_INPUT_DIR, name)) as f: exp_output.append(getattr(f.read(), switch)()) exp_output = "".join(exp_output) return output.splitlines() == exp_output.splitlines()
def check_base_histogram(mr_out_dir): output = Counter() for line in hadut.collect_output(mr_out_dir).splitlines(): k, v = line.split("\t") output[k] = int(v) exp_output = Counter() in_dir = os.path.join(THIS_DIR, "data", "base_histogram_input") for name in os.listdir(in_dir): with open(os.path.join(in_dir, name)) as f: for line in f: for base in line.rstrip().split("\t", 10)[9]: exp_output[base] += 1 return output == exp_output
def main(): parser = make_parser() opt, _ = parser.parse_args() logger = logging.getLogger("main") logger.setLevel(logging.INFO) logger.info("running word count") wc_output = run_wc(opt) logger.info("running filter") filter_output = run_filter(opt, wc_output) logger.info("checking results") res = hadut.collect_output(filter_output) local_wc = pts.LocalWordCount(opt.input, min_occurrence=opt.threshold) logger.info(local_wc.check(res))
def main(): parser = make_parser() opt, _ = parser.parse_args() logger = logging.getLogger("main") logger.setLevel(logging.INFO) logger.info("running dst counter") dst_output = run_dst(opt) logger.info("running top50 filter") filter_output = run_filter(opt, dst_output) logger.info("checking results") res = hadut.collect_output(filter_output) with open("results/result_tf.txt", "w") as f_out: f_out.write(res)
def check_wordcount(mr_out_dir, stop_words=None): output = hadut.collect_output(mr_out_dir) local_wc = pts.LocalWordCount(DEFAULT_INPUT_DIR, stop_words=stop_words) res = local_wc.check(output) return res.startswith("OK") # FIXME: change local_wc to raise an exception
def get_res(output_dir): return pts.parse_mr_output(hadut.collect_output(output_dir), vtype=int)