def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="dry run, do not delete any files [%default]") parser.set_defaults(dry_run=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) filenames = args c = E.Counter() for filename in filenames: c.checked += 1 if os.path.exists(filename + ".log"): if IOTools.isComplete(filename + ".log"): c.complete += 1 continue if IOTools.isComplete(filename): c.complete += 1 continue c.incomplete += 1 E.info('deleting %s' % filename) if options.dry_run: continue os.unlink(filename) c.deleted += 1 E.info(c) # write footer and output benchmark information. E.Stop()
def compareCheckSums(infiles, outfile): '''compare checksum files against existing reference data. ''' to_cluster = False outf = IOTools.openFile(outfile, "w") outf.write("\t".join(( ("track", "status", "job_finished", "nfiles", "nref", "missing", "extra", "different", "different_md5", "different_lines", "same", "same_md5", "same_lines", "same_exist", "files_missing", "files_extra", "files_different_md5", "files_different_lines"))) + "\n") for infile in infiles: E.info("working on {}".format(infile)) track = P.snip(infile, ".stats") logfiles = glob.glob(track + "*.log") job_finished = True for logfile in logfiles: is_complete = IOTools.isComplete(logfile) E.debug("logcheck: {} = {}".format(logfile, is_complete)) job_finished = job_finished and is_complete reffile = track + ".ref" # regular expression of files to test only for existence regex_exist = PARAMS.get('%s_regex_exist' % track, None) if regex_exist: regex_exist = re.compile("|".join(P.asList(regex_exist))) regex_linecount = PARAMS.get('%s_regex_linecount' % track, None) if regex_linecount: regex_linecount = re.compile("|".join(P.asList(regex_linecount))) regex_md5 = PARAMS.get('%s_regex_md5' % track, None) if regex_md5: regex_md5 = re.compile("|".join(P.asList(regex_md5))) if not os.path.exists(reffile): raise ValueError('no reference data defined for %s' % track) cmp_data = pandas.read_csv(IOTools.openFile(infile), sep="\t", index_col=0) ref_data = pandas.read_csv(IOTools.openFile(reffile), sep="\t", index_col=0) shared_files = set(cmp_data.index).intersection(ref_data.index) missing = set(ref_data.index).difference(cmp_data.index) extra = set(cmp_data.index).difference(ref_data.index) different = set(shared_files) # remove those for which only check for existence if regex_exist: same_exist = set([x for x in different if regex_exist.search(x)]) different = set([x for x in different if not regex_exist.search(x)]) else: same_exist = set() # select those for which only check for number of lines if regex_linecount: check_lines = [x for x in different if regex_linecount.search(x)] dd = (cmp_data['nlines'][check_lines] != ref_data['nlines'][check_lines]) different_lines = set(dd.index[dd]) different = different.difference(check_lines) dd = (cmp_data['nlines'][check_lines] == ref_data['nlines'][check_lines]) same_lines = set(dd.index[dd]) else: different_lines = set() same_lines = set() # remainder - check md5 if regex_md5: check_md5 = [x for x in different if regex_md5.search(x)] dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5]) different_md5 = set(dd.index[dd]) dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5]) same_md5 = set(dd.index[dd]) else: different_md5 = set() same_md5 = set() if job_finished and (len(missing) + len(extra) + len(different_md5) + len(different_lines) == 0): status = "OK" else: status = "FAIL" outf.write("\t".join(map(str, ( track, status, job_finished, len(cmp_data), len(ref_data), len(missing), len(extra), len(different_md5) + len(different_lines), len(different_md5), len(different_lines), len(same_md5) + len(same_lines) + len(same_exist), len(same_md5), len(same_lines), len(same_exist), ",".join(missing), ",".join(extra), ",".join(different_md5), ",".join(different_lines), ))) + "\n") outf.close()