def main():
    opts, args = getopt.getopt(sys.argv[1:], "n:o:c:qr:z")
    opts = dict(opts)
    if not args or not ("-o" in opts):
        print(Usage)
        sys.exit(2)

    nparts = None
    out_prefix = opts["-o"]
    rewrite_match = rewrite_out = filter_in = remove_prefix = add_prefix = starts_with = None
    ignore_list = []
    if "-c" in opts:
        rse = opts["-r"]
        config = Config(opts.get("-c"))
        preprocess = config.rse_param(rse, "preprocess")
        ignore_list = config.rse_param(rse, "ignore_list") or []
        if preprocess is not None:
            ilist = preprocess.get("ignore_list")
            if ilist is not None:
                ignore_list = ilist
            filter_in = preprocess.get("filter")
            if filter_in is not None:
                #print("filtering:", filter_in)
                filter_in = re.compile(filter_in)
            starts_with = preprocess.get("starts_with")
            remove_prefix = preprocess.get("remove_prefix")
            add_prefix = preprocess.get("add_prefix")
            rewrite = preprocess.get("rewrite", {})
            if rewrite:
                rewrite_match = re.compile(rewrite["match"])
                rewrite_out = rewrite["out"]
            #print("rewriting:", rewrite["match"], rewrite["out"])
        nparts = config.nparts(rse)
    zout = "-z" in opts
    nparts = int(opts.get("-n", nparts))

    if nparts is None:
        print(
            "N parts must be specified either with -n or via the -c <config> and -r <rse>"
        )
        print(Usage)
        sys.exit(2)

    in_lst = PartitionedList.open(files=args)
    out_lst = PartitionedList.create(nparts, out_prefix, zout)

    #print("ignore list:", ignore_list)

    for path in in_lst:
        if starts_with and not path.startswith(starts_with): continue
        for ignore_path in ignore_list:
            #print(f"checking path {path} for ignore path {ignore_path}")
            if path.startswith(ignore_path):
                ignore = True
                break
        else:
            ignore = False
        if ignore: continue
        if filter_in is not None and not filter_in.search(path): continue
        if remove_prefix is not None:
            if not path.startswith(remove_prefix):
                sys.stderr.write(
                    f"Path {path} does not begin with prefix {remove_prefix}\n"
                )
                sys.exit(1)
            path = path[len(remove_prefix):]
        if add_prefix:
            path = add_prefix + path
        if rewrite_match is not None:
            if not rewrite_match.search(path):
                sys.stderr.write(
                    f"Path rewrite pattern did not find a match in path {path}\n"
                )
                sys.exit(1)
            path = rewrite_match.sub(rewrite_out, path)
        #print("path:", type(path), path)
        out_lst.add(path)
    out_lst.close()

    print(out_lst.NWritten)
Beispiel #2
0
def main():
    import getopt, json

    t0 = time.time()

    opts, args = getopt.getopt(sys.argv[1:], "s:S:z")
    opts = dict(opts)

    if len(args) < 5:
        print(Usage)
        sys.exit(2)
    compress = "-z" in opts
    stats_file = opts.get("-s")
    stats_key = opts.get("-S", "cmp3")
    stats = Stats(stats_file) if stats_file else None

    b_prefix, r_prefix, a_prefix, out_dark, out_missing = args

    a_list = PartitionedList.open(a_prefix)
    r_list = PartitionedList.open(r_prefix)
    b_list = PartitionedList.open(b_prefix)

    my_stats = {
        "version": Version,
        "elapsed": None,
        "start_time": t0,
        "end_time": None,
        "missing": None,
        "dark": None,
        "missing_list_file": None,
        "dark_list_file": None,
        "b_prefix": b_prefix,
        "a_prefix": a_prefix,
        "r_prefix": r_prefix,
        "a_files": a_list.FileNames,
        "b_files": b_list.FileNames,
        "r_files": r_list.FileNames,
        "a_nfiles": a_list.NParts,
        "b_nfiles": b_list.NParts,
        "r_nfiles": r_list.NParts,
        "status": "started"
    }

    if stats is not None:
        stats[stats_key] = my_stats

    if compress:
        if not out_dark.endswith(".gz"): out_dark += ".gz"
        if not out_missing.endswith(".gz"): out_missing += ".gz"
        fd = gzip.open(out_dark, "wt")
        fm = gzip.open(out_missing, "wt")
    else:
        fd = open(out_dark, "w")
        fm = open(out_missing, "w")

    diffs = cmp3_generator(a_list, r_list, b_list)
    nm = nd = 0
    for t, path in diffs:
        if t == 'd':
            fd.write(path)
            nd += 1
        else:
            fm.write(path)
            nm += 1
    fd.close()
    fm.close()

    print("Found %d dark and %d missing replicas" % (nd, nm))
    t1 = time.time()

    my_stats.update({
        "elapsed": t1 - t0,
        "end_time": t1,
        "missing": nm,
        "dark": nd,
        "status": "done",
        "missing_list_file": out_missing,
        "dark_list_file": out_dark
    })

    if stats is not None:
        stats[stats_key] = my_stats

    t = int(t1 - t0)
    s = t % 60
    m = t // 60
    print("Elapsed time: %dm%02ds" % (m, s))
Beispiel #3
0
def cmp3_parts(a_prefix, r_prefix, b_prefix):
    a_list = PartitionedList.open(a_prefix)
    r_list = PartitionedList.open(r_prefix)
    b_list = PartitionedList.open(b_prefix)
    return cmp3_lists(a_list, r_list, b_list)
Beispiel #4
0
def main():
    import getopt

    t0 = time.time()

    opts, args = getopt.getopt(sys.argv[1:], "s:S:zf")
    opts = dict(opts)

    if len(args) < 4:
        print(Usage)
        sys.exit(2)

    stats_file = opts.get("-s")
    stats_key = opts.get("-S", "join")
    compress = "-z" in opts
    single_file = "-f" in opts

    my_stats = stats = None

    op, a_spec, b_spec, out_spec = args

    if single_file:
        a_list = PartitionedList.open(files=[a_spec])
        b_list = PartitionedList.open(files=[b_spec])
        out_list = PartitionedList.create_file(out_spec)
    else:
        a_list = PartitionedList.open(prefix=a_spec)
        b_list = PartitionedList.open(prefix=b_spec)
        if a_list.NParts != b_list.NParts:
            print("Inconsistent number of parts: %s:%d: %s:%d" %
                  (a_spec, a_list.NParts, b_spec, b_list.NParts))
            sys.exit(1)
        out_list = PartitionedList.create(a_list.NParts, out_spec)

    if stats_file is not None:
        stats = Stats(stats_file)
        my_stats = {
            "version": Version,
            "elapsed": None,
            "start_time": t0,
            "end_time": None,
            "a_list_files": 0,
            "b_list_files": 0,
            "join_list_files": 0,
            "operation": op,
            "b_prefix": b_spec,
            "a_prefix": a_spec,
            "out_prefix": out_spec,
            "a_files": a_list.FileNames,
            "b_files": b_list.FileNames,
            "out_files": out_list.FileNames,
            "nparts": a_list.NParts,
            "status": "started"
        }
        stats[stats_key] = my_stats

    n_a_files = 0
    n_b_files = 0
    n_out_files = 0

    for pa, pb in zip(a_list.parts(), b_list.parts()):
        b_set = set(pb)
        n_b_files += len(b_set)
        for f in pa:
            n_a_files += 1
            if op == "and":
                if f in b_set:
                    out_list.add(f)
                    n_out_files += 1
            elif op == "minus":
                if not f in b_set:
                    out_list.add(f)
                    n_out_files += 1
            elif op == "xor":
                if f in b_set:
                    b_set.remove(f)
                else:
                    out_list.add(f)
                    n_out_files += 1
            elif op == "or":
                if f in b_set:
                    b_set.remove(f)
                out_list.add(f)
                n_out_files += 1
        if op in ("or", "xor"):
            for f in b_set:
                out_list.add(f)
                n_out_files += 1

    t1 = time.time()

    if stats_file:
        my_stats.update({
            "elapsed": t1 - t0,
            "end_time": t1,
            "a_list_files": n_a_files,
            "b_list_files": n_b_files,
            "join_list_files": join_list_files,
            "status": "done"
        })
        stats[stats_key] = my_stats