def __construct_tree_dict(in_file, in_id, merge_tree_dict):
    in_fp = bp.FileParser(in_file)
    for in_items in in_fp.iter():
        in_fp.print_prog()
        cur_chr = in_items[0][3:]
        cur_pos = int(in_items[1])
        cur_tag = in_items[-1]
        if "V" in cur_tag:
            type_tag = "pc_ind" if "ind" in cur_tag else "pc_snv"
        elif "refhom" in cur_tag:
            type_tag = "nc_low"
        else:
            type_tag = "nc_ind" if "ind" in cur_tag else "nc_snv"
        af = round(float(in_items[5]), FLOAT_PREC)
        cur_info = f"{in_id}_{'_'.join(in_items[2:5])}_{af}_{'_'.join(in_items[6:8])}"
        if type_tag not in merge_tree_dict:
            merge_tree_dict[type_tag] = dict()
        if cur_chr not in merge_tree_dict[type_tag]:
            merge_tree_dict[type_tag][cur_chr] = tree.RBTree()
        prev_info = merge_tree_dict[type_tag][cur_chr].search(cur_pos)
        if prev_info:
            if prev_info.split(";")[0] != cur_tag:
                print("WARNING: Inconsistent tag for locus")
            cur_info = f"{prev_info};{cur_info}"
        else:
            cur_info = f"{cur_tag};{cur_info}"
        merge_tree_dict[type_tag][cur_chr].insert(cur_pos, cur_info)
    return
Ejemplo n.º 2
0
def construct_from_bed(in_file: str):
    dict_tree = dict()
    in_fp = bp.FileParser(in_file)
    for in_items in in_fp.iter():
        in_fp.print_prog()
        cur_chr: str = in_items[0]
        cur_start: int = int(in_items[1]) + 1
        cur_end: int = int(in_items[2])
        if cur_chr not in dict_tree:
            dict_tree[cur_chr] = tree.RBITree()
        q_res = dict_tree[cur_chr].search(cur_start)
        if q_res:
            print(f"WARNING: Overlap found: {cur_chr}:{cur_start} - {q_res}")
        dict_tree[cur_chr].insert(cur_start, "0/0", cur_end)
    in_fp.close()
    return dict_tree
Ejemplo n.º 3
0
def construct_from_vcf(in_file: str):
    dict_tree = dict()
    in_fp = bp.FileParser(in_file)
    for in_items in in_fp.iter():
        in_fp.print_prog()
        cur_chr: str = in_items[0]
        cur_pos: int = int(in_items[1])
        if cur_chr not in dict_tree:
            dict_tree[cur_chr] = tree.RBITree()
        q_res = dict_tree[cur_chr].search(cur_pos)
        if q_res:
            print(f"WARNING: Overlap found: {cur_chr}:{cur_pos} - {q_res}")
        proc_res = process_vcf_line(in_items)
        cur_info: str = "-".join(proc_res)
        dict_tree[cur_chr].insert(cur_pos, cur_info)
    in_fp.close()
    return dict_tree
        if ctrl_info not in tag_count_dict:
            tag_count_dict[ctrl_info] = 0
        tag_count_dict[ctrl_info] += 1
        print(
            f"{chrpos} | CurThres: {rep_test[1]:<6.3f} | Count: {rep_test[2]}")
    else:
        tag_count_dict["na"] += 1
        print(f"{chrpos} | CurThres: ??? | Count: 0")
    out_f.write(f"{chrpos[0]}\t{chrpos[1]}\t{HOLDER_STR}\n")


if __name__ == '__main__':
    print(f"# Reference: {CTRL_FILE}")
    print(f"# Input: {IN_FILE}")
    print(f"# Reproducibility threshold: {threshold}")
    ref_fp = bp.FileParser(CTRL_FILE)
    in_fp = bp.FileParser(IN_FILE)
    out_f = open(OUT_FILE, "w")
    bp.write_vcf_hd(out_f)
    tag_count_dict = {"na": 0}
    for ref_items in ref_fp.iter():
        if in_fp.term:
            if not IGNORE_NULL:
                __print_and_write(out_f, ref_items, tag_count_dict)
            continue
        cmp_res = bp.cmp_lines(ref_items, in_fp.get_line())
        while cmp_res > 0:
            in_fp.next()
            if in_fp.term:
                break
            cmp_res = bp.cmp_lines(ref_items, in_fp.get_line())
Ejemplo n.º 5
0
    for out_id in OUT_ID_TUP:
        out_file = f"{OUT_DIR}/{IN_ID}.parse.{out_id}.txt"
        print(out_file)
        out_dict[out_id] = open(out_file, "w")
    return out_dict


if __name__ == '__main__':
    in_file_list = glob.glob(f"{IN_DIR}/*{IN_ID}.parse.txt")
    if len(in_file_list) != 1:
        print("ERROR: Invalid input file")
        sys.exit()
    else:
        print(f"** Input: {in_file_list[0]}")
    out_dict = __create_out_dict()
    in_fp = bp.FileParser(in_file_list[0])
    in_var_tup = SAMP_TO_VAR[IN_MOS] if IN_MOS in SAMP_TO_VAR else tuple()
    for in_items in in_fp.iter():
        in_fp.print_prog()
        cur_id = in_items[-1]
        if cur_id[0] == "[":
            cur_id = cur_id[2:-2].split(";")[0]
        if NAIVE:
            out_l = "{}\t{}".format("\t".join(in_items[:-1]), cur_id)
        else:
            out_l = "{}\t{}\t{}".format("\t".join(in_items[:3]),
                                        "\t".join(in_items[4:9]), cur_id)
        if not no_refhom and "refhom" in cur_id:
            out_dict["nc_low"].write(f"{out_l}\n")
        elif cur_id.split("_")[0] in in_var_tup:
            if "ind" in cur_id:
IN_DIR = sys.argv[1]
IN_ID = sys.argv[2]
OUT_FILE = sys.argv[3]


def __grab_all_input(in_dir, in_id):
    in_wc = in_id if "*" in in_id else f"{in_id}*"
    return glob.glob(f"{in_dir}/{in_wc}")


if __name__ == '__main__':
    in_file_list = __grab_all_input(IN_DIR, IN_ID)
    print(f"# Inputs: {len(in_file_list)}")
    for in_file in in_file_list:
        print(in_file)
    out_f = open(OUT_FILE, "w")
    bp.write_vcf_hd(out_f)
    for chr_num in bp.iter_chrom():
        cur_file = None
        for in_file in in_file_list:
            if f".chr{chr_num}." in in_file:
                cur_file = in_file
                break
        if not cur_file:
            continue
        cur_fp = bp.FileParser(cur_file)
        for cur_item in cur_fp.iter():
            out_f.write(f"{cur_fp.get_line().strip()}\n")
        cur_fp.close()
    out_f.close()
    if in_items[6] != "PASS":
        return False
    cur_gt = __convert_gt(in_items[-1].split(":")[0])
    if not cur_gt or cur_gt not in VALID_GT_SET:
        return False
    alt_base_list = in_items[4].split(",")
    if "<*>" in alt_base_list:
        alt_base_list.remove("<*>")
    if len(alt_base_list) > 1:
        return False
    return True


if __name__ == '__main__':
    print(f"# Input: {IN_FILE}")
    in_fp = bp.FileParser(IN_FILE)
    pass_f = open(PASS_FILE, "w")
    fail_f = open(FAIL_FILE, "w")
    bp.write_vcf_hd(pass_f)
    bp.write_vcf_hd(fail_f)
    prev_chrpos: str = "chr1.0"
    for in_items in in_fp.iter():
        in_fp.print_prog()
        in_l = in_fp.get_line().rstrip()
        cur_chrpos: str = f"{in_items[0]}.{in_items[1]}")
        if cur_chrpos == prev_chrpos:
            print(f"WARNING: Multiallelic site: {in_l}")
            fail_f.write(f"{in_l}\n")
        elif __validate_line(in_items):
            pass_f.write(f"{in_l}\n")
        else:
Ejemplo n.º 8
0
            return None
        out_list = ref_items[:2]
        var_id = ref_gt
        var_id += "_ind" if len(ref_items[3]) != len(ref_items[4]) else "_snv"
        out_list.append(var_id)
        out_list += ref_items[3:5]
        return "{}\t{}\n".format("\t".join(out_list), HOLDER_STR)
    elif sk_items[2] == in_items[2]:
        return "{}\n".format("\t".join(ref_items))
    return None


if __name__ == '__main__':
    print(f"# SK Input: {SK_FILE}")
    print(f"# DV Input: {DV_FILE}")
    sk_fp = bp.FileParser(SK_FILE)
    dv_fp = bp.FileParser(DV_FILE)
    out_f = open(OUT_FILE, "w")
    bp.write_vcf_hd(out_f)
    for sk_items in sk_fp.iter():
        if dv_fp.term:
            break
        cmp_res = bp.cmp_lines(sk_items, dv_fp.get_items())
        while cmp_res > 0:
            dv_fp.next()
            if dv_fp.term:
                break
            cmp_res = bp.cmp_lines(sk_items, dv_fp.get_items())
        if dv_fp.term:
            break
        elif cmp_res == 0: