def __construct_tree_dict(in_file, in_id, merge_tree_dict): in_fp = bp.FileParser(in_file) for in_items in in_fp.iter(): in_fp.print_prog() cur_chr = in_items[0][3:] cur_pos = int(in_items[1]) cur_tag = in_items[-1] if "V" in cur_tag: type_tag = "pc_ind" if "ind" in cur_tag else "pc_snv" elif "refhom" in cur_tag: type_tag = "nc_low" else: type_tag = "nc_ind" if "ind" in cur_tag else "nc_snv" af = round(float(in_items[5]), FLOAT_PREC) cur_info = f"{in_id}_{'_'.join(in_items[2:5])}_{af}_{'_'.join(in_items[6:8])}" if type_tag not in merge_tree_dict: merge_tree_dict[type_tag] = dict() if cur_chr not in merge_tree_dict[type_tag]: merge_tree_dict[type_tag][cur_chr] = tree.RBTree() prev_info = merge_tree_dict[type_tag][cur_chr].search(cur_pos) if prev_info: if prev_info.split(";")[0] != cur_tag: print("WARNING: Inconsistent tag for locus") cur_info = f"{prev_info};{cur_info}" else: cur_info = f"{cur_tag};{cur_info}" merge_tree_dict[type_tag][cur_chr].insert(cur_pos, cur_info) return
def construct_from_bed(in_file: str): dict_tree = dict() in_fp = bp.FileParser(in_file) for in_items in in_fp.iter(): in_fp.print_prog() cur_chr: str = in_items[0] cur_start: int = int(in_items[1]) + 1 cur_end: int = int(in_items[2]) if cur_chr not in dict_tree: dict_tree[cur_chr] = tree.RBITree() q_res = dict_tree[cur_chr].search(cur_start) if q_res: print(f"WARNING: Overlap found: {cur_chr}:{cur_start} - {q_res}") dict_tree[cur_chr].insert(cur_start, "0/0", cur_end) in_fp.close() return dict_tree
def construct_from_vcf(in_file: str): dict_tree = dict() in_fp = bp.FileParser(in_file) for in_items in in_fp.iter(): in_fp.print_prog() cur_chr: str = in_items[0] cur_pos: int = int(in_items[1]) if cur_chr not in dict_tree: dict_tree[cur_chr] = tree.RBITree() q_res = dict_tree[cur_chr].search(cur_pos) if q_res: print(f"WARNING: Overlap found: {cur_chr}:{cur_pos} - {q_res}") proc_res = process_vcf_line(in_items) cur_info: str = "-".join(proc_res) dict_tree[cur_chr].insert(cur_pos, cur_info) in_fp.close() return dict_tree
if ctrl_info not in tag_count_dict: tag_count_dict[ctrl_info] = 0 tag_count_dict[ctrl_info] += 1 print( f"{chrpos} | CurThres: {rep_test[1]:<6.3f} | Count: {rep_test[2]}") else: tag_count_dict["na"] += 1 print(f"{chrpos} | CurThres: ??? | Count: 0") out_f.write(f"{chrpos[0]}\t{chrpos[1]}\t{HOLDER_STR}\n") if __name__ == '__main__': print(f"# Reference: {CTRL_FILE}") print(f"# Input: {IN_FILE}") print(f"# Reproducibility threshold: {threshold}") ref_fp = bp.FileParser(CTRL_FILE) in_fp = bp.FileParser(IN_FILE) out_f = open(OUT_FILE, "w") bp.write_vcf_hd(out_f) tag_count_dict = {"na": 0} for ref_items in ref_fp.iter(): if in_fp.term: if not IGNORE_NULL: __print_and_write(out_f, ref_items, tag_count_dict) continue cmp_res = bp.cmp_lines(ref_items, in_fp.get_line()) while cmp_res > 0: in_fp.next() if in_fp.term: break cmp_res = bp.cmp_lines(ref_items, in_fp.get_line())
for out_id in OUT_ID_TUP: out_file = f"{OUT_DIR}/{IN_ID}.parse.{out_id}.txt" print(out_file) out_dict[out_id] = open(out_file, "w") return out_dict if __name__ == '__main__': in_file_list = glob.glob(f"{IN_DIR}/*{IN_ID}.parse.txt") if len(in_file_list) != 1: print("ERROR: Invalid input file") sys.exit() else: print(f"** Input: {in_file_list[0]}") out_dict = __create_out_dict() in_fp = bp.FileParser(in_file_list[0]) in_var_tup = SAMP_TO_VAR[IN_MOS] if IN_MOS in SAMP_TO_VAR else tuple() for in_items in in_fp.iter(): in_fp.print_prog() cur_id = in_items[-1] if cur_id[0] == "[": cur_id = cur_id[2:-2].split(";")[0] if NAIVE: out_l = "{}\t{}".format("\t".join(in_items[:-1]), cur_id) else: out_l = "{}\t{}\t{}".format("\t".join(in_items[:3]), "\t".join(in_items[4:9]), cur_id) if not no_refhom and "refhom" in cur_id: out_dict["nc_low"].write(f"{out_l}\n") elif cur_id.split("_")[0] in in_var_tup: if "ind" in cur_id:
IN_DIR = sys.argv[1] IN_ID = sys.argv[2] OUT_FILE = sys.argv[3] def __grab_all_input(in_dir, in_id): in_wc = in_id if "*" in in_id else f"{in_id}*" return glob.glob(f"{in_dir}/{in_wc}") if __name__ == '__main__': in_file_list = __grab_all_input(IN_DIR, IN_ID) print(f"# Inputs: {len(in_file_list)}") for in_file in in_file_list: print(in_file) out_f = open(OUT_FILE, "w") bp.write_vcf_hd(out_f) for chr_num in bp.iter_chrom(): cur_file = None for in_file in in_file_list: if f".chr{chr_num}." in in_file: cur_file = in_file break if not cur_file: continue cur_fp = bp.FileParser(cur_file) for cur_item in cur_fp.iter(): out_f.write(f"{cur_fp.get_line().strip()}\n") cur_fp.close() out_f.close()
if in_items[6] != "PASS": return False cur_gt = __convert_gt(in_items[-1].split(":")[0]) if not cur_gt or cur_gt not in VALID_GT_SET: return False alt_base_list = in_items[4].split(",") if "<*>" in alt_base_list: alt_base_list.remove("<*>") if len(alt_base_list) > 1: return False return True if __name__ == '__main__': print(f"# Input: {IN_FILE}") in_fp = bp.FileParser(IN_FILE) pass_f = open(PASS_FILE, "w") fail_f = open(FAIL_FILE, "w") bp.write_vcf_hd(pass_f) bp.write_vcf_hd(fail_f) prev_chrpos: str = "chr1.0" for in_items in in_fp.iter(): in_fp.print_prog() in_l = in_fp.get_line().rstrip() cur_chrpos: str = f"{in_items[0]}.{in_items[1]}") if cur_chrpos == prev_chrpos: print(f"WARNING: Multiallelic site: {in_l}") fail_f.write(f"{in_l}\n") elif __validate_line(in_items): pass_f.write(f"{in_l}\n") else:
return None out_list = ref_items[:2] var_id = ref_gt var_id += "_ind" if len(ref_items[3]) != len(ref_items[4]) else "_snv" out_list.append(var_id) out_list += ref_items[3:5] return "{}\t{}\n".format("\t".join(out_list), HOLDER_STR) elif sk_items[2] == in_items[2]: return "{}\n".format("\t".join(ref_items)) return None if __name__ == '__main__': print(f"# SK Input: {SK_FILE}") print(f"# DV Input: {DV_FILE}") sk_fp = bp.FileParser(SK_FILE) dv_fp = bp.FileParser(DV_FILE) out_f = open(OUT_FILE, "w") bp.write_vcf_hd(out_f) for sk_items in sk_fp.iter(): if dv_fp.term: break cmp_res = bp.cmp_lines(sk_items, dv_fp.get_items()) while cmp_res > 0: dv_fp.next() if dv_fp.term: break cmp_res = bp.cmp_lines(sk_items, dv_fp.get_items()) if dv_fp.term: break elif cmp_res == 0: