def merge_post_vcfs(ref, resolved_vcf, no_resolve_vcf, out_vcf, pass_threshold, lowqual_threshold): logger = logging.getLogger(merge_post_vcfs.__name__) logger.info("------------------------Merge vcfs-------------------------") chroms_order = get_chromosomes_order(reference=ref) good_records = [] # Use fileinput to stream as if the two files were concatenated for line in fileinput.input([no_resolve_vcf, resolved_vcf]): if line[0] == "#": continue chrom, pos, _, ref, alt, score, _, info, format_, gt = line.strip( ).split() good_records.append([chrom, pos, ref, alt, gt, score]) with open(out_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for record in sorted(good_records, key=lambda x: [chroms_order[x[0]], x[1]]): chrom, pos, ref, alt, gt, score = record prob = np.round(1 - (10**(-float(score) / 10)), 4) filter_ = "REJECT" if prob >= pass_threshold: filter_ = "PASS" elif prob >= lowqual_threshold: filter_ = "LowQual" o_f.write("\t".join([ chrom, pos, ".", ref, alt, "{:.4f}".format(float(score)), filter_, "SCORE={:.4f}".format(prob), "GT", "0/1" ]) + "\n")
def merge_post_vcfs(ref, resolved_vcf, no_resolve_vcf, target_vcf, out_vcf, pass_threshold, lowqual_threshold): logger.info("-----------------------------------------------------------") logger.info("Merge vcfs") logger.info("-----------------------------------------------------------") chroms_order = get_chromosomes_order(reference=ref) good_records = [] # Use fileinput to stream as if the two files were concatenated for line in fileinput.input([no_resolve_vcf, resolved_vcf]): if line[0] == "#": continue chrom, pos, _, ref, alt, score, _, info, format_, gt = line.strip( ).split() good_records.append([chrom, pos, ref, alt, gt, score]) with open(out_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write("##NeuSomatic Version={}\n".format(__version__)) o_f.write( "##FORMAT=<ID=SCORE,Number=1,Type=Float,Description=\"Prediction probability score\">\n" ) o_f.write( "##FILTER=<ID=PASS,Description=\"Accept as a higher confidence somatic mutation calls with probability score value at least {}\">\n" .format(pass_threshold)) o_f.write( "##FILTER=<ID=LowQual,Description=\"Less confident somatic mutation calls with probability score value at least {}\">\n" .format(lowqual_threshold)) o_f.write( "##FILTER=<ID=REJECT,Description=\"Rejected as a confident somatic mutation with probability score value below {}\">\n" .format(lowqual_threshold)) o_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for record in sorted(good_records, key=lambda x: [chroms_order[x[0]], x[1]]): chrom, pos, ref, alt, gt, score = record prob = np.round(1 - (10**(-float(score) / 10)), 4) filter_ = "REJECT" if prob >= pass_threshold: filter_ = "PASS" elif prob >= lowqual_threshold: filter_ = "LowQual" o_f.write("\t".join([ chrom, pos, ".", ref, alt, "{:.4f}".format(float(score)), filter_, "SCORE={:.4f}".format(prob), "GT", "0/1" ]) + "\n")
def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, batch_size, max_load_candidates, pass_threshold, lowqual_threshold, ensemble, use_cuda): logger = logging.getLogger(call_neusomatic.__name__) logger.info("-----------------Call Somatic Mutations--------------------") logger.info("PyTorch Version: {}".format(torch.__version__)) logger.info("Torchvision Version: {}".format(torchvision.__version__)) if not use_cuda: torch.set_num_threads(num_threads) chroms_order = get_chromosomes_order(reference=ref_file) with pysam.FastaFile(ref_file) as rf: chroms = rf.references vartype_classes = ['DEL', 'INS', 'NONE', 'SNP'] data_transform = transforms.Compose( [transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) num_channels = 119 if ensemble else 26 net = NeuSomaticNet(num_channels) if use_cuda: logger.info("GPU calling!") net.cuda() else: logger.info("CPU calling!") if torch.cuda.device_count() > 1: logger.info("We use {} GPUs!".format(torch.cuda.device_count())) net = nn.DataParallel(net) if not os.path.exists(out_dir): os.mkdir(out_dir) logger.info("Load pretrained model from checkpoint {}".format(checkpoint)) pretrained_dict = torch.load(checkpoint, map_location=lambda storage, loc: storage) pretrained_state_dict = pretrained_dict["state_dict"] model_tag = pretrained_dict["tag"] logger.info("tag: {}".format(model_tag)) matrices_dir = "{}/matrices_{}".format(out_dir, model_tag) if os.path.exists(matrices_dir): logger.warning("Remove matrices directory: {}".format(matrices_dir)) shutil.rmtree(matrices_dir) os.mkdir(matrices_dir) coverage_thr = pretrained_dict["coverage_thr"] model_dict = net.state_dict() # 1. filter out unnecessary keys # pretrained_state_dict = { # k: v for k, v in pretrained_state_dict.items() if k in model_dict} if "module." in list( pretrained_state_dict.keys())[0] and "module." not in list( model_dict.keys())[0]: pretrained_state_dict = { k.split("module.")[1]: v for k, v in pretrained_state_dict.items() if k.split("module.")[1] in model_dict } elif "module." not in list( pretrained_state_dict.keys())[0] and "module." in list( model_dict.keys())[0]: pretrained_state_dict = {("module." + k): v for k, v in pretrained_state_dict.items() if ("module." + k) in model_dict} else: pretrained_state_dict = { k: v for k, v in pretrained_state_dict.items() if k in model_dict } # 2. overwrite entries in the existing state dict model_dict.update(pretrained_state_dict) # 3. load the new state dict net.load_state_dict(pretrained_state_dict) new_split_tsvs_dir = os.path.join(out_dir, "split_tsvs") if os.path.exists(new_split_tsvs_dir): logger.warning( "Remove split candidates directory: {}".format(new_split_tsvs_dir)) shutil.rmtree(new_split_tsvs_dir) os.mkdir(new_split_tsvs_dir) Ls = [] candidates_tsv_ = [] split_i = 0 for candidate_file in candidates_tsv: idx = pickle.load(open(candidate_file + ".idx", "rb")) if len(idx) > max_load_candidates / 2: logger.info("Splitting {} of lenght {}".format( candidate_file, len(idx))) new_split_tsvs_dir_i = os.path.join(new_split_tsvs_dir, "split_{}".format(split_i)) if os.path.exists(new_split_tsvs_dir_i): logger.warning("Remove split candidates directory: {}".format( new_split_tsvs_dir_i)) shutil.rmtree(new_split_tsvs_dir_i) os.mkdir(new_split_tsvs_dir_i) candidate_file_splits = merge_tsvs(input_tsvs=[candidate_file], out=new_split_tsvs_dir_i, candidates_per_tsv=max( 1, max_load_candidates / 2), max_num_tsvs=100000, overwrite_merged_tsvs=True, keep_none_types=True) for candidate_file_split in candidate_file_splits: idx_split = pickle.load( open(candidate_file_split + ".idx", "rb")) candidates_tsv_.append(candidate_file_split) Ls.append(len(idx_split) - 1) split_i += 1 else: candidates_tsv_.append(candidate_file) Ls.append(len(idx) - 1) current_L = 0 candidate_files = [] all_vcf_records = [] all_vcf_records_none = [] for i, (candidate_file, L) in enumerate( sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])): current_L += L candidate_files.append(candidate_file) if current_L > max_load_candidates / 10 or i == len( candidates_tsv_) - 1: logger.info("Run for candidate files: {}".format(candidate_files)) call_set = NeuSomaticDataset( roots=candidate_files, max_load_candidates=max_load_candidates, transform=data_transform, is_test=True, num_threads=num_threads, coverage_thr=coverage_thr) call_loader = torch.utils.data.DataLoader(call_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=num_threads) current_L = 0 candidate_files = [] logger.info("N_dataset: {}".format(len(call_set))) if len(call_set) == 0: logger.warning( "Skip {} with 0 candidates".format(candidate_file)) continue final_preds_, none_preds_, true_path_ = call_variants( net, vartype_classes, call_loader, out_dir, model_tag, use_cuda) all_vcf_records.extend( pred_vcf_records(ref_file, final_preds_, true_path_, chroms, vartype_classes, num_threads)) all_vcf_records_none.extend( pred_vcf_records_none(none_preds_, chroms)) all_vcf_records = dict(all_vcf_records) all_vcf_records_none = dict(all_vcf_records_none) if os.path.exists(new_split_tsvs_dir): logger.warning( "Remove split candidates directory: {}".format(new_split_tsvs_dir)) shutil.rmtree(new_split_tsvs_dir) logger.info("Prepare Output VCF") output_vcf = "{}/pred.vcf".format(out_dir) var_vcf_records = get_vcf_records(all_vcf_records) write_vcf(var_vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_threshold) logger.info("Prepare Non-Somatics VCF") output_vcf_none = "{}/none.vcf".format(out_dir) vcf_records_none = get_vcf_records(all_vcf_records_none) write_vcf(vcf_records_none, output_vcf_none, chroms_order, pass_threshold, lowqual_threshold) if os.path.exists(matrices_dir): logger.warning("Remove matrices directory: {}".format(matrices_dir)) shutil.rmtree(matrices_dir) return output_vcf
def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, output_vcf, pass_threshold, lowqual_threshold): merged_vcf = pybedtools.BedTool(merged_vcf) candidates_vcf = pybedtools.BedTool(candidates_vcf) ensemble_candids_vcf = [] if ensemble_tsv: ensemble_candids_vcf = os.path.join(work, "ensemble_candids.vcf") with open(ensemble_tsv) as e_f: with open(ensemble_candids_vcf, "w") as c_f: c_f.write("##fileformat=VCFv4.2\n") c_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for line in e_f: if "T_REF_FOR" in line: header = line.strip().split() chrom_id = header.index("CHROM") pos_id = header.index("POS") ref_id = header.index("REF") alt_id = header.index("ALT") dp_id = header.index("T_DP") ref_fw_id = header.index("T_REF_FOR") ref_rv_id = header.index("T_REF_REV") alt_fw_id = header.index("T_ALT_FOR") alt_rv_id = header.index("T_ALT_REV") continue fields = line.strip().split() chrom = fields[chrom_id] pos = fields[pos_id] ref = fields[ref_id] alt = fields[alt_id] dp = int(fields[dp_id]) ro_fw = int(fields[ref_fw_id]) ro_rv = int(fields[ref_rv_id]) ao_fw = int(fields[alt_fw_id]) ao_rv = int(fields[alt_rv_id]) ro = ro_fw + ro_rv ao = ao_fw + ao_rv af = np.round(ao / float(ao + ro + 0.0001), 4) c_f.write( "\t".join(map(str, [chrom, pos, ".", ref, alt, ".", ".", ".", "GT:DP:RO:AO:AF", ":".join(map(str, ["0/1", dp, ro, ao, af]))])) + "\n") ensemble_candids_vcf = pybedtools.BedTool(ensemble_candids_vcf) in_candidates = merged_vcf.window(candidates_vcf, w=5) notin_candidates = merged_vcf.window(candidates_vcf, w=5, v=True) in_ensemble = merged_vcf.window(ensemble_candids_vcf, w=5) notin_any = notin_candidates.window(ensemble_candids_vcf, w=5, v=True) chroms_order = get_chromosomes_order(reference=reference) with pysam.FastaFile(reference) as rf: chroms = rf.references scores = {} tags_info = {} for s_e, dd in [0, in_candidates], [1, in_ensemble]: for x in dd: tag = "-".join([str(chroms_order[x[0]]), x[1], x[3], x[4]]) scores[tag] = [x[5], x[6], x[7], x[9]] if tag not in tags_info: tags_info[tag] = [] info = x[19].split(":") dp, ro, ao = list(map(int, info[1:4])) af = float(info[4]) is_same = x[1] == x[11] and x[3] == x[13] and x[4] == x[14] is_same_type = np.sign( len(x[3]) - len(x[13])) == np.sign(len(x[4]) - len(x[14])) dist = abs(int(x[1]) - int(x[11])) len_diff = abs((len(x[3]) - len(x[13])) - (len(x[4]) - len(x[14]))) tags_info[tag].append( [~is_same, ~is_same_type, dist, len_diff, s_e, dp, ro, ao, af]) fina_info_tag = {} for tag, hits in tags_info.items(): hits = sorted(hits, key=lambda x: x[0:5]) fina_info_tag[tag] = hits[0][5:] for x in notin_any: tag = "-".join([str(chroms_order[x[0]]), x[1], x[3], x[4]]) fina_info_tag[tag] = [0, 0, 0, 0] scores[tag] = [x[5], x[6], x[7], x[9]] tags = sorted(fina_info_tag.keys(), key=lambda x: list(map(int, x.split("-")[0:2] ))) with open(output_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write("##NeuSomatic Version={}\n".format(__version__)) o_f.write( "##INFO=<ID=SCORE,Number=1,Type=Float,Description=\"Prediction probability score\">\n") o_f.write( "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth in the tumor\">\n") o_f.write( "##INFO=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count in the tumor\">\n") o_f.write( "##INFO=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count in the tumor\">\n") o_f.write( "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele fractions of alternate alleles in the tumor\">\n") o_f.write("##FILTER=<ID=PASS,Description=\"Accept as a higher confidence somatic mutation calls with probability score value at least {}\">\n".format( pass_threshold)) o_f.write("##FILTER=<ID=LowQual,Description=\"Less confident somatic mutation calls with probability score value at least {}\">\n".format( lowqual_threshold)) o_f.write("##FILTER=<ID=REJECT,Description=\"Rejected as a confident somatic mutation with probability score value below {}\">\n".format( lowqual_threshold)) o_f.write( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") o_f.write( "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth in the tumor\">\n") o_f.write( "##FORMAT=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count in the tumor\">\n") o_f.write( "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count in the tumor\">\n") o_f.write( "##FORMAT=<ID=AF,Number=1,Type=Float,Description=\"Allele fractions of alternate alleles in the tumor\">\n") o_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for tag in tags: chrom_id, pos, ref, alt = tag.split("-") qual, filter_, score, gt = scores[tag] dp, ro, ao, af = fina_info_tag[tag] info_field = "{};DP={};RO={};AO={};AF={}".format( score, dp, ro, ao, af) gt_field = "{}:{}:{}:{}:{}".format(gt, dp, ro, ao, af) o_f.write("\t".join(map(str, [chroms[int(chrom_id)], str( pos), ".", ref, alt, qual, filter_, info_field, "GT:DP:RO:AO:AF", gt_field])) + "\n")
def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file, target_bed_file, num_threads): logger = logging.getLogger(resolve_variants.__name__) logger.info("-------Resolve variants (e.g. exact INDEL sequences)-------") variants = {} with open(target_vcf_file) as tv_f: for line in tv_f: if line[0] == "#": continue fields = line.strip().split() id_ = int(fields[2]) if len(fields[4]) < len(fields[3]): vartype = "DEL" elif len(fields[4]) > len(fields[3]): vartype = "INS" else: vartype = "SNP" if id_ not in variants: variants[id_] = [] variants[id_].append(fields + [vartype]) target_bed = pybedtools.BedTool(target_bed_file) map_args = [] for tb in target_bed: chrom, start, end, id_ = tb[0:4] id_ = int(id_) map_args.append([chrom, start, end, variants[id_], input_bam, reference]) pool = multiprocessing.Pool(num_threads) try: out_variants_list = pool.map_async( find_resolved_variants, map_args).get() pool.close() except Exception as inst: logger.error(inst) pool.close() traceback.print_exc() raise Exception for o in out_variants_list: if o is None: raise Exception("resolve_variants failed!") out_variants = [x for xs in out_variants_list for x in xs] chroms_order = get_chromosomes_order(bam=input_bam) out_variants = sorted(out_variants, key=lambda x: [ chroms_order[x[0]], int(x[1])]) with open(resolved_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for chrom, pos, ref, alt, gt, phred_score in out_variants: if ref != alt: phred_score = float(phred_score) prob = np.round(1 - (10**(-phred_score / 10)), 4) o_f.write("\t".join([chrom, str(pos), ".", ref, alt, "{:.4f}".format( np.round(phred_score, 4)), ".", "SCORE={:.4f}".format(prob), "GT", gt]) + "\n")
def resolve_scores(input_bam, ra_vcf, target_vcf, output_vcf): logger = logging.getLogger(resolve_scores.__name__) logger.info("-----Resolve Prediction Scores for Realigned Variants------") ra_out = pybedtools.BedTool(ra_vcf) ra_target = pybedtools.BedTool(target_vcf) final_intervals = [] for interval in ra_out.window(ra_target, w=5, v=True): interval[5] = "0.5" final_intervals.append(interval) intervals_dict = {} for interval in ra_out.window(ra_target, w=5): id_ = "{}-{}-{}-{}".format(interval[0], interval[1], interval[3], interval[4]) if id_ not in intervals_dict: intervals_dict[id_] = [] intervals_dict[id_].append(interval) for id_, intervals in intervals_dict.items(): if len(intervals) == 1: score = intervals[0][15] interval = intervals[0][:10] interval[5] = score interval[7] = "SCORE={:.4f}".format( np.round(1 - (10**(-float(score) / 10)), 4)) else: len_ = (len(intervals[0][4]) - len(intervals[0][3])) pos_ = int(intervals[0][1]) len_diff = list( map(lambda x: abs((len(x[14]) - len(x[13])) - len_), intervals)) min_len_diff = min(len_diff) intervals = list( filter( lambda x: abs( (len(x[14]) - len(x[13])) - len_) == min_len_diff, intervals)) pos_diff = list(map(lambda x: abs(int(x[11]) - pos_), intervals)) min_pos_diff = min(pos_diff) intervals = list( filter(lambda x: abs(int(x[11]) - pos_) == min_pos_diff, intervals)) score = "{:.4f}".format( np.round(max(map(lambda x: float(x[15]), intervals)), 4)) interval = intervals[0][:10] interval[5] = score interval[7] = "SCORE={:.4f}".format( np.round(1 - (10**(-float(score) / 10)), 4)) final_intervals.append(interval) chroms_order = get_chromosomes_order(bam=input_bam) out_variants = sorted( final_intervals, key=lambda x: [chroms_order[x[0]], int(x[1])]) with open(output_vcf, "w") as o_f: o_f.write("##fileformat=VCFv4.2\n") o_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for var in out_variants: o_f.write("\t".join(var) + "\n")