コード例 #1
0
def merge_post_vcfs(ref, resolved_vcf, no_resolve_vcf, out_vcf, pass_threshold,
                    lowqual_threshold):

    logger = logging.getLogger(merge_post_vcfs.__name__)

    logger.info("------------------------Merge vcfs-------------------------")

    chroms_order = get_chromosomes_order(reference=ref)

    good_records = []
    # Use fileinput to stream as if the two files were concatenated
    for line in fileinput.input([no_resolve_vcf, resolved_vcf]):
        if line[0] == "#":
            continue
        chrom, pos, _, ref, alt, score, _, info, format_, gt = line.strip(
        ).split()
        good_records.append([chrom, pos, ref, alt, gt, score])

    with open(out_vcf, "w") as o_f:
        o_f.write("##fileformat=VCFv4.2\n")
        o_f.write(
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
        for record in sorted(good_records,
                             key=lambda x: [chroms_order[x[0]], x[1]]):
            chrom, pos, ref, alt, gt, score = record
            prob = np.round(1 - (10**(-float(score) / 10)), 4)
            filter_ = "REJECT"
            if prob >= pass_threshold:
                filter_ = "PASS"
            elif prob >= lowqual_threshold:
                filter_ = "LowQual"
            o_f.write("\t".join([
                chrom, pos, ".", ref, alt, "{:.4f}".format(float(score)),
                filter_, "SCORE={:.4f}".format(prob), "GT", "0/1"
            ]) + "\n")
コード例 #2
0
def merge_post_vcfs(ref, resolved_vcf, no_resolve_vcf, target_vcf, out_vcf,
                    pass_threshold, lowqual_threshold):

    logger.info("-----------------------------------------------------------")
    logger.info("Merge vcfs")
    logger.info("-----------------------------------------------------------")

    chroms_order = get_chromosomes_order(reference=ref)

    good_records = []
    # Use fileinput to stream as if the two files were concatenated
    for line in fileinput.input([no_resolve_vcf, resolved_vcf]):
        if line[0] == "#":
            continue
        chrom, pos, _, ref, alt, score, _, info, format_, gt = line.strip(
        ).split()
        good_records.append([chrom, pos, ref, alt, gt, score])

    with open(out_vcf, "w") as o_f:
        o_f.write("##fileformat=VCFv4.2\n")
        o_f.write("##NeuSomatic Version={}\n".format(__version__))
        o_f.write(
            "##FORMAT=<ID=SCORE,Number=1,Type=Float,Description=\"Prediction probability score\">\n"
        )
        o_f.write(
            "##FILTER=<ID=PASS,Description=\"Accept as a higher confidence somatic mutation calls with probability score value at least {}\">\n"
            .format(pass_threshold))
        o_f.write(
            "##FILTER=<ID=LowQual,Description=\"Less confident somatic mutation calls with probability score value at least {}\">\n"
            .format(lowqual_threshold))
        o_f.write(
            "##FILTER=<ID=REJECT,Description=\"Rejected as a confident somatic mutation with probability score value below {}\">\n"
            .format(lowqual_threshold))
        o_f.write(
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
        for record in sorted(good_records,
                             key=lambda x: [chroms_order[x[0]], x[1]]):
            chrom, pos, ref, alt, gt, score = record
            prob = np.round(1 - (10**(-float(score) / 10)), 4)
            filter_ = "REJECT"
            if prob >= pass_threshold:
                filter_ = "PASS"
            elif prob >= lowqual_threshold:
                filter_ = "LowQual"
            o_f.write("\t".join([
                chrom, pos, ".", ref, alt, "{:.4f}".format(float(score)),
                filter_, "SCORE={:.4f}".format(prob), "GT", "0/1"
            ]) + "\n")
コード例 #3
0
def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                    batch_size, max_load_candidates, pass_threshold,
                    lowqual_threshold, ensemble, use_cuda):
    logger = logging.getLogger(call_neusomatic.__name__)

    logger.info("-----------------Call Somatic Mutations--------------------")

    logger.info("PyTorch Version: {}".format(torch.__version__))
    logger.info("Torchvision Version: {}".format(torchvision.__version__))
    if not use_cuda:
        torch.set_num_threads(num_threads)

    chroms_order = get_chromosomes_order(reference=ref_file)
    with pysam.FastaFile(ref_file) as rf:
        chroms = rf.references

    vartype_classes = ['DEL', 'INS', 'NONE', 'SNP']
    data_transform = transforms.Compose(
        [transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    num_channels = 119 if ensemble else 26
    net = NeuSomaticNet(num_channels)
    if use_cuda:
        logger.info("GPU calling!")
        net.cuda()
    else:
        logger.info("CPU calling!")

    if torch.cuda.device_count() > 1:
        logger.info("We use {} GPUs!".format(torch.cuda.device_count()))
        net = nn.DataParallel(net)

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    logger.info("Load pretrained model from checkpoint {}".format(checkpoint))
    pretrained_dict = torch.load(checkpoint,
                                 map_location=lambda storage, loc: storage)
    pretrained_state_dict = pretrained_dict["state_dict"]
    model_tag = pretrained_dict["tag"]
    logger.info("tag: {}".format(model_tag))

    matrices_dir = "{}/matrices_{}".format(out_dir, model_tag)
    if os.path.exists(matrices_dir):
        logger.warning("Remove matrices directory: {}".format(matrices_dir))
        shutil.rmtree(matrices_dir)
    os.mkdir(matrices_dir)
    coverage_thr = pretrained_dict["coverage_thr"]

    model_dict = net.state_dict()

    # 1. filter out unnecessary keys
    # pretrained_state_dict = {
    #     k: v for k, v in pretrained_state_dict.items() if k in model_dict}
    if "module." in list(
            pretrained_state_dict.keys())[0] and "module." not in list(
                model_dict.keys())[0]:
        pretrained_state_dict = {
            k.split("module.")[1]: v
            for k, v in pretrained_state_dict.items()
            if k.split("module.")[1] in model_dict
        }
    elif "module." not in list(
            pretrained_state_dict.keys())[0] and "module." in list(
                model_dict.keys())[0]:
        pretrained_state_dict = {("module." + k): v
                                 for k, v in pretrained_state_dict.items()
                                 if ("module." + k) in model_dict}
    else:
        pretrained_state_dict = {
            k: v
            for k, v in pretrained_state_dict.items() if k in model_dict
        }

    # 2. overwrite entries in the existing state dict
    model_dict.update(pretrained_state_dict)
    # 3. load the new state dict
    net.load_state_dict(pretrained_state_dict)

    new_split_tsvs_dir = os.path.join(out_dir, "split_tsvs")
    if os.path.exists(new_split_tsvs_dir):
        logger.warning(
            "Remove split candidates directory: {}".format(new_split_tsvs_dir))
        shutil.rmtree(new_split_tsvs_dir)
    os.mkdir(new_split_tsvs_dir)
    Ls = []
    candidates_tsv_ = []
    split_i = 0
    for candidate_file in candidates_tsv:
        idx = pickle.load(open(candidate_file + ".idx", "rb"))
        if len(idx) > max_load_candidates / 2:
            logger.info("Splitting {} of lenght {}".format(
                candidate_file, len(idx)))
            new_split_tsvs_dir_i = os.path.join(new_split_tsvs_dir,
                                                "split_{}".format(split_i))
            if os.path.exists(new_split_tsvs_dir_i):
                logger.warning("Remove split candidates directory: {}".format(
                    new_split_tsvs_dir_i))
                shutil.rmtree(new_split_tsvs_dir_i)
            os.mkdir(new_split_tsvs_dir_i)
            candidate_file_splits = merge_tsvs(input_tsvs=[candidate_file],
                                               out=new_split_tsvs_dir_i,
                                               candidates_per_tsv=max(
                                                   1, max_load_candidates / 2),
                                               max_num_tsvs=100000,
                                               overwrite_merged_tsvs=True,
                                               keep_none_types=True)
            for candidate_file_split in candidate_file_splits:
                idx_split = pickle.load(
                    open(candidate_file_split + ".idx", "rb"))
                candidates_tsv_.append(candidate_file_split)
                Ls.append(len(idx_split) - 1)
            split_i += 1
        else:
            candidates_tsv_.append(candidate_file)
            Ls.append(len(idx) - 1)

    current_L = 0
    candidate_files = []
    all_vcf_records = []
    all_vcf_records_none = []
    for i, (candidate_file, L) in enumerate(
            sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])):
        current_L += L
        candidate_files.append(candidate_file)
        if current_L > max_load_candidates / 10 or i == len(
                candidates_tsv_) - 1:
            logger.info("Run for candidate files: {}".format(candidate_files))
            call_set = NeuSomaticDataset(
                roots=candidate_files,
                max_load_candidates=max_load_candidates,
                transform=data_transform,
                is_test=True,
                num_threads=num_threads,
                coverage_thr=coverage_thr)
            call_loader = torch.utils.data.DataLoader(call_set,
                                                      batch_size=batch_size,
                                                      shuffle=True,
                                                      pin_memory=True,
                                                      num_workers=num_threads)

            current_L = 0
            candidate_files = []

            logger.info("N_dataset: {}".format(len(call_set)))
            if len(call_set) == 0:
                logger.warning(
                    "Skip {} with 0 candidates".format(candidate_file))
                continue

            final_preds_, none_preds_, true_path_ = call_variants(
                net, vartype_classes, call_loader, out_dir, model_tag,
                use_cuda)
            all_vcf_records.extend(
                pred_vcf_records(ref_file, final_preds_, true_path_, chroms,
                                 vartype_classes, num_threads))
            all_vcf_records_none.extend(
                pred_vcf_records_none(none_preds_, chroms))

    all_vcf_records = dict(all_vcf_records)
    all_vcf_records_none = dict(all_vcf_records_none)

    if os.path.exists(new_split_tsvs_dir):
        logger.warning(
            "Remove split candidates directory: {}".format(new_split_tsvs_dir))
        shutil.rmtree(new_split_tsvs_dir)

    logger.info("Prepare Output VCF")
    output_vcf = "{}/pred.vcf".format(out_dir)
    var_vcf_records = get_vcf_records(all_vcf_records)
    write_vcf(var_vcf_records, output_vcf, chroms_order, pass_threshold,
              lowqual_threshold)

    logger.info("Prepare Non-Somatics VCF")
    output_vcf_none = "{}/none.vcf".format(out_dir)
    vcf_records_none = get_vcf_records(all_vcf_records_none)
    write_vcf(vcf_records_none, output_vcf_none, chroms_order, pass_threshold,
              lowqual_threshold)

    if os.path.exists(matrices_dir):
        logger.warning("Remove matrices directory: {}".format(matrices_dir))
        shutil.rmtree(matrices_dir)
    return output_vcf
コード例 #4
0
ファイル: postprocess.py プロジェクト: zorrodong/neusomatic
def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
                 output_vcf, pass_threshold, lowqual_threshold):
    merged_vcf = pybedtools.BedTool(merged_vcf)
    candidates_vcf = pybedtools.BedTool(candidates_vcf)
    ensemble_candids_vcf = []
    if ensemble_tsv:
        ensemble_candids_vcf = os.path.join(work, "ensemble_candids.vcf")
        with open(ensemble_tsv) as e_f:
            with open(ensemble_candids_vcf, "w") as c_f:
                c_f.write("##fileformat=VCFv4.2\n")
                c_f.write(
                    "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
                for line in e_f:
                    if "T_REF_FOR" in line:
                        header = line.strip().split()
                        chrom_id = header.index("CHROM")
                        pos_id = header.index("POS")
                        ref_id = header.index("REF")
                        alt_id = header.index("ALT")
                        dp_id = header.index("T_DP")
                        ref_fw_id = header.index("T_REF_FOR")
                        ref_rv_id = header.index("T_REF_REV")
                        alt_fw_id = header.index("T_ALT_FOR")
                        alt_rv_id = header.index("T_ALT_REV")
                        continue
                    fields = line.strip().split()
                    chrom = fields[chrom_id]
                    pos = fields[pos_id]
                    ref = fields[ref_id]
                    alt = fields[alt_id]
                    dp = int(fields[dp_id])
                    ro_fw = int(fields[ref_fw_id])
                    ro_rv = int(fields[ref_rv_id])
                    ao_fw = int(fields[alt_fw_id])
                    ao_rv = int(fields[alt_rv_id])
                    ro = ro_fw + ro_rv
                    ao = ao_fw + ao_rv
                    af = np.round(ao / float(ao + ro + 0.0001), 4)
                    c_f.write(
                        "\t".join(map(str, [chrom, pos, ".", ref, alt, ".", ".", ".", "GT:DP:RO:AO:AF", ":".join(map(str, ["0/1", dp, ro, ao, af]))])) + "\n")

    ensemble_candids_vcf = pybedtools.BedTool(ensemble_candids_vcf)
    in_candidates = merged_vcf.window(candidates_vcf, w=5)
    notin_candidates = merged_vcf.window(candidates_vcf, w=5, v=True)
    in_ensemble = merged_vcf.window(ensemble_candids_vcf, w=5)
    notin_any = notin_candidates.window(ensemble_candids_vcf, w=5, v=True)
    chroms_order = get_chromosomes_order(reference=reference)
    with pysam.FastaFile(reference) as rf:
        chroms = rf.references

    scores = {}
    tags_info = {}
    for s_e, dd in [0, in_candidates], [1, in_ensemble]:
        for x in dd:
            tag = "-".join([str(chroms_order[x[0]]), x[1], x[3], x[4]])
            scores[tag] = [x[5], x[6], x[7], x[9]]
            if tag not in tags_info:
                tags_info[tag] = []
            info = x[19].split(":")
            dp, ro, ao = list(map(int, info[1:4]))
            af = float(info[4])
            is_same = x[1] == x[11] and x[3] == x[13] and x[4] == x[14]
            is_same_type = np.sign(
                len(x[3]) - len(x[13])) == np.sign(len(x[4]) - len(x[14]))
            dist = abs(int(x[1]) - int(x[11]))
            len_diff = abs((len(x[3]) - len(x[13])) - (len(x[4]) - len(x[14])))
            tags_info[tag].append(
                [~is_same, ~is_same_type, dist, len_diff, s_e, dp, ro, ao, af])
    fina_info_tag = {}
    for tag, hits in tags_info.items():
        hits = sorted(hits, key=lambda x: x[0:5])
        fina_info_tag[tag] = hits[0][5:]

    for x in notin_any:
        tag = "-".join([str(chroms_order[x[0]]), x[1], x[3], x[4]])
        fina_info_tag[tag] = [0, 0, 0, 0]
        scores[tag] = [x[5], x[6], x[7], x[9]]

    tags = sorted(fina_info_tag.keys(), key=lambda x: list(map(int, x.split("-")[0:2]
                                                               )))
    with open(output_vcf, "w") as o_f:
        o_f.write("##fileformat=VCFv4.2\n")
        o_f.write("##NeuSomatic Version={}\n".format(__version__))
        o_f.write(
            "##INFO=<ID=SCORE,Number=1,Type=Float,Description=\"Prediction probability score\">\n")
        o_f.write(
            "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth in the tumor\">\n")
        o_f.write(
            "##INFO=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count in the tumor\">\n")
        o_f.write(
            "##INFO=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count in the tumor\">\n")
        o_f.write(
            "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele fractions of alternate alleles in the tumor\">\n")
        o_f.write("##FILTER=<ID=PASS,Description=\"Accept as a higher confidence somatic mutation calls with probability score value at least {}\">\n".format(
            pass_threshold))
        o_f.write("##FILTER=<ID=LowQual,Description=\"Less confident somatic mutation calls with probability score value at least {}\">\n".format(
            lowqual_threshold))
        o_f.write("##FILTER=<ID=REJECT,Description=\"Rejected as a confident somatic mutation with probability score value below {}\">\n".format(
            lowqual_threshold))
        o_f.write(
            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
        o_f.write(
            "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth in the tumor\">\n")
        o_f.write(
            "##FORMAT=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count in the tumor\">\n")
        o_f.write(
            "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count in the tumor\">\n")
        o_f.write(
            "##FORMAT=<ID=AF,Number=1,Type=Float,Description=\"Allele fractions of alternate alleles in the tumor\">\n")
        o_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
        for tag in tags:
            chrom_id, pos, ref, alt = tag.split("-")
            qual, filter_, score, gt = scores[tag]
            dp, ro, ao, af = fina_info_tag[tag]
            info_field = "{};DP={};RO={};AO={};AF={}".format(
                score, dp, ro, ao, af)
            gt_field = "{}:{}:{}:{}:{}".format(gt, dp, ro, ao, af)
            o_f.write("\t".join(map(str, [chroms[int(chrom_id)], str(
                pos), ".", ref, alt, qual, filter_, info_field, "GT:DP:RO:AO:AF", gt_field])) + "\n")
コード例 #5
0
ファイル: resolve_variants.py プロジェクト: sll513/neusomatic
def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file,
                     target_bed_file, num_threads):
    logger = logging.getLogger(resolve_variants.__name__)

    logger.info("-------Resolve variants (e.g. exact INDEL sequences)-------")

    variants = {}
    with open(target_vcf_file) as tv_f:
        for line in tv_f:
            if line[0] == "#":
                continue
            fields = line.strip().split()
            id_ = int(fields[2])
            if len(fields[4]) < len(fields[3]):
                vartype = "DEL"
            elif len(fields[4]) > len(fields[3]):
                vartype = "INS"
            else:
                vartype = "SNP"
            if id_ not in variants:
                variants[id_] = []
            variants[id_].append(fields + [vartype])
    target_bed = pybedtools.BedTool(target_bed_file)
    map_args = []
    for tb in target_bed:
        chrom, start, end, id_ = tb[0:4]
        id_ = int(id_)
        map_args.append([chrom, start, end, variants[id_],
                         input_bam, reference])

    pool = multiprocessing.Pool(num_threads)
    try:
        out_variants_list = pool.map_async(
            find_resolved_variants, map_args).get()
        pool.close()
    except Exception as inst:
        logger.error(inst)
        pool.close()
        traceback.print_exc()
        raise Exception

    for o in out_variants_list:
        if o is None:
            raise Exception("resolve_variants failed!")

    out_variants = [x for xs in out_variants_list for x in xs]
    chroms_order = get_chromosomes_order(bam=input_bam)

    out_variants = sorted(out_variants, key=lambda x: [
                          chroms_order[x[0]], int(x[1])])
    with open(resolved_vcf, "w") as o_f:
        o_f.write("##fileformat=VCFv4.2\n")
        o_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
        for chrom, pos, ref, alt, gt, phred_score in out_variants:
            if ref != alt:
                phred_score = float(phred_score)
                prob = np.round(1 - (10**(-phred_score / 10)), 4)
                o_f.write("\t".join([chrom, str(pos), ".", ref,
                                     alt, "{:.4f}".format(
                                         np.round(phred_score, 4)),
                                     ".", "SCORE={:.4f}".format(prob), "GT", gt]) + "\n")
コード例 #6
0
def resolve_scores(input_bam, ra_vcf, target_vcf, output_vcf):
    logger = logging.getLogger(resolve_scores.__name__)

    logger.info("-----Resolve Prediction Scores for Realigned Variants------")

    ra_out = pybedtools.BedTool(ra_vcf)
    ra_target = pybedtools.BedTool(target_vcf)

    final_intervals = []
    for interval in ra_out.window(ra_target, w=5, v=True):
        interval[5] = "0.5"
        final_intervals.append(interval)

    intervals_dict = {}
    for interval in ra_out.window(ra_target, w=5):
        id_ = "{}-{}-{}-{}".format(interval[0], interval[1], interval[3],
                                   interval[4])
        if id_ not in intervals_dict:
            intervals_dict[id_] = []
        intervals_dict[id_].append(interval)

    for id_, intervals in intervals_dict.items():
        if len(intervals) == 1:
            score = intervals[0][15]
            interval = intervals[0][:10]
            interval[5] = score
            interval[7] = "SCORE={:.4f}".format(
                np.round(1 - (10**(-float(score) / 10)), 4))
        else:
            len_ = (len(intervals[0][4]) - len(intervals[0][3]))
            pos_ = int(intervals[0][1])
            len_diff = list(
                map(lambda x: abs((len(x[14]) - len(x[13])) - len_),
                    intervals))
            min_len_diff = min(len_diff)
            intervals = list(
                filter(
                    lambda x: abs(
                        (len(x[14]) - len(x[13])) - len_) == min_len_diff,
                    intervals))
            pos_diff = list(map(lambda x: abs(int(x[11]) - pos_), intervals))
            min_pos_diff = min(pos_diff)
            intervals = list(
                filter(lambda x: abs(int(x[11]) - pos_) == min_pos_diff,
                       intervals))
            score = "{:.4f}".format(
                np.round(max(map(lambda x: float(x[15]), intervals)), 4))
            interval = intervals[0][:10]
            interval[5] = score
            interval[7] = "SCORE={:.4f}".format(
                np.round(1 - (10**(-float(score) / 10)), 4))
        final_intervals.append(interval)

    chroms_order = get_chromosomes_order(bam=input_bam)

    out_variants = sorted(
        final_intervals,
        key=lambda x: [chroms_order[x[0]], int(x[1])])
    with open(output_vcf, "w") as o_f:
        o_f.write("##fileformat=VCFv4.2\n")
        o_f.write(
            "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
        for var in out_variants:
            o_f.write("\t".join(var) + "\n")