Python VariantFile.new_record Beispiele

Programmiersprache: Python

Namespace / Paketname: pysam

Klasse / Typ: VariantFile

Methode / Funktion: new_record

Beispiele auf hotexamples.com: 12

Python VariantFile.new_record - 12 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pysam.VariantFile.new_record, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

VariantFile(30)

close(30)

fetch(30)

write(30)

new_record(12)

subset_samples(4)

readline(2)

seek(2)

add_line(1)

items(1)

Beispiel #1

Datei anzeigen

Datei: VcfWriter.py Projekt: mfallahi/pepper

class VCFWriter:
    def __init__(self, reference_file_path, contigs, sample_name, output_dir,
                 filename):
        self.fasta_handler = PEPPER_HP.FASTA_handler(reference_file_path)
        self.contigs = contigs
        vcf_header = self.get_vcf_header(sample_name, contigs)

        self.vcf_file = VariantFile(output_dir + filename + '.vcf',
                                    'w',
                                    header=vcf_header)

    def write_vcf_records(self, called_variant):
        contig, ref_start, ref_end, ref_seq, alleles, genotype = called_variant
        alleles = tuple([ref_seq]) + tuple(alleles)

        vcf_record = self.vcf_file.new_record(contig=str(contig),
                                              start=ref_start,
                                              stop=ref_end,
                                              id='.',
                                              qual=60,
                                              filter='PASS',
                                              alleles=alleles,
                                              GT=genotype,
                                              GQ=60)

        self.vcf_file.write(vcf_record)

    def get_vcf_header(self, sample_name, contigs):
        header = VariantHeader()
        items = [('ID', "PASS"), ('Description', "All filters passed")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "refCall"), ('Description', "Call is homozygous")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowGQ"), ('Description', "Low genotype quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowQUAL"),
                 ('Description', "Low variant call quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "conflictPos"), ('Description', "Overlapping record")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "GT"), ('Number', 1), ('Type', 'String'),
                 ('Description', "Genotype")]
        header.add_meta(key='FORMAT', items=items)
        items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'),
                 ('Description', "Genotype Quality")]
        header.add_meta(key='FORMAT', items=items)

        sqs = self.fasta_handler.get_chromosome_names()
        for sq in sqs:
            if sq not in contigs:
                continue
            sq_id = sq
            ln = self.fasta_handler.get_chromosome_sequence_length(sq)
            header.contigs.add(sq_id, length=ln)

        header.add_sample(sample_name)

        return header

Beispiel #2

Datei anzeigen

Datei: VcfWriter.py Projekt: kishwarshafin/jarvis

class VCFWriter:
    def __init__(self, bam_file_path, sample_name, output_dir):
        self.bam_handler = BamHandler(bam_file_path)
        bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0]
        vcf_header = self.get_vcf_header(sample_name)
        time_str = time.strftime("%m%d%Y_%H%M%S")

        self.vcf_file = VariantFile(output_dir + bam_file_name + '_' +
                                    time_str + '.vcf',
                                    'w',
                                    header=vcf_header)

    def write_vcf_records(self, called_variants):
        for variant in called_variants:
            alleles = tuple([variant.ref]) + tuple(variant.alternate_alleles)
            # print(str(chrm), st_pos, end_pos, qual, rec_filter, alleles, genotype, gq)
            vcf_record = self.vcf_file.new_record(contig=str(
                variant.chromosome_name),
                                                  start=variant.pos_start,
                                                  stop=variant.pos_end,
                                                  id='.',
                                                  qual=60,
                                                  filter='PASS',
                                                  alleles=alleles,
                                                  GT=variant.genotype,
                                                  GQ=60)
            self.vcf_file.write(vcf_record)

    def get_vcf_header(self, sample_name):
        header = VariantHeader()
        items = [('ID', "PASS"), ('Description', "All filters passed")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "refCall"), ('Description', "Call is homozygous")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowGQ"), ('Description', "Low genotype quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowQUAL"),
                 ('Description', "Low variant call quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "conflictPos"), ('Description', "Overlapping record")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "GT"), ('Number', 1), ('Type', 'String'),
                 ('Description', "Genotype")]
        header.add_meta(key='FORMAT', items=items)
        items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'),
                 ('Description', "Genotype Quality")]
        header.add_meta(key='FORMAT', items=items)
        bam_sqs = self.bam_handler.get_header_sq()
        for sq in bam_sqs:
            id = sq['SN']
            ln = sq['LN']
            items = [('ID', id), ('length', ln)]
            header.add_meta(key='contig', items=items)

        header.add_sample(sample_name)

        return header

Beispiel #3

Datei anzeigen

def create_sample_format_from_info_lofreq(sample,
                                          input_name,
                                          output_name,
                                          skip_gt=False):
    input_vcf = VariantFile(input_name, 'r')
    input_vcf.header.formats.add("AF",
                                 number=1,
                                 type='Float',
                                 description="Allele Frequency")
    input_vcf.header.formats.add(
        "AD",
        number=".",
        type='String',
        description=
        "Allelic sample depths for the ref and alt alleles in the order listed"
    )
    input_vcf.header.formats.add(
        "DP",
        number=1,
        type='Integer',
        description=
        "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"
    )
    input_vcf.header.formats.add(
        "DP4",
        number=4,
        type='Integer',
        description=
        "Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"
    )
    input_vcf.header.formats.add("GT",
                                 number=".",
                                 type="String",
                                 description="Genotype")

    input_vcf.header.add_sample(sample)
    output_vcf = VariantFile(output_name, 'w', header=input_vcf.header)
    for record in input_vcf:
        ad = record.info["AD"]
        af = record.info["AF"]
        dp = record.info["DP"]
        fields = {
            "AF": af,
            "DP4": record.info["DP4"],
            "DP": dp,
            "AD": ad,
            "GT": (record.alleles[1], record.alleles[0])
        }
        new_record = output_vcf.new_record(record.chrom, record.start,
                                           record.stop, record.alleles,
                                           record.id, record.qual,
                                           record.filter, record.info,
                                           [fields])  #,
        output_vcf.write(new_record)

Beispiel #4

Datei anzeigen

def run_process(opts, mutect2_vcf, pindel_vcf):
    outputvcf = opts.output

    # Open VCF
    mutect2 = VariantFile(mutect2_vcf)
    pindel = VariantFile(pindel_vcf)

    # Add pindel header to new header
    new_header = mutect2.header
    new_header_keys = new_header.info.keys()
    for item in pindel.header.info.iteritems():
        if item[1].name in new_header_keys:
            continue
        else:
            new_header.info.add(item[1].name, item[1].number, item[1].type, item[1].description)

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=new_header)

    pindel_record_list = list()
    for p in pindel.fetch():
        tmp = vcf_out.new_record()
        tmp.chrom = p.chrom
        tmp.pos = p.pos
        tmp.ref = p.ref
        tmp.alts = p.alts
        for key in p.info.keys():
            tmp.info[key] = p.info[key]
        for key in p.format.keys():
            tmp.samples[0][key] = p.samples[0][key]
        tmp.samples[0]["AF"] = float(tmp.samples[0]["AD"][1]) / float(tmp.samples[0]["AD"][0] + tmp.samples[0]["AD"][1])
        tmp.info["DP"] = tmp.samples[0]["AD"][0] + tmp.samples[0]["AD"][1]
        pindel_record_list.append(tmp)

    oldchrom = 1
    for record in mutect2.fetch():
        chrom = record.chrom
        pos = record.pos
        alts = record.alts

        for i,record2 in enumerate(pindel_record_list):
            oldchrom = int(record2.chrom.replace("chr",""))
            if record2.chrom == chrom and record2.pos == pos and record2.alts == alts:
                del(pindel_record_list[i])
            elif record2.chrom == chrom and record2.pos > pos:
                break
            elif record2.chrom == chrom and record2.pos < pos:
                vcf_out.write(record2)
                del(pindel_record_list[i])
            elif oldchrom < int(chrom.replace("chr","")):
                vcf_out.write(record2)
                del(pindel_record_list[i])

        vcf_out.write(record)

Beispiel #5

Datei anzeigen

Datei: dtoxog_maf_to_vcf.py Projekt: NCI-GDC/variant-filtration-tool

def build_new_record(maf: Dict[str, str], vcf: VariantFile,
                     tag: str) -> VariantRecord:
    """
    Generates a new VCF minimal record from the MAF dictionary.
    :param maf: The MAF record as a dictionary.
    :param vcf: The VarianFile object.
    :param tag: The FILTER tag to use.
    """
    alleles = (
        maf["Reference_Allele"],
        maf["Tumor_Seq_Allele1"],
    )
    record = vcf.new_record(
        contig=str(maf["Chromosome"]),
        start=int(maf["Start_position"]) - 1,
        stop=len(maf["Reference_Allele"]) + int(maf["Start_position"]) - 1,
        filter=(tag, ),
        alleles=alleles,
    )
    return record

Beispiel #6

Datei anzeigen

Datei: generate_telomere_edits.py Projekt: kishwarshafin/T2T_polishing_scripts

def telomere_pruning(telomere_depth_bed, telomere_annotation, fasta,
                     small_variant_vcf, output_vcf, min_depth, min_gq,
                     min_vaf):
    """
    Find regions to delete in the telomere
    :param telomere_depth_bed:
    :param telomere_annotation:
    :param fasta:
    :param output_vcf:
    :return:
    """
    sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                     "] INFO: READING DEPTH BED FILE " + "\n")
    sys.stderr.flush()

    assembly_fasta_file = FastaFile(fasta)

    # directionary to keep track of depth at each position of the telomere
    position_wise_depth = defaultdict()

    # outputs regions that are going to be edited
    telomere_edit_regions = open("CHM13_v1_telomere_edit_regions.bed", "w")

    # read the depth file
    depth_bed_file = open(telomere_depth_bed, "r")

    small_variant_vcf = VariantFile(small_variant_vcf)
    output_vcf_file = VariantFile(output_vcf,
                                  'w',
                                  header=small_variant_vcf.header)

    # populate the position dictionary
    for bed_record in depth_bed_file:
        contig, position, depth = bed_record.rstrip().split("\t")
        # the bedfile has an offset of 1
        position_wise_depth[(contig, int(position) - 1)] = int(depth)

    sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                     "] INFO: DEPTH BED LOADED. " + "\n")
    sys.stderr.flush()

    sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                     "] INFO: READING ANNOTATION BED. " + "\n")
    sys.stderr.flush()

    regions_of_deletion = defaultdict(lambda: list)
    telomere_regions = defaultdict(lambda: list)
    contig_length_dict = defaultdict()
    telomere_annotation_bed_file = open(telomere_annotation, "r")

    all_vcf_records = []

    for bed_record in telomere_annotation_bed_file:
        contig, start_pos, end_pos, contig_length = bed_record.rstrip().split(
            "\t")
        contig_length_dict[contig] = contig_length
        if contig not in regions_of_deletion.keys():
            regions_of_deletion[contig] = []

        if contig not in telomere_regions.keys():
            telomere_regions[contig] = []

        telomere_regions[contig].append((int(start_pos), int(end_pos)))

        start_pos = int(start_pos)
        end_pos = int(end_pos)
        contig_length = int(contig_length)

        if start_pos == 0:
            # this is the left side of the telomere, so scan left to right
            if (contig,
                    0) in position_wise_depth.keys() and position_wise_depth[
                        (contig, 0)] >= min_depth:
                # it has full coverage, so simply do nothing.
                continue
            # otherwise scan to the point we hit min_depth
            record_start_pos = 0
            current_position = 1

            while True:
                current_depth = 0
                if (contig, current_position) in position_wise_depth.keys():
                    current_depth = position_wise_depth[(contig,
                                                         current_position)]

                if current_depth >= min_depth or current_position == end_pos:
                    break
                current_position += 1
            record_end_position = current_position
            length_of_record = record_end_position - record_start_pos + 1
            # pad the reference allele by one place
            reference_allele = assembly_fasta_file.fetch(
                reference=contig,
                start=record_start_pos,
                end=record_end_position + 1)
            # alternate allele is the last base of the
            alternate_allele = reference_allele[-1]

            sys.stderr.write("[" +
                             datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                             "] INFO: PRUNING: " + contig + " " +
                             str(record_start_pos) + " " +
                             str(record_end_position) + " " +
                             str(length_of_record) + "\n")
            sys.stderr.flush()

            regions_of_deletion[contig].append(
                (record_start_pos, record_end_position))
            telomere_edit_regions.write(contig + "\t" + str(record_start_pos) +
                                        "\t" + str(record_end_position) + "\n")

            # write this deletion to the VCF file
            alleles = [reference_allele, alternate_allele]
            vcf_record = output_vcf_file.new_record(contig=contig,
                                                    start=record_start_pos,
                                                    stop=record_end_position +
                                                    1,
                                                    id='.',
                                                    qual=60,
                                                    filter='PASS',
                                                    alleles=alleles,
                                                    GT=[1, 1],
                                                    GQ=60,
                                                    VAF=[1.0])
            all_vcf_records.append((vcf_record.contig, vcf_record.start,
                                    vcf_record.stop, vcf_record))
        elif end_pos == contig_length:
            # this is the right side of the telomere, so scan right to left
            if (contig, end_pos) in position_wise_depth.keys(
            ) and position_wise_depth[(contig, end_pos)] >= min_depth:
                # it has full coverage, so simply do nothing.
                continue

            record_end_position = end_pos
            current_position = end_pos - 1
            while True:
                current_depth = 0
                if (contig, current_position) in position_wise_depth.keys():
                    current_depth = position_wise_depth[(contig,
                                                         current_position)]

                if current_depth >= min_depth or current_position == start_pos:
                    break
                current_position -= 1
            record_start_pos = current_position
            length_of_record = record_end_position - record_start_pos + 1

            # pad the reference allele by one place
            reference_allele = assembly_fasta_file.fetch(
                reference=contig,
                start=record_start_pos - 1,
                end=record_end_position)
            # alternate allele is the last base of the
            alternate_allele = reference_allele[0]
            sys.stderr.write("[" +
                             datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                             "] INFO: PRUNING: " + contig + " " +
                             str(record_start_pos) + " " +
                             str(record_end_position) + " " +
                             str(length_of_record) + "\n")
            sys.stderr.flush()

            regions_of_deletion[contig].append(
                (record_start_pos, record_end_position))
            telomere_edit_regions.write(contig + "\t" + str(record_start_pos) +
                                        "\t" + str(record_end_position) + "\n")

            # write this deletion to the VCF file
            alleles = [reference_allele, alternate_allele]
            vcf_record = output_vcf_file.new_record(contig=contig,
                                                    start=record_start_pos - 1,
                                                    stop=record_end_position,
                                                    id='.',
                                                    qual=60,
                                                    filter='PASS',
                                                    alleles=alleles,
                                                    GT=[1, 1],
                                                    GQ=60,
                                                    VAF=[1.0])
            all_vcf_records.append((vcf_record.contig, vcf_record.start,
                                    vcf_record.stop, vcf_record))

    sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                     "] INFO: READING SMALL VARIANT VCF. " + "\n")
    sys.stderr.flush()

    # true_positive_positions = defaultdict(list)
    #
    # # filter the file
    for rec in small_variant_vcf.fetch():
        contig_deletion_regions = list(regions_of_deletion[rec.chrom])
        record_overlaps = False
        for region_start, region_end in contig_deletion_regions:
            if region_start <= rec.pos <= region_end:
                record_overlaps = True
                break

        # this small variant overalps with a region we are going to delete.
        if record_overlaps:
            continue

        telomere_region = list(telomere_regions[rec.chrom])
        record_overlaps = False
        for region_start, region_end in telomere_region:
            if region_start <= rec.pos <= region_end:
                record_overlaps = True
                break

        # small variant is outside telomere region
        if record_overlaps is False:
            continue

        sample_vafs = []
        for sample in rec.samples:
            sample_vafs = rec.samples[sample]['VAF']

        selected_alleles = [rec.alleles[0]]
        selected_allele_vaf = []

        for i in range(0, len(rec.alts)):

            if rec.pos < 10000:
                restoring_canonical = is_restoring_canonical_kmer(
                    rec.contig, rec.start, rec.stop, rec.alleles[0],
                    rec.alts[i], "CCCTAA", assembly_fasta_file,
                    int(contig_length_dict[rec.contig]))
            else:
                restoring_canonical = is_restoring_canonical_kmer(
                    rec.contig, rec.start, rec.stop, rec.alleles[0],
                    rec.alts[i], "GGGTTA", assembly_fasta_file,
                    int(contig_length_dict[rec.contig]))

            # if rec.contig == "chr19" and rec.pos < 200:
            #     print(rec, end='')
            #     print(restoring_canonical)

            # restoring canonical: 0 is no change, 1 is positive change, -1 means it's moving away from canonical
            if restoring_canonical == 1:
                found_positive_change = True
                selected_alleles.append(rec.alts[i])
                selected_allele_vaf.append(sample_vafs[i])
            elif restoring_canonical == 0:
                # meaning this allele has no affect on canonical k-mer restoration, so we simply fall back to set thresholds.
                if sample_vafs[i] >= min_vaf and rec.qual >= min_gq:
                    selected_alleles.append(rec.alts[i])
                    selected_allele_vaf.append(sample_vafs[i])

        # no allele passed the thresholds or is restoring canonical k-mer
        if len(selected_alleles) == 1:
            continue

        vcf_record = output_vcf_file.new_record(contig=rec.contig,
                                                start=rec.start,
                                                stop=rec.stop,
                                                id='.',
                                                qual=rec.qual,
                                                filter='PASS',
                                                alleles=selected_alleles,
                                                GT=[1, 1],
                                                GQ=rec.qual,
                                                VAF=selected_allele_vaf)

        all_vcf_records.append(
            (vcf_record.contig, vcf_record.start, vcf_record.stop, vcf_record))

        telomere_edit_regions.write(rec.contig + "\t" + str(rec.start) + "\t" +
                                    str(rec.stop) + "\n")

    all_vcf_records = sorted(all_vcf_records, key=lambda x: (x[0], x[1], x[2]))
    for contig, start, stop, record in all_vcf_records:
        output_vcf_file.write(record)

Beispiel #7

Datei anzeigen

Datei: merge_vcf_test.py Projekt: mfallahi/pepper

def merg_vcf(h1_vcf, h2_vcf, output_dir, merge_genotype):

    vcf_positional_dict = defaultdict(lambda: defaultdict(list))
    vcf_in1 = VariantFile(h1_vcf)
    vcf_out = VariantFile(output_dir + 'merged_file.vcf',
                          'w',
                          header=vcf_in1.header)

    for rec in vcf_in1.fetch():
        # ['__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__',
        # 'alleles', 'alts', 'chrom', 'contig', 'copy', 'filter', 'format', 'header', 'id', 'info', 'pos', 'qual', 'ref', 'rid', 'rlen', 'samples', 'start', 'stop', 'translate']
        if 'PASS' in rec.filter.keys():
            vcf_positional_dict[rec.chrom][rec.pos].append(rec)

    vcf_in2 = VariantFile(h2_vcf)
    for rec in vcf_in2.fetch():
        if 'PASS' in rec.filter.keys():
            vcf_positional_dict[rec.chrom][rec.pos].append(rec)

    for chrom in vcf_positional_dict.keys():
        for pos in sorted(vcf_positional_dict[chrom].keys()):
            # this means that merging is needed at this position
            if len(vcf_positional_dict[chrom][pos]) == 1:
                for var in vcf_positional_dict[chrom][pos]:
                    vcf_out.write(var)
            elif len(vcf_positional_dict[chrom][pos]) > 1:
                longest_ref = vcf_positional_dict[chrom][pos][0].ref
                longest_var = vcf_positional_dict[chrom][pos][0]
                for var in vcf_positional_dict[chrom][pos]:
                    if len(var.ref) > len(longest_ref):
                        longest_ref = var.ref
                        longest_var = var

                alts = [longest_ref]
                gq = -1.0
                qual = -1.0
                gts = []
                for var in vcf_positional_dict[chrom][pos]:
                    for sample in var.samples:
                        if gq < 0:
                            gq = var.samples[sample]['GQ']
                        gq = min(gq, var.samples[sample]['GQ'])
                        if var.samples[sample]['GT'] != [0, 0]:
                            gts.append(var.samples[sample]['GT'])

                    var_alts = list(var.alts)
                    var_ref = var.ref
                    if qual < 0:
                        qual = var.qual
                    qual = min(qual, var.qual)

                    ref_suffix = longest_ref[len(var_ref):]
                    for alt in var_alts:
                        if alt + ref_suffix not in alts and len(
                                alt + ref_suffix) > 0:
                            alts.append(alt + ref_suffix)

                if len(alts) == 2:
                    if merge_genotype:
                        if len(gts) == 2:
                            genotype = [1, 1]
                        else:
                            genotype = [0, 1]
                    else:
                        genotype = gts[0]
                else:
                    genotype = [1, 2]

                vcf_record = vcf_out.new_record(contig=longest_var.contig,
                                                start=longest_var.start,
                                                stop=longest_var.stop,
                                                id=longest_var.id,
                                                qual=qual,
                                                filter=longest_var.filter,
                                                alleles=alts,
                                                GT=genotype,
                                                GQ=gq)
                vcf_out.write(vcf_record)

Beispiel #8

Datei anzeigen

class VCFWriter:
    def __init__(self, reference_path, sample_name, output_dir, contigs):
        self.fasta_handler = PEPPER_SNP.FASTA_handler(reference_path)
        vcf_header = self.get_vcf_header(sample_name, contigs)
        time_str = time.strftime("%m%d%Y_%H%M%S")

        self.vcf_file = VariantFile(output_dir + "CANDIDATES_PEPPER" + '_' +
                                    time_str + '.vcf',
                                    'w',
                                    header=vcf_header)

    def get_genotype(self, ref, alt1, alt2):
        alt1_gt = 1
        alt2_gt = 2
        if ref == alt1 or alt1 == '*':
            alt1_gt = 0
        if ref == alt2 or alt2 == '*':
            alt2_gt = 0

        if alt1 == alt2:
            alt2_gt = alt1_gt
        gt = sorted([alt1_gt, alt2_gt])

        if gt == [0, 0]:
            return ref, [], [0, 0]
        if gt == [0, 1]:
            return ref, [alt1], [0, 1]
        if gt == [1, 1]:
            return ref, [alt1], [1, 1]
        if gt == [0, 2]:
            return ref, [alt2], [0, 1]
        if gt == [2, 2]:
            return ref, [alt2], [1, 1]
        if gt == [1, 2]:
            return ref, [alt1, alt2], [1, 2]

        return sorted([alt1_gt, alt2_gt])

    def get_alleles(self, ref_base, alt_predictions):
        alts1 = set()
        alts2 = set()
        for alt1, alt2 in alt_predictions:
            if alt1 != '*' and alt1 != ref_base:
                alts1.add(alt1)
            if alt2 != '*' and alt2 != ref_base:
                alts2.add(alt2)

        return list(alts1), list(alts2)

    def write_vcf_records(self, chromosome_name, called_variants,
                          reference_dict, positions):
        for pos in sorted(positions):
            ref_base = reference_dict[pos]

            if ref_base == 'n' or ref_base == 'N':
                continue

            alts1, alts2 = self.get_alleles(ref_base, called_variants[pos])
            if alts1:
                alt1 = alts1[0]
            else:
                alt1 = ref_base

            if alts2:
                alt2 = alts2[0]
            else:
                alt2 = ref_base

            ref, alt_alleles, gt = self.get_genotype(ref_base, alt1, alt2)

            if gt == [0, 0]:
                continue
            # add extra alleles not used here
            for i in range(1, len(alts1)):
                alt_alleles.append(alts1[i])
            # add extra alleles not used
            for i in range(1, len(alts2)):
                alt_alleles.append(alts2[i])

            alleles = tuple([ref]) + tuple(set(alt_alleles))
            # print(str(chrm), st_pos, end_pos, qual, rec_filter, alleles, genotype, gq)
            vcf_record = self.vcf_file.new_record(contig=str(chromosome_name),
                                                  start=pos,
                                                  stop=pos + 1,
                                                  id='.',
                                                  qual=60,
                                                  filter='PASS',
                                                  alleles=alleles,
                                                  GT=gt,
                                                  GQ=60)
            self.vcf_file.write(vcf_record)

    def get_vcf_header(self, sample_name, contigs):
        header = VariantHeader()
        items = [('ID', "PASS"), ('Description', "All filters passed")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "refCall"), ('Description', "Call is homozygous")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowGQ"), ('Description', "Low genotype quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowQUAL"),
                 ('Description', "Low variant call quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "conflictPos"), ('Description', "Overlapping record")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "GT"), ('Number', 1), ('Type', 'String'),
                 ('Description', "Genotype")]
        header.add_meta(key='FORMAT', items=items)
        items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'),
                 ('Description', "Genotype Quality")]
        header.add_meta(key='FORMAT', items=items)
        sqs = self.fasta_handler.get_chromosome_names()

        for sq in sqs:
            if sq not in contigs:
                continue
            sq_id = sq
            ln = self.fasta_handler.get_chromosome_sequence_length(sq)
            header.contigs.add(sq_id, length=ln)

        header.add_sample(sample_name)

        return header

Beispiel #9

Datei anzeigen

Datei: vcf_writer.py Projekt: MiaAltieri/friday

class VCFWriter:
    def __init__(self, bam_file_path, sample_name, output_dir):
        self.bam_handler = BamHandler(bam_file_path)
        bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0]
        vcf_header = self.get_vcf_header(sample_name)
        time_str = time.strftime("%m%d%Y_%H%M%S")

        self.vcf_file = VariantFile(output_dir + bam_file_name + '_' +
                                    time_str + '.vcf',
                                    'w',
                                    header=vcf_header)

    def write_vcf_record(self, chrm, st_pos, end_pos, ref, alts, genotype,
                         qual, gq, rec_filter):
        alleles = tuple([ref]) + tuple(alts)
        genotype = self.get_genotype_tuple(genotype)
        end_pos = int(end_pos) + 1
        st_pos = int(st_pos)

        vcf_record = self.vcf_file.new_record(contig=str(chrm),
                                              start=st_pos,
                                              stop=end_pos,
                                              id='.',
                                              qual=qual,
                                              filter=rec_filter,
                                              alleles=alleles,
                                              GT=genotype,
                                              GQ=gq)
        self.vcf_file.write(vcf_record)

    @staticmethod
    def prediction_label_to_allele(label):
        label_to_allele = {
            0: ['0', '0'],
            1: ['0', '1'],
            2: ['1', '1'],
            3: ['0', '2'],
            4: ['2', '2'],
            5: ['1', '2']
        }
        return label_to_allele[label]

    @staticmethod
    def get_qual_and_gq(probabilities, predicted_class):
        qual = 1.0 - probabilities[0]
        phred_qual = min(
            60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60)
        phred_qual = math.ceil(phred_qual * 100.0) / 100.0

        gq = probabilities[predicted_class]
        phred_gq = min(60,
                       -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60)
        phred_gq = math.ceil(phred_gq * 100.0) / 100.0
        return phred_qual, phred_gq

    @staticmethod
    def solve_multiple_alts(alts, ref):
        type1, type2 = alts[0][1], alts[1][1]
        alt1, alt2 = alts[0][0], alts[1][0]
        if type1 == DEL_TYPE and type2 == DEL_TYPE:
            if len(alt2) > len(alt1):
                return alt2, ref, alt2[0] + alt2[len(alt1):]
            else:
                return alt1, ref, alt1[0] + alt1[len(alt2):]
        elif type1 == IN_TYPE and type2 == IN_TYPE:
            return ref, alt1, alt2
        elif type1 == DEL_TYPE or type2 == DEL_TYPE:
            if type1 == DEL_TYPE and type2 == IN_TYPE:
                return alt1, ref, alt2 + alt1[1:]
            elif type1 == IN_TYPE and type2 == DEL_TYPE:
                return alt2, alt1 + alt2[1:], ref
            elif type1 == DEL_TYPE and type2 == SNP_TYPE:
                return alt1, ref, alt2 + alt1[1:]
            elif type1 == SNP_TYPE and type2 == DEL_TYPE:
                return alt2, alt1 + alt2[1:], ref
            elif type1 == DEL_TYPE:
                return alt1, ref, alt2
            elif type2 == DEL_TYPE:
                return alt2, alt1, ref
        else:
            return ref, alt1, alt2

    @staticmethod
    def solve_single_alt(alts, ref):
        alt1, alt_type = alts[0]
        if alt_type == DEL_TYPE:
            return alt1, ref, '.'

        return ref, alt1, '.'

    @staticmethod
    def get_genotype_tuple(genotype):
        split_values = genotype.split('/')
        split_values = [int(x) for x in split_values]
        return tuple(split_values)

    @staticmethod
    def process_prediction(pos, prediction_alt1, prediction_alt2):
        # get the list of prediction labels
        # assume both are homozygous first
        alt1_probability = [0.0, 0.0, 0.0]
        alt2_probability = [0.0, 0.0, 0.0]
        if prediction_alt1:
            count = 0
            for label, probability in prediction_alt1:
                count += 1
                for j, prob_value in enumerate(probability):
                    alt1_probability[j] += prob_value
            alt1_probability = [prob / count for prob in alt1_probability]
        if prediction_alt2:
            count = 0
            for label, probability in prediction_alt2:
                count += 1
                for j, prob_value in enumerate(probability):
                    alt2_probability[j] += prob_value
            alt1_probability = [prob / count for prob in alt1_probability]
        # probability that the site genotype is 0/0
        p00 = min(alt1_probability[0], alt2_probability[0])
        p01 = alt1_probability[1]
        p11 = alt1_probability[2]
        p02 = alt2_probability[1]
        p22 = alt2_probability[2]
        p12 = min(max(alt1_probability[1], alt1_probability[2]),
                  max(alt2_probability[1], alt2_probability[2]))

        # print(alt_probs)
        prob_list = [p00, p01, p11, p02, p22, p12]
        # print(prob_list)
        sum_probs = sum(prob_list)
        # print(sum_probs)
        normalized_list = [(float(i) / sum_probs) if sum_probs else 0
                           for i in prob_list]
        prob_list = normalized_list
        # print(prob_list)
        # print(sum(prob_list))
        gq, index = 0, 0
        for i, prob in enumerate(prob_list):
            if gq <= prob and prob > 0:
                index = i
                gq = prob
        # get alts from label
        genotype = VCFWriter.prediction_label_to_allele(index)
        genotype = genotype[0] + '/' + genotype[1]

        qual = sum(prob_list) - prob_list[0]
        phred_qual = min(
            60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60)
        phred_qual = math.ceil(phred_qual * 100.0) / 100.0
        phred_gq = min(60,
                       -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60)
        phred_gq = math.ceil(phred_gq * 100.0) / 100.0

        return genotype, phred_qual, phred_gq

    @staticmethod
    def get_proper_alleles(positional_record, genotype):
        alts = [(positional_record.alt1, positional_record.alt1_type),
                (positional_record.alt2, positional_record.alt2_type)]

        gts = genotype.split('/')
        refined_alt = []

        if gts[0] == '0' and gts[1] == '0':
            refined_alt.append('.')
        if gts[0] == '1' or gts[1] == '1':
            refined_alt.append(alts[0])
        if gts[0] == '2' or gts[1] == '2':
            if len(alts) > 1:
                refined_alt.append(alts[1])
            elif genotype == '0/2':
                refined_alt.append(alts[0])
                genotype = '0/1'
            elif genotype == '2/2':
                refined_alt.append(alts[0])
                genotype = '1/1'
            elif genotype == '1/2':
                genotype = '0/1'

        if len(refined_alt) == 1:
            ref, alt1, alt2 = VCFWriter.solve_single_alt(
                refined_alt, positional_record.ref)
        else:
            ref, alt1, alt2 = VCFWriter.solve_multiple_alts(
                refined_alt, positional_record.ref)

        refined_alt = [alt1, alt2]
        refined_gt = genotype
        if genotype == '0/2':
            refined_gt = '0/1'
        if genotype == '2/2':
            refined_gt = '1/1'

        return ref, refined_alt, refined_gt

    @staticmethod
    def get_filter(record, last_end):
        chrm, st_pos, end_pos, ref, alt_field, genotype, phred_qual, phred_gq = record
        if st_pos < last_end:
            return 'conflictPos'
        if genotype == '0/0':
            return 'refCall'
        if phred_qual < 0:
            return 'lowQUAL'
        if phred_gq < 0:
            return 'lowGQ'
        return 'PASS'

    def get_vcf_header(self, sample_name):
        header = VariantHeader()
        items = [('ID', "PASS"), ('Description', "All filters passed")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "refCall"), ('Description', "Call is homozygous")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowGQ"), ('Description', "Low genotype quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowQUAL"),
                 ('Description', "Low variant call quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "conflictPos"), ('Description', "Overlapping record")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "GT"), ('Number', 1), ('Type', 'String'),
                 ('Description', "Genotype")]
        header.add_meta(key='FORMAT', items=items)
        items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'),
                 ('Description', "Genotype Quality")]
        header.add_meta(key='FORMAT', items=items)
        bam_sqs = self.bam_handler.get_header_sq()
        for sq in bam_sqs:
            id = sq['SN']
            ln = sq['LN']
            items = [('ID', id), ('length', ln)]
            header.add_meta(key='contig', items=items)

        header.add_sample(sample_name)

        return header

Beispiel #10

Datei anzeigen

Datei: VcfWriter.py Projekt: CGL-Deeplearning/FRIDAY

class VCFWriter:
    def __init__(self, bam_file_path, sample_name, output_dir):
        self.bam_handler = BamHandler(bam_file_path)
        bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0]
        vcf_header = self.get_vcf_header(sample_name)
        time_str = time.strftime("%m%d%Y_%H%M%S")

        self.vcf_file = VariantFile(output_dir + bam_file_name + '_' +
                                    time_str + '.vcf',
                                    'w',
                                    header=vcf_header)

    def write_vcf_record(self, chrm, st_pos, end_pos, ref, alts, genotype,
                         qual, gq, rec_filter):
        alleles = tuple([ref]) + tuple(alts)
        genotype = self.get_genotype_tuple(genotype)
        end_pos = int(end_pos) + 1
        st_pos = int(st_pos)
        vcf_record = self.vcf_file.new_record(contig=str(chrm),
                                              start=st_pos,
                                              stop=end_pos,
                                              id='.',
                                              qual=qual,
                                              filter=rec_filter,
                                              alleles=alleles,
                                              GT=genotype,
                                              GQ=gq)
        self.vcf_file.write(vcf_record)

    @staticmethod
    def prediction_label_to_allele(label):
        label_to_allele = {
            0: ['0', '0'],
            1: ['0', '1'],
            2: ['1', '1'],
            3: ['0', '2'],
            4: ['2', '2'],
            5: ['1', '2']
        }
        return label_to_allele[label]

    @staticmethod
    def get_qual_and_gq(probabilities, predicted_class):
        qual = 1.0 - probabilities[0]
        phred_qual = min(
            60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60)
        phred_qual = math.ceil(phred_qual * 100.0) / 100.0

        gq = probabilities[predicted_class]
        phred_gq = min(60,
                       -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60)
        phred_gq = math.ceil(phred_gq * 100.0) / 100.0
        return phred_qual, phred_gq

    @staticmethod
    def solve_multiple_alts(alts, ref):
        type1, type2 = alts[0][1], alts[1][1]
        alt1, alt2 = alts[0][0], alts[1][0]
        if type1 == DEL_TYPE and type2 == DEL_TYPE:
            if len(alt2) > len(alt1):
                return alt2, ref, alt2[0] + alt2[len(alt1):]
            else:
                return alt1, ref, alt1[0] + alt1[len(alt2):]
        elif type1 == IN_TYPE and type2 == IN_TYPE:
            return ref, alt1, alt2
        elif type1 == DEL_TYPE or type2 == DEL_TYPE:
            if type1 == DEL_TYPE and type2 == IN_TYPE:
                return alt1, ref, alt2 + alt1[1:]
            elif type1 == IN_TYPE and type2 == DEL_TYPE:
                return alt2, alt1 + alt2[1:], ref
            elif type1 == DEL_TYPE and type2 == SNP_TYPE:
                return alt1, ref, alt2 + alt1[1:]
            elif type1 == SNP_TYPE and type2 == DEL_TYPE:
                return alt2, alt1 + alt2[1:], ref
            elif type1 == DEL_TYPE:
                return alt1, ref, alt2
            elif type2 == DEL_TYPE:
                return alt2, alt1, ref
        else:
            return ref, alt1, alt2

    @staticmethod
    def solve_single_alt(alts, ref):
        alt1, alt_type = alts[0]
        if alt_type == DEL_TYPE:
            return alt1, ref, '.'

        return ref, alt1, '.'

    @staticmethod
    def get_genotype_tuple(genotype):
        split_values = genotype.split('/')
        split_values = [int(x) for x in split_values]
        return tuple(split_values)

    @staticmethod
    def process_prediction(pos, predictions):
        # get the list of prediction labels
        list_prediction_labels = [label for label, probs in predictions]
        predicted_class = max(set(list_prediction_labels),
                              key=list_prediction_labels.count)

        # get alts from label
        genotype = VCFWriter.prediction_label_to_allele(predicted_class)
        genotype = genotype[0] + '/' + genotype[1]

        # get the probabilities
        list_prediction_probabilities = [probs for label, probs in predictions]
        num_classes = len(list_prediction_probabilities[0])
        min_probs_for_each_class = [
            min(l[i] for l in list_prediction_probabilities)
            for i in range(num_classes)
        ]

        # normalize the probabilities
        sum_of_probs = sum(min_probs_for_each_class
                           ) if sum(min_probs_for_each_class) > 0 else 1
        if sum(min_probs_for_each_class) <= 0:
            print("SUM ZERO ENCOUNTERED IN: ", pos, predictions)
            exit()
        probabilities = [
            float(i) / sum_of_probs for i in min_probs_for_each_class
        ]

        qual, gq = VCFWriter.get_qual_and_gq(probabilities, predicted_class)

        return genotype, qual, gq

    @staticmethod
    def get_proper_alleles(record):
        ref, alt_field, genotype, phred_qual, phred_gq = record

        gts = genotype.split('/')
        refined_alt = []

        if gts[0] == '0' and gts[1] == '0':
            refined_alt.append('.')
        if gts[0] == '1' or gts[1] == '1':
            refined_alt.append(alt_field[0])
        if gts[0] == '2' or gts[1] == '2':
            if len(alt_field) > 1:
                refined_alt.append(alt_field[1])
            elif genotype == '0/2':
                refined_alt.append(alt_field[0])
                genotype = '0/1'
            elif genotype == '2/2':
                refined_alt.append(alt_field[0])
                genotype = '1/1'
            elif genotype == '1/2':
                genotype = '0/1'

        if len(refined_alt) == 1:
            ref, alt1, alt2 = VCFWriter.solve_single_alt(refined_alt, ref)
        else:
            ref, alt1, alt2 = VCFWriter.solve_multiple_alts(refined_alt, ref)

        refined_alt = [alt1, alt2]
        refined_gt = genotype
        if genotype == '0/2':
            refined_gt = '0/1'
        if genotype == '2/2':
            refined_gt = '1/1'

        record = ref, refined_alt, phred_qual, phred_gq, refined_gt

        return record

    @staticmethod
    def get_filter(record, last_end):
        chrm, st_pos, end_pos, ref, alt_field, genotype, phred_qual, phred_gq = record
        if st_pos < last_end:
            return 'conflictPos'
        if genotype == '0/0':
            return 'refCall'
        if phred_qual < 0:
            return 'lowQUAL'
        if phred_gq < 0:
            return 'lowGQ'
        return 'PASS'

    def get_vcf_header(self, sample_name):
        header = VariantHeader()
        items = [('ID', "PASS"), ('Description', "All filters passed")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "refCall"), ('Description', "Call is homozygous")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowGQ"), ('Description', "Low genotype quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowQUAL"),
                 ('Description', "Low variant call quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "conflictPos"), ('Description', "Overlapping record")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "GT"), ('Number', 1), ('Type', 'String'),
                 ('Description', "Genotype")]
        header.add_meta(key='FORMAT', items=items)
        items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'),
                 ('Description', "Genotype Quality")]
        header.add_meta(key='FORMAT', items=items)
        bam_sqs = self.bam_handler.get_header_sq()
        for sq in bam_sqs:
            id = sq['SN']
            ln = sq['LN']
            items = [('ID', id), ('length', ln)]
            header.add_meta(key='contig', items=items)

        header.add_sample(sample_name)

        return header

Beispiel #11

Datei anzeigen

def run_process(opts, mutect2_vcf, mutect2_bam):
    outputvcf = opts.output

    # Open VCF, BAM
    m2vcf = VariantFile(mutect2_vcf)
    m2bam = AlignmentFile(mutect2_bam, 'rb')

    old_chrom = ''
    old_pos = -1
    old_ref = ''
    old_alts = ()
    variants_list = list()

    # Get Splited Variants
    for record in m2vcf.fetch():
        chrom = record.chrom
        pos = record.pos
        ref = record.ref
        alts = record.alts

        if chrom == old_chrom and pos == old_pos + 1 and len(old_ref) == 1 and len(ref) == 1 and len(alts) == 1:
            tmp_dict = {
                "chrom" : chrom,
                "start_pos" : old_pos,
                "end_pos" : pos,
                "ref" : old_ref + ref,
                "alt" : old_alts[0] + alts[0]
            }
            variants_list.append(tmp_dict)
        old_chrom = chrom
        old_pos = pos
        old_ref = ref
        old_alts = alts

    # Get Read Information
    for v in variants_list:
        reads = m2bam.fetch(v["chrom"], v["start_pos"] - 1, v["end_pos"])
        ref_read_cnt = 0
        alt_read_cnt = 0
        alt_first_cnt = 0
        alt_second_cnt = 0
        f1r2_ref_cnt = 0
        f2r1_ref_cnt = 0
        f1r2_alt_cnt = 0
        f2r1_alt_cnt = 0
        dp = 0
        for read in reads:
            if not read.is_secondary and not read.is_supplementary and not read.is_unmapped and not read.is_duplicate:
                query_position_list = read.get_reference_positions()
                try:
                    q_start_index = query_position_list.index(v["start_pos"]-1)
                    q_end_index = query_position_list.index(v["end_pos"]-1)
                    query_seq = read.query_sequence[q_start_index] + read.query_sequence[q_end_index]
                    if query_seq == v["ref"]:
                        ref_read_cnt += 1
                        if read.is_read1:
                            f1r2_ref_cnt += 1
                        elif read.is_read2:
                            f2r1_ref_cnt += 1
                    elif query_seq == v["alt"]:
                        alt_read_cnt += 1
                        if read.is_read1:
                            f1r2_alt_cnt += 1
                        elif read.is_read2:
                            f2r1_alt_cnt += 1
                    elif query_seq[0] != v["ref"][0] and query_seq[1] == v["ref"][1]:
                        alt_first_cnt += 1
                    elif query_seq[0] == v["ref"][0] and query_seq[1] != v["ref"][1]:
                        alt_second_cnt += 1
                    dp += 1
                except:
                    continue
        v["ref_cnt"] = ref_read_cnt
        v["alt_cnt"] = alt_read_cnt
        v["alt_first_cnt"] = alt_first_cnt
        v["alt_second_cnt"] = alt_second_cnt
        v["f1r2"] = (f1r2_ref_cnt, f1r2_alt_cnt)
        v["f2r1"] = (f2r1_ref_cnt, f2r1_alt_cnt)
        v["dp"] = dp

    # Re-index True:False
    m2vcf_index = 0
    m2vcf_flag = list()
    second_flag = True
    for record in m2vcf.fetch():
        chrom = record.chrom
        pos = record.pos

        if second_flag == True:
            m2vcf_flag.append(True)
        else:
            m2vcf_flag.append(False)
            second_flag = True

        for v in variants_list:
            if v["chrom"] == chrom and v["start_pos"] == pos and v["alt_cnt"] != 0:
                if v["alt_first_cnt"] == 0:
                    m2vcf_flag[m2vcf_index] = False
                if v["alt_second_cnt"] == 0:
                    second_flag = False

        m2vcf_index += 1

    # Write Recrod & VCF
    new_header = m2vcf.header
    new_header.formats.add("MDV", "1", "Integer", "Merged Di-Allelic Variant : Backed Phased variant that was splited snp before")
    vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=new_header)

    m2vcf_index = 0
    for record in m2vcf.fetch():
        chrom = record.chrom
        pos = record.pos

        if m2vcf_flag[m2vcf_index] == True:
            vcf_out.write(record)

        for v in variants_list:
            if v["chrom"] == chrom and v["start_pos"] == pos and v["alt_cnt"] != 0:
                record2 = vcf_out.new_record()
                record2.chrom = v["chrom"]
                record2.pos = v["start_pos"]
                record2.ref = v["ref"]
                record2.alts = (v["alt"],)
                record2.info["DP"] = v["dp"]
                if "F1R2" in record2.samples[0]:
                    record2.samples[0]["F1R2"] = v["f1r2"]
                    record2.samples[0]["F2R1"] = v["f2r1"]
                record2.samples[0]["AD"] =  (v["ref_cnt"], v["alt_cnt"])
                record2.samples[0]["DP"] = v["dp"]
                record2.samples[0]["AF"] =  float(v["alt_cnt"]) / float(v["dp"])
                record2.samples[0]["GT"] = ("0", "0")
                record2.samples[0]["MDV"] = True
                vcf_out.write(record2)
            else:
                continue

        m2vcf_index += 1

Beispiel #12

Datei anzeigen

class VCFWriter:
    def __init__(self, bam_file_path, sample_name, output_dir):
        self.bam_handler = BamHandler(bam_file_path)
        bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0]
        vcf_header = self.get_vcf_header(sample_name)
        time_str = time.strftime("%m%d%Y_%H%M%S")
        self.vcf_file = VariantFile(output_dir + bam_file_name + '_' +
                                    time_str + '.vcf',
                                    'w',
                                    header=vcf_header)

    def write_vcf_record(self, chrm, st_pos, end_pos, ref, alts, genotype,
                         qual, gq, rec_filter):
        alleles = tuple([ref]) + tuple(alts)
        genotype = self.get_genotype_tuple(genotype)
        end_pos = int(end_pos) + 1
        st_pos = int(st_pos)
        vcf_record = self.vcf_file.new_record(contig=chrm,
                                              start=st_pos,
                                              stop=end_pos,
                                              id='.',
                                              qual=qual,
                                              filter=rec_filter,
                                              alleles=alleles,
                                              GT=genotype,
                                              GQ=gq)
        self.vcf_file.write(vcf_record)

    @staticmethod
    def solve_multiple_alts(alts, ref):
        type1, type2 = alts[0][1], alts[1][1]
        alt1, alt2 = alts[0][0], alts[1][0]
        if type1 == DEL_TYPE and type2 == DEL_TYPE:
            if len(alt2) > len(alt1):
                return alt2, ref, alt2[0] + alt2[len(alt1):]
            else:
                return alt1, ref, alt1[0] + alt1[len(alt2):]
        elif type1 == IN_TYPE and type2 == IN_TYPE:
            return ref, alt1, alt2
        elif type1 == DEL_TYPE or type2 == DEL_TYPE:
            if type1 == DEL_TYPE and type2 == IN_TYPE:
                return alt1, ref, alt2 + alt1[1:]
            elif type1 == IN_TYPE and type2 == DEL_TYPE:
                return alt2, alt1 + alt2[1:], ref
            elif type1 == DEL_TYPE and type2 == SNP_TYPE:
                return alt1, ref, alt2 + alt1[1:]
            elif type1 == SNP_TYPE and type2 == DEL_TYPE:
                return alt2, alt1 + alt2[1:], ref
            elif type1 == DEL_TYPE:
                return alt1, ref, alt2
            elif type2 == DEL_TYPE:
                return alt2, alt1, ref
        else:
            return ref, alt1, alt2

    @staticmethod
    def solve_single_alt(alts, ref):
        # print(alts)
        alt1, alt_type = alts
        if alt_type == DEL_TYPE:
            return alt1, ref, '.'
        return ref, alt1, '.'

    @staticmethod
    def get_genotype_tuple(genotype):
        split_values = genotype.split('/')
        split_values = [int(x) for x in split_values]
        return tuple(split_values)

    @staticmethod
    def get_genotype_for_multiple_allele(records):

        ref = '.'
        st_pos = 0
        end_pos = 0
        chrm = ''
        rec_alt1 = '.'
        rec_alt2 = '.'
        alt_probs = defaultdict(list)
        alt_with_types = []
        for record in records:
            chrm = record[0]
            st_pos = record[1]
            end_pos = record[2]
            ref = record[3]
            alt1 = record[4]
            alt2 = record[5]
            if alt1 != '.' and alt2 != '.':
                rec_alt1 = alt1
                rec_alt2 = alt2
                alt_probs['both'] = (record[8:])
            else:
                alt_probs[alt1] = (record[8:])
                alt_with_types.append((alt1, record[6]))

        p00 = min(alt_probs[rec_alt1][0], alt_probs[rec_alt2][0],
                  alt_probs['both'][0])
        p01 = min(alt_probs[rec_alt1][1], alt_probs['both'][1])
        p11 = min(alt_probs[rec_alt1][2], alt_probs['both'][2])
        p02 = min(alt_probs[rec_alt2][1], alt_probs['both'][1])
        p22 = min(alt_probs[rec_alt2][2], alt_probs['both'][2])
        p12 = min(max(alt_probs[rec_alt1][1], alt_probs[rec_alt1][2]),
                  max(alt_probs[rec_alt2][1], alt_probs[rec_alt2][2]),
                  max(alt_probs['both'][1], alt_probs['both'][2]))
        # print(alt_probs)
        prob_list = [p00, p01, p11, p02, p22, p12]
        # print(prob_list)
        sum_probs = sum(prob_list)
        # print(sum_probs)
        normalized_list = [(float(i) / sum_probs) if sum_probs else 0
                           for i in prob_list]
        prob_list = normalized_list
        # print(prob_list)
        # print(sum(prob_list))
        genotype_list = ['0/0', '0/1', '1/1', '0/2', '2/2', '1/2']
        gq, index = 0, 0
        for i, prob in enumerate(prob_list):
            if gq <= prob and prob > 0:
                index = i
                gq = prob
        qual = sum(prob_list) - prob_list[0]
        if index == 5:
            ref, rec_alt1, rec_alt2 = VCFWriter.solve_multiple_alts(
                alt_with_types, ref)
        else:
            if index <= 2:
                ref, rec_alt1, rec_alt2 = VCFWriter.solve_single_alt(
                    alt_with_types[0], ref)
            else:
                ref, rec_alt2, rec_alt1 = VCFWriter.solve_single_alt(
                    alt_with_types[1], ref)

        phred_qual = min(
            60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60)
        phred_qual = math.ceil(phred_qual * 100.0) / 100.0
        phred_gq = min(60,
                       -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60)
        phred_gq = math.ceil(phred_gq * 100.0) / 100.0

        return chrm, st_pos, end_pos, ref, [
            rec_alt1, rec_alt2
        ], genotype_list[index], phred_qual, phred_gq

    @staticmethod
    def get_genotype_for_single_allele(records):
        for record in records:
            probs = [record[8], record[9], record[10]]
            genotype_list = ['0/0', '0/1', '1/1']
            gq, index = max([(v, i) for i, v in enumerate(probs)])
            qual = sum(probs) - probs[0]
            ref = record[3]
            alt_with_types = list()
            alt_with_types.append((record[4], record[6]))
            # print(alt_with_types)
            ref, alt1, alt2 = VCFWriter.solve_single_alt(
                alt_with_types[0], ref)
            # print(ref, rec_alt1, rec_alt2)
            phred_qual = min(
                60, -10 * np.log10(1 - qual) if 1 - qual >= 0.0000001 else 60)
            phred_qual = math.ceil(phred_qual * 100.0) / 100.0
            phred_gq = min(
                60, -10 * np.log10(1 - gq) if 1 - gq >= 0.0000001 else 60)
            phred_gq = math.ceil(phred_gq * 100.0) / 100.0

            return record[0], record[1], record[2], ref, [
                alt1, alt2
            ], genotype_list[index], phred_qual, phred_gq

    @staticmethod
    def get_proper_alleles(record):
        chrm, st_pos, end_pos, ref, alt_field, genotype, phred_qual, phred_gq = record
        gts = genotype.split('/')
        refined_alt = []
        refined_gt = genotype
        if gts[0] == '1' or gts[1] == '1':
            refined_alt.append(alt_field[0])
        if gts[0] == '2' or gts[1] == '2':
            refined_alt.append(alt_field[1])
        if gts[0] == '0' and gts[1] == '0':
            refined_alt.append('.')
        if genotype == '0/2':
            refined_gt = '0/1'
        if genotype == '2/2':
            refined_gt = '1/1'

        end_pos = st_pos + len(ref) - 1
        record = chrm, st_pos, end_pos, ref, refined_alt, refined_gt, phred_qual, phred_gq

        return record

    @staticmethod
    def get_filter(record, last_end):
        chrm, st_pos, end_pos, ref, alt_field, genotype, phred_qual, phred_gq = record
        if st_pos <= last_end:
            return 'conflictPos'
        if genotype == '0/0':
            return 'refCall'
        if phred_qual <= 1:
            return 'lowQUAL'
        if phred_gq <= 1:
            return 'lowGQ'
        return 'PASS'

    def get_vcf_header(self, sample_name):
        header = VariantHeader()
        items = [('ID', "PASS"), ('Description', "All filters passed")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "refCall"), ('Description', "Call is homozygous")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowGQ"), ('Description', "Low genotype quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowQUAL"),
                 ('Description', "Low variant call quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "conflictPos"), ('Description', "Overlapping record")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "GT"), ('Number', 1), ('Type', 'String'),
                 ('Description', "Genotype")]
        header.add_meta(key='FORMAT', items=items)
        items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'),
                 ('Description', "Genotype Quality")]
        header.add_meta(key='FORMAT', items=items)
        bam_sqs = self.bam_handler.get_header_sq()
        for sq in bam_sqs:
            id = sq['SN']
            ln = sq['LN']
            items = [('ID', id), ('length', ln)]
            header.add_meta(key='contig', items=items)

        header.add_sample(sample_name)

        return header