コード例 #1
0
def main(bedfile, output_file, genome_file):
    """
    Constructing a vcf file from scratch using the linkedSV and pysam module
    bedpe inputfile describing the variants
    """
    contigs = get_contigs(genome_file)
    tmp_vcf = vcf_from_scratch(output_file, contigs)
    vcf_in = VariantFile(tmp_vcf)

    ashkenazim_son = "HG002"

    vcf_in.header.info.add('END',
                           number=1,
                           type='Integer',
                           description="End position of the variant "
                           "described in this record")
    vcf_in.header.info.add('SVTYPE',
                           number=1,
                           type='String',
                           description="Type of structural variant")
    vcf_in.header.info.add('SVMETHOD',
                           number=1,
                           type='String',
                           description="SV detection method")
    vcf_in.header.info.add(
        'NUM_FRAGMENT_SUPPORT',
        number=1,
        type='Integer',
        description="Number of fragments supporting the variant")
    vcf_in.header.info.add(
        'NUM_READ_PAIR',
        number=1,
        type='Integer',
        description="Number of read pairs supporting the variant")
    vcf_in.header.info.add('ENDPOINT1',
                           number=1,
                           type='String',
                           description="?")
    vcf_in.header.info.add('ENDPOINT2',
                           number=1,
                           type='String',
                           description="?")
    vcf_in.header.info.add(
        'BARCODES',
        number=1,
        type='String',
        description="List of molecules barcodes supporting the variant")
    vcf_in.header.formats.add('GT',
                              number=1,
                              type='String',
                              description="Genotype")
    vcf_in.header.add_sample(ashkenazim_son)

    records = reformat_bedpe2vcfrecords(bedfile, vcf_in.header)

    vcf_out = VariantFile(output_file, 'w', header=vcf_in.header)
    for rec in records:
        vcf_out.write(rec)

    os.remove(tmp_vcf)
コード例 #2
0
def file_process(fname):
    try:
        cpath = fname.rstrip('\n')
        sys.stderr.write("Processing " + cpath + "\n")
        sys.stderr.flush()
        in_vcf = VariantFile(cpath)
        # pdb.set_trace()
        for cat in tbl_dict:
            for key in tbl_dict[cat]:
                getattr(in_vcf.header, cat)[key].remove_header()
                in_vcf.header.add_meta(cat_dict[cat],
                                       items=[('ID', key),
                                              ('Number',
                                               getattr(good_boy.header,
                                                       cat)[key].number),
                                              ('Type',
                                               getattr(good_boy.header,
                                                       cat)[key].type),
                                              ('Description',
                                               getattr(good_boy.header,
                                                       cat)[key].description)])
        # pdb.set_trace()
        out_vcf = VariantFile("-", 'w', header=in_vcf.header)
        for rec in in_vcf.fetch():
            out_vcf.write(rec)
        out_vcf.close()
    except Exception as e:
        sys.stderr.write(str(e) + "\n failed to process " + cpath + "\n")
コード例 #3
0
def run_process(opts, inputvcf):
    outputvcf = opts.output

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add FORMAT to Header
    vcf_in.header.formats.add("AF", "A", "Float",
                              "Allele fractions of alternate alleles")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-',
                          'w',
                          header=vcf_in.header)

    for record in vcf_in.fetch():
        chrom = record.chrom
        pos = record.pos
        ref = record.ref
        alts = record.alts

        vaf_list = list()
        for n, alt in enumerate(alts):
            # Get DP,AO,RO,VAF
            tmp_vaf = float(record.samples[0]['AD'][1]) / float(
                record.samples[0]['AD'][0] + record.samples[0]['AD'][1])
            vaf_list.append(tmp_vaf)

        if vaf_list != []:
            record.samples[0]["AF"] = tuple(vaf_list)

        # Write VCF
        vcf_out.write(record)
コード例 #4
0
def filter_variants(vcf, read_ratio, depth, output):
    """
    Soft filter all variants with suspicious read ratio and insufficient read-depth
    """
    vcf_in = VariantFile(vcf)
    new_header = vcf_in.header
    new_header.filters.add(f"AR{read_ratio}", None, None,
                           f"Ratio of ref/alt reads lower than {read_ratio}")
    new_header.filters.add(f"DP{depth}", None, None, f"DP is lower than {depth}x")
    vcf_out = VariantFile(output, "w", header=new_header)

    for record in vcf_in.fetch():
        ad = record.samples[0]["AD"]
        #  No multiallelic split

        if record.info["DP"] < depth:
            record.filter.add("DP100")

        elif len(ad) == 2:
            n_ref, n_alt = ad
            if n_alt / (n_ref + n_alt) < read_ratio:
                record.filter.add(f"AR{read_ratio}")
            else:
                record.filter.add("PASS")
        vcf_out.write(record)
コード例 #5
0
def _dump_rebased_vcf(records: List[VariantRecord],
                      disco_paths: DiscoverPaths):
    template_vcf = VariantFile(disco_paths.discov_vcf)
    output_vcf = VariantFile(disco_paths.final_vcf,
                             "w",
                             header=template_vcf.header)
    for record in records:
        output_vcf.write(record)
コード例 #6
0
ファイル: VcfWriter.py プロジェクト: mfallahi/pepper
class VCFWriter:
    def __init__(self, reference_file_path, contigs, sample_name, output_dir,
                 filename):
        self.fasta_handler = PEPPER_HP.FASTA_handler(reference_file_path)
        self.contigs = contigs
        vcf_header = self.get_vcf_header(sample_name, contigs)

        self.vcf_file = VariantFile(output_dir + filename + '.vcf',
                                    'w',
                                    header=vcf_header)

    def write_vcf_records(self, called_variant):
        contig, ref_start, ref_end, ref_seq, alleles, genotype = called_variant
        alleles = tuple([ref_seq]) + tuple(alleles)

        vcf_record = self.vcf_file.new_record(contig=str(contig),
                                              start=ref_start,
                                              stop=ref_end,
                                              id='.',
                                              qual=60,
                                              filter='PASS',
                                              alleles=alleles,
                                              GT=genotype,
                                              GQ=60)

        self.vcf_file.write(vcf_record)

    def get_vcf_header(self, sample_name, contigs):
        header = VariantHeader()
        items = [('ID', "PASS"), ('Description', "All filters passed")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "refCall"), ('Description', "Call is homozygous")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowGQ"), ('Description', "Low genotype quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowQUAL"),
                 ('Description', "Low variant call quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "conflictPos"), ('Description', "Overlapping record")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "GT"), ('Number', 1), ('Type', 'String'),
                 ('Description', "Genotype")]
        header.add_meta(key='FORMAT', items=items)
        items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'),
                 ('Description', "Genotype Quality")]
        header.add_meta(key='FORMAT', items=items)

        sqs = self.fasta_handler.get_chromosome_names()
        for sq in sqs:
            if sq not in contigs:
                continue
            sq_id = sq
            ln = self.fasta_handler.get_chromosome_sequence_length(sq)
            header.contigs.add(sq_id, length=ln)

        header.add_sample(sample_name)

        return header
コード例 #7
0
ファイル: VcfWriter.py プロジェクト: kishwarshafin/jarvis
class VCFWriter:
    def __init__(self, bam_file_path, sample_name, output_dir):
        self.bam_handler = BamHandler(bam_file_path)
        bam_file_name = bam_file_path.rstrip().split('/')[-1].split('.')[0]
        vcf_header = self.get_vcf_header(sample_name)
        time_str = time.strftime("%m%d%Y_%H%M%S")

        self.vcf_file = VariantFile(output_dir + bam_file_name + '_' +
                                    time_str + '.vcf',
                                    'w',
                                    header=vcf_header)

    def write_vcf_records(self, called_variants):
        for variant in called_variants:
            alleles = tuple([variant.ref]) + tuple(variant.alternate_alleles)
            # print(str(chrm), st_pos, end_pos, qual, rec_filter, alleles, genotype, gq)
            vcf_record = self.vcf_file.new_record(contig=str(
                variant.chromosome_name),
                                                  start=variant.pos_start,
                                                  stop=variant.pos_end,
                                                  id='.',
                                                  qual=60,
                                                  filter='PASS',
                                                  alleles=alleles,
                                                  GT=variant.genotype,
                                                  GQ=60)
            self.vcf_file.write(vcf_record)

    def get_vcf_header(self, sample_name):
        header = VariantHeader()
        items = [('ID', "PASS"), ('Description', "All filters passed")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "refCall"), ('Description', "Call is homozygous")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowGQ"), ('Description', "Low genotype quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "lowQUAL"),
                 ('Description', "Low variant call quality")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "conflictPos"), ('Description', "Overlapping record")]
        header.add_meta(key='FILTER', items=items)
        items = [('ID', "GT"), ('Number', 1), ('Type', 'String'),
                 ('Description', "Genotype")]
        header.add_meta(key='FORMAT', items=items)
        items = [('ID', "GQ"), ('Number', 1), ('Type', 'Float'),
                 ('Description', "Genotype Quality")]
        header.add_meta(key='FORMAT', items=items)
        bam_sqs = self.bam_handler.get_header_sq()
        for sq in bam_sqs:
            id = sq['SN']
            ln = sq['LN']
            items = [('ID', id), ('length', ln)]
            header.add_meta(key='contig', items=items)

        header.add_sample(sample_name)

        return header
コード例 #8
0
def convert_vcffile(filename, outfile_name, source):

    vcf_in = VariantFile(filename)
    vcf_out = VariantFile(outfile_name, "w", header=vcf_in.header)

    for rec in vcf_in.fetch():
        if source == "svaba":
            rec = convert_indel(rec)
        for conv_rec in convert_to_bnd(rec, source):
            vcf_out.write(conv_rec)
コード例 #9
0
ファイル: FilterVCF.py プロジェクト: hobrien/LabNotes
def main(argv):
    bcf_in = VariantFile(argv[0])  # auto-detect input format
    bcf_out = VariantFile(argv[0] + '.filtered.vcf.gz', 'w', header=bcf_in.header)
    for site in bcf_in.fetch():
        keep_site = 0 # default option is to remove SNP
        for sample, rec in site.samples.items():
            if max(rec.get('GP')[1:]) > 0.9:
                keep_site = 1 # do not remove SNP if either het or non-ref h**o is greater than .9 for any sample
        if keep_site:
            bcf_out.write(site)
コード例 #10
0
def create_sample_format_from_info_lofreq(sample,
                                          input_name,
                                          output_name,
                                          skip_gt=False):
    input_vcf = VariantFile(input_name, 'r')
    input_vcf.header.formats.add("AF",
                                 number=1,
                                 type='Float',
                                 description="Allele Frequency")
    input_vcf.header.formats.add(
        "AD",
        number=".",
        type='String',
        description=
        "Allelic sample depths for the ref and alt alleles in the order listed"
    )
    input_vcf.header.formats.add(
        "DP",
        number=1,
        type='Integer',
        description=
        "Approximate read depth (reads with MQ=255 or with bad mates are filtered)"
    )
    input_vcf.header.formats.add(
        "DP4",
        number=4,
        type='Integer',
        description=
        "Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"
    )
    input_vcf.header.formats.add("GT",
                                 number=".",
                                 type="String",
                                 description="Genotype")

    input_vcf.header.add_sample(sample)
    output_vcf = VariantFile(output_name, 'w', header=input_vcf.header)
    for record in input_vcf:
        ad = record.info["AD"]
        af = record.info["AF"]
        dp = record.info["DP"]
        fields = {
            "AF": af,
            "DP4": record.info["DP4"],
            "DP": dp,
            "AD": ad,
            "GT": (record.alleles[1], record.alleles[0])
        }
        new_record = output_vcf.new_record(record.chrom, record.start,
                                           record.stop, record.alleles,
                                           record.id, record.qual,
                                           record.filter, record.info,
                                           [fields])  #,
        output_vcf.write(new_record)
コード例 #11
0
def decompose_multiallelic_record(in_vcf, out_vcf):
    """Break records with multiple ALT alleles into multiple records."""
    i_vcf = VariantFile(in_vcf, "r")
    raw_out = out_vcf.strip(".gz")
    o_vcf = VariantFile(raw_out, "w", header=i_vcf.header)

    for record in i_vcf:
        # Only mutect put multiple ALTs in one record
        number_events = len(record.alts)
        # Temporary fix due to segfault
        # see https://github.com/leukgen/click_mergevcfs/issues/2
        if number_events >= 8:
            continue
        elif number_events > 1:
            click.echo("file={},pos={}".format(in_vcf, record.pos))
            for i in range(0, number_events):
                new_rec = record.copy()
                new_rec.alts = tuple([record.alts[i]])
                # Multiallic sites GT are ex. 0/1/2, which causes error later
                # Needs to change to ./.
                genotypes = list(record.samples)
                for g in genotypes:
                    # Overwrite GT
                    new_rec.samples[g]["GT"] = (None, None)
                    # Use none_if_tuple_out_of_idx because
                    # record.samples[g]['AD'] would sometimes return
                    # a tuple of (None,)
                    if "AD" in list(record.samples[g]):
                        new_rec.samples[g]["AD"] = (
                            record.samples[g]["AD"][0],
                            none_if_tuple_out_of_idx(t=record.samples[g]["AD"],
                                                     index=i + 1),
                        )
                    if "AF" in list(record.samples[g]):
                        new_rec.samples[g]["AF"] = none_if_tuple_out_of_idx(
                            t=record.samples[g]["AF"], index=i)
                    if "F1R2" in list(record.samples[g]):
                        new_rec.samples[g]["F1R2"] = (
                            record.samples[g]["F1R2"][0],
                            none_if_tuple_out_of_idx(
                                t=record.samples[g]["F1R2"], index=i + 1),
                        )
                    if "F2R1" in list(record.samples[g]):
                        new_rec.samples[g]["F2R1"] = (
                            record.samples[g]["F2R1"][0],
                            none_if_tuple_out_of_idx(
                                t=record.samples[g]["F2R1"], index=i + 1),
                        )
                o_vcf.write(new_rec)
        else:
            o_vcf.write(record)

    o_vcf.close()
    subprocess.check_call(["bgzip", "-f", raw_out])
コード例 #12
0
def vcf_merge_vcfs(in_vcf1, in_vcf2, happy_vcf, output_vcf):
    """
    Merge two vcf files
    :param in_vcf1: Input VCF file 1
    :param in_vcf2: Input VCF file 2
    :param happy_vcf: Hap.py input file
    :param output_vcf: Output VCF file
    :return:
    """
    happy_vcf_file = VariantFile(happy_vcf)

    # counter to keep track to of true positive cases
    true_positive_positions = defaultdict(list)

    # filter the file
    for rec in happy_vcf_file.fetch():
        for sample in rec.samples:
            sample_bd = rec.samples[sample]['BD']
            if sample_bd == 'TP':
                # record a true positive case
                true_positive_positions[rec.contig].append(rec.pos)

    # read the two inpt files
    vcf1_vcf_file = VariantFile(in_vcf1)
    vcf2_vcf_file = VariantFile(in_vcf2)

    # for VCF1 we add all of the records.
    merged_records = []
    position_dict = set()
    for rec in vcf1_vcf_file.fetch():
        position_dict.add((rec.contig, rec.pos))
        merged_records.append((rec.contig, rec.pos, rec))

    # for VCF2 we add records that are not true positives
    for rec in vcf2_vcf_file.fetch():
        if rec.pos not in true_positive_positions[rec.contig]:
            if (rec.contig, rec.pos) not in position_dict:
                merged_records.append((rec.contig, rec.pos, rec))

    # sort the records
    merged_records.sort(key=operator.itemgetter(0, 1))

    # output the file
    vcf_out = VariantFile(output_vcf, 'w', header=vcf1_vcf_file.header)

    # write the VCF
    for cotig, pos, rec in merged_records:
        vcf_out.write(rec)

    # process completed
    sys.stderr.write("[" + datetime.now().strftime('%m-%d-%Y %H:%M:%S') +
                     "] INFO: PROCESS FINISHED " + "\n")
    sys.stderr.flush()
コード例 #13
0
def add_contigs_to_header(input_name, output_name, contig_file, assembly):
    from src.lib.data.files.reference import InfoImporter
    info = InfoImporter(contig_file)
    input_vcf = VariantFile(input_name, 'r')
    for key in info:
        input_vcf.header.contigs.add(key,
                                     length=info[key]['length'],
                                     assembly=assembly)
    output_vcf = VariantFile(output_name, 'w', header=input_vcf.header)
    #output_vcf.header.info.add("AD", number=".", type='Integer', description="Allelic depths for the ref and alt alleles in the order listed")
    for record in input_vcf:
        output_vcf.write(record)
コード例 #14
0
def get_filtered_phased_het_trio_variants(trio_vcf, trio_filtered_het_phased_vcf, sample_name):
    vcf_in = VariantFile(trio_vcf)
    vcf_in.subset_samples([sample_name])
    vcf_out = VariantFile(trio_filtered_het_phased_vcf, 'w', header=vcf_in.header)
    
    for rec in vcf_in.fetch():
    	if rec.filter.keys()[0] == 'PASS':
    	    rec_sample = rec.samples[0]
    	    if rec_sample.phased and rec_sample['GT'][0] != rec_sample['GT'][1]:
    	        rec.samples[0].update({'PS':1})
    	        vcf_out.write(rec)
    return 0
コード例 #15
0
def filter_somatic(in_vcf_path, out_vcf_path):
    in_vcf = VariantFile(in_vcf_path)
    out_vcf = VariantFile(out_vcf_path, 'w', header=in_vcf.header)
    num_skipped_records = 0
    for rec in in_vcf:
        if is_somatic(rec):
            try:
                out_vcf.write(rec)
            except OSError:
                num_skipped_records += 1
    print("Skipped " + str(num_skipped_records) + " bad records")
    in_vcf.close()
    out_vcf.close()
コード例 #16
0
def main():
    vcf = VariantFile(snakemake.input.vcf)
    outlier_table = pd.read_table(snakemake.input.outliers)
    filtered = VariantFile(snakemake.output[0], mode='w', header=vcf.header)

    outliers = defaultdict(list)
    for idx, row in outlier_table.iterrows():
        outliers[row['svtype']].append(row['sample'])

    for record in remove_outliers(vcf, outliers):
        filtered.write(record)

    filtered.close()
コード例 #17
0
 def filter_bcf_file(self,
                     bcf_file):
     bcf_in = VariantFile(bcf_file,'rb')
     bcf_out = VariantFile("%s.target.vcf" % bcf_file[:-4],'w',header=bcf_in.header)
     for rec in bcf_in.fetch():
         if rec.contig == self.contig_id:
             if self.contig_start == False and self.contig_end == False:
                 pass
             else:
                 if rec.pos >= self.contig_start and rec.pos <= self.contig_end:
                     bcf_out.write(rec)
     bcf_in.close()
     bcf_out.close()
コード例 #18
0
def subset_by_callers(in_file, callers):
    out_file = "%s-%s.vcf" % (in_file.replace(".vcf", "").replace(".gz", ""), "_".join(callers))
    if not os.path.exists(out_file) and not os.path.exists(out_file + ".gz"):
        want_callers = set(callers)
        reader = VariantFile(in_file)
        writer = VariantFile(out_file, "w", header=reader.header)
        count = 0
        for rec in reader:
            cur_callers = set(rec.info["set"].split("-"))
            if len(cur_callers & want_callers) > 0:
                count += 1
                writer.write(rec)
        print callers, count
    return vcfutils.bgzip_and_index(out_file, {})
コード例 #19
0
def subset_by_callers(in_file, callers):
    out_file = "%s-%s.vcf" % (in_file.replace(".vcf", "").replace(
        ".gz", ""), "_".join(callers))
    if not os.path.exists(out_file) and not os.path.exists(out_file + ".gz"):
        want_callers = set(callers)
        reader = VariantFile(in_file)
        writer = VariantFile(out_file, "w", header=reader.header)
        count = 0
        for rec in reader:
            cur_callers = set(rec.info["set"].split("-"))
            if len(cur_callers & want_callers) > 0:
                count += 1
                writer.write(rec)
        print callers, count
    return vcfutils.bgzip_and_index(out_file, {})
コード例 #20
0
def add_AD_field_using_DP4(input_name, output_name):
    input_vcf = VariantFile(input_name, 'r')
    input_vcf.header.info.add(
        "AD",
        number=".",
        type='String',
        description=
        "Allelic depths for the ref and alt alleles in the order listed")
    output_vcf = VariantFile(output_name, 'w', header=input_vcf.header)
    #output_vcf.header.info.add("AD", number=".", type='Integer', description="Allelic depths for the ref and alt alleles in the order listed")
    for record in input_vcf:
        ref_pos, ref_neg, var_pos, var_neg = record.info['DP4']
        new_record = record.copy()
        new_record.info["AD"] = "{},{}".format(ref_pos + ref_neg,
                                               var_pos + var_neg)
        output_vcf.write(new_record)
コード例 #21
0
def main(bedfile, output_file, genome_file):
    """
        Constructing a vcf file from scratch using the linkedSV
        bedpe inputfile thzta describes the variants
    """
    contigs = get_contigs(genome_file)
    tmp_vcf = vcf_from_scratch(output_file, contigs)
    vcf_in = VariantFile(tmp_vcf)

    ashkenazim_son = "HG002"

    vcf_in.header.info.add('END',
                           number=1,
                           type='Integer',
                           description="End position of the variant "
                           "described in this record")
    vcf_in.header.info.add('SVLEN',
                           number=1,
                           type='Integer',
                           description="Length of the variant "
                           "described in this record")
    vcf_in.header.info.add('SVTYPE',
                           number=1,
                           type='String',
                           description="Type of structural variant")
    vcf_in.header.info.add('SVMETHOD',
                           number=1,
                           type='String',
                           description="SV detection method")
    vcf_in.header.filters.add('FAIL',
                              number=None,
                              type=None,
                              description="Fail to pass filtering")
    vcf_in.header.formats.add('GT',
                              number=1,
                              type='String',
                              description="Genotype")
    vcf_in.header.add_sample(ashkenazim_son)

    records = reformat_bedpe2vcfrecords(bedfile, vcf_in.header)

    vcf_out = VariantFile(output_file, 'w', header=vcf_in.header)
    for rec in records:
        vcf_out.write(rec)

    os.remove(tmp_vcf)
コード例 #22
0
def prepare_octopus_vcf_for_rtg(octopus_vcf, tumour_sample, out_vcf_name):
    """"
    Octopus reports non-diploid genotypes for somatic variants.
    """
    in_vcf = VariantFile(octopus_vcf)
    out_vcf = VariantFile(out_vcf_name, 'w', header=in_vcf.header)
    n_failed = 0
    for record in in_vcf:
        old_gt = record.samples[tumour_sample]['GT']
        assert (len(old_gt) > 1)
        somatic_allele = next(a for a in reversed(list(old_gt))
                              if a is not None and a > 0)
        record.samples[tumour_sample]['GT'] = (old_gt[0], somatic_allele)
        try:
            out_vcf.write(record)
        except OSError:
            n_failed += 1
    out_vcf.close()
    index(out_vcf_name)
コード例 #23
0
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str,
                      output_vcf: str) -> None:
    """
    Transforms dToxoG MAF to minimal VCF of only dtoxo failures.

    :param input_maf: The annotated dtoxog MAF output file.
    :param reference_fa: Reference fasta used to make seqdict header.
    :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'.
    """
    logger = Logger.get_logger("dtoxog_maf_to_vcf")
    logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures")

    # setup
    total = 0
    written = 0
    tag = "oxog"

    # header
    header = generate_header(reference_fa, tag)

    # Writer
    mode = get_pysam_outmode(output_vcf)
    writer = VariantFile(output_vcf, mode=mode, header=header)

    # Process
    try:
        with open(input_maf, "rt") as fh:
            for record in maf_generator(fh):
                total += 1
                if record["oxoGCut"] == "1":
                    new_vcf_record = build_new_record(record, writer, tag)
                    writer.write(new_vcf_record)
                    written += 1

    finally:
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = tabix_index(output_vcf, preset="vcf", force=True)

    logger.info("Processed {} records - Wrote {}".format(total, written))
コード例 #24
0
def add_PASSED_field(in_vcf, out_vcf):
    """
    Add PASSED_{caller} fields.

    Add flags (e.g. PASSED_caveman) under INFO for PASS variant in aim of reduce
    ambiguity of confident variants in the merged vcf.
    """
    # see logic of merging INFO fields
    # https://github.com/vcftools/vcftools/blob/490848f7865abbb4b436ca09381ea7912a363fe3/src/perl/vcf-merge
    caller = get_caller(in_vcf)

    i_vcf = VariantFile(in_vcf, "rb")
    new_header = i_vcf.header.copy()
    try:
        new_header.info.add(
            "PASSED_{}".format(caller),
            ".",
            "Flag",
            "this variants passed which caller(s)",
        )
        i_vcf.header.info.add(
            "PASSED_{}".format(caller),
            ".",
            "Flag",
            "this variants passed which caller(s)",
        )
    except ValueError:
        pass

    raw_out = out_vcf.strip(".gz")
    o_vcf = VariantFile(raw_out, "w", header=new_header)

    for record in i_vcf:
        new_rec = record.copy()
        filters = list(record.filter)
        if filters and filters[0] == "PASS":
            new_rec.info["PASSED_{}".format(caller)] = 1
        o_vcf.write(new_rec)

    o_vcf.close()

    subprocess.check_call(["bgzip", "-f", raw_out])
コード例 #25
0
def main():
    parser = argparse.ArgumentParser("find_outliers.py")
    parser.add_argument("input", type=str, help="list of samples names")
    parser.add_argument("output", type=str, help="list of samples names")
    parser.add_argument("outliers", type=str, help="list of samples names")

    args = parser.parse_args()

    #vcf = VariantFile(snakemake.input.vcf)
    vcf = VariantFile(args.input)

    outlier_table = pd.read_table(args.outliers)
    filtered = VariantFile(args.output, mode='w', header=vcf.header)

    outliers = defaultdict(list)
    for idx, row in outlier_table.iterrows():
        outliers[row['svtype']].append(row['sample'])

    for record in remove_outliers(vcf, outliers):
        filtered.write(record)

    filtered.close()
コード例 #26
0
def run_process(opts, inputvcf):
    outputvcf = opts.output

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    vcf_in.header.info.add("HGVS_p", ".", "String",
                           "HGVS.p Information (Single Character Amino Acid)")
    vcf_in.header.info.add("variant_type", ".", "String",
                           "Variant Type for Tiering System")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-',
                          'w',
                          header=vcf_in.header)

    for record in vcf_in.fetch():
        new_hgvsp = []
        if "ANN" in record.info:
            # Get HGVS.p
            anns = record.info["ANN"]
            for annstring in anns:
                ann = annstring.split("|")
                #ann[6] = ann[6].split(".")[0]
                #print annstring
                #print "|".join(ann)
                new_hgvsp_tmp = convert_hgvsp(ann[10])
                if new_hgvsp_tmp == "" or new_hgvsp_tmp == None:
                    new_hgvsp_tmp = '.'
                new_hgvsp.append(new_hgvsp_tmp)

            new_hgvsp_string = ",".join(new_hgvsp)
            record.info["HGVS_p"] = new_hgvsp_string

        # Write VCF
        vcf_out.write(record)
コード例 #27
0
def write_rephased_tenx_vcf(tenx_vcf, tenx_records, tenx_phase_sets, threshold,
                            workdir):
    """ Writes new 10X VCF file and switches genotypes if logratios above /
    below threshold """
    basename = os.path.basename(tenx_vcf)
    if basename.endswith('.vcf'):
        offset = -4
    elif basename.endswith('.vcf.gz'):
        offset = -7
    else:
        return
    tenx_rephased_vcf = workdir + '/' + basename[:offset] + '.filtered.het.rephased.vcf'

    vcf_in = VariantFile(tenx_vcf)
    vcf_out = VariantFile(tenx_rephased_vcf, 'w', header=vcf_in.header)

    for ps_id in tenx_phase_sets:
        if tenx_phase_sets[ps_id].rephased:
            chrom = tenx_phase_sets[ps_id].chrom
            if tenx_phase_sets[ps_id].log2ratio >= threshold:
                for pos in tenx_phase_sets[ps_id].positions:
                    tenx_records[chrom + ':' + str(pos)].samples[0]['PS'] = 1
                    vcf_out.write(tenx_records[chrom + ':' + str(pos)])
            elif tenx_phase_sets[ps_id].log2ratio <= -threshold:
                for pos in tenx_phase_sets[ps_id].positions:
                    tenx_records[chrom + ':' + str(pos)].samples[0]['PS'] = 1
                    GT_swapped = (tenx_records[chrom + ':' +
                                               str(pos)].samples[0]['GT'][1],
                                  tenx_records[chrom + ':' +
                                               str(pos)].samples[0]['GT'][0])
                    tenx_records[chrom + ':' +
                                 str(pos)].samples[0]['GT'] = GT_swapped
                    tenx_records[chrom + ':' +
                                 str(pos)].samples[0].phased = True
                    vcf_out.write(tenx_records[chrom + ':' + str(pos)])
    return tenx_rephased_vcf
コード例 #28
0
def run_process(opts, mutect2_vcf, pindel_vcf):
    outputvcf = opts.output

    # Open VCF
    mutect2 = VariantFile(mutect2_vcf)
    pindel = VariantFile(pindel_vcf)

    # Add pindel header to new header
    new_header = mutect2.header
    new_header_keys = new_header.info.keys()
    for item in pindel.header.info.iteritems():
        if item[1].name in new_header_keys:
            continue
        else:
            new_header.info.add(item[1].name, item[1].number, item[1].type, item[1].description)

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=new_header)

    pindel_record_list = list()
    for p in pindel.fetch():
        tmp = vcf_out.new_record()
        tmp.chrom = p.chrom
        tmp.pos = p.pos
        tmp.ref = p.ref
        tmp.alts = p.alts
        for key in p.info.keys():
            tmp.info[key] = p.info[key]
        for key in p.format.keys():
            tmp.samples[0][key] = p.samples[0][key]
        tmp.samples[0]["AF"] = float(tmp.samples[0]["AD"][1]) / float(tmp.samples[0]["AD"][0] + tmp.samples[0]["AD"][1])
        tmp.info["DP"] = tmp.samples[0]["AD"][0] + tmp.samples[0]["AD"][1]
        pindel_record_list.append(tmp)

    oldchrom = 1
    for record in mutect2.fetch():
        chrom = record.chrom
        pos = record.pos
        alts = record.alts

        for i,record2 in enumerate(pindel_record_list):
            oldchrom = int(record2.chrom.replace("chr",""))
            if record2.chrom == chrom and record2.pos == pos and record2.alts == alts:
                del(pindel_record_list[i])
            elif record2.chrom == chrom and record2.pos > pos:
                break
            elif record2.chrom == chrom and record2.pos < pos:
                vcf_out.write(record2)
                del(pindel_record_list[i])
            elif oldchrom < int(chrom.replace("chr","")):
                vcf_out.write(record2)
                del(pindel_record_list[i])

        vcf_out.write(record)
コード例 #29
0
def run_process(opts, inputvcf):
    outputvcf = opts.output
    popfreq = float(opts.popfreq)

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    vcf_in.header.info.add("ngb_popmaf_snp_db_cnt",".","Integer","Population Database Count above setting MAF")
    vcf_in.header.info.add("ngb_popmaf_snp_db_list",".","String","Population Database List above setting MAF")
    vcf_in.header.info.add("ngb_popmaf_snp_db_eastasian",".","String","East Asian Exist Flag above setting MAF")
    vcf_in.header.info.add("ngb_popmaf_snp_db_korean",".","String","Korean Exist Flag above setting MAF")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=vcf_in.header)

    for record in vcf_in.fetch():
        record_data = OrderedDict()
        record_value = list()

        # Check Population MAF
        for key in freq_check_list:
            try:
                value = record.info[key]
                if type(value) == list or type(value) == tuple:
                    value2 = float(value[0])
                else:
                    value2 = float(value)
                if value2 >= popfreq:
                    record_data[key] = value2
            except:
                continue

        # Check ESP6500
        try:
            value_list = record.info['esp6500_MAF']
            if float(value_list[2]) / 100 >= popfreq:
                record_data['esp6500_MAF_ALL'] = float(value_list[2]) / 100
            if float(value_list[1]) / 100 >= popfreq:
                record_data['esp6500_MAF_AA'] = float(value_list[1]) / 100
            if float(value_list[0]) / 100 >= popfreq:
                record_data['esp6500_MAF_EA'] = float(value_list[0]) / 100
        except:
            pass

        for key in record_data.iterkeys():
            record_value.append(key)
        filtered_db_list = '|'.join(record_value)

        if filtered_db_list == '':
            filtered_db_list = '.'

        record.info['ngb_popmaf_snp_db_list'] = filtered_db_list
        record.info['ngb_popmaf_snp_db_cnt'] = len(record_data)
        if "EAS" in filtered_db_list:
            record.info['ngb_popmaf_snp_db_eastasian'] = 'Y'
        else:
            record.info['ngb_popmaf_snp_db_eastasian'] = 'N'
        if ("KRGDB" in filtered_db_list) or ("KoEXID" in filtered_db_list):
            record.info['ngb_popmaf_snp_db_korean'] = 'Y'
        else:
            record.info['ngb_popmaf_snp_db_korean'] = 'N'

        # Write VCF
        vcf_out.write(record)
コード例 #30
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtools standardize',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Raw VCF.')
    parser.add_argument('fout', help='Standardized VCF.')
    parser.add_argument('source',
                        help='Source algorithm. '
                        '[delly,lumpy,manta,wham,melt]')
    parser.add_argument('-p',
                        '--prefix',
                        help='If provided, variant names '
                        'will be overwritten with this prefix.')
    parser.add_argument('--include-reference-sites',
                        action='store_true',
                        default=False,
                        help='Include records where all '
                        'samples are called 0/0 or ./.')
    parser.add_argument('--standardizer',
                        help='Path to python file with '
                        'custom standardizer definition. (Not yet supported.)')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    template = pkg_resources.resource_filename('svtools',
                                               'data/standard_template.vcf')
    template = VariantFile(template)
    vcf = VariantFile(args.vcf)

    # Template header includes all necessary FILTER, INFO, and FORMAT fields
    # Just need to add samples from VCF being standardized
    header = template.header
    for sample in vcf.header.samples:
        header.add_sample(sample)

    # Tag source in header
    meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"'
    meta = meta.format(args.source, args.source.capitalize())
    header.add_line(meta)
    header.add_line('##source={0}'.format(args.source))

    fout = VariantFile(args.fout, mode='w', header=header)

    standardizer = VCFStandardizer.create(args.source, vcf, fout)
    idx = 1
    for record in standardizer.standardize_vcf():
        if any_called(record) or args.include_reference_sites:
            if args.prefix is not None:
                record.id = '{0}_{1}'.format(args.prefix, idx)
                idx += 1

            fout.write(record)

    #  for std_rec in standardize_vcf(vcf, fout):
    #  fout.write(std_rec)

    fout.close()
    vcf.close()
コード例 #31
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtk standardize',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Raw VCF.')
    parser.add_argument('fout', help='Standardized VCF.')
    parser.add_argument('source',
                        help='Source algorithm. '
                        '[delly,lumpy,manta,wham,melt]')
    parser.add_argument('-p',
                        '--prefix',
                        help='If provided, variant names '
                        'will be overwritten with this prefix.')
    parser.add_argument('--include-reference-sites',
                        action='store_true',
                        default=False,
                        help='Include records where all '
                        'samples are called 0/0 or ./.')
    parser.add_argument('--standardizer',
                        help='Path to python file with '
                        'custom standardizer definition. (Not yet supported.)')
    parser.add_argument('--contigs',
                        type=argparse.FileType('r'),
                        help='Reference fasta index (.fai). If provided, '
                        'contigs in index will be used in VCF header. '
                        'Otherwise all GRCh37 contigs will be used in header. '
                        'Variants on contigs not in provided list will be '
                        'removed.')
    parser.add_argument('--min-size',
                        type=int,
                        default=50,
                        help='Minimum SV size to report [50].')
    parser.add_argument('--call-null-sites',
                        action='store_true',
                        default=False,
                        help='Call sites with null genotypes (./.). Generally '
                        'useful when an algorithm has been run on a single '
                        'sample and has only reported variant sites.')
    parser.add_argument('--sample-names',
                        type=str,
                        default=None,
                        help='Comma-delimited list of sample names to use in '
                        'header [use existing].')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    # Add contigs to header if provided
    if args.contigs:
        template = pkg_resources.resource_filename(
            'svtk', 'data/no_contigs_template.vcf')
        template = VariantFile(template)
        header = template.header
        contig_line = '##contig=<ID={contig},length={length}>'
        for line in args.contigs:
            contig, length = line.split()[:2]
            header.add_line(contig_line.format(**locals()))
    # Use GRCh37 by default
    else:
        template = pkg_resources.resource_filename('svtk',
                                                   'data/GRCh37_template.vcf')
        template = VariantFile(template)
        header = template.header

    vcf = VariantFile(args.vcf)

    # Parse new sample names if provided
    if args.sample_names:
        sample_names_list = args.sample_names.split(',')
    else:
        sample_names_list = vcf.header.samples

    # Tag source in header
    meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"'
    meta = meta.format(args.source, args.source.capitalize())
    header.add_line(meta)
    header.add_line('##source={0}'.format(args.source))

    fout = VariantFile(args.fout, mode='w', header=header)

    standardizer = VCFStandardizer.create(args.source, vcf, fout,
                                          sample_names_list, args.prefix,
                                          args.min_size,
                                          args.include_reference_sites,
                                          args.call_null_sites)

    for record in standardizer.standardize_vcf():
        fout.write(record)

    fout.close()
    vcf.close()