コード例 #1
0
def annotate_false_negs(folder):
    """
    Get information for any false negative results - returns basic variant info plus quality, genotype, coverage
    (total, ref base and alt base if appropriate)
    :param folder: Folder containing output from bcftools isec
    :return: array of variant dictionaries containing information on false negatives
    """
    false_negs = VariantFile(folder + '/0000.vcf')
    num_neg = len(list(false_negs.fetch()))
    print num_neg

    variants = []

    if num_neg > 0:
        print 'false negatives'
        for rec in false_negs.fetch():
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples['INTEGRATION']['GT']

            variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                       'GT':genotype}

            variants.append(variant)
    else:
        print 'no false negatives'

    return variants
コード例 #2
0
def gen_report(vcf, sample, ref_flag):
    vcf_in = VariantFile(vcf)
    # run cadd twice over snv and indel file
    out = open(sample + '.germline.vep91.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'HGVSg': 0, 'Protein_position': 0,
               'Amino_acids': 0, 'Codons': 0, 'BIOTYPE': 0, 'SIFT': 0, 'Existing_variation': 0, 'VARIANT_CLASS': 0,
               'gnomAD_AF': 0, 'CLIN_SIG': 0, 'CADD_PHRED': []}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            if desc_list[i] == 'CADD_PHRED':
                desired[desc_list[i]].append(i)
            else:
                desired[desc_list[i]] = i
    out.write('CHROM\tPOS\tREF\tAllele\tTotal Allele Count\tTotal Position Coverage\tGene\tHGVSg\tTranscript_id'
              '\tEffect\tIMPACT\tBIOTYPE\tCodons\tAmino_acids\tExisting_variation\tVARIANT_CLASS\tSIFT\tgnomAD_AF'
              '\tCLIN_SIG\tCADD_PHRED\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt, alt_ct, tot_ct) = (record.contig, str(record.pos), record.ref, record.alts[0],
                                                  str(record.info['TR']), str(record.info['TC']))
        ann_list = [_.split('|') for _ in record.info['ANN']]
        output_highest_impact(chrom, pos, ref, alt, alt_ct, tot_ct, ann_list, desired, out, ref_flag)
    out.close()
    return 0
コード例 #3
0
def gen_report(vcf):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.indels.vep_priority.report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    vcf_in = VariantFile(vcf)

    out = open(parts[0] + '.indels.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in xrange(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact'
            '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n')
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0],
                                str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO']))
        ann_list = [_.split('|') for _ in record.info['ANN'].split(',')]
        output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
コード例 #4
0
def annotate_false_pos(folder, coverage_file, sample):
    """
    Get information for any false positive results - returns basic variant info plus quality, genotype, coverage
    (total, ref base and alt base if appropriate)
    :param folder: Folder containing output from bcftools isec
    :param coverage_file: File containing per base coverage for the truth_regions panel
    :param sample: container ID used in vcf file
    :return: array of variant dictionaries containing information on false negatives
    """
    false_pos = VariantFile(folder + '/0001.vcf')
    num_pos = len(list(false_pos.fetch()))
    print num_pos

    variants = []

    if num_pos > 0:
        print 'false positives'
        for rec in false_pos.fetch():
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples[sample]['GT']
            if 'AD' in rec.samples[sample].keys():
                allelic_depth = rec.samples[sample]['AD']
            else:
                allelic_depth = 'N/A'
            total_depth = rec.samples[sample]['DP']
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print 'Error executing command: ' + str(e.returncode)
                    exit(1)
                if line == '':
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                               'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                else:
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                               'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}
            else:
                variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                            'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                           'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A', 'alt':'N/A'}}
            variants.append(variant)
    else:
        print 'no false positives'

    return variants
コード例 #5
0
def check_genotype(folder, sample):
    """
    Compares the genotype for all shared variants
    :param folder: location of results from the NGS analysis pipeline
    :param sample:  sample number (used in vcf file)
    :return: dictionary of number of matching variants and detailed information for any with mismatching genotypes
    """
    shared_giab = VariantFile(folder + '/0002.vcf')
    shared_patient = VariantFile(folder + '/0003.vcf')

    variants = []

    vars_giab = {}
    for rec in shared_giab.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if chrom not in vars_giab:
            vars_giab[chrom] = {}
        if pos not in vars_giab[chrom]:
            vars_giab[chrom][pos] = {}
        if alleles not in vars_giab[chrom][pos]:
            vars_giab[chrom][pos][alleles] = rec.samples['INTEGRATION']['GT']

    matching = 0
    for rec in shared_patient.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if 'AD' in rec.samples[sample].keys():
            allelic_depth = rec.samples[sample]['AD']
        else:
            allelic_depth = 'N/A'
        total_depth = rec.samples[sample]['DP']
        giab_genotype = vars_giab[chrom][pos][alleles]
        if rec.samples[sample]['GT'] == giab_genotype:
            matching += 1
        elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][
            0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1:
            matching += 1
        else:

            variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                       'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                       'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}}

            variants.append(variant)
    print str(matching) + ' matching variants'
    results = {'matching':matching, 'mismatching':variants}
    print results
    return results
コード例 #6
0
def subset_by_callers(in_file, callers):
    out_file = "%s-%s.vcf" % (in_file.replace(".vcf", "").replace(".gz", ""), "_".join(callers))
    if not os.path.exists(out_file) and not os.path.exists(out_file + ".gz"):
        want_callers = set(callers)
        reader = VariantFile(in_file)
        writer = VariantFile(out_file, "w", header=reader.header)
        count = 0
        for rec in reader:
            cur_callers = set(rec.info["set"].split("-"))
            if len(cur_callers & want_callers) > 0:
                count += 1
                writer.write(rec)
        print callers, count
    return vcfutils.bgzip_and_index(out_file, {})
コード例 #7
0
def gen_report(vcf, out, c, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.subsitutions.vep.priority_report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    mut_dict = create_mutect_ind(out)
    log(loc, date_time() + 'Created index for added mutect info\n')
    on_dict = {}
    if c != 'n':
        on_dict = create_target(c)
        log(loc, date_time() + 'Target file given, creating index for on target info\n')
    vcf_in = VariantFile(vcf)

    out = open(parts[0] + '.subsitutions.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write('chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t'
              'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tgnomAD_AF\tgene\ttx_id\teffect\timpact\tbiotype\t'
              'codon_change\tamino_acid_change\ton/off-target\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0]
        ann_list = [_.split('|') for _ in record.info['ANN']]
        tflag = 'NA'
        if c != 'n':
            tflag = mark_target(chrom, pos, on_dict)
            # only outputting ON TARGET hits
            if tflag == 'OFF':
                continue
        output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
コード例 #8
0
ファイル: variant_predict.py プロジェクト: aghozlane/MBMA
def get_variants(filename):
    """
    Function that parse the sample VCF file. This function get snp found in the
    representative genes, and uses the tag 'AD', a list containing the number of
    read mapped for reference and alternative variant.
    
    Args: 
        filename [string] = sample filename
        
    Returns:
        var [dict] = contain snp variation informations of representative genes 
                    key : representative gene name
                    value : variant [dict] containing snp position as key,
                            and a list of (nucleotide variant, aligned reads 
                            number)
    """
    # open VCF file
    vcf = VariantFile(filename)
    # initialise
    var = {}
    flag = 0
    
    for rec in vcf.fetch():
        # only for the first record, set variable name
        if flag == 0:
            name = rec.chrom # rec.chrom is the representative gene name
            variant = defaultdict(list)
            flag = 1
        # if snp are found in another representative gene
        if rec.chrom != name:
            var[name] = variant # store the variant
            name = rec.chrom # change the representative gene name
            variant = defaultdict(list) # create a new variant dictionnary
        # read the snp informations
        for gene, obj in rec.samples.items():
            i = 0
            if 'AD' in obj:
                for nb in obj['AD']:
                    if nb != 0:
                        variant[rec.pos].append((rec.alleles[i], nb))
                    i +=1
    
    return var
コード例 #9
0
ファイル: variant_predict.py プロジェクト: aghozlane/MBMA
def parse_vcf(filename):
    """
    Function that parse a database VCF file obtained by a variant calling using
    a multiple alignment file. It parses the VCF file and output a matrix 
    containing all the variant at each snp position of all the clustered genes
    
    Args :
        filename [string] = VCF filename
        
    Returns:
        name [string] = representative gene name 
        index [dict] = a dictionary containing index of snp position in list:
                       key : snp position
                       value : index of the snp in the list of the dict versions
        matrix [dict] = dictionary containing all variations
                          key : clustered gene
                          value : list of the nucleotide variation
    """
    # open VCF file
    vcf = VariantFile(filename)
    # initialise
    index = {}  
    matrix = defaultdict(list)
    i = 0 # index of snp
    name = 0

    for rec in vcf.fetch():
        name = rec.chrom # representative gene name
        # get the snp position (rec.pos) and his index (i)
        index[rec.pos] = i  
        i += 1
        # creation of the matrix of a cluster, gene are the different clustered 
        # genes, obj contain information about the snp
        for gene, obj in rec.samples.items():
            snp = obj.allele_indices[0]
            if snp != -1:
                matrix[gene].append(rec.alleles[snp]) 
            else: # if deletion
                matrix[gene].append('') 
            
    return name, [index, matrix]
コード例 #10
0
ファイル: GWASCatalog.py プロジェクト: welchr/Swiss
  def variants_missing_vcf(self,vcf_file):
    cat_chroms = set(self.data[self.col_chr].unique())
    cat_variants = set(self.data[self.col_epacts].unique())

    vcf_variants = set()
    for cat_chrom in cat_chroms:
      print >> sys.stderr, "Checking chromosome %s..." % str(cat_chrom)

      if '.json' in vcf_file:
        import json
        with open(vcf_file) as jsin:
          vcf_dict = json.load(jsin)

        vcf = vcf_dict.get(cat_chrom)
        if vcf is None:
          warning("GWAS catalog has variants on chromosome %s, but could not find this chromosome in your VCF (or JSON) file: %s" % (cat_chrom,vcf_file))
          continue
      else:
        vcf = vcf_file

      vcf_pysam = VariantFile(vcf)

      # Subset catalog to chromosome
      df_cat_for_chrom = self.data.query("{} == '{}'".format(self.col_chr,cat_chrom))

      # Catalog has repeated rows for variants depending on the number of traits * citations
      # But we just need each variant once
      df_cat_for_chrom = df_cat_for_chrom.drop_duplicates(self.col_epacts)

      # Loop over subsetted catalog, check if variant is in VCF
      for idx, row in df_cat_for_chrom.iterrows():
        chrom, pos = row[self.col_chr], row[self.col_pos]

        for rec in vcf_pysam.fetch(chrom,pos,pos):
          epacts = "{}:{}_{}/{}".format(rec.chrom,rec.pos,rec.ref,rec.alt)
          vcf_variants.add(epacts)

    missing_variants = cat_variants.difference(vcf_variants)
    missing_rows = self.data[self.data[self.col_epacts].isin(missing_variants)]

    return missing_rows
コード例 #11
0
def annotate_false_pos(folder, sample):
    """
    Get information for any false positive results - returns basic variant info plus quality, genotype, coverage
    (total, ref base and alt base if appropriate)
    :param folder: Folder containing output from bcftools isec
    :param sample: container ID used in vcf file
    :return: array of variant dictionaries containing information on false negatives
    """
    false_pos = VariantFile(folder + '/0001.vcf')
    num_pos = len(list(false_pos.fetch()))
    print num_pos

    variants = []

    if num_pos > 0:
        print 'false positives'
        for rec in false_pos.fetch():
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples[sample]['GT']
            if 'AD' in rec.samples[sample].keys():
                allelic_depth = rec.samples[sample]['AD']
            else:
                allelic_depth = 'N/A'
            total_depth = rec.samples[sample]['DP']
            variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                       'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}}

            variants.append(variant)
    else:
        print 'no false positives'

    return variants
コード例 #12
0
def add_PASSED_field(in_vcf, out_vcf):
    """
    Add PASSED_{caller} fields.

    Add flags (e.g. PASSED_caveman) under INFO for PASS variant in aim of reduce
    ambiguity of confident variants in the merged vcf.
    """
    # see logic of merging INFO fields
    # https://github.com/vcftools/vcftools/blob/490848f7865abbb4b436ca09381ea7912a363fe3/src/perl/vcf-merge
    caller = get_caller(in_vcf)

    i_vcf = VariantFile(in_vcf, "rb")
    new_header = i_vcf.header.copy()
    try:
        new_header.info.add(
            "PASSED_{}".format(caller),
            ".",
            "Flag",
            "this variants passed which caller(s)",
        )
        i_vcf.header.info.add(
            "PASSED_{}".format(caller),
            ".",
            "Flag",
            "this variants passed which caller(s)",
        )
    except ValueError:
        pass

    raw_out = out_vcf.strip(".gz")
    o_vcf = VariantFile(raw_out, "w", header=new_header)

    for record in i_vcf:
        new_rec = record.copy()
        filters = list(record.filter)
        if filters and filters[0] == "PASS":
            new_rec.info["PASSED_{}".format(caller)] = 1
        o_vcf.write(new_rec)

    o_vcf.close()

    subprocess.check_call(["bgzip", "-f", raw_out])
コード例 #13
0
def main():
    vcf_path = sys.argv[1]
    vcf = VariantFile(vcf_path, 'r', drop_samples=True)

    vcf.header.add_line(
        "##INFO=<ID=CAF,Number=.,Type=String,Description=\"An ordered, comma delimited list of allele frequencies, starting with the reference allele followed by alternate alleles as ordered in the ALT column.\">"
    )

    for record in vcf.header.records:
        if record.key == "FORMAT":
            record.remove()

    print('\n'.join(str(vcf.header).split('\n')[:-2]))
    print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO")

    for record in vcf:
        afs = record.info.get('EUR_AF')
        rf = round(1.0 - sum(afs), 3)
        record.info.__setitem__(
            'CAF', '{},{}'.format(rf,
                                  ','.join([str(round(af, 3)) for af in afs])))
        print(record, end='')
コード例 #14
0
def main():
    parser = argparse.ArgumentParser(
        description="Tags the variants in a VCF with the corresponding gene.")
    parser.add_argument(
        "-a",
        dest="all_flag",
        action="store_true",
        help="Print all variants, not only the ones with genotype>0")
    parser.add_argument("vcf_path",
                        metavar="VCF",
                        type=str,
                        help="Path to VCF file")
    parser.add_argument("gtf_path",
                        metavar="GTF",
                        type=str,
                        help="Path to GTF file")

    args = parser.parse_args()

    with_anno = args.gtf_path != "None"
    if with_anno:
        gtf = open_gtf(args.gtf_path)
        cdss = extract_cdss(gtf)

    vcf = VariantFile(args.vcf_path, 'r', drop_samples=False)

    vcf.header.add_line(
        "##INFO=<ID=GENE,Number=1,Type=String,Description=\"Genic region\">")
    print('\n'.join(str(vcf.header).split('\n')[:-1]))

    for record in vcf:
        gt = record.samples[0]['GT'][0]
        if gt == 0 and not args.all_flag:
            continue
        if with_anno:
            gene_name = get_gene_name(cdss, record.pos)
            record.info.__setitem__("GENE", gene_name)
        print(record, end='')
コード例 #15
0
ファイル: validate.py プロジェクト: gberriz/bcbio-nextgen
def _pick_best_quality_score(vrn_file):
    """Flexible quality score selection, picking the best available.

    Implementation based on discussion:

    https://github.com/chapmanb/bcbio-nextgen/commit/a538cecd86c0000d17d3f9d4f8ac9d2da04f9884#commitcomment-14539249

    (RTG=AVR/GATK=VQSLOD/MuTect=t_lod_fstar, otherwise GQ, otherwise QUAL, otherwise DP.)

    For MuTect, it's not clear how to get t_lod_fstar, the right quality score, into VCF cleanly.
    MuTect2 has TLOD in the INFO field.
    """
    # pysam fails on checking reference contigs if input is empty
    if not vcfutils.vcf_has_variants(vrn_file):
        return "DP"
    to_check = 25
    scores = collections.defaultdict(int)
    try:
        in_handle = VariantFile(vrn_file)
    except ValueError:
        raise ValueError("Failed to parse input file in preparation for validation: %s" % vrn_file)
    with contextlib.closing(in_handle) as val_in:
        for i, rec in enumerate(val_in):
            if i > to_check:
                break
            if "VQSLOD" in rec.info and rec.info.get("VQSLOD") is not None:
                scores["INFO=VQSLOD"] += 1
            if "TLOD" in rec.info and rec.info.get("TLOD") is not None:
                scores["INFO=TLOD"] += 1
            for skey in ["AVR", "GQ", "DP"]:
                if len(rec.samples) > 0 and rec.samples[0].get(skey) is not None:
                    scores[skey] += 1
            if rec.qual:
                scores["QUAL"] += 1
    for key in ["AVR", "INFO=VQSLOD", "INFO=TLOD", "GQ", "QUAL", "DP"]:
        if scores[key] > 0:
            return key
    raise ValueError("Did not find quality score for validation from %s" % vrn_file)
コード例 #16
0
def parse_filepaths(filepaths):
    """
    Parameters
    ----------
    filepaths : list of str
        List of paths to standardized VCFs

    Returns
    -------
    vcfs : list of pysam.VariantFile
    """

    vcfs = deque()
    for path in filepaths:
        if len(path.split()) != 1:
            raise ValueError('File list must be single column')
        if not os.path.isfile(path):
            raise FileNotFoundError('VCF {0} not found'.format(path))

        vcf = VariantFile(path)
        vcfs.append(vcf)

    return vcfs
コード例 #17
0
def rename_samples_headers(in_vcf, out_vcf):
    """Replace hard-to-read and ambiguious header with clear header."""
    out_vcf = out_vcf.strip(".gz")
    caller = get_caller(in_vcf)
    vcf = VariantFile(in_vcf, "rb")
    samples_names = list(vcf.header.samples)
    header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"

    if len(samples_names) == 2:
        header += "\t{}_NORMAL\t{}_TUMOR\n".format(caller, caller)
    elif len(samples_names) == 1:
        header += "\t{}_NORMAL\n".format(caller)

    with open(out_vcf, "w") as fout:
        with gzip.open(in_vcf, "r") as fin:
            for line in fin:
                if line.startswith("#CHROM"):
                    fout.write(header)
                else:
                    fout.write(line)

    subprocess.check_call(["bgzip", "-f", out_vcf])
    os.remove(in_vcf)
コード例 #18
0
ファイル: vcf2pseudogenome.py プロジェクト: nf-core/bactmap
def filtered_bcf_to_fasta(filtered_bcf_file, reference_lengths):
    # make dictionaries to capture seeuence(s) and ongoing positons
    sequences = OrderedDict()
    for chrom in reference_lengths.keys():
        sequences[chrom] = []
    previous_positions = OrderedDict()
    for chrom in reference_lengths.keys():
        previous_positions[chrom] = 0
    with VariantFile(filtered_bcf_file) as vcf_reader:
        for record in vcf_reader.fetch():
            record_chrom = record.chrom
            if record.pos == previous_positions[record_chrom]: # Insertion - remove previous character and add 'N'
                sequences[record_chrom].pop() # remove last position
                sequences[record_chrom].append('N')
            else:
                if previous_positions[record_chrom] + 1 < record.pos: # large deletion add gaps before adding next position
                    sequences[record_chrom].extend(calculate_gaps_to_add(previous_positions[record_chrom] + 1, record.pos))
                if 'PASS' in record.filter.keys(): # HQ SNP
                    gt = record.samples[0]['GT'][0] # get genotype (1st from tuple)
                    if 'INDEL' in record.info: # indel
                        sequences[record_chrom].append('N')
                    elif gt == 0: # reference base
                        sequences[record_chrom].append(record.ref.lower()) # add reference base as lower case
                    else: # variant
                        if len(record.alts) != 1: # if more than one ALT genotype so add N
                            sequences[record_chrom].append('N')
                        else: # add ALT SNP as upper case
                            sequences[record_chrom].append(record.alts[gt-1].upper())
                else: # if not PASS it's a low qual SNP so add N
                    sequences[record_chrom].append('N')
            previous_positions[record_chrom] = record.pos

        # check for gaps at end
        for chrom in sequences:
            if len(sequences[chrom]) != reference_lengths[record_chrom]: # if gap at the end
                sequences[chrom].extend(calculate_gaps_to_add(len(sequences[chrom]), reference_lengths[chrom]))
        return sequences
コード例 #19
0
def main():
    parser = argparse.ArgumentParser("find_outliers.py")
    parser.add_argument("input", type=str, help="list of samples names")
    parser.add_argument("output", type=str, help="list of samples names")
    parser.add_argument("outliers", type=str, help="list of samples names")

    args = parser.parse_args()

    #vcf = VariantFile(snakemake.input.vcf)
    vcf = VariantFile(args.input)

    outlier_table = pd.read_table(args.outliers)
    filtered = VariantFile(args.output, mode='w', header=vcf.header)

    outliers = defaultdict(list)
    for idx, row in outlier_table.iterrows():
        outliers[row['svtype']].append(row['sample'])

    for record in remove_outliers(vcf, outliers):
        filtered.write(record)

    filtered.close()
コード例 #20
0
def _prep_vrn_file(in_file, vcaller, work_dir, somatic_info):
    """Select heterozygous variants in the normal sample with sufficient depth.
    """
    data = somatic_info.tumor_data
    params = {"min_freq": 0.4, "max_freq": 0.6, "min_depth": 15}
    out_file = os.path.join(
        work_dir, "%s-%s-prep.csv" %
        (utils.splitext_plus(os.path.basename(in_file))[0], vcaller))
    if not utils.file_uptodate(out_file, in_file):
        sub_file = _create_subset_file(in_file, work_dir, data)
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle)
                writer.writerow(["chrom", "start", "end", "freq"])
                bcf_in = VariantFile(sub_file)
                for rec in bcf_in:
                    tumor_freq = _is_possible_loh(rec, params, somatic_info)
                    if chromhacks.is_autosomal(
                            rec.chrom) and tumor_freq is not None:
                        writer.writerow([
                            _to_ucsc_style(rec.chrom), rec.start, rec.stop,
                            tumor_freq
                        ])
    return out_file
コード例 #21
0
def extract_truth(truth_path):
    truth_vcf = VariantFile(truth_path)
    truth_gts = {}
    indels = {}
    ref_idx = ""
    for record in truth_vcf:
        if is_snp(record):
            continue
        ref_idx = record.chrom  # Only if input is "chromosome-specific"
        vidx = ("chr" + record.chrom, record.pos, record.ref,
                "-".join(record.alts))
        is_good = True
        for alt in record.alts:
            if alt[0] == '<':
                is_good = False
                break
        is_good = is_good and len(
            record.alts) == 1  # We consider only single-allelic indels
        if not is_good:
            continue
        gt = ""
        for (type_name, content) in record.samples.items()[0][1].items():
            if type_name == 'GT':
                gt = str(min(content[0], content[1])) + "/" + str(
                    max(content[0], content[1]))
        if gt == "0/0":
            continue
        l_indel = len(record.alts[0]) - len(
            record.ref)  # + is an insertion, - is a deletion
        if gt not in indels:
            indels[gt] = {}
        if l_indel not in indels[gt]:
            indels[gt][l_indel] = [0, 0, 0]
        indels[gt][l_indel][2] += 1
        truth_gts[vidx] = gt
    return ref_idx, truth_gts, indels
コード例 #22
0
    def __init__(self,
                 vcf_file,
                 reference_file,
                 prg_output_file,
                 mode="normal"):
        self.prg_bytes: Map[str, bytearray] = defaultdict(bytearray)
        self.num_sites = 0
        self.processed_refs = list()
        self.skipped_records: int = 0

        if mode not in self.acceptable_modes:
            raise ValueError(f"Mode not in {self.acceptable_modes}")

        self.f_out_prefix = prg_output_file
        self.vcf_in = VariantFile(vcf_file).fetch()
        self.ref_in = reference_file

        self.ref_records = load_fasta(reference_file)

        self._make_prg(mode)
        if self.skipped_records > 0:
            logger.warning(
                f"Skipped {self.skipped_records} because of no 'PASS' in their FORMAT column"
            )
コード例 #23
0
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str,
                      output_vcf: str) -> None:
    """
    Transforms dToxoG MAF to minimal VCF of only dtoxo failures.

    :param input_maf: The annotated dtoxog MAF output file.
    :param reference_fa: Reference fasta used to make seqdict header.
    :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'.
    """
    logger = Logger.get_logger("dtoxog_maf_to_vcf")
    logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures")

    # setup
    total = 0
    written = 0
    tag = "oxog"

    # header
    header = generate_header(reference_fa, tag)

    # Writer
    mode = get_pysam_outmode(output_vcf)
    writer = VariantFile(output_vcf, mode=mode, header=header)

    # Process
    try:
        with open(input_maf, "rt") as fh:
            for record in maf_generator(fh):
                total += 1
                if record["oxoGCut"] == "1":
                    new_vcf_record = build_new_record(record, writer, tag)
                    writer.write(new_vcf_record)
                    written += 1

    finally:
        writer.close()

    if mode == "wz":
        logger.info("Creating tabix index...")
        tbx = tabix_index(output_vcf, preset="vcf", force=True)

    logger.info("Processed {} records - Wrote {}".format(total, written))
コード例 #24
0
def run_process(opts, inputvcf):
    outputvcf = opts.output

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    vcf_in.header.info.add("HGVS_p", ".", "String",
                           "HGVS.p Information (Single Character Amino Acid)")
    vcf_in.header.info.add("variant_type", ".", "String",
                           "Variant Type for Tiering System")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-',
                          'w',
                          header=vcf_in.header)

    for record in vcf_in.fetch():
        new_hgvsp = []
        if "ANN" in record.info:
            # Get HGVS.p
            anns = record.info["ANN"]
            for annstring in anns:
                ann = annstring.split("|")
                #ann[6] = ann[6].split(".")[0]
                #print annstring
                #print "|".join(ann)
                new_hgvsp_tmp = convert_hgvsp(ann[10])
                if new_hgvsp_tmp == "" or new_hgvsp_tmp == None:
                    new_hgvsp_tmp = '.'
                new_hgvsp.append(new_hgvsp_tmp)

            new_hgvsp_string = ",".join(new_hgvsp)
            record.info["HGVS_p"] = new_hgvsp_string

        # Write VCF
        vcf_out.write(record)
コード例 #25
0
def write_rephased_tenx_vcf(tenx_vcf, tenx_records, tenx_phase_sets, threshold,
                            workdir):
    """ Writes new 10X VCF file and switches genotypes if logratios above /
    below threshold """
    basename = os.path.basename(tenx_vcf)
    if basename.endswith('.vcf'):
        offset = -4
    elif basename.endswith('.vcf.gz'):
        offset = -7
    else:
        return
    tenx_rephased_vcf = workdir + '/' + basename[:offset] + '.filtered.het.rephased.vcf'

    vcf_in = VariantFile(tenx_vcf)
    vcf_out = VariantFile(tenx_rephased_vcf, 'w', header=vcf_in.header)

    for ps_id in tenx_phase_sets:
        if tenx_phase_sets[ps_id].rephased:
            chrom = tenx_phase_sets[ps_id].chrom
            if tenx_phase_sets[ps_id].log2ratio >= threshold:
                for pos in tenx_phase_sets[ps_id].positions:
                    tenx_records[chrom + ':' + str(pos)].samples[0]['PS'] = 1
                    vcf_out.write(tenx_records[chrom + ':' + str(pos)])
            elif tenx_phase_sets[ps_id].log2ratio <= -threshold:
                for pos in tenx_phase_sets[ps_id].positions:
                    tenx_records[chrom + ':' + str(pos)].samples[0]['PS'] = 1
                    GT_swapped = (tenx_records[chrom + ':' +
                                               str(pos)].samples[0]['GT'][1],
                                  tenx_records[chrom + ':' +
                                               str(pos)].samples[0]['GT'][0])
                    tenx_records[chrom + ':' +
                                 str(pos)].samples[0]['GT'] = GT_swapped
                    tenx_records[chrom + ':' +
                                 str(pos)].samples[0].phased = True
                    vcf_out.write(tenx_records[chrom + ':' + str(pos)])
    return tenx_rephased_vcf
コード例 #26
0
ファイル: vcf.py プロジェクト: rvicedomini/whatshap
class VcfAugmenter(ABC):
    def __init__(self,
                 in_path,
                 command_line,
                 out_file=sys.stdout,
                 include_haploid_phase_sets=False):
        """
        in_path -- Path to input VCF, used as template.
        command_line -- A string that will be added as a VCF header entry
            (use None to not add this to the VCF header)
        out_file -- Open file-like object to which VCF is written.
        tag -- which type of tag to write, either 'PS' or 'HP'. 'PS' is standardized;
            'HP' is compatible with GATK’s ReadBackedPhasing.
        """
        # TODO This is slow because it reads in the entire VCF one extra time
        contigs, formats, infos = missing_headers(in_path)
        # TODO It would actually look nicer if the custom HS header was directly below PS
        if include_haploid_phase_sets and "HS" not in formats:
            formats.append("HS")
        # We repair the header (adding missing contigs, formats, infos) of the *input* VCF because
        # we will modify the records that we read, and these are associated with the input file.
        self._reader = VariantFile(in_path)
        augment_header(self._reader.header, contigs, formats, infos)
        if command_line is not None:
            command_line = '"' + command_line.replace('"', "") + '"'
            self._reader.header.add_meta("commandline", command_line)
        self.setup_header(self._reader.header)
        self._writer = VariantFile(out_file,
                                   mode="w",
                                   header=self._reader.header)
        self._unprocessed_record = None
        self._reader_iter = iter(self._reader)

    @abstractmethod
    def setup_header(self, header):
        pass

    def close(self):
        self._writer.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    @property
    def samples(self):
        return list(self._reader.header.samples)

    def _iterrecords(self, chromosome):
        """Yield all records for the target chromosome"""
        n = 0
        if self._unprocessed_record is not None:
            assert self._unprocessed_record.chrom == chromosome
            yield self._unprocessed_record
            n += 1
        for record in self._reader_iter:
            n += 1
            if record.chrom != chromosome:
                # save it for later
                self._unprocessed_record = record
                assert n != 1
                return
            yield record
コード例 #27
0
ファイル: vcf.py プロジェクト: rvicedomini/whatshap
class VcfReader:
    """
    Read a VCF file chromosome by chromosome.
    """
    def __init__(
        self,
        path,
        indels=False,
        phases=False,
        genotype_likelihoods=False,
        ignore_genotypes=False,
        ploidy=None,
    ):
        """
        path -- Path to VCF file
        indels -- Whether to include also insertions and deletions in the list of
            variants.
        ignore_genotypes -- In case of genotyping algorithm, no genotypes may be given in
                                vcf, so ignore all genotypes
        ploidy -- Ploidy of the samples
        """
        # TODO Always include deletions since they can 'overlap' other variants
        self._indels = indels
        self._vcf_reader = VariantFile(path)
        self._path = path
        self._phases = phases
        self._genotype_likelihoods = genotype_likelihoods
        self._ignore_genotypes = ignore_genotypes
        self.samples = list(
            self._vcf_reader.header.samples)  # intentionally public
        self.ploidy = ploidy
        logger.debug("Found %d sample(s) in the VCF file.", len(self.samples))

    def __enter__(self):
        return self

    def __exit__(self, *args):
        # follows same structure as for ReadSetReader
        self.close()

    def close(self):
        self._vcf_reader.close()

    @property
    def path(self):
        return self._vcf_reader.filename.decode()

    def _fetch(self, chromosome: str, start=0, end=None):
        try:
            records = self._vcf_reader.fetch(chromosome, start=start, stop=end)
        except ValueError as e:
            if "invalid contig" in e.args[0]:
                raise VcfInvalidChromosome(e.args[0]) from None
            elif "fetch requires an index" in e.args[0]:
                raise VcfIndexMissing(
                    "{} is missing an index (.tbi or .csi)".format(
                        self._path)) from None
            else:
                raise
        return records

    def fetch(self, chromosome: str, start=0, end=None):
        """
        Fetch records from a single chromosome, optionally restricted to a single region.

        Return a VariantTable object.
        """
        records = list(self._fetch(chromosome, start=start, end=end))
        return self._process_single_chromosome(chromosome, records)

    def fetch_regions(self, chromosome: str, regions):
        """
        Fetch records from a single chromosome that overlap the given regions.

        :param regions: a list of start, end tuples (end can be None)
        """
        records = []
        for start, end in regions:
            records.extend(list(self._fetch(chromosome, start=start, end=end)))
        return self._process_single_chromosome(chromosome, records)

    def __iter__(self):
        """
        Yield VariantTable objects for each chromosome.

        Multi-ALT sites are skipped.
        """
        for chromosome, records in itertools.groupby(
                self._vcf_reader, lambda record: record.chrom):
            yield self._process_single_chromosome(chromosome, records)

    @staticmethod
    def _extract_HP_phase(call):
        hp = call.get("HP")
        if hp is None or hp == (".", ):
            return None
        fields = [[int(x) for x in s.split("-")] for s in hp]
        for i in range(len(fields)):
            assert fields[0][0] == fields[i][0]
        block_id = fields[0][0]
        phase = tuple(field[1] - 1 for field in fields)
        return VariantCallPhase(block_id=block_id,
                                phase=phase,
                                quality=call.get("PQ", None))

    @staticmethod
    def _extract_GT_PS_phase(call):
        is_het = not all(x == call["GT"][0] for x in call["GT"])
        if not is_het:
            return None
        if not call.phased:
            return None
        block_id = call.get("PS", 0)
        phase = call["GT"]
        return VariantCallPhase(block_id=block_id,
                                phase=phase,
                                quality=call.get("PQ", None))

    def _process_single_chromosome(self, chromosome, records):
        phase_detected = None
        n_snvs = 0
        n_other = 0
        n_multi = 0
        table = VariantTable(chromosome, self.samples)
        prev_position = None
        for record in records:
            if len(record.alts) > 1:
                # Multi-ALT sites are not supported, yet
                n_multi += 1
                continue

            pos, ref, alt = record.start, str(record.ref), str(record.alts[0])
            if len(ref) == len(alt) == 1:
                n_snvs += 1
            else:
                n_other += 1
                if not self._indels:
                    continue

            if (prev_position is not None) and (prev_position > pos):
                raise VcfNotSortedError(
                    "VCF not ordered: {}:{} appears before {}:{}".format(
                        chromosome, prev_position + 1, chromosome, pos + 1))

            if prev_position == pos:
                logger.warning(
                    "Skipping duplicated position %s on chromosome %r",
                    pos + 1,
                    chromosome,
                )
                continue
            prev_position = pos

            # Read phasing information (allow GT/PS or HP phase information, but not both),
            # if requested
            if self._phases:
                phases = []
                for sample_name, call in record.samples.items():
                    phase = None
                    for extract_phase, phase_name in [
                        (self._extract_HP_phase, "HP"),
                        (self._extract_GT_PS_phase, "GT_PS"),
                    ]:
                        p = extract_phase(call)
                        if p is not None:
                            if phase_detected is None:
                                phase_detected = phase_name
                            elif phase_detected != phase_name:
                                raise MixedPhasingError(
                                    "Mixed phasing information in input VCF (e.g. mixing PS "
                                    "and HP fields)")
                            phase = p
                            # check for ploidy consistency and limits
                            phase_ploidy = len(p.phase)
                            if phase_ploidy > get_max_genotype_ploidy():
                                raise PloidyError(
                                    "Ploidies higher than {} are not supported."
                                    "".format(get_max_genotype_ploidy()))
                            elif p is None or None in p:
                                pass
                            elif self.ploidy is None:
                                self.ploidy = phase_ploidy
                            elif phase_ploidy != self.ploidy:
                                print("phase= {}".format(phase))
                                raise PloidyError(
                                    "Phasing information contains inconsistent ploidy ({} and "
                                    "{})".format(self.ploidy, phase_ploidy))
                    phases.append(phase)
            else:
                phases = [None] * len(record.samples)

            # Read genotype likelihoods, if requested
            if self._genotype_likelihoods:
                genotype_likelihoods = []
                for call in record.samples.values():
                    GL = call.get("GL", None)
                    PL = call.get("PL", None)
                    # Prefer GLs (floats) over PLs (ints) if both should be present
                    if GL is not None:
                        genotype_likelihoods.append(GenotypeLikelihoods(GL))
                    elif PL is not None:
                        genotype_likelihoods.append(
                            GenotypeLikelihoods([pl / -10 for pl in PL]))
                    else:
                        genotype_likelihoods.append(None)
            else:
                genotype_likelihoods = [None] * len(record.samples)

            if not self._ignore_genotypes:
                # check for ploidy consistency and limits
                genotype_lists = [
                    call["GT"] for call in record.samples.values()
                ]
                for geno in genotype_lists:
                    geno_ploidy = len(geno)
                    if geno_ploidy > get_max_genotype_ploidy():
                        raise PloidyError(
                            "Ploidies higher than {} are not supported."
                            "".format(get_max_genotype_ploidy()))
                    elif geno is None or None in geno:
                        pass
                    elif self.ploidy is None:
                        self.ploidy = geno_ploidy
                    elif geno_ploidy != self.ploidy:
                        raise PloidyError("Inconsistent ploidy ({} and "
                                          "{})".format(self.ploidy,
                                                       geno_ploidy))

                genotypes = [
                    genotype_code(geno_list) for geno_list in genotype_lists
                ]
            else:
                genotypes = [Genotype([]) for i in range(len(self.samples))]
                phases = [None] * len(self.samples)
            variant = VcfVariant(position=pos,
                                 reference_allele=ref,
                                 alternative_allele=alt)
            table.add_variant(variant, genotypes, phases, genotype_likelihoods)

        logger.debug(
            "Parsed %s SNVs and %s non-SNVs. Also skipped %s multi-ALTs.",
            n_snvs,
            n_other,
            n_multi,
        )

        # TODO remove overlapping variants
        return table
コード例 #28
0
ファイル: vcf.py プロジェクト: rvicedomini/whatshap
def missing_headers(path):
    """
    Find contigs, FORMATs and INFOs that are used within the body of a VCF file, but are
    not listed in the header or that have an incorrect type.

    Return a tuple (contigs, formats, infos) where each of the items are lists of
    strings.

    The reason this function exists is that pysam.VariantFile crashes when we
    try to write a VCF record to it that uses contigs, INFOs or FORMATs that
    are missing from the header. See also
    <https://github.com/pysam-developers/pysam/issues/771>
    """
    with VariantFile(path) as variant_file:
        header = variant_file.header.copy()
        # Check for FORMATs that do not have the expected type
        incorrect_formats = []
        for fmt, v in variant_file.header.formats.items():
            if fmt not in PREDEFINED_FORMATS:
                continue
            h = PREDEFINED_FORMATS[fmt]
            if v.number != h.number or v.type != h.typ:
                if fmt == "PS" and v.type != h.typ:
                    raise VcfError(
                        "The input VCF/BCF contains phase set ('PS') tags that are of the"
                        " non-standard type '{}' instead of 'Integer'. WhatsHap cannot"
                        " overwrite these as it could produce inconsistent files."
                        " To proceed, you can use 'whatshap unphase' to remove phasing"
                        " information from the input file".format(v.type))
                incorrect_formats.append(fmt)

        # Iterate through entire file and check which contigs, formats and
        # info fields are used
        contigs = []  # contigs encountered, in the proper order
        seen_contigs = set()
        formats = []  # FORMATs encountered, in the proper order
        seen_formats = set()
        seen_infos = set()  # INFOs encountered

        for record in variant_file:
            seen_infos.update(record.info)
            if record.alts is not None:
                for alt in record.alts:
                    # If there are "vague" ALT alleles such as <INS>, <DEL> etc, then
                    # the header needs to contain a LEN info entry even if LEN
                    # is never used
                    if alt.startswith("<"):
                        seen_infos.add("END")

            # For the contigs, we maintain a set *and* a list because we want to
            # keep track of the order of the contigs.
            if record.contig not in seen_contigs:
                contigs.append(record.contig)
            seen_contigs.add(record.contig)

            for fmt in record.format:
                if fmt not in seen_formats:
                    formats.append(fmt)
                seen_formats.add(fmt)

    # Determine which contigs are missing from the header
    header_contigs = set(header.contigs)
    missing_contigs = []
    for contig in contigs:
        if contig not in header_contigs:
            missing_contigs.append(contig)

    # Determine which FORMATs are missing from the header
    header_formats = set(header.formats)
    missing_formats = []
    for fmt in formats:
        if fmt in header_formats:
            continue
        missing_formats.append(fmt)

    # Determine which INFOs are missing from the header
    missing_infos = list(set(seen_infos) - set(header.info))

    return (missing_contigs, incorrect_formats + missing_formats,
            missing_infos)
コード例 #29
0
ファイル: discover.py プロジェクト: bricoletc/gramtools
def _dump_rebased_vcf(records: List[VariantRecord], disco_paths: DiscoverPaths):
    template_vcf = VariantFile(disco_paths.discov_vcf_cortex)
    output_vcf = VariantFile(disco_paths.final_vcf, "w", header=template_vcf.header)
    for record in records:
        output_vcf.write(record)
コード例 #30
0
ファイル: queries.py プロジェクト: nlesc-ave/ave-data-manager
from pysam import VariantFile
from pysam import TabixFile
from pyfaidx import Fasta

# data files
reference_file = 'S_lycopersicum_chromosomes.2.40.fa'
annotation_file = 'gene_models.gff.gz'
variant_file = 'tomato_snps.bcf'

# load reference
reference = Fasta(reference_file)

# load annotations
annotations = TabixFile(annotation_file)

# laod variants
variants = VariantFile(variant_file)

# regions to query
region1 = ("SL2.40ch01", 15000, 21000)
region2 = ("SL2.40ch01", 20000, 70000)

region1_reference = reference[region1[0]][region1[1]: region1[2]]
region1_annotations = [a for a in annotations.fetch(*region1, parser=pysam.asGTF())]
region1_variants = [a for a in variants.fetch(*region1)]

region2_reference = reference[region2[0]][region2[1]: region2[2]]
region2_annotations = [a for a in annotations.fetch(*region2, parser=pysam.asGTF())]
region2_variants = [a for a in variants.fetch(*region2)]
コード例 #31
0
ファイル: VCFOverlap.py プロジェクト: wavefancy/BIDMC-PYTHON
                key = r.contig + str(r.pos) + r.alleles[0] + r.alleles[1]
                if key in self.currentMap:
                    sys.stderr.write('ERROR: repeated records detected, same meta info, error record:\n%s\n'%(r))
                else:
                    self.currentMap[key] = r

if __name__ == '__main__':
    args = docopt(__doc__, version='1.0')
    #print(args)

    if(args['--format']):
        ShowFormat()
        sys.exit(-1)

    vcfMetaCols=9       #number of colummns for vcf meta information.
    inF1 = VariantFile(args['<input1>'], 'r')
    inF2 = VariantFile(args['<input2>'], 'r')
    Record = Record(inF2)
    #check smaples in two input file, same samples, and same order.
    if len(inF1.header.samples) != len(inF2.header.samples):
        sys.stderr.write('ERROR: different number of samples in two input files.\n')
        sys.exit(-1)
    else:
        for x, y in zip( inF1.header.samples, inF2.header.samples):
            if x != y:
                sys.stderr.write('ERROR: two input files should have the same samples, and ordered in same order.\n')
                sys.exit(-1)

    #output vcf header
    sys.stdout.write('%s'%(str(inF1.header)))
    for line in inF1.fetch():
コード例 #32
0
#coding:utf-8
from sys import argv
from os.path import exists
import os
import pysam
import numpy as np
from pysam import VariantFile

script, bam_file, vcf_file, output_bam_file = argv

bamfile = pysam.AlignmentFile(bam_file, "rb")
vcffile = VariantFile(vcf_file)
output_bamfile = pysam.AlignmentFile(output_bam_file, "wb", template=bamfile)
for rec in vcffile.fetch():
    for read in bamfile.fetch():
        if (rec.pos == read.pos):
            output_bamfile.write(read)

output_bamfile.close()
bamfile.close()
vcffile.close()
コード例 #33
0
from labels import SVRecord_generic
from pysam import VariantFile
import re
import os

chr_list = []
with open("../MinorResearchInternship/BAM/BAM_chr_list", "r") as f:
    for line in f:
        line = line.strip()
        chr_list += [line]

for chrom in chr_list:
    #filename = "genomewide_windowpairs/delly/"+chrom+"_windowpairs_DEL.npy.gz"
    #with gzip.GzipFile(filename, "rb") as f:
    ##shape = X.shape
    counter = 0
    for vcf_file in ["../MinorResearchInternship/VCF/delly.sym.vcf"]:
        assert os.path.isfile(vcf_file)
        vcf_in = VariantFile(vcf_file, 'r')
        caller = re.findall(r'^\w*', vcf_file)
        for rec in vcf_in.fetch():
            svrec = SVRecord_generic(rec, "delly")
            startCI = abs(svrec.cipos[0]) + svrec.cipos[1]
            endCI = abs(svrec.ciend[0]) + svrec.ciend[1]
            if startCI <= 200 and endCI <= 200 and svrec.chrom == "1" and svrec.svtype == "DEL":
                counter += 1

    #if counter == shape[0]:
    #print("OK")

print(counter)
コード例 #34
0
    def make_vcf_header(self):
        """
        Add samples and sources to VCF template header.

        Returns
        -------
        pysam.VariantHeader
        """

        if self.preserve_header:
            header = self.vcfs[0].header
            for sample in self.samples:
                if sample not in header.samples:
                    header.add_sample(sample)

            if self.preserve_ids and 'MEMBERS' not in header.info.keys():
                info = (
                    '##INFO=<ID=MEMBERS,Number=.,Type=String,'
                    'Description="IDs of cluster\'s constituent records.">')
                header.add_line(info)

            if (not self.do_merge) and 'CLUSTER' not in header.info.keys():
                info = (
                    '##INFO=<ID=CLUSTER,Number=1,Type=Integer,Description="Cluster ID">'
                )
                header.add_line(info)

            return header

        # Read stock template
        template = pkg_resources.resource_filename(
            'svtk', 'data/vcfcluster_template.vcf')

        # Make header
        template = VariantFile(template)
        header = template.header

        # Add samples
        for sample in self.samples:
            header.add_sample(sample)

        # Add contigs
        contigs = []
        for vcf in self.vcfs:
            for contig in vcf.header.contigs.values():
                tup = (contig.name, contig.length)
                if tup not in contigs:
                    contigs.append(tup)

        contig_line = '##contig=<ID={0},length={1}>'
        for contig in contigs:
            header.add_line(contig_line.format(*contig))

        # Add INFO
        infos = []
        for vcf in self.vcfs:
            for tag, info in vcf.header.info.items():
                if tag in header.info.keys():
                    continue
                tup = (info.name, info.number, info.type, info.description)
                if tup not in infos:
                    infos.append(tup)

        info_line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'
        for info in infos:
            header.add_line(info_line.format(*info))

        if self.preserve_ids and 'MEMBERS' not in header.info.keys():
            info = ('##INFO=<ID=MEMBERS,Number=.,Type=String,'
                    'Description="IDs of cluster\'s constituent records.">')
            header.add_line(info)

        if (not self.do_merge) and 'CLUSTER' not in header.info.keys():
            info = (
                '##INFO=<ID=CLUSTER,Number=1,Type=Integer,Description="Cluster ID">'
            )
            header.add_line(info)

        # Add source
        sourcelist = sorted(set(self.sources))
        header.add_line('##source={0}'.format(','.join(sourcelist)))

        # Add source FORMAT fields
        meta = ('##FORMAT=<ID={0},Number=1,Type=Integer,'
                'Description="Called by {1}"')
        for source in self.sources:
            header.add_line(meta.format(source, source.capitalize()))

        return header
コード例 #35
0
def gen_report(vcf, c, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.snv.strelka.vep_priority.report.log'
    log(loc,
        date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    on_dict = {}
    if c != 'n':
        on_dict = create_target(c)
        log(
            loc,
            date_time() +
            'Target file given, creating index for on target info\n')
    vcf_in = VariantFile(vcf)
    call_type = 'snv'
    if bool(re.search('indel', fn)):
        out = open(
            parts[0] + '.indel.strelka.vep.prioritized_impact.report.xls', 'w')
        call_type = 'indel'
    else:
        out = open(parts[0] + '.snv.strelka.vep.prioritized_impact.report.xls',
                   'w')
    desired = {
        'Consequence': 0,
        'IMPACT': 0,
        'SYMBOL': 0,
        'Feature': 0,
        'Protein_position': 0,
        'Amino_acids': 0,
        'Codons': 0,
        'Existing_variation': 0,
        'ExAC_MAF': 0,
        'BIOTYPE': 0,
        'VARIANT_CLASS': 0
    }

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace(
        'Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    if call_type == 'snv':
        out.write(
            'chr\tpos\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t'
            'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\t'
            'variant_class_effect\teffect\timpact\tbiotype\tcodon_change\tamino_acid_change\ton/off-target\n'
        )
    else:
        out.write(
            'chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact\t'
            'biotype\tcodon_change\tamino_acid_change\ton/off-target\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)

    for record in vcf_in.fetch():
        # dict contains what's different between strelka indel and snv reports
        (chrom, pos, ref, alt) = (record.contig, str(record.pos), record.ref,
                                  record.alts[0])
        if call_type == 'snv':
            not_shared = {
                'norm_ref_ct': record.samples['NORMAL'][(record.ref + 'U')][0],
                'norm_alt_ct':
                record.samples['NORMAL'][(record.alts[0] + 'U')][0],
                'tum_ref_ct': record.samples['TUMOR'][(record.ref + 'U')][0],
                'tum_alt_ct':
                record.samples['TUMOR'][(record.alts[0] + 'U')][0]
            }
        else:
            not_shared = {}
        ann_list = [_.split('|') for _ in record.info['ANN'].split(',')]
        tflag = 'NA'
        if c != 'n':
            tflag = mark_target(chrom, pos, on_dict)
            # only outputting ON TARGET hits
            if tflag == 'OFF':
                continue
        output_highest_impact(chrom, pos, ref, alt, not_shared, ann_list,
                              desired, tflag, out, ref_flag, call_type)

    out.close()
    log(
        loc,
        date_time() + 'Creating prioritized report for ' + vcf +
        ' complete!\n')
    return 0
コード例 #36
0
ファイル: bcf2maf.py プロジェクト: smangul1/sbt

def sdi(data):
    if len(data) == 0:
        return "N/A"
    else:
        N = sum(data)
        return abs(-sum(p(n, N) for n in data if n is not 0))


ap = argparse.ArgumentParser()
ap.add_argument('input_bcf', help='--')
ap.add_argument('out', help='---')
args = ap.parse_args()

bcf_in = VariantFile(args.input_bcf)  # auto-detect input format

#test=[0.75,0.25]
#print "---->",sdi(test)  0.811278124459 which is CORRECT!

dict = {}
pos = set()

for rec in bcf_in.fetch():
    #(0, 0, 8, 0)
    #VAF = (forward non-ref + reverse non-ref alleles) /  (forward ref alleles + reverse ref + forward non-ref + reverse non-ref alleles)

    ref = rec.info["DP4"][0] + rec.info["DP4"][1]
    non_ref = rec.info["DP4"][2] + rec.info["DP4"][3]
    if ref == 0:
        VAF = 1
コード例 #37
0
def check_genotype(folder, sample, coverage_file):
    """
    Compares the genotype for all shared variants
    :param folder: location of results from the NGS analysis pipeline
    :param sample:  sample number (used in vcf file)
    :param coverage_file: file containing coverage information for each position in the panel
    :return: dictionary of number of matching variants and detailed information for any with mismatching genotypes
    """
    shared_giab = VariantFile(folder + '/0002.vcf')
    shared_patient = VariantFile(folder + '/0003.vcf')

    variants = []

    vars_giab = {}
    for rec in shared_giab.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if chrom not in vars_giab:
            vars_giab[chrom] = {}
        if pos not in vars_giab[chrom]:
            vars_giab[chrom][pos] = {}
        if alleles not in vars_giab[chrom][pos]:
            vars_giab[chrom][pos][alleles] = rec.samples['INTEGRATION']['GT']

    matching = 0
    for rec in shared_patient.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if 'AD' in rec.samples[sample].keys():
            allelic_depth = rec.samples[sample]['AD']
        else:
            allelic_depth = 'N/A'
        total_depth = rec.samples[sample]['DP']
        giab_genotype = vars_giab[chrom][pos][alleles]
        if rec.samples[sample]['GT'] == giab_genotype:
            matching += 1
        elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][
            0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1:
            matching += 1
        else:
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print 'Error executing command: ' + str(e.returncode)
                    exit(1)
                if line == '':
                    variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                               'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                               'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                               'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                else:
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':alleles[0], 'alt':alleles[1], 'QUAL':rec.qual,
                                    'GT':{sample:rec.samples[sample]['GT'], 'GIAB':giab_genotype},
                                    'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                                    'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}
            else:
                variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                           'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                           'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                           'coverage': {'total': 'indel: no coverage could be obtained', 'ref': 'N/A', 'alt': 'N/A'}}
            variants.append(variant)
    print str(matching) + ' matching variants'
    results = {'matching':matching, 'mismatching':variants}
    print results
    return results
コード例 #38
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-vcf', help='Results VCF to be compared', required=True)
    parser.add_argument('-bed', help='The reference BED file', required=True)
    parser.add_argument('-s', help='Sample ID in VCF', required=True)
    parser.add_argument('-out', help='The folder to putt results files', required=True)

    args = parser.parse_args()

    if args.out.endswith('/'):
        out_dir = args.out
    else:
        out_dir = args.out + '/'

    sample = args.s

    vcf_file = args.vcf
    bed = args.bed

    f = open(bed, 'r')
    regions = [line.strip('\n') for line in f.readlines()]
    f.close()

    variants = {}
    for region in regions:
        if region.startswith('#'):
            continue
        chrom, start, end, name = region.split('\t')
        pos, ref, alt = name.split(':')
        if chrom not in variants:
            variants[chrom] = {pos:{(ref, alt):False,}}
        elif pos not in variants[chrom]:
            variants[chrom][pos] = {(ref, alt):False,}
        else:
            variants[chrom][pos][(ref, alt)] = False

    vcf = VariantFile(vcf_file)
    false_pos = []
    false_neg = []
    true_pos = []
    for v in vcf.fetch():
        chrom = v.contig
        pos = str(v.pos)
        ref = v.alleles[0]
        alt = v.alleles[1]
        qual = v.qual
        genotype = v.samples[sample]['GT']
        if 'AD' in v.samples[sample].keys():
            allelic_depth = v.samples[sample]['AD']
        elif 'NV' in v.samples[sample].keys():
            allelic_depth = v.samples[sample]['NV']
        else:
            allelic_depth = 'N/A'
        if 'DP' in v.samples[sample].keys():
            total_depth = v.samples[sample]['DP']
        elif 'NR' in v.samples[sample].keys():
            total_depth = v.samples[sample]['NR']
        else:
            total_depth = 0
        if pos in variants[chrom].keys():
            if (ref,alt) in variants[chrom][pos].keys():
                variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual,
                           'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                           'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}}
                true_pos.append(variant)
                variants[chrom][pos][(ref, alt)] = True
            else:
                variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual,
                           'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                           'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}}
                false_pos.append(variant)
        else:
            variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual,
                       'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                       'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}}
            false_pos.append(variant)

    for chrom in variants.keys():
        for pos in variants[chrom].keys():
            for v in variants[chrom][pos].keys():
                if not variants[chrom][pos][v]:
                    variant = {'chrom': chrom, 'pos': pos, 'ref': v[0], 'alt': v[1], 'QUAL': 0,
                               'GT': (0,0),
                               'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}}
                    false_neg.append(variant)

    out = {'false_negative': {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':false_neg}, 'false_positive': false_pos,
           'mismatching_genotype': [], 'matching_variants': len(true_pos),
           'num_true_negatives': 0, 'sensitivity': 0, 'MCC': 0,
           'small_panel_remainder_length': 0, 'percent_small_panel_covered': 0,
           'num_false_positive': len(false_pos), 'num_false_negative': {'indel': 0,
                                                                   'no_coverage': 0,
                                                                   'ev_of_alt': 0,
                                                                   'false_neg': 0,
                                                                   'total': len(false_neg)},
           'num_mismatching_genotype': 0}

    all_results = {sample:out}
    f = open(out_dir + sample + '_summary.json', 'w')
    j = json.dumps(all_results, indent=4)
    print >> f, j
    f.close()
コード例 #39
0
                sys.stderr.write('ERROR: Index out of range. geno: %s, out index: %s\n'%(geno, str(outGenoArrayIndex)))
                sys.exit(-1)

    outGenoArrayIndex = []
    def setoutGenoArrayIndex(oldFormatTags):
        outGenoArrayIndex.clear()
        ss = oldFormatTags.upper().split(':')
        for x in tags:
            try:
                y = ss.index(x)
                outGenoArrayIndex.append(y)
            except ValueError:
                sys.stderr.write('ERROR: can not find tag: "%s", from input vcf FORMAT field.\n'%(x))
                sys.exit(-1)

    infile = VariantFile('-', 'r')
    sys.stdout.write(str(infile.header))
    for line in infile:
        ss = str(line).strip().split()
        out = ss[:vcfMetaCols]
        out[8] = otags                  #update tags genotyp tags info.
        setoutGenoArrayIndex(ss[8])     #Check format line by line.
        for x in ss[vcfMetaCols:]:
            #if not outGenoArrayIndex:
            #    setoutGenoArrayIndex(ss[8])
            out.append(reformat(x))

        sys.stdout.write('%s\n'%('\t'.join(out)))

    infile.close()
sys.stdout.flush()
コード例 #40
0
def annotate_false_negs(folder, ref_sample, coverage_file):
    """
    Get information for any false negative results.

    Returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate)

    False Negatives are split into categories to aid final comparison:

    * Zero coverage - No reads present
    * Evidence of alternate allele - Coverage or quality too low for variant call
    * Indels - Coverage is more difficult to obtain in these cases; currently they must be investigated by hand
    * All other false negatives - In these cases there are reads present and no evidence of the alternate allele

    :param folder: Folder containing output from bcftools isec
    :type folder: String
    :param ref_sample: Sample number for reference vcf
    :type ref_sample: String
    :param coverage_file: File containing per base coverage for the truth_regions panel
    :type coverage_file: String
    :return: List of variant dictionaries containing information on false negatives
    :rtype: List
    """
    false_negs = VariantFile(folder + '/0000.vcf')
    num_neg = len(list(false_negs.fetch()))
    print(num_neg)

    variants = {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':[]}
    v_list = []
    count=0
    if num_neg > 0:
        print('false negatives')
        for rec in false_negs.fetch():
            print(rec.samples)
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples[ref_sample]['GT']
            if [chrom, pos, ref, alt] in v_list:
                print("duplicate")
                continue
                
            count+=1
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print(command)
                    print('Error executing command: ' + str(e.returncode))
                    exit(1)

                if line == '':
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                    no_cov = variants['no_coverage']
                    no_cov.append(variant)
                    variants['no_coverage'] = no_cov
                else:
                    line.strip('\n')
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}

                    if int(cov) == 0:
                        no_cov = variants['no_coverage']
                        no_cov.append(variant)
                        variants['no_coverage'] = no_cov
                    elif int(alt_cov) != 0:
                        ev_alt = variants['evidence_of_alt']
                        ev_alt.append(variant)
                        variants['evidence_of_alt'] = ev_alt
                    else:
                        fn = variants['false_neg']
                        fn.append(variant)
                        variants['false_neg'] = fn
            else:
                variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                            'GT':genotype, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A',
                                                        'alt':'N/A'}}
                indels = variants['indels']
                indels.append(variant)
                variants['indels'] = indels

    else:
        print('no false negatives')
    print("false_negatives=" + str(count))
    return variants
コード例 #41
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtk standardize',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Raw VCF.')
    parser.add_argument('fout', help='Standardized VCF.')
    parser.add_argument('source',
                        help='Source algorithm. '
                        '[delly,lumpy,manta,wham,melt]')
    parser.add_argument('-p',
                        '--prefix',
                        help='If provided, variant names '
                        'will be overwritten with this prefix.')
    parser.add_argument('--include-reference-sites',
                        action='store_true',
                        default=False,
                        help='Include records where all '
                        'samples are called 0/0 or ./.')
    parser.add_argument('--standardizer',
                        help='Path to python file with '
                        'custom standardizer definition. (Not yet supported.)')
    parser.add_argument('--contigs',
                        type=argparse.FileType('r'),
                        help='Reference fasta index (.fai). If provided, '
                        'contigs in index will be used in VCF header. '
                        'Otherwise all GRCh37 contigs will be used in header. '
                        'Variants on contigs not in provided list will be '
                        'removed.')
    parser.add_argument('--min-size',
                        type=int,
                        default=50,
                        help='Minimum SV size to report [50].')
    parser.add_argument('--call-null-sites',
                        action='store_true',
                        default=False,
                        help='Call sites with null genotypes (./.). Generally '
                        'useful when an algorithm has been run on a single '
                        'sample and has only reported variant sites.')
    parser.add_argument('--sample-names',
                        type=str,
                        default=None,
                        help='Comma-delimited list of sample names to use in '
                        'header [use existing].')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    # Add contigs to header if provided
    if args.contigs:
        template = pkg_resources.resource_filename(
            'svtk', 'data/no_contigs_template.vcf')
        template = VariantFile(template)
        header = template.header
        contig_line = '##contig=<ID={contig},length={length}>'
        for line in args.contigs:
            contig, length = line.split()[:2]
            header.add_line(contig_line.format(**locals()))
    # Use GRCh37 by default
    else:
        template = pkg_resources.resource_filename('svtk',
                                                   'data/GRCh37_template.vcf')
        template = VariantFile(template)
        header = template.header

    vcf = VariantFile(args.vcf)

    # Parse new sample names if provided
    if args.sample_names:
        sample_names_list = args.sample_names.split(',')
    else:
        sample_names_list = vcf.header.samples

    # Tag source in header
    meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"'
    meta = meta.format(args.source, args.source.capitalize())
    header.add_line(meta)
    header.add_line('##source={0}'.format(args.source))

    fout = VariantFile(args.fout, mode='w', header=header)

    standardizer = VCFStandardizer.create(args.source, vcf, fout,
                                          sample_names_list, args.prefix,
                                          args.min_size,
                                          args.include_reference_sites,
                                          args.call_null_sites)

    for record in standardizer.standardize_vcf():
        fout.write(record)

    fout.close()
    vcf.close()
コード例 #42
0
def check_genotype(folder, sample, ref_sample, coverage_file):
    """
    Compares the genotype for all shared variants

    The number of matching variants are counted and those that do not match are annotated with basic variant info plus
    quality, genotype, coverage (total, ref base and alt base if appropriate)

    :param folder: Location of results from the NGS analysis pipeline
    :type folder: String
    :param sample: Sample number (used in vcf file)
    :type sample: String
    :param ref_sample: Sample number for reference vcf
    :type ref_sample: String
    :param coverage_file: File containing coverage information for each position in the panel
    :type coverage_file: String
    :return: Number of matching variants
    :rtype: Int
    :return: List of variant dictionaries with detailed information for mismatching genotypes
    :rtype: List
    """
    shared_giab = VariantFile(folder + '/0002.vcf')
    shared_patient = VariantFile(folder + '/0003.vcf')

    variants = []

    vars_giab = {}
    for rec in shared_giab.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if chrom not in vars_giab:
            vars_giab[chrom] = {}
        if pos not in vars_giab[chrom]:
            vars_giab[chrom][pos] = {}
        if alleles not in vars_giab[chrom][pos]:
            vars_giab[chrom][pos][alleles] = rec.samples[ref_sample]['GT']

    matching = 0
    for rec in shared_patient.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if 'AD' in rec.samples[sample].keys():
            allelic_depth = rec.samples[sample]['AD']
        else:
            allelic_depth = 'N/A'
        if 'DP' in rec.samples[sample].keys():
            total_depth = rec.samples[sample]['DP']
        elif 'NR' in rec.samples[sample].keys():
            total_depth = rec.samples[sample]['NR']
        else:
            total_depth = 0
        giab_genotype = vars_giab[chrom][pos][alleles]
        if rec.samples[sample]['GT'] == giab_genotype:
            matching += 1
        elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][
            0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1:
            matching += 1
        else:
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print('Error executing command: ' + str(e.returncode))
                    exit(1)
                if line == '':
                    variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                               'GT': {"sample": rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                               'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                               'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                else:
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':alleles[0], 'alt':alleles[1], 'QUAL':rec.qual,
                                    'GT':{"sample":rec.samples[sample]['GT'], 'GIAB':giab_genotype},
                                    'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                                    'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}
            else:
                variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                           'GT': {"sample": rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                           'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                           'coverage': {'total': 'indel: no coverage could be obtained', 'ref': 'N/A', 'alt': 'N/A'}}
            variants.append(variant)
    print(str(matching) + ' matching variants')

    return matching, variants
コード例 #43
0
class masking(object):
    Bampath = ""
    variantfile = None
    Vcfpath = ""
    Variants = []

    def __init__(self, bampath, vcfpath):
        ### populates class variables
        ### input : string(bampath), string(vcfpath)
        ### output : none
        self.Bampath = bampath
        self.Bamfile = pysam.AlignmentFile(bampath, "rb")
        self.Vcfpath = vcfpath
        self.variantfile = VariantFile(self.Vcfpath)
        self.populateVarlist()

    def populateVarlist(self):
        ### populate varaints from vcf file
        ### input : none
        ### output : none
        for rec in self.variantfile.fetch():
            self.Variants.append(rec)

    def printMaskVars(self):
        ### prints class variables
        ### input : none
        ### output : none
        for v in self.Variants:
            print(v.chrom, v.pos, v.ref, v.alts)

        print("Bampath :", self.Bampath)
        print("Vcfpath:", self.Vcfpath)

    def maskVariant(self, varRec, bamAlign):
        ### masking the alt base to ref
        ### input : VariantRecord(varRec), AlignedSegment
        ### output : AlignedSegment (modified)

        if not self.doesOverlap(varRec, bamAlign):
            return (bamAlign, False)
        elif len(varRec.alts) != 1:
            return (bamAlign, False)
        else:
            print("found the overlap with variant")
            AlIndex = (varRec.pos - 1) - (bamAlign.reference_end -
                                          bamAlign.reference_length + 1)
            queryBases = bamAlign.query_sequence
            if queryBases[AlIndex] == varRec.ref:
                return (bamAlign, False)
            elif queryBases[AlIndex] == varRec.alts[0]:
                queryBases = self.replaceChar(queryBases, varRec.ref, AlIndex)
                print("queryBas", queryBases)
                bamAlign.query_sequence = queryBases
                return (bamAlign, True)
            else:
                print("Unhandle case for maskvariant")
                return (bamAlign, False)

    def replaceChar(self, bamSeq, ref, index):
        bamSeq = bamSeq[:index] + ref + bamSeq[index + len(ref):]
        return bamSeq

    def doesOverlap(self, varRec, bamAlign):
        ### check if vcf overlaps bamalignment
        ### input : VariantRecord(varRec), AlignedSegment
        ### output : boolean
        pos = varRec.pos
        bamEndPos = bamAlign.reference_end
        bamStartPos = bamEndPos - bamAlign.reference_length
        return (pos >= bamStartPos and pos <= bamEndPos)

    def maskAllVariants(self):
        iter = self.Bamfile.fetch()
        masked_bam = pysam.AlignmentFile("masked.norm.bam",
                                         "wb",
                                         template=self.Bamfile)
        for x in iter:
            read = x
            for v in self.Variants:
                ret = self.maskVariant(v, x)
                b = ret[1]
                if b:
                    read = ret[0]
            masked_bam.write(read)
        masked_bam.close()
        return
コード例 #44
0
    #metavar='File',
    help='Filtered vcffile',
    type=str,
    required=True)

parser.add_argument('-af',
                    '--minAF',
                    help='Minimal allele frequency to output',
                    default=0.25,
                    type=float,
                    required=False)

args = parser.parse_args()

if __name__ == '__main__':
    vcffile = VariantFile(args.infile)

    with open(args.outfile, 'w') as outfile:
        filtered_variant_dict = {}
        for rec in vcffile:
            #Skip minor allele variants
            if rec.info['MAJOR'] == 0:
                continue
            #Skip positions with an allele frequency below threshold
            if rec.info['AF'] < args.minAF:
                continue
            #Ignore out-of-frame indels
            if not (len(rec.alleles[0]) - len(rec.alleles[1])) % 3 == 0:
                continue

            #Add record when it passes all filters
コード例 #45
0
ファイル: vcf_popstats.py プロジェクト: padraicc/popgen_stats
# parser.add_argument('-y', '--pop_2', required=False, dest='pop2', help="Samples belonging to population 2")


args = parser.parse_args()

if args.bed and args.exclude:
    sys.exit('\nCan not use the -e and -b options together\n')

if args.wwss and not args.bed:
    sys.exit('\nNeed to specify a bed file of regions with -b option\n')

if args.bed and not args.min_sites:
    sys.exit('\nThe --min option must be specified with -b option\n')


vcf_infile = VariantFile(args.vcf_infile)
sample_num = len(vcf_infile.header.samples)

if args.ploidy == 2:
    n = 2 * sample_num
elif args.ploidy == 1:
    n = sample_num
else:
    sys.exit("\nSpecify ploidy as 1 or 2\n")

if args.sfs:
    sfs = np.zeros((n // 2) + 1, dtype=int)

if args.exclude:

    af = []
コード例 #46
0
ファイル: vcf.py プロジェクト: evansbenj/admixfrog
def vcf_to_ref(
    outfile,
    vcf_file,
    rec_file,
    pop2sample,
    random_read_samples=[],
    pos_id="Physical_Pos",
    map_ids=["AA_Map"],
    default_map="AA_Map",
    rec_rate=1e-8,
    chroms = None,
    bed=None,
    lax_alleles = False
):

    pprint(pop2sample)


    if chroms is None:
        vcf_first = vcf_file
    else: 
        vcf_first = vcf_file.format(CHROM=chroms[0])

    #  get chromosomes
    with VariantFile(vcf_first) as vcf:
        if chroms is None:
            chroms = [i for i in vcf.header.contigs]
        else:
            chroms = parse_chroms(chroms)
        log_.info("chroms found: %s", chroms)

        sample2pop = defaultdict(list)
        for pop, v in pop2sample.items():
            for sample in v:
                if sample in vcf.header.samples:
                    sample2pop[sample].append(pop)


    samples = sample2pop.keys()
    pops = set(pop for s, v in sample2pop.items() for pop in v)
    pprint(sample2pop)
    pprint(pops)

    map_ids = ['map'] + map_ids

    data_cols = [f"{p}_{e}" for p in pops for e in EXT]

    with lzma.open(outfile, "wt") as ref:
        ref.write("chrom,pos,ref,alt,")
        if rec_file is None:
            ref.write("map,")
        else:
            ref.write(",".join(map_ids))
            ref.write(",")
        ref.write(",".join(data_cols))
        ref.write("\n")
        for chrom in chroms:

            # set up rec file
            if rec_file is not None:
                rec = pd.read_csv(rec_file.format(CHROM=chrom), sep=" ")
                if "chrom" in rec:
                    rec = rec[rec.chrom == chrom]


                rec['map'] = rec[default_map]
                rec_file_cols = list((pos_id, *map_ids))
                rec = rec[rec_file_cols]

                rec_iter = rec.iterrows()
                R0 = next(rec_iter)[1]
                R1 = next(rec_iter)[1]

            #skip chrom if empty
            with VariantFile(vcf_file.format(CHROM=chrom)) as vcf:
                try:
                    V = next(vcf)
                except StopIteration:
                    continue
            with VariantFile(vcf_file.format(CHROM=chrom)) as vcf:
                vcf.subset_samples(samples)
                for row in vcf.fetch(chrom):

                    alt_ix = 0

                    if len(row.alleles) <= 1 or len(row.alleles) > 3:
                        continue

                    if len(row.alleles) == 3:
                        alleles = [i for v in row.samples.values() for i in v["GT"]]
                        if 3 in alleles:
                            continue
                        elif 1 in alleles and 2 in alleles:
                            continue
                        elif 1 not in alleles and 2 not in alleles:
                            continue
                        elif 1 in alleles:
                            alt_ix = 0
                        elif 2 in alleles:
                            alt_ix = 1
                        else:
                            raise ValueError(f"weird alleles {row.alleles}")
                        log_.debug(f"{row.chrom}, {row.pos}, {row.alleles}, {Counter(alleles)}")


                    if row.alts[alt_ix] not in "ACGT" or lax_alleles:
                        continue

                    D = defaultdict(int)
                    # rec stuff
                    if rec_file is None:
                        map_ = row.pos * rec_rate
                        ref.write(
                            f"{row.chrom},{row.pos},{row.ref},{row.alts[alt_ix]},{map_},"
                        )
                    else:
                        if R1 is None: 
                            map_ = R0[map_ids]
                        elif row.pos <= R0[pos_id]:
                            map_ = R0[map_ids]
                        elif R0[pos_id] < row.pos <= R1[pos_id]:
                            slope = (R1[map_ids] - R0[map_ids]) / (
                                R1[pos_id] - R0[pos_id]
                            )
                            map_ = R0[map_ids] + slope * (row.pos - R0[pos_id]) / (
                                R1[pos_id] - R0[pos_id]
                            )
                        elif row.pos > R1[pos_id]:
                            try:
                                while row.pos > R1[pos_id]:
                                    R0, R1 = R1, next(rec_iter)[1]
                            except StopIteration:
                                R0, R1 = R1, None
                            if R1 is None:
                                map_ = R0[map_ids]
                            else:
                                slope = (R1[map_ids] - R0[map_ids]) / (
                                    R1[pos_id] - R0[pos_id]
                                )
                                map_ = R0[map_ids] + slope * (row.pos - R0[pos_id]) / (
                                    R1[pos_id] - R0[pos_id]
                                )

                        ref.write(
                            f"{row.chrom},{row.pos},{row.ref},{row.alts[alt_ix]},"
                        )
                        map_str = ",".join((str(m) for m in map_))
                        ref.write(f"{map_str},")


                    sample_data = row.samples
                    for s in sample_data:
                        if s in random_read_samples:
                            allele = sample_data[s]["GT"][0]
                            if allele is not None:
                                for pop in sample2pop[s]:
                                    D[f"{pop}_{EXT[allele > 0]}"] += 1
                        else:
                            for allele in sample_data[s]["GT"]:
                                if allele is not None:
                                    for pop in sample2pop[s]:
                                        D[f"{pop}_{EXT[allele > 0]}"] += 1


                    ref.write(",".join((str(D[c]) for c in data_cols)))
                    ref.write("\n")
コード例 #47
0
def annotate_false_negs(folder, ref_sample, coverage_file):
    """
    Get information for any false negative results.

    Returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate)

    :param folder: Folder containing output from bcftools isec
    :type folder: String
    :param ref_sample: Sample number for reference vcf
    :type ref_sample: String
    :param coverage_file: File containing per base coverage for the truth_regions panel
    :type coverage_file: String
    :return: List of variant dictionaries containing information on false negatives
    :rtype: List
    """
    false_negs = VariantFile(folder + '/0000.vcf')
    num_neg = len(list(false_negs.fetch()))
    print(num_neg)

    variants = {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':[]}

    if num_neg > 0:
        print('false negatives')
        for rec in false_negs.fetch():
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples['Venter.il_st']['GT']
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print(command)
                    print('Error executing command: ' + str(e.returncode))
                    exit(1)

                if line == '':
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                    no_cov = variants['no_coverage']
                    no_cov.append(variant)
                    variants['no_coverage'] = no_cov
                else:
                    line.strip('\n')
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}

                    if cov == 0:
                        no_cov = variants['no_coverage']
                        no_cov.append(variant)
                        variants['no_coverage'] = no_cov
                    elif alt_cov != 0:
                        ev_alt = variants['evidence_of_alt']
                        ev_alt.append(variant)
                        variants['evidence_of_alt'] = ev_alt
                    else:
                        fn = variants['false_neg']
                        fn.append(variant)
                        variants['false_neg'] = fn
            else:
                variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                            'GT':genotype, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A',
                                                        'alt':'N/A'}}
                indels = variants['indels']
                indels.append(variant)
                variants['indels'] = indels

    else:
        print('no false negatives')

    return variants
コード例 #48
0
    
    Do we want the record, a dictionary, both??
    """
    return(None)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description = "Takes a list of input files? Or Idrectory...TBD")
    parser.add_argument("--input_file", default = "./101.bcf")
    parser.add_argument("--output_dir", default = "./extract_output/")
    parser.add_argument("--merge_strands", action = "store_true")

    args = parser.parse_args()

    infile = VariantFile("101.bcf")
    csv_out_name = args.input_file.replace('.bcf', '.csv')
    ofile = open(csv_out_name, "w")

    # Column names for ouptut
    writer = csv.writer(ofile)
    writer.writerow(["chr", "pos", "reference", "call", "methylated", "unmethylated", "strand"])

    # The things in rec.format
    # GT FT DP MQ GQ QD GL MC8 AMQ CS CG CX

    # Iterator
    I = infile.fetch('chr1', 100000, 110000, threads=4)

    # Iterate two records at a time if merging...
    for rec1, rec2 in itertools.zip_longest(*[I]*2):
コード例 #49
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=__doc__,
        prog='svtools standardize',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('vcf', help='Raw VCF.')
    parser.add_argument('fout', help='Standardized VCF.')
    parser.add_argument('source',
                        help='Source algorithm. '
                        '[delly,lumpy,manta,wham,melt]')
    parser.add_argument('-p',
                        '--prefix',
                        help='If provided, variant names '
                        'will be overwritten with this prefix.')
    parser.add_argument('--include-reference-sites',
                        action='store_true',
                        default=False,
                        help='Include records where all '
                        'samples are called 0/0 or ./.')
    parser.add_argument('--standardizer',
                        help='Path to python file with '
                        'custom standardizer definition. (Not yet supported.)')

    # Print help if no arguments specified
    if len(argv) == 0:
        parser.print_help()
        sys.exit(1)
    args = parser.parse_args(argv)

    template = pkg_resources.resource_filename('svtools',
                                               'data/standard_template.vcf')
    template = VariantFile(template)
    vcf = VariantFile(args.vcf)

    # Template header includes all necessary FILTER, INFO, and FORMAT fields
    # Just need to add samples from VCF being standardized
    header = template.header
    for sample in vcf.header.samples:
        header.add_sample(sample)

    # Tag source in header
    meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"'
    meta = meta.format(args.source, args.source.capitalize())
    header.add_line(meta)
    header.add_line('##source={0}'.format(args.source))

    fout = VariantFile(args.fout, mode='w', header=header)

    standardizer = VCFStandardizer.create(args.source, vcf, fout)
    idx = 1
    for record in standardizer.standardize_vcf():
        if any_called(record) or args.include_reference_sites:
            if args.prefix is not None:
                record.id = '{0}_{1}'.format(args.prefix, idx)
                idx += 1

            fout.write(record)

    #  for std_rec in standardize_vcf(vcf, fout):
    #  fout.write(std_rec)

    fout.close()
    vcf.close()
コード例 #50
0
def run_process(opts, inputvcf):
    outputvcf = opts.output
    popfreq = float(opts.popfreq)

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    vcf_in.header.info.add("ngb_popmaf_snp_db_cnt",".","Integer","Population Database Count above setting MAF")
    vcf_in.header.info.add("ngb_popmaf_snp_db_list",".","String","Population Database List above setting MAF")
    vcf_in.header.info.add("ngb_popmaf_snp_db_eastasian",".","String","East Asian Exist Flag above setting MAF")
    vcf_in.header.info.add("ngb_popmaf_snp_db_korean",".","String","Korean Exist Flag above setting MAF")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=vcf_in.header)

    for record in vcf_in.fetch():
        record_data = OrderedDict()
        record_value = list()

        # Check Population MAF
        for key in freq_check_list:
            try:
                value = record.info[key]
                if type(value) == list or type(value) == tuple:
                    value2 = float(value[0])
                else:
                    value2 = float(value)
                if value2 >= popfreq:
                    record_data[key] = value2
            except:
                continue

        # Check ESP6500
        try:
            value_list = record.info['esp6500_MAF']
            if float(value_list[2]) / 100 >= popfreq:
                record_data['esp6500_MAF_ALL'] = float(value_list[2]) / 100
            if float(value_list[1]) / 100 >= popfreq:
                record_data['esp6500_MAF_AA'] = float(value_list[1]) / 100
            if float(value_list[0]) / 100 >= popfreq:
                record_data['esp6500_MAF_EA'] = float(value_list[0]) / 100
        except:
            pass

        for key in record_data.iterkeys():
            record_value.append(key)
        filtered_db_list = '|'.join(record_value)

        if filtered_db_list == '':
            filtered_db_list = '.'

        record.info['ngb_popmaf_snp_db_list'] = filtered_db_list
        record.info['ngb_popmaf_snp_db_cnt'] = len(record_data)
        if "EAS" in filtered_db_list:
            record.info['ngb_popmaf_snp_db_eastasian'] = 'Y'
        else:
            record.info['ngb_popmaf_snp_db_eastasian'] = 'N'
        if ("KRGDB" in filtered_db_list) or ("KoEXID" in filtered_db_list):
            record.info['ngb_popmaf_snp_db_korean'] = 'Y'
        else:
            record.info['ngb_popmaf_snp_db_korean'] = 'N'

        # Write VCF
        vcf_out.write(record)
コード例 #51
0
def gen_report(vcf, c, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.snv.strelka.vep_priority.report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    on_dict = {}
    if c != 'n':
        on_dict = create_target(c)
        log(loc, date_time() + 'Target file given, creating index for on target info\n')
    vcf_in = VariantFile(vcf)
    call_type = 'snv'
    if bool(re.search('indel', fn)):
        out = open(parts[0] + '.indel.strelka.vep.prioritized_impact.report.xls', 'w')
        call_type = 'indel'
    else:
        out = open(parts[0] + '.snv.strelka.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    if call_type == 'snv':
        out.write('chr\tpos\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t'
              'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\t'
              'variant_class_effect\teffect\timpact\tbiotype\tcodon_change\tamino_acid_change\ton/off-target\n')
    else:
        out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact\t'
                  'biotype\tcodon_change\tamino_acid_change\ton/off-target\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)

    for record in vcf_in.fetch():
        # dict contains what's different between strelka indel and snv reports
        (chrom, pos, ref, alt) = (record.contig, str(record.pos),
        record.ref, record.alts[0])
        if call_type == 'snv':
            not_shared = {'norm_ref_ct': record.samples['NORMAL'][(record.ref + 'U')][0],
                          'norm_alt_ct': record.samples['NORMAL'][(record.alts[0] + 'U')][0],
                           'tum_ref_ct': record.samples['TUMOR'][(record.ref + 'U')][0],
                           'tum_alt_ct': record.samples['TUMOR'][(record.alts[0] + 'U')][0]}
        else:
            not_shared = {}
        ann_list = [_.split('|') for _ in record.info['ANN'].split(',')]
        tflag = 'NA'
        if c != 'n':
            tflag = mark_target(chrom, pos, on_dict)
            # only outputting ON TARGET hits
            if tflag == 'OFF':
                continue
        output_highest_impact(chrom, pos, ref, alt, not_shared, ann_list, desired, tflag, out, ref_flag, call_type)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0