def annotate_false_negs(folder):
    """
    Get information for any false negative results - returns basic variant info plus quality, genotype, coverage
    (total, ref base and alt base if appropriate)
    :param folder: Folder containing output from bcftools isec
    :return: array of variant dictionaries containing information on false negatives
    """
    false_negs = VariantFile(folder + '/0000.vcf')
    num_neg = len(list(false_negs.fetch()))
    print num_neg

    variants = []

    if num_neg > 0:
        print 'false negatives'
        for rec in false_negs.fetch():
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples['INTEGRATION']['GT']

            variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                       'GT':genotype}

            variants.append(variant)
    else:
        print 'no false negatives'

    return variants
def annotate_false_pos(folder, coverage_file, sample):
    """
    Get information for any false positive results - returns basic variant info plus quality, genotype, coverage
    (total, ref base and alt base if appropriate)
    :param folder: Folder containing output from bcftools isec
    :param coverage_file: File containing per base coverage for the truth_regions panel
    :param sample: container ID used in vcf file
    :return: array of variant dictionaries containing information on false negatives
    """
    false_pos = VariantFile(folder + '/0001.vcf')
    num_pos = len(list(false_pos.fetch()))
    print num_pos

    variants = []

    if num_pos > 0:
        print 'false positives'
        for rec in false_pos.fetch():
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples[sample]['GT']
            if 'AD' in rec.samples[sample].keys():
                allelic_depth = rec.samples[sample]['AD']
            else:
                allelic_depth = 'N/A'
            total_depth = rec.samples[sample]['DP']
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print 'Error executing command: ' + str(e.returncode)
                    exit(1)
                if line == '':
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                               'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                else:
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                               'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}
            else:
                variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                            'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                           'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A', 'alt':'N/A'}}
            variants.append(variant)
    else:
        print 'no false positives'

    return variants
def check_genotype(folder, sample):
    """
    Compares the genotype for all shared variants
    :param folder: location of results from the NGS analysis pipeline
    :param sample:  sample number (used in vcf file)
    :return: dictionary of number of matching variants and detailed information for any with mismatching genotypes
    """
    shared_giab = VariantFile(folder + '/0002.vcf')
    shared_patient = VariantFile(folder + '/0003.vcf')

    variants = []

    vars_giab = {}
    for rec in shared_giab.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if chrom not in vars_giab:
            vars_giab[chrom] = {}
        if pos not in vars_giab[chrom]:
            vars_giab[chrom][pos] = {}
        if alleles not in vars_giab[chrom][pos]:
            vars_giab[chrom][pos][alleles] = rec.samples['INTEGRATION']['GT']

    matching = 0
    for rec in shared_patient.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if 'AD' in rec.samples[sample].keys():
            allelic_depth = rec.samples[sample]['AD']
        else:
            allelic_depth = 'N/A'
        total_depth = rec.samples[sample]['DP']
        giab_genotype = vars_giab[chrom][pos][alleles]
        if rec.samples[sample]['GT'] == giab_genotype:
            matching += 1
        elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][
            0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1:
            matching += 1
        else:

            variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                       'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                       'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}}

            variants.append(variant)
    print str(matching) + ' matching variants'
    results = {'matching':matching, 'mismatching':variants}
    print results
    return results
def gen_report(vcf, sample, ref_flag):
    vcf_in = VariantFile(vcf)
    # run cadd twice over snv and indel file
    out = open(sample + '.germline.vep91.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'HGVSg': 0, 'Protein_position': 0,
               'Amino_acids': 0, 'Codons': 0, 'BIOTYPE': 0, 'SIFT': 0, 'Existing_variation': 0, 'VARIANT_CLASS': 0,
               'gnomAD_AF': 0, 'CLIN_SIG': 0, 'CADD_PHRED': []}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            if desc_list[i] == 'CADD_PHRED':
                desired[desc_list[i]].append(i)
            else:
                desired[desc_list[i]] = i
    out.write('CHROM\tPOS\tREF\tAllele\tTotal Allele Count\tTotal Position Coverage\tGene\tHGVSg\tTranscript_id'
              '\tEffect\tIMPACT\tBIOTYPE\tCodons\tAmino_acids\tExisting_variation\tVARIANT_CLASS\tSIFT\tgnomAD_AF'
              '\tCLIN_SIG\tCADD_PHRED\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt, alt_ct, tot_ct) = (record.contig, str(record.pos), record.ref, record.alts[0],
                                                  str(record.info['TR']), str(record.info['TC']))
        ann_list = [_.split('|') for _ in record.info['ANN']]
        output_highest_impact(chrom, pos, ref, alt, alt_ct, tot_ct, ann_list, desired, out, ref_flag)
    out.close()
    return 0
Exemple #5
0
def gen_report(vcf):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.indels.vep_priority.report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    vcf_in = VariantFile(vcf)

    out = open(parts[0] + '.indels.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in xrange(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact'
            '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n')
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0],
                                str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO']))
        ann_list = [_.split('|') for _ in record.info['ANN'].split(',')]
        output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
def annotate_false_pos(folder, sample):
    """
    Get information for any false positive results - returns basic variant info plus quality, genotype, coverage
    (total, ref base and alt base if appropriate)
    :param folder: Folder containing output from bcftools isec
    :param sample: container ID used in vcf file
    :return: array of variant dictionaries containing information on false negatives
    """
    false_pos = VariantFile(folder + '/0001.vcf')
    num_pos = len(list(false_pos.fetch()))
    print num_pos

    variants = []

    if num_pos > 0:
        print 'false positives'
        for rec in false_pos.fetch():
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples[sample]['GT']
            if 'AD' in rec.samples[sample].keys():
                allelic_depth = rec.samples[sample]['AD']
            else:
                allelic_depth = 'N/A'
            total_depth = rec.samples[sample]['DP']
            variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                       'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}}

            variants.append(variant)
    else:
        print 'no false positives'

    return variants
def gen_report(vcf, out, c, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.subsitutions.vep.priority_report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    mut_dict = create_mutect_ind(out)
    log(loc, date_time() + 'Created index for added mutect info\n')
    on_dict = {}
    if c != 'n':
        on_dict = create_target(c)
        log(loc, date_time() + 'Target file given, creating index for on target info\n')
    vcf_in = VariantFile(vcf)

    out = open(parts[0] + '.subsitutions.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write('chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t'
              'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tgnomAD_AF\tgene\ttx_id\teffect\timpact\tbiotype\t'
              'codon_change\tamino_acid_change\ton/off-target\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0]
        ann_list = [_.split('|') for _ in record.info['ANN']]
        tflag = 'NA'
        if c != 'n':
            tflag = mark_target(chrom, pos, on_dict)
            # only outputting ON TARGET hits
            if tflag == 'OFF':
                continue
        output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
Exemple #8
0
def get_variants(filename):
    """
    Function that parse the sample VCF file. This function get snp found in the
    representative genes, and uses the tag 'AD', a list containing the number of
    read mapped for reference and alternative variant.
    
    Args: 
        filename [string] = sample filename
        
    Returns:
        var [dict] = contain snp variation informations of representative genes 
                    key : representative gene name
                    value : variant [dict] containing snp position as key,
                            and a list of (nucleotide variant, aligned reads 
                            number)
    """
    # open VCF file
    vcf = VariantFile(filename)
    # initialise
    var = {}
    flag = 0
    
    for rec in vcf.fetch():
        # only for the first record, set variable name
        if flag == 0:
            name = rec.chrom # rec.chrom is the representative gene name
            variant = defaultdict(list)
            flag = 1
        # if snp are found in another representative gene
        if rec.chrom != name:
            var[name] = variant # store the variant
            name = rec.chrom # change the representative gene name
            variant = defaultdict(list) # create a new variant dictionnary
        # read the snp informations
        for gene, obj in rec.samples.items():
            i = 0
            if 'AD' in obj:
                for nb in obj['AD']:
                    if nb != 0:
                        variant[rec.pos].append((rec.alleles[i], nb))
                    i +=1
    
    return var
Exemple #9
0
def parse_vcf(filename):
    """
    Function that parse a database VCF file obtained by a variant calling using
    a multiple alignment file. It parses the VCF file and output a matrix 
    containing all the variant at each snp position of all the clustered genes
    
    Args :
        filename [string] = VCF filename
        
    Returns:
        name [string] = representative gene name 
        index [dict] = a dictionary containing index of snp position in list:
                       key : snp position
                       value : index of the snp in the list of the dict versions
        matrix [dict] = dictionary containing all variations
                          key : clustered gene
                          value : list of the nucleotide variation
    """
    # open VCF file
    vcf = VariantFile(filename)
    # initialise
    index = {}  
    matrix = defaultdict(list)
    i = 0 # index of snp
    name = 0

    for rec in vcf.fetch():
        name = rec.chrom # representative gene name
        # get the snp position (rec.pos) and his index (i)
        index[rec.pos] = i  
        i += 1
        # creation of the matrix of a cluster, gene are the different clustered 
        # genes, obj contain information about the snp
        for gene, obj in rec.samples.items():
            snp = obj.allele_indices[0]
            if snp != -1:
                matrix[gene].append(rec.alleles[snp]) 
            else: # if deletion
                matrix[gene].append('') 
            
    return name, [index, matrix]
Exemple #10
0
  def variants_missing_vcf(self,vcf_file):
    cat_chroms = set(self.data[self.col_chr].unique())
    cat_variants = set(self.data[self.col_epacts].unique())

    vcf_variants = set()
    for cat_chrom in cat_chroms:
      print >> sys.stderr, "Checking chromosome %s..." % str(cat_chrom)

      if '.json' in vcf_file:
        import json
        with open(vcf_file) as jsin:
          vcf_dict = json.load(jsin)

        vcf = vcf_dict.get(cat_chrom)
        if vcf is None:
          warning("GWAS catalog has variants on chromosome %s, but could not find this chromosome in your VCF (or JSON) file: %s" % (cat_chrom,vcf_file))
          continue
      else:
        vcf = vcf_file

      vcf_pysam = VariantFile(vcf)

      # Subset catalog to chromosome
      df_cat_for_chrom = self.data.query("{} == '{}'".format(self.col_chr,cat_chrom))

      # Catalog has repeated rows for variants depending on the number of traits * citations
      # But we just need each variant once
      df_cat_for_chrom = df_cat_for_chrom.drop_duplicates(self.col_epacts)

      # Loop over subsetted catalog, check if variant is in VCF
      for idx, row in df_cat_for_chrom.iterrows():
        chrom, pos = row[self.col_chr], row[self.col_pos]

        for rec in vcf_pysam.fetch(chrom,pos,pos):
          epacts = "{}:{}_{}/{}".format(rec.chrom,rec.pos,rec.ref,rec.alt)
          vcf_variants.add(epacts)

    missing_variants = cat_variants.difference(vcf_variants)
    missing_rows = self.data[self.data[self.col_epacts].isin(missing_variants)]

    return missing_rows
Exemple #11
0
#!/group/ctan/anaconda3/envs/snakemake/bin/python

import sys
from vcf_ctan import samvcf
from pysam import VariantFile

samples= ["AC","BD","Commander","EC2.1","EC2.2","EC7.1","EC7.2","Fleet","Hindmarsh","La_Trobe","Scope","Vlamingh","W1","WI4304","X1","barke","bowman","haruna_Nijo","igri","spontaneum_B1k-04-12"]
smps = [samples[3],samples[4],samples[5],samples[6]]

ibcf = VariantFile(sys.argv[1])
#obcf = VariantFile(sys.argv[2],'w',header=ibcf.header)
ofile = open(sys.argv[2],"w")
hd = "\t".join(["#chr","pos","len","ref","ref_num","alt","alt_num")
ofile.write(hd)
for one in ibcf.fetch("chr3H"):
    record = samvcf(one)
    if record.flt and record.diff_repeat(smps):
        opt = record.opt + [str(sum(one.samples[smps[0]]['GT'])),",".join(list(map(str,one.samples[smps[0]]['AD']))),str(sum(one.samples[smps[1]]['GT'])),",".join(list(map(str,one.samples[smps[1]]['AD']))),str(sum(one.samples[smps[2]]['GT'])),",".join(list(map(str,one.samples[smps[2]]['AD']))),str(sum(one.samples[smps[3]]['GT'])),",".join(list(map(str,one.samples[smps[3]]['AD'])))]
        ofile.write("\t".join(opt) + "\n")
Exemple #12
0
def force_calling(bam_path, ivcf_path, output_path, sigs_dir,
                  max_cluster_bias_dict, threshold_gloab_dict, gt_round,
                  threads):
    logging.info('Check the parameter -Ivcf: OK.')
    logging.info('Enable to perform force calling.')
    #print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    sv_dict = dict()
    #'''
    for sv_type in ["DEL", "DUP"]:
        sv_dict[sv_type] = parse_sigs(sv_type, sigs_dir)
    sv_dict['INS'] = parse_inssigs(sigs_dir)
    sv_dict['INV'] = parse_invsigs(sigs_dir)
    sv_dict['TRA'] = parse_trasigs(sigs_dir)
    #'''
    vcf_reader = VariantFile(ivcf_path, 'r')
    row_count = 0
    for record in vcf_reader.fetch():
        row_count += 1
    idx = -1
    #gt_list = Manager().list([[] for x in range(row_count)])
    gt_list = list()
    result = []
    process_pool = Pool(processes=threads)
    vcf_reader = VariantFile(ivcf_path, 'r')
    for record in vcf_reader.fetch():
        idx += 1
        sv_type, chrom, sv_chr2, pos, sv_end, sv_strand = parse_record(record)
        if sv_type not in ["DEL", "INS", "DUP", "INV", "TRA"]:
            continue
        search_id_list = []
        if sv_type == 'TRA' and 'TRA' in sv_dict and chrom in sv_dict[
                'TRA'] and sv_chr2 in sv_dict['TRA'][chrom]:
            search_id_list = sv_dict['TRA'][chrom][sv_chr2]
        elif sv_type == 'INV' and 'INV' in sv_dict and chrom in sv_dict['INV']:
            if sv_strand in sv_dict['INV'][chrom]:
                search_id_list = sv_dict['INV'][chrom][sv_strand]
            else:
                for strand_iter in sv_dict['INV'][chrom]:
                    sv_strand = strand_iter
                    search_id_list = sv_dict['INV'][chrom][strand_iter]
                    break
        elif sv_type != 'TRA' and sv_type != 'INV' and sv_type in sv_dict and chrom in sv_dict[
                sv_type]:
            search_id_list = sv_dict[sv_type][chrom]
        max_cluster_bias = 0
        if sv_type == 'INS' or sv_type == 'DEL':
            read_id_list, max_cluster_bias, indel_seq, CIPOS, CILEN = find_in_indel_list(
                sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos,
                sv_end, threshold_gloab_dict[sv_type])
        else:
            read_id_list, max_cluster_bias = find_in_list(
                sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos,
                sv_end)
            CIPOS = '.,.'
            CILEN = '.,.'
        if sv_type == 'INV' and 'INV' in sv_dict and chrom in sv_dict[
                'INV'] and len(read_id_list) == 0:
            for strand_iter in sv_dict['INV'][chrom]:
                if strand_iter != sv_strand:
                    search_id_list = sv_dict['INV'][chrom][strand_iter]
                    read_id_list, max_cluster_bias = find_in_list(
                        sv_type, search_id_list,
                        max_cluster_bias_dict[sv_type], pos, sv_end)
                    if len(read_id_list) != 0:
                        sv_strand = strand_iter
                        break
        #print(read_id_list)
        if sv_type == 'INS':
            max_cluster_bias = max(1000, max_cluster_bias)
        else:
            max_cluster_bias = max(max_cluster_bias_dict[sv_type],
                                   max_cluster_bias)
        para = Para(record, CIPOS, CILEN)
        '''
        if sv_type == 'INS':
            fx_para = [([bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'INS')]
            gt_list.append(call_gt_wrapper(fx_para))
        if sv_type == 'DEL':
            fx_para = [([bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'DEL')]
            gt_list.append(call_gt_wrapper(fx_para))
        if sv_type == 'INV':
            fx_para = [([bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'INV')]
            gt_list.append(call_gt_wrapper(fx_para))
        if sv_type == 'DUP':
            fx_para = [([bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'DUP')]
            gt_list.append(call_gt_wrapper(fx_para))
        if sv_type == 'TRA':
            fx_para = [([bam_path, pos, sv_end, chrom, sv_chr2, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'TRA')]
            gt_list.append(call_gt_wrapper(fx_para))
        '''
        #'''
        if sv_type == 'INS':
            fx_para = [([
                bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round
            ], idx, row_count, para, sv_strand, indel_seq, 'INS')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        if sv_type == 'DEL':
            fx_para = [([
                bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round
            ], idx, row_count, para, sv_strand, '<DEL>', 'DEL')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        if sv_type == 'INV':
            fx_para = [([
                bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias,
                gt_round
            ], idx, row_count, para, sv_strand, '<INV>', 'INV')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        if sv_type == 'DUP':
            fx_para = [([
                bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias,
                gt_round
            ], idx, row_count, para, sv_strand, '<DUP>', 'DUP')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        if sv_type == 'TRA':
            fx_para = [([
                bam_path, pos, sv_end, chrom, sv_chr2, read_id_list,
                max_cluster_bias, gt_round
            ], idx, row_count, para, sv_strand, '<TRA>', 'TRA')]
            gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para))
        #'''
    process_pool.close()
    process_pool.join()

    semi_result = list()
    for item in gt_list:
        try:
            semi_result.append(item.get()[0])
        except:
            pass
    logging.info('Finished force calling.')
    return semi_result
Exemple #13
0
def Main():
    parser = argparse.ArgumentParser(
        description="loading vcf and interaction files")
    parser.add_argument("interactionfile",
                        help="Interaction calls from HiCap method")
    parser.add_argument(
        "vcfile", help="Variant calls from either HiCap or sequencing samples")
    parser.add_argument("-o",
                        "--output",
                        help="output of interaction files",
                        action='store',
                        default=None)
    args = parser.parse_args()
    Vcfin = VariantFile(args.vcfile)

    result_title = [
        "RefSeqName", "TranscriptName", "Feature_ID", "Feature_Chr",
        "Feature_Start", "Feature_End", "Annotation", "Strand",
        "Interactor_Chr", "Interactor_Start", "Interactor_End", "Distance",
        "SNPs", "SNP_ID", "Ind_count", "Swed_Freq", "TAV2431", "TAV2515",
        "TAV2709", "BAV2375", "BAV2424", "BAV2714"
    ]

    with open(args.output, "w") as output_file:
        output_file.write("\t".join(result_title) + "\n")

    with open(args.interactionfile, 'r') as f:
        next(f)

        for line in f:
            line = line.strip().split("\t")
            all_fields = line[0], line[1], line[2], line[3], line[4], line[
                5], line[6], line[7], line[8], line[9], line[10], line[11]
            chr = ((line[8])[3:], line[9], line[10])

            TAV2431 = [line[12], line[13]]
            TAV2515 = [line[15], line[16]]
            TAV2709 = [line[18], line[19]]
            BAV2375 = [line[21], line[22]]
            BAV2424 = [line[24], line[25]]
            BAV2714 = [line[27], line[28]]

            interaction_sample = [
                TAV2431, TAV2515, TAV2709, BAV2375, BAV2424, BAV2714
            ]
            interaction_binary = int2binary(interaction_sample)

            sample_list = [3, 4, 5, 0, 1, 2]
            for rec in Vcfin.fetch(chr[0], int(chr[1]), int(chr[2])):
                genotype_binary = []
                for test in rec.samples.values():

                    genotype = "/".join([str(x) for x in test["GT"]])
                    if genotype == "None/None":
                        continue
                    elif genotype == "0/1" or genotype == "1/1":
                        genotype_binary.append("1")
                    elif genotype == "0/0":
                        genotype_binary.append("0")

                    swed_freq = "0"
                    for f, v in rec.info.iteritems():
                        if pattern.match(f):
                            swed_freq = v

                    if rec.id == None:
                        rec.id = "X"

                sorted_genotype = [
                    x for _, x in sorted(zip(sample_list, genotype_binary))
                ]
                zip_array = list(zip(interaction_binary, sorted_genotype))

                count = 0
                for a, b in zip_array:
                    if a == b:
                        count = count + 1

                if count == 6:
                    allele = "|".join(rec.alleles)
                    count_int_allele = 0
                    for a, b in zip_array:
                        if (a, b) == ('1', '1'):
                            count_int_allele = count_int_allele + 1

                    changed_freq = "".join(str(x) for x in swed_freq)
                    unzip_array = ["|".join(x) for x in zip_array]
                    snp = (line[8], rec.start, rec.stop, allele,
                           rec.filter.keys()[0])
                    str_snp = "_".join(str(x) for x in snp)

                    result = "\t".join(
                        all_fields
                    ), str_snp, rec.id, count_int_allele, changed_freq, "\t".join(
                        unzip_array)
                    combined_result = "\t".join(str(x) for x in result)

                    with open(args.output, "a") as output_file:
                        output_file.write(combined_result + "\n")
Exemple #14
0
    # Column names for ouptut
    writer = csv.writer(ofile)
    writer.writerow([
        "chr", "pos", "reference", "call", "methylated", "unmethylated",
        "strand"
    ])

    # The things in rec.format
    # GT FT DP MQ GQ QD GL MC8 AMQ CS CG CX
    # 480 minutes per one bcf--unacceptable!!!
    # 7 minutes for chrom 22--using 4 threads
    # 7 minutes for chrom 22--using 8 threads

    # Iterator
    #I = infile.fetch('chr1', 100000, 110000)
    I = infile.fetch('chr22')

    # Iterate two records at a time if merging...
    #for rec1, rec2 in zip_longest(*[I]*2):
    for rec2 in I:
        #data_1 = rec1.samples.items()[0][1].items()
        data_2 = rec2.samples.items()[0][1].items()

        # rec2 should be the base. Is it CpG? Then do the conditional tests
        # rec2 can be negative strand (and should still be written out)
        if (data_2[10][1] == 'Y'):
            m2, um2 = get_methylation_estimate(data_2[7][1], data_2[9][1])
            # This is the merge condition. The records need to be one position away from each other
            # They need to both be CpGs
            # They need for the first position on
            #if (rec2.pos - rec1.pos == 1 and data_1[10][1] == 'Y' and data_1[9][1] == "+" and data_2[9][1] == "-"):
Exemple #15
0
    def main(self, args):
        command.Command.main(self, args)
        self.validate(args)
        for i in [1, 2]:
            attr = "pop%d" % i
            pid, ary = getattr(args, attr)
            if len(ary) == 1 and ary[0][0] == "@":
                setattr(args, attr, SampleList(
                    pid, open(ary[0][1:], "rt").read().strip().split("\n")))
        pop_d = dict([args.pop1, args.pop2])
        for pid in pop_d:
            if pop_d[pid]:
                c = Counter(pop_d[pid])
                if max(c.values()) > 1:
                    raise RuntimeError(
                        "Population %s has duplicated samples: %s" %
                        (pid, [item for item in c.items() if item[1] > 1]))
        dist = [[], []]
        if not args.d:
            first_sid = args.pop1.samples[0]
            args.d = [first_sid] * 2
        args.d = [args.d[0] + ":0", args.d[1] + ":1"]
        all_samples = set(args.pop1.samples) | set(args.pop2.samples)
        for sid_i in args.d:
            sid, i = sid_i.split(":")
            i = int(i)
            if sid not in all_samples:
                raise RuntimeError("%s is not in the sample list" % sid)
            if sid in args.pop1.samples:
                d = dist[0]
            else:
                assert sid in args.pop2.samples
                d = dist[1]
            d.append((sid, i))
        undist = [[(k, i) for k in p.samples for i in (0, 1) if (k, i) not in d]
                  for p, d in zip((args.pop1, args.pop2), dist)]
        npop = 1

        def print_pop(i):
            logger.info("Population %d:" % i)
            logger.info("Distinguished lineages: " +
                        ", ".join("%s:%d" % t for t in dist[i - 1]))
            logger.info("Undistinguished lineages: " +
                        ", ".join("%s:%d" % t for t in undist[i - 1]))
        print_pop(1)
        if args.pop2.pid is not None:
            npop = 2
            common = set(args.pop1.samples) & set(args.pop2.samples)
            if common:
                logger.error("Populations 1 and 2 should be disjoint, "
                             "but both contain " + ", ".join(common))
                sys.exit(1)
            print_pop(2)

        # Start parsing
        vcf = VariantFile(args.vcf)
        with optional_gzip(args.out, "wt") as out:
            samples = list(vcf.header.samples)
            dist = dist[:npop]
            undist = undist[:npop]
            if not set([dd[0] for d in dist for dd in d]) <= set(samples):
                raise RuntimeError("Distinguished lineages not found in data?")
            missing = [s for u in undist for s, _ in u if s not in samples]
            if missing:
                msg = "The following samples were not found in the data: %s. " % ", ".join(
                    missing)
                if args.ignore_missing:
                    logger.warn(msg)
                else:
                    msg += "If you want to continue without these samples, use --ignore-missing."
                    raise RuntimeError(msg)
            undist = [[t for t in u if t[0] not in missing] for u in undist]

            # Write header
            pids = [a.pid for a in (args.pop1, args.pop2)[:npop]]
            out.write("# SMC++ ")
            json.dump({"version": version, "pids": pids,
                       "undist": undist, "dist": dist}, out)
            out.write("\n")
            na = list(map(len, dist))
            nb = list(map(len, undist))

            # function to convert a VCF record to our format:
            # <span, dist gt, # undist gt, # undist, [...]>
            def rec2gt(rec):
                ref = rec.alleles[0]
                da = [[rec.samples[d].alleles[i]
                       for d, i in di] for di in dist]
                a = [sum([x != ref for x in d])
                     if None not in d else -1 for d in da]
                bs = [[rec.samples[d].alleles[i] != ref
                       for d, i in un
                       if rec.samples[d].alleles[i] is not None]
                      for un in undist]
                b = [sum(_) for _ in bs]
                nb = [len(_) for _ in bs]
                # Fold non-polymorphic (in subsample) sites
                if np.array_equal(b, nb) and np.array_equal(a, na):
                    a = [0] * len(a)
                    b = [0] * len(b)
                return list(sum(zip(a, b, nb), tuple()))

            try:
                region_iterator = vcf.fetch(contig=args.contig)
            except ValueError as e:
                logger.error("VCF reader threw an error: %s", e)
                logger.error("Make sure the VCF is indexed:")
                logger.error("")
                logger.error("    $ tabix %s", args.vcf)
                logger.error("")
                sys.exit(1)

            contig_length = args.length or vcf.header.contigs[args.contig].length
            if contig_length is None:
                logger.error("Failed to acquire contig length from VCF header. See the --length option.")
                sys.exit(1)
            if args.mask:
                mask_iterator = TabixFile(
                    args.mask).fetch(reference=args.contig)
                args.missing_cutoff = np.inf
            else:
                mask_iterator = iter([])
                if args.missing_cutoff is None:
                    args.missing_cutoff = np.inf
            mask_iterator = (x.split("\t") for x in mask_iterator)
            mask_iterator = ((x[0], int(x[1]), int(x[2]))
                             for x in mask_iterator)
            snps_only = (
                rec for rec in region_iterator if
                len(rec.alleles) <= 2 and
                all(len(a) == 1 for a in rec.alleles)
                )

            def interleaved():
                cmask = next(mask_iterator, None)
                csnp = next(snps_only, None)
                while cmask or csnp:
                    if cmask is None:
                        yield "snp", csnp
                        csnp = next(snps_only, None)
                    elif csnp is None:
                        yield "mask", cmask
                        cmask = next(mask_iterator, None)
                    else:
                        if csnp.pos < cmask[1]:
                            yield "snp", csnp
                            csnp = next(snps_only, None)
                        elif csnp.pos <= cmask[2]:
                            while csnp is not None and csnp.pos <= cmask[2]:
                                csnp = next(snps_only, None)
                            yield "mask", cmask
                            cmask = next(mask_iterator, None)
                        else:
                            yield "mask", cmask
                            cmask = next(mask_iterator, None)

            abnb_miss = [-1, 0, 0] * len(nb)
            abnb_nonseg = sum([[0, 0, x] for x in nb], [])
            multiples = set()
            with RepeatingWriter(out) as rw, \
                    tqdm.tqdm(total=contig_length, unit='bases', unit_scale=True) as bar:
                def write(x):
                    if not write.first or not args.drop_first_last:
                        rw.write(x)
                    write.first = False
                write.first = True
                last_pos = 0
                for ty, rec in interleaved():
                    if ty == "mask":
                        span = rec[1] - last_pos
                        write([span] + abnb_nonseg)
                        write([rec[2] - rec[1] + 1] + abnb_miss)
                        last_pos = rec[2]
                        continue
                    bar.update(rec.pos - last_pos)
                    abnb = rec2gt(rec)
                    if rec.pos == last_pos:
                        multiples.add(rec.pos)
                        continue
                    span = rec.pos - last_pos - 1
                    if 1 <= span <= args.missing_cutoff:
                        write([span] + abnb_nonseg)
                    elif span > args.missing_cutoff:
                        write([span] + abnb_miss)
                    write([1] + abnb)
                    last_pos = rec.pos
                if not args.drop_first_last:
                    write([contig_length - last_pos] + abnb_nonseg)
            if multiples:
                # FIXME: what to do with multiple records at same site
                logger.warn(
                    "Multiple entries found at %d positions; skipped all but the first", len(multiples))
Exemple #16
0
from pysam import VariantFile
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob

baseDir = "/scratch/users/fhol/elife_data/"
saveDir = "/scratch/users/fhol/elife_data/varfilesDENV01/"
dataDir = glob.glob(baseDir + '/10017006*')

for d in dataDir:
    filename = glob.glob(d + "/*.vcf")
    for i in filename:
        df = pd.DataFrame(columns=['pos', 'af'])
        varFileName = os.path.basename(i)
        SNVs = VariantFile(i)
        for rec in SNVs.fetch():
            df2 = pd.DataFrame([[rec.pos, rec.info["AF"]]],
                               columns=['pos', 'af'])
            df = df.append(df2, ignore_index=True)
        os.chdir(saveDir)
        df.to_pickle(os.path.splitext(varFileName)[0] + '_df.pkl')
def gen_report(vcf, c, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.snv.strelka.vep_priority.report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    on_dict = {}
    if c != 'n':
        on_dict = create_target(c)
        log(loc, date_time() + 'Target file given, creating index for on target info\n')
    vcf_in = VariantFile(vcf)
    call_type = 'snv'
    if bool(re.search('indel', fn)):
        out = open(parts[0] + '.indel.strelka.vep.prioritized_impact.report.xls', 'w')
        call_type = 'indel'
    else:
        out = open(parts[0] + '.snv.strelka.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    if call_type == 'snv':
        out.write('chr\tpos\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t'
              'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\t'
              'variant_class_effect\teffect\timpact\tbiotype\tcodon_change\tamino_acid_change\ton/off-target\n')
    else:
        out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact\t'
                  'biotype\tcodon_change\tamino_acid_change\ton/off-target\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)

    for record in vcf_in.fetch():
        # dict contains what's different between strelka indel and snv reports
        (chrom, pos, ref, alt) = (record.contig, str(record.pos),
        record.ref, record.alts[0])
        if call_type == 'snv':
            not_shared = {'norm_ref_ct': record.samples['NORMAL'][(record.ref + 'U')][0],
                          'norm_alt_ct': record.samples['NORMAL'][(record.alts[0] + 'U')][0],
                           'tum_ref_ct': record.samples['TUMOR'][(record.ref + 'U')][0],
                           'tum_alt_ct': record.samples['TUMOR'][(record.alts[0] + 'U')][0]}
        else:
            not_shared = {}
        ann_list = [_.split('|') for _ in record.info['ANN'].split(',')]
        tflag = 'NA'
        if c != 'n':
            tflag = mark_target(chrom, pos, on_dict)
            # only outputting ON TARGET hits
            if tflag == 'OFF':
                continue
        output_highest_impact(chrom, pos, ref, alt, not_shared, ann_list, desired, tflag, out, ref_flag, call_type)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
Exemple #18
0
 def __init__(self, vcf_path):
     self.pairs = {}
     vcf = VariantFile(vcf_path)
     for rec in vcf.fetch():
         if re.match('^chr[0-9XY]+$', rec.chrom):
             self.add_entry(rec)
Exemple #19
0
def find_homopolymer_cases(vcf_file, fasta_file):
    small_variant_vcf = VariantFile(vcf_file)
    assembly_fasta_file = FastaFile(fasta_file)

    homopolymer_changes = 0
    total_records = 0
    for rec in small_variant_vcf.fetch():
        alternate_allele = rec.alleles[1]
        if len(alternate_allele) > 50:
            continue
        rec_len = rec.stop - rec.start
        if rec_len > 50:
            continue
        total_records += 1
        reference_start = rec.start - 200
        reference_end = rec.stop + 200
        reference_sequence = assembly_fasta_file.fetch(reference=rec.contig,
                                                       start=rec.start - 200,
                                                       end=rec.stop + 200)

        homopolymer_positions = [0] * len(reference_sequence)
        for i in range(0, len(reference_sequence)):
            if i == 0:
                homopolymer_positions[i] = reference_start
            elif reference_sequence[i] == reference_sequence[i - 1]:
                homopolymer_positions[i] = homopolymer_positions[i - 1]
            else:
                homopolymer_positions[i] = i + reference_start

        homopolymer_start = 0
        homopolymer_end = 0
        # print(rec, end='')
        for i in range(0, len(reference_sequence)):
            if i + reference_start == rec.start:
                homopolymer_start = homopolymer_positions[i]

            if i + reference_start > rec.stop and homopolymer_positions[
                    i] != homopolymer_start:
                homopolymer_end = max(homopolymer_positions[i], rec.stop + 1)
                break

        sequence_in_assembly = assembly_fasta_file.fetch(reference=rec.contig,
                                                         start=rec.start,
                                                         end=rec.stop + 1)

        polished_homopolymer = assembly_fasta_file.fetch(
            reference=rec.contig, start=homopolymer_start,
            end=rec.start) + alternate_allele + assembly_fasta_file.fetch(
                reference=rec.contig, start=rec.stop, end=homopolymer_end)
        sequence_in_polished = assembly_fasta_file.fetch(
            reference=rec.contig, start=rec.start,
            end=rec.start) + alternate_allele + assembly_fasta_file.fetch(
                reference=rec.contig, start=rec.stop, end=rec.stop + 1)

        homopolymer_record_end = homopolymer_start
        while reference_sequence[homopolymer_record_end -
                                 reference_start] == reference_sequence[
                                     homopolymer_start - reference_start]:
            homopolymer_record_end += 1
        # print(assembly_fasta_file.fetch(reference=rec.contig, start=rec.start-1, end=rec.start))
        # print(alternate_allele)
        # print(assembly_fasta_file.fetch(reference=rec.contig, start=rec.stop, end=rec.stop+10))
        # print("Assembly", sequence_in_assembly)
        # print("Polish", sequence_in_polished)
        # if rec.contig != 'chr22':
        #     continue

        # print(rec, end='')
        # print(sequence_in_assembly)
        # print(sequence_in_polished)
        true_homopolymer = True
        if len(sequence_in_assembly) > 1:
            start_index = 2
            while start_index < len(sequence_in_assembly):
                if sequence_in_assembly[start_index] != sequence_in_assembly[
                        start_index - 1]:
                    true_homopolymer = False
                    break
                start_index += 1

        if len(sequence_in_polished) > 1:
            start_index = 2
            while start_index < len(sequence_in_polished):
                if sequence_in_polished[start_index] != sequence_in_polished[
                        start_index - 1]:
                    true_homopolymer = False
                    break
                start_index += 1

        if not true_homopolymer:
            pass
        else:
            print(rec.contig + "\t" + str(rec.start) + "\t" + str(rec.stop))
Exemple #20
0
def gen_report(vcf, out, c, ref_flag, cache):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    sample = parts[0]
    loc = 'LOGS/' + sample + '.subsitutions.vep' + cache + '.priority_report.log'
    suffix = '.subsitutions.vep' + cache + '.prioritized_impact.report.xls'
    log(loc,
        date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    mut_dict = create_mutect_ind(out)
    log(loc, date_time() + 'Created index for added mutect info\n')
    on_dict = {}
    if c != 'n':
        on_dict = create_target(c)
        log(
            loc,
            date_time() +
            'Target file given, creating index for on target info\n')
    vcf_in = VariantFile(vcf)
    out_fn = sample + suffix
    out = open(out_fn, 'w')
    desired = {
        'Consequence': 0,
        'IMPACT': 0,
        'SYMBOL': 0,
        'Feature': 0,
        'HGVSg': 0,
        'Protein_position': 0,
        'Amino_acids': 0,
        'Codons': 0,
        'Existing_variation': 0,
        'gnomAD_AF': 0,
        'BIOTYPE': 0
    }

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace(
        'Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write(
        'chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t'
        'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tExAC_MAF\tgene\tHGVSg\ttx_id\teffect\timpact\t'
        'biotype\tcodon_change\tamino_acid_change\ton/off-target\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)
    for record in vcf_in.fetch():
        (chrom, pos, ref,
         alt) = record.contig, str(record.pos), record.ref, record.alts[0]
        ann_list = [_.split('|') for _ in record.info['ANN']]
        tflag = 'NA'
        if c != 'n':
            tflag = mark_target(chrom, pos, on_dict)
            # only outputting ON TARGET hits
            if tflag == 'OFF':
                continue
        output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict,
                              desired, tflag, out, ref_flag)

    out.close()
    log(
        loc,
        date_time() + 'Creating prioritized report for ' + vcf +
        ' complete!\n')
    return 0
from pysam import VariantFile
import sys

bcfin = VariantFile(sys.argv[1])
var_calls = []
min_score = 10000
max_score = -1
for i in range(1, 23):
    for rec in bcfin.fetch('chr%d' % i):
        if rec.ref in rec.alts:
            continue
        var_calls.append(rec)

header = '''##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
Exemple #22
0
class VcfAltParser():
    '''
    Class to iterate over a vcf in batches, returns strings of DNA-sequences.
    
    Loosely inspired by janggu.dna.VarianStreamer
    
    :ivar pysam.VariantFile vcf: VariantFile, the variant calls
    :ivar pysam.FastaFile ref: FastaFile, the reference sequence
    :ivar str idx_path: Path to the exported (compatible) variants in bed-format
    :ivar int bin_size: size of the DNA-sequences
    :ivar int n_variants: number of exported (compatible) variants
    '''
    def __init__(self,
                 ref_fa_path=None,
                 vcf_path=None,
                 idx_path=None,
                 batch_size=32,
                 bin_size=100,
                 tie='r'):
        '''
        :param str ref_fa_path: Path to indexed reference fasta
        :param str vcf_path: Path to indexed vcf
        :param str idx_path: Path to bed-file which will contain the names and locations of compatible variants
        :param int batch_size: Batch size
        :param int bin_size: Length of the DNA-sequences (centered on the start position of the variant)
        '''
        self.vcf = VariantFile(vcf_path)
        self.ref = FastaFile(ref_fa_path)
        assert os.path.isfile(
            ref_fa_path +
            '.fai'), 'Error: no index found for Fasta-file: {}'.format(
                ref_fa_path)
        self.idx_path = idx_path
        self.batch_size = batch_size
        self.bin_size = bin_size
        assert tie in ['l', 'r']
        self.tie = tie
        if not bin_size % 2:
            self.offset = 0 if tie == 'r' else 1
        else:
            self.offset = 0
        self.n_variants = self._initialize_index()
        self._verify_refmatch()

    def get_flanking_centered(self, variant):
        '''
        get flanking sequence, variant will be centered
        '''
        # centers the alt variant (note: ref centering not implemented)
        # flank
        lf, rf = ceil(self.bin_size / 2), floor(self.bin_size / 2)
        lenref, lenalt = len(variant.ref), len(variant.alts[0])
        d = lenalt - lenref
        # len diff
        ld, rd = ceil(d / 2), floor(d / 2)
        pos = variant.pos - 1  # 0-based
        left_seq = self.ref.fetch(variant.chrom, pos - lf + ld + self.offset,
                                  pos)
        right_seq = self.ref.fetch(variant.chrom, pos + lenref,
                                   pos + rf - rd + self.offset)
        return left_seq, right_seq

    def get_flanking_right(self, variant):
        '''
        get flanking sequence, variant will be aligned to the right of the center
        '''
        # aligns the varaint to the right of the center
        # flank
        if self.bin_size % 2:
            rf = floor(self.bin_size / 2)
            lf = rf
        else:
            lf, rf = ceil(self.bin_size / 2), floor(self.bin_size / 2)
        lenref, lenalt = len(variant.ref), len(variant.alts[0])
        d = lenalt - lenref
        # len diff
        pos = variant.pos - 1  # 0-based
        left_seq = self.ref.fetch(variant.chrom, pos - lf + self.offset, pos)
        right_seq = self.ref.fetch(variant.chrom, pos + lenref,
                                   pos + rf - d + self.offset)
        return left_seq, right_seq

    def get_alt(self, variant):
        '''
        get alternative sequence for a variant
        '''
        l, r = self.get_flanking_right(variant)
        return (l + variant.alts[0] + r).upper()

    def get_ref(self, variant):
        '''
        get reference sequence for a variant
        '''
        if self.bin_size % 2:
            rf = floor(self.bin_size / 2)
            lf = rf
        else:
            lf, rf = ceil(self.bin_size / 2), floor(self.bin_size / 2)
        return self.ref.fetch(variant.chrom,
                              variant.pos - lf - 1 + self.offset,
                              variant.pos + rf - 1 + self.offset).upper()

    def is_compatible(self, variant):
        '''
        simple test for compatibility 
        '''
        if len(variant.alts[0]) >= (self.bin_size / 2):
            return False
        if len(variant.alts) > 1:
            return False
        return True

    def _verify_refmatch(self):
        variants = self.vcf.fetch()
        err = 0
        proc = 0
        varid = []
        ref = []
        true_ref = []
        for i in range(50000):
            try:
                variant = next(variants)
                proc += 1
            except StopIteration:
                break
            if variant.ref != self.ref.fetch(
                    variant.chrom, variant.pos - 1,
                    variant.pos - 1 + len(variant.ref)):
                err += 1
                ref.append(variant.ref)
                true_ref.append(
                    self.ref.fetch(variant.chrom, variant.pos - 1,
                                   variant.pos - 1 + len(variant.ref)))
                varid.append(variant.id)

        if err:
            print(
                'Warning: {} mismatches with reference based on the first {} variants.'
                .format(err, min(proc, 50000)))
            for i in range(min(err, 10)):
                print('variant: {}, vcf ref : {}, actual ref: {}'.format(
                    varid[i], ref[i], true_ref[i]))
            if err > 10:
                print('...')

    def _initialize_index(self):
        '''
        create a bed-file containing the variant locations and ids
        '''
        bedtool = BedTool(
            (Interval(record.chrom,
                      record.pos - 1,
                      record.pos - 1 + len(record.ref),
                      name='{}_{}>{}'.format(record.id, record.ref,
                                             record.alts[0]))
             for record in self.vcf.fetch() if self.is_compatible(record)))
        bedtool.saveas(self.idx_path)
        with open(self.idx_path, 'r') as infile:
            for n, _ in enumerate(infile):
                pass
        try:
            import subprocess
            subprocess.call(['gzip', '-f', self.idx_path])
            self.idx_path += '.gz'
        except:
            pass
        return n + 1

    def batch_generator(self):
        '''
        returns a generator that iterates over pairs of reference and alternative sequences
        '''
        variants = self.vcf.fetch()
        ibatch = 0
        try:
            while True:
                br = []
                ba = []
                ids = []
                while ibatch < self.batch_size:
                    variant = next(variants)
                    if not self.is_compatible(variant):
                        continue
                    ids.append(variant.id)
                    br.append(self.get_ref(variant))
                    ba.append(self.get_alt(variant))
                    ibatch += 1
                yield np.array(ids), (np.array(br), np.array(ba))
                ibatch = 0
        except StopIteration:
            yield np.array(ids), (np.array(br), np.array(ba))
    pos = int(cols[pos_ind])
    ref = cols[ref_ind]
    alt = cols[alt_ind]
    marker_id = cols[marker_id_ind]
    rsid = "."
    infos_out = []
    allele = "."
    gene = "."
    annotation = "."
    hgvs_c = "."
    hgvs_p = "."

    #####
    # Get annotation
    #####
    for rec in vcf_handle.fetch(chrom, pos - 1, pos + 1):
        ####
        # Match by position
        ####
        if rec.chrom == chrom and rec.pos == pos:

            if "ANN" in rec.info:
                ann_field = rec.info["ANN"]
                #anns = ann_field.split(",")
                ann = ann_field[0]
                ann_cols = ann.split("|")
                allele = ann_cols[0]
                gene = ann_cols[3]
                annotation = ann_cols[1]
                hgvs_c = ann_cols[9]
                hgvs_p = ann_cols[10]
Exemple #24
0
    dest='call_vcf',
    help='Called vcf to search for variants not found in reference vcf')
parser.add_argument(
    '-o',
    '--out-vcf',
    action='store',
    dest='out_vcf',
    help='Output vcf that is a subset of called vcf meeting criteria')

args = parser.parse_args()

ref_vcf = VariantFile(args.ref_vcf)
called_vcf = VariantFile(args.call_vcf, threads=4)
out_vcf = VariantFile(args.out_vcf, "w", header=called_vcf.header, threads=4)
x = 0
m = 1000
for record in called_vcf.fetch():
    if x % m == 0:
        sys.stderr.write('Processed ' + str(x) + " records\n")
        sys.stderr.flush()
    f = 0
    for comp in ref_vcf.fetch(record.contig, record.start, record.stop):
        if record.pos == comp.pos and record.alleles == comp.alleles:
            f = 1
            break
    if not f:
        out_vcf.write(record)
    x += 1
out_vcf.close()
ref_vcf.close()
called_vcf.close()
Exemple #25
0
#!/bin/python3.6
import sys
from pysam import VariantFile
import subprocess

vcf_in = VariantFile(sys.argv[1])
new_header = new_header = vcf_in.header
vcf_out = VariantFile(sys.argv[2], 'w', header=new_header)
sv_out = sys.argv[2] + '.svtypeDEL.txt'
indelArteFile = sys.argv[3]

for record in vcf_in.fetch():
    # import pdb; pdb.set_trace()
    try:
        if record.info["SVTYPE"] == 'DEL':
            with open(sv_out, 'a+') as svtype_out:
                svtype_out.write(str(record))
    except KeyError:
        if len(record.ref) != len(record.alts[0]):  # if InDel
            if (
                "mutect2" in record.info["CALLERS"] or "vardict" in record.info["CALLERS"]
            ):  # Support by either Vardict or Manta, ok.
                # Check if indel artefact
                # import pdb; pdb.set_trace()
                write = 1
                cmdIndelArte = 'grep -w ' + str(record.pos) + ' ' + indelArteFile
                artefactLines = (
                    subprocess.run(cmdIndelArte, stdout=subprocess.PIPE, shell='TRUE').stdout.decode('utf-8').strip()
                )
                for artefactLine in artefactLines.split("\n"):
                    if (
Exemple #26
0
def run_process(opts, inputvcf):
    reference = opts.reference
    outputvcf = opts.output
    infoname = "HOMOPOLYX"
    maxbp = opts.maxpolypadding
    minbp = opts.minpolybp

    # STDERR
    sys.stderr.write("Maximum basepair of reference region around variant : " +
                     str(maxbp) + "\n")
    sys.stderr.write("Minumum basepair of homopolymer detection : " +
                     str(minbp) + "\n")

    # Load Reference Fasta
    genome = FastaFile(reference)

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.info, infoname):
        vcf_in.header.info.add(infoname, ".", "String",
                               "Homepolymer Basepair Count")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-',
                          'w',
                          header=vcf_in.header)

    # Found count init
    homopolymer_cnt = 0

    # Fetch VCF Record
    for record in vcf_in.fetch():
        chrom = record.chrom
        pos = record.pos
        ref = record.ref
        alts = record.alts

        info_value_list = list()
        for alt in alts:
            ret = ngb_functions.pairdiff(ref, alt)
            if (ret['variant_type'] == 'ins' or ret['variant_type']
                    == 'del') and ret['diff_basepair_composition_count'] == 1:
                diffbasepair = ret['diff_basepair_composition'][0]

                match_cnt = 0
                around_sequence = (genome.fetch(chrom, pos,
                                                pos + maxbp)).upper()
                for seq in around_sequence:
                    if diffbasepair == seq:
                        match_cnt += 1
                    else:
                        break

                if match_cnt >= int(minbp):
                    info_value_list.append(match_cnt)

        if info_value_list != []:
            info_value = ','.join(str(e) for e in info_value_list)
            record.info[infoname] = info_value
            homopolymer_cnt += 1

        vcf_out.write(record)

    sys.stderr.write("Found homopolymer(s) : " + str(homopolymer_cnt) + "\n")
def find_dimer_repeats(bam_file, vcf_file, fasta_file):
    assembly_fasta_file = FastaFile(fasta_file)
    small_variant_vcf = VariantFile(vcf_file)
    samfile = pysam.AlignmentFile(bam_file, "rb")

    for rec in small_variant_vcf.fetch():
        alternate_allele = rec.alleles[1]
        if len(alternate_allele) > 50:
            continue
        rec_len = rec.stop - rec.start
        if rec_len > 20:
            continue

        # if rec.contig != 'chr22':
        #     continue
        reference_start = rec.start - 200
        reference_end = rec.stop + 200
        reference_sequence = assembly_fasta_file.fetch(reference=rec.contig,
                                                       start=rec.start,
                                                       end=rec.stop + 200)

        in_dimer = False
        end_index = 1
        dimer_base = '**'
        reference_dimer_length = 0
        for i in range(len(reference_sequence) - 1):
            if reference_sequence[i] != reference_sequence[i + 1]:
                dimer_base = reference_sequence[i] + reference_sequence[i + 1]
                end_index = extend_dimers(reference_sequence, dimer_base, i)
                reference_dimer_length = int((end_index - i) / 2)
                if i == 1 and reference_dimer_length > 1:
                    # print("----------------------")
                    # print(rec, end='')
                    # print(reference_sequence[i:end_index])
                    # print("FOUND", i, end_index, reference_dimer_length)
                    # print("######################")
                    in_dimer = True
                    break

        if not in_dimer:
            continue

        all_reads = samfile.fetch(rec.contig, rec.start - 10,
                                  rec.start + end_index)

        read_dimers = []
        for read in all_reads:
            aligned_pairs = read.get_aligned_pairs()

            read_start_index = -1
            for index, position in aligned_pairs:
                if index is None:
                    continue
                if position == rec.start - 1:
                    read_start_index = index + 2
                    break

            if read_start_index < 0:
                continue
            if read.query_sequence is None:
                continue
            read_end_index = read_start_index + end_index
            if read_start_index >= len(
                    read.query_sequence) or read_end_index >= len(
                        read.query_sequence):
                continue

            read_sequence = read.query_sequence[read_start_index:]

            read_dimer_base = read_sequence[0:2]
            if read_dimer_base != dimer_base:
                continue

            read_end_index_late = extend_dimers(read_sequence, dimer_base, 0)
            read_dimer_length = int((read_end_index_late - 0) / 2)
            read_dimers.append(read_dimer_length)
            # print(read.query_sequence[read_start_index:read_end_index], read_start_index, read_end_index, read_dimer_length)

        if len(read_dimers) == 0:
            continue
        print(
            str(rec.contig) + "\t" + str(rec.start) + "\t" +
            str(rec.start + end_index) + "\t" + str(reference_dimer_length) +
            "\t" + str(','.join([str(x) for x in read_dimers])))
Exemple #28
0
class AnnotateHelper:
    def __init__(self):
        self._gene_database = DataBase(settings.GENE_DATABASE)
        self._omim_gene_database = DataBase(settings.OMIM_GENE_DATABASE)
        self._func_region_database = DataBase(settings.FUNC_REGION_DATABASE)
        self._hi_gene_database = DataBase(settings.HI_GENE_DATABASE)
        self._hi_exon_database = DataBase(settings.HI_EXON_DATABASE)
        self._hi_cds_database = DataBase(settings.HI_CDS_DATABASE)
        self._clinvar_pathogenic_database = VariantFile(
            settings.CLINVAR_PATHOGENIC_DATABASE)
        self._uhi_gene_database = DataBase(settings.UHI_GENE_DATABASE)
        self._hi_region_database = DataBase(settings.HI_REGION_DATABASE)
        self._uhi_region_database = DataBase(settings.UHI_REGION_DATABASE)
        self._decipher_gene_database = DataBase(
            settings.DECIPHER_GENE_DATABASE)
        self._ts_gene_database = DataBase(settings.TS_GENE_DATABASE)
        self._ts_region_database = DataBase(settings.TS_REGION_DATABASE)
        self._uts_gene_database = DataBase(settings.UTS_GENE_DATABASE)
        self._uts_region_database = DataBase(settings.UTS_REGION_DATABASE)
        self._dgv_gain_database = DataBase(settings.DGV_GAIN_DATABASE)
        self._dgv_loss_database = DataBase(settings.DGV_LOSS_DATABASE)
        self._gnomad_del_database = DataBase(settings.GNOMAD_DEL_DATABASE)
        self._gnomad_dup_database = DataBase(settings.GNOMAD_DUP_DATABASE)
        self._cnv_syndrome_del_database = DataBase(
            settings.CNV_SYNDROME_DEL_DATABASE)
        self._cnv_syndrome_dup_database = DataBase(
            settings.CNV_SYNDROME_DUP_DATABASE)

    @staticmethod
    def _norm_chrom(ch):
        """
        normalize chromosome name, eg. 2 -> chr2, 23 -> chrX
        :param ch: input chromosome name
        :return: normalized name
        >>> norm_chrom(2)
        'chr2'
        >>> norm_chrom('chr23')
        'chrX'
        """
        ch = str(ch).replace('chr', '')
        if ch == '23':
            return 'chrX'
        if ch == '24':
            return 'chrY'
        return f'chr{ch}'

    @staticmethod
    def _annotate_loss(**annotation):
        """
        计算拷贝数减少的CNV的证据项
        :param annotation: 已注释的CNV
        :return: 注释后的CNV
        """
        loss = dict()

        # Section 1

        if len(annotation['outer_overlap_genes']) + len(
                annotation['overlap_func_regions']) > 0:
            loss['1A'] = True
        else:
            loss['1B'] = True

        # Section 2

        # hi区域
        for region, overlap, coverage in annotation['overlap_hi_regions']:
            if coverage == 1:  # 完全覆盖区域
                loss['2A'] = True
            elif len(
                    set(gene.symbol
                        for gene, *_ in annotation['overlap_hi_genes'])) == 0:
                # 未覆盖hi基因
                loss['2B'] = True

        # hi基因
        for gene, overlap, coverage in annotation['overlap_hi_genes']:
            if coverage == 1:  # 完全覆盖基因
                loss['2A'] = True
            elif overlap < 1:  # 是否位于基因内部
                if any(exon.last_exon == 'True'
                       for exon, *_ in annotation['overlap_hi_exons'][
                           gene.gene_id]):  # 是否覆盖末位外显子
                    if len(annotation['overlap_hi_exons'][gene.gene_id]) >= 2:
                        # 覆盖超过两个外显子
                        loss['2D-4'] = True
                    elif gene.gene_id in annotation['overlap_hi_cds'] \
                            and len(annotation['overlap_hi_cds'][gene.gene_id]) > 0:  # 是否覆盖CDS
                        if len(annotation['variants']) > 0:  # 末位外显子是否有致病变异
                            loss['2D-2'] = True
                        else:  # 末尾外显子无致病变异
                            loss['2D-3'] = True
                    else:
                        # 不覆盖CDS区
                        loss['2D-1'] = True
                # 未覆盖末位外显子
                elif gene.gene_id in annotation['overlap_hi_cds'] \
                        and len(annotation['overlap_hi_cds'][gene.gene_id]) > 0:  # 是否覆盖5'端CDS
                    loss['2C-1'] = True
                else:  # 未覆盖5'端CDS
                    loss['2C-2'] = True
            # 位于基因内部
            else:
                cnv = CNVRecord(annotation['chromosome'],
                                annotation['inner_start'],
                                annotation['inner_end'], annotation['func'])
                tx = get_transcript(gene.transcript, transcripts)
                pvs1 = PVS1CNV(cnv, None, tx)
                loss['2E'] = True
                #loss[PVS1[pvs1.verify_DEL()[0]]] = True
                loss['pvs1'] = PVS1[pvs1.verify_DEL()[0]]

        # 包含预测HI基因
        if len(annotation['overlap_hi_genes']) + len(annotation['overlap_hi_regions']) == 0 \
                and len(annotation['overlap_decipher_genes']) > 0:
            loss['2H'] = True

        # 落入uhi基因
        for gene, overlap, coverage in annotation['overlap_uhi_genes']:
            if overlap == 1:
                loss['2F'] = True

        # 落入uhi区域
        genes = set(gene.symbol
                    for gene, *_ in annotation['outer_overlap_genes'])
        for region, overlap, coverage in annotation['overlap_uhi_regions']:
            if len(genes - set(region.genes.split(','))) > 0:
                loss['2G'] = True
            else:
                loss['2F'] = True

        # Section 3

        # 覆盖基因个数
        gene_count = len(annotation['outer_overlap_genes'])
        if gene_count >= 35:
            loss['3C'] = True
        elif gene_count >= 25:
            loss['3B'] = True
        elif gene_count >= 0:
            loss['3A'] = True

        # Section 4

        # DGV金标和Gnomad
        genes = set(gene.symbol
                    for gene, *_ in annotation['outer_overlap_genes'])
        l, m = 0, 0
        for record, overlap, coverage in chain(
                annotation['dgv_loss_records'],
                annotation['gnomad_del_records']):
            if overlap == 1 and any(
                    float(v) >= 0.01 for f, v in record._asdict().items()
                    if f.startswith('af')):  # 完全覆盖待解读CNV且频率大于1%
                loss['4O'] = True
                break
            elif overlap >= 0.5 and len(genes -
                                        set(record.genes.split(','))) == 0:
                # 与待解读CNV重叠超过50%且覆盖全部蛋白编码基因
                if any(
                        float(v) < 0.01 for f, v in record._asdict().items()
                        if f.startswith('af')):
                    # 频率小于1%
                    m += 1
                else:
                    # 频率大于1%
                    l += 1
        else:
            if l > 0 and m == 0:  # 存在频率大于1%且不存在小于1%的CNV
                loss['4O'] = True

        annotation['rules'] = loss
        return annotation

    @staticmethod
    def _annotate_gain(**annotation):
        """
        计算拷贝数减少的CNV的证据项
        :param annotation: 已注释的CNV
        :return: 注释后的CNV
        """
        gain = dict()

        # Section 1

        if len(annotation['outer_overlap_genes']) + len(
                annotation['overlap_func_regions']) > 0:
            gain['1A'] = True
        else:
            gain['1B'] = True

        # Section 2

        # 完全覆盖ts区域
        for region, overlap, coverage in annotation['overlap_ts_regions']:
            if coverage == 1:  # 是否覆盖整改区域
                gain['2A'] = True
            elif len(
                    set(gene.symbol
                        for gene, *_ in annotation['overlap_ts_genes'])) == 0:
                # 未覆盖ts基因
                gain['2B'] = True

        for gene, overlap, coverage in annotation['overlap_ts_genes']:
            # 覆盖整个基因
            if coverage == 1:
                gain['2A'] = True

        # 落入uts基因
        for gene, overlap, coverage in annotation['overlap_uts_genes']:
            if overlap == 1:
                gain['2D'] = True

        # 落入uts区域
        for region, overlap, coverage in annotation['overlap_uts_regions']:
            genes = set(gene.symbol
                        for gene, *_ in annotation['inner_overlap_genes'])
            region_genes = set(region.genes.split(','))
            if overlap == coverage == 1:  # 与良性区域完全一致
                gain['2C'] = True
            elif len(genes - region_genes) > 0:  # 编码蛋白基因比良性区域多
                gain['2G'] = True
            # 破坏蛋白编码基因
            elif any(c < 1 for *_, c in annotation['inner_overlap_genes']):
                gain['2E'] = True
            elif overlap == 1:  # 被良性区域完全覆盖
                gain['2D'] = True
            else:
                gain['2F'] = True

        # hi基因
        hi_genes = set()
        for gene, overlap, coverage in annotation['overlap_hi_genes']:
            hi_genes.add(gene.symbol)
            if coverage == 1:  # 完全覆盖
                gain['2H'] = True
            elif overlap == 1:  # 两端均位于基因内
                cnv = CNVRecord(annotation['chromosome'],
                                annotation['inner_start'],
                                annotation['inner_end'], annotation['func'])
                tx = get_transcript(gene.transcript, transcripts)
                pvs1 = PVS1CNV(cnv, None, tx)
                gain['2I'] = True
                # gain[PVS1[pvs1.verify_DUP()[0]]] = True
                gain['pvs1'] = PVS1[pvs1.verify_DUP()[0]]

        # 非hi基因
        for gene, overlap, coverage in annotation['inner_overlap_genes']:
            if gene.symbol not in hi_genes and coverage != 1:
                gain['2L'] = True
                annotation['break_point_genes'].append(gene.symbol)

        # Section 3

        # 覆盖基因个数
        gene_count = len(annotation['inner_overlap_genes'])
        if gene_count >= 50:
            gain['3C'] = True
        elif gene_count >= 35:
            gain['3B'] = True
        elif gene_count >= 0:
            gain['3A'] = True

        # Section 4

        # DGV金标和Gnomad
        genes = set(gene.symbol
                    for gene, *_ in annotation['outer_overlap_genes'])
        l, m = 0, 0
        for record, overlap, coverage in chain(
                annotation['dgv_gain_records'],
                annotation['gnomad_dup_records']):
            if overlap == 1 and any(
                    float(v) >= 0.01 for f, v in record._asdict().items()
                    if f.startswith('af')):  # 完全覆盖待解读CNV且频率大于1%
                gain['4O'] = True
                break
            elif overlap >= 0.5 and len(genes -
                                        set(record.genes.split(','))) == 0:
                # 与待解读CNV重叠超过50%且覆盖全部蛋白编码基因
                if any(
                        float(v) < 0.01 for f, v in record._asdict().items()
                        if f.startswith('af')):
                    # 频率小于1%
                    m += 1
                else:
                    # 频率大于1%
                    l += 1
        else:
            if l > 0 and m == 0:  # 存在频率大于1%且不存在小于1%的CNV
                gain['4O'] = True

        annotation['rules'] = gain
        return annotation

    @staticmethod
    def merge_score(func, **rules):
        """
        整合所有证据项得分
        :param func: 变异类型
        :param rules: 证据项
        :return: 生成各证据项得分
        """
        groups = defaultdict(list)
        for rule, score in rules.items():
            try:  # 需要分组计分的证据项先收集起来
                groups[SCORE_GROUP[func][rule]].append(score)
            except KeyError:  # 无需分组计分的证据项直接计分
                yield score
        for _, scores in groups.items():  # 分组计分的证据项只计算最大分值
            yield max(scores)

    @staticmethod
    def judge(func, **rules):
        """
        判断给定的证据项组合最终的致病性
        :param func: 变异类型
        :param rules: 勾选的证据项
        :return: 证据项、得分和致病性
        """
        # 获取所有证据项得分
        # rules = {
        #     rule: settings.DEFAULT_SCORE[func][rule] for rule, check in rules.items() if check
        # }
        rules_value = {}
        for rule, check in rules.items():
            if check in PVS1.values():
                rules_value['pvs1'] = settings.DEFAULT_SCORE[func][check]
            elif check:
                rules_value[rule] = settings.DEFAULT_SCORE[func][rule]
        # 整合所有证据项得分
        score = sum(AnnotateHelper.merge_score(func, **rules_value))
        # 判断致病性
        for op, cutoff, level in PATHOGENICITY_LEVELS[:-1]:
            if op(score, cutoff):
                pathogenicity = level
                break
        else:
            pathogenicity = PATHOGENICITY_LEVELS[-1][2]
        return rules_value, score, pathogenicity

    def annotate(self, chromosome, start, end, func, error=0):
        """
        对给定CNV进行注释
        :param chromosome: 染色体编号
        :param start: 起始位置
        :param end: 终止位置
        :param func: 变异类型
        :param error: 误差值
        :return: 注释结果
        """
        annotation = dict(chromosome=chromosome,
                          start=start,
                          end=end,
                          length=end - start,
                          error=error,
                          outer_start=start - error,
                          outer_end=end + error,
                          inner_start=start + error,
                          inner_end=end - error,
                          func=func,
                          break_point_genes=list())

        annotation['inner_overlap_genes'] = list(
            self._gene_database.overlap(
                chromosome,
                annotation['inner_start'],
                annotation['inner_end'],
            ))

        annotation['outer_overlap_genes'] = list(
            self._gene_database.overlap(
                chromosome,
                annotation['outer_start'],
                annotation['outer_end'],
            ))

        annotation['overlap_omim_genes'] = list(
            self._omim_gene_database.overlap(chromosome,
                                             annotation['inner_start'],
                                             annotation['inner_end']))

        annotation['overlap_func_regions'] = list(
            self._func_region_database.overlap(chromosome,
                                               annotation['outer_start'],
                                               annotation['outer_end']))

        annotation['overlap_hi_genes'] = list(
            self._hi_gene_database.overlap(chromosome,
                                           annotation['inner_start'],
                                           annotation['inner_end']))

        annotation['overlap_hi_exons'] = self._hi_exon_database.overlap_groups(
            chromosome, annotation['inner_start'], annotation['inner_end'],
            lambda record: record[0].gene_id)

        annotation['overlap_hi_cds'] = self._hi_cds_database.overlap_groups(
            chromosome, annotation['inner_start'], annotation['inner_end'],
            lambda record: record[0].gene_id)

        try:
            annotation['variants'] = list(
                self._clinvar_pathogenic_database.fetch(
                    chromosome, annotation['inner_start'],
                    annotation['inner_end']))
        except ValueError:
            annotation['variants'] = []

        annotation['overlap_hi_regions'] = list(
            self._hi_region_database.overlap(chromosome,
                                             annotation['inner_start'],
                                             annotation['inner_end']))

        annotation['overlap_decipher_genes'] = list(
            self._decipher_gene_database.overlap(chromosome,
                                                 annotation['inner_start'],
                                                 annotation['inner_end']))

        annotation['overlap_uhi_genes'] = list(
            self._uhi_gene_database.overlap(chromosome,
                                            annotation['outer_start'],
                                            annotation['outer_end']))

        annotation['overlap_uhi_regions'] = list(
            self._uhi_region_database.overlap(chromosome,
                                              annotation['outer_start'],
                                              annotation['outer_end']))

        annotation['overlap_ts_genes'] = list(
            self._ts_gene_database.overlap(chromosome,
                                           annotation['inner_start'],
                                           annotation['inner_end']))

        annotation['overlap_ts_regions'] = list(
            self._ts_region_database.overlap(chromosome,
                                             annotation['inner_start'],
                                             annotation['inner_end']))

        annotation['overlap_uts_genes'] = list(
            self._uts_gene_database.overlap(chromosome,
                                            annotation['outer_start'],
                                            annotation['outer_end']))

        annotation['overlap_uts_regions'] = list(
            self._uts_region_database.overlap(chromosome,
                                              annotation['outer_start'],
                                              annotation['outer_end']))

        annotation['dgv_gain_records'] = list(
            self._dgv_gain_database.overlap(chromosome,
                                            annotation['outer_start'],
                                            annotation['outer_end']))

        annotation['dgv_loss_records'] = list(
            self._dgv_loss_database.overlap(chromosome,
                                            annotation['outer_start'],
                                            annotation['outer_end']))

        annotation['gnomad_del_records'] = list(
            self._gnomad_del_database.overlap(chromosome,
                                              annotation['outer_start'],
                                              annotation['outer_end']))

        annotation['gnomad_dup_records'] = list(
            self._gnomad_dup_database.overlap(chromosome,
                                              annotation['outer_start'],
                                              annotation['outer_end']))

        annotation['cnv_syndrome_loss'] = list(
            self._cnv_syndrome_del_database.overlap(chromosome,
                                                    annotation['outer_start'],
                                                    annotation['outer_end']))
        annotation['cnv_syndrome_gain'] = list(
            self._cnv_syndrome_dup_database.overlap(chromosome,
                                                    annotation['outer_start'],
                                                    annotation['outer_end']))

        if func == 'del':
            annotation = self._annotate_loss(**annotation)
        elif func == 'dup':
            annotation = self._annotate_gain(**annotation)
        else:
            raise ValueError('Unknown func `{}`'.format(func))

        annotation['rules'], annotation['score'], annotation[
            'pathogenicity'] = self.judge(func, **annotation['rules'])
        # PVS1
        if func == 'del' and '2E' in annotation['rules'].keys():
            annotation['rules']['2E'] = annotation['rules'].get('pvs1')
        elif func == 'dup' and '2I' in annotation['rules'].keys():
            annotation['rules']['2I'] = annotation['rules'].get('pvs1')
        annotation['pvs1'] = annotation['rules'].pop('pvs1', None)

        return annotation

    def _serializer(self, anno_result):
        seri = {}
        seri['inner_gene'] = ','.join(
            x[0].symbol for x in anno_result['inner_overlap_genes'])
        seri['inner_omim_gene'] = ','.join(
            x[0].symbol for x in anno_result['overlap_omim_genes'])
        seri['HI_gene'] = ','.join(f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})'
                                   for x in anno_result['overlap_hi_genes'])
        seri['HI_region'] = SEP.join(
            f'{x[0].name}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['overlap_hi_regions'])
        seri['TS_gene'] = ','.join(f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})'
                                   for x in anno_result['overlap_ts_genes'])
        seri['TS_region'] = ','.join(
            f'{x[0].name}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['overlap_ts_regions'])
        seri['Pred_HI_gene'] = ','.join(
            f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['overlap_decipher_genes'])
        seri['auto_evidence'] = ','.join(sorted(anno_result['rules']))
        seri['auto_evidence_score'] = ','.join(
            f'{k}:{anno_result["rules"][k]}'
            for k in sorted(anno_result['rules']))
        seri['benign_hi_gene'] = ','.join(
            f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['overlap_uhi_genes'])
        seri['benign_hi_region'] = ','.join(
            f'{x[0].name}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['overlap_uhi_regions'])
        seri['benign_ts_gene'] = ','.join(
            f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['overlap_uts_genes'])
        seri['benign_ts_region'] = ','.join(
            f'{x[0].name}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['overlap_uts_regions'])
        seri['dgv_loss_records'] = ','.join(
            f'{x[0].id}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['dgv_loss_records'])
        seri['dgv_gain_records'] = ','.join(
            f'{x[0].id}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['dgv_gain_records'])
        seri['gnomad_loss_records'] = ','.join(
            f'{x[0].chrom}:{x[0].start}-{x[0].end}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['gnomad_del_records'])
        seri['gnomad_gain_records'] = ','.join(
            f'{x[0].chrom}:{x[0].start}-{x[0].end}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['gnomad_dup_records'])
        seri['cnv_syndrome_gain'] = ','.join(
            f'{x[0].disease_name}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['cnv_syndrome_gain'])
        seri['cnv_syndrome_loss'] = ','.join(
            f'{x[0].disease_name}({x[1]:.2%};{x[2]:.2%})'
            for x in anno_result['cnv_syndrome_loss'])
        seri['auto_score'] = anno_result['score']
        seri['auto_pathogenicity'] = anno_result['pathogenicity']
        seri['pvs1'] = anno_result['pvs1']
        return seri

    def _seri_anno(self, seri: pd.Series) -> pd.Series:
        anno_result = self.annotate(seri['chr'], seri['start'], seri['end'],
                                    seri['type'], seri['error'])
        return seri.append(
            pd.Series(self._serializer(anno_result)).replace(
                '', '-').fillna(DEFAULT_EMPTY_VALUE))

    def annotation_file(self, file_path, result_path):
        """
        annotate specified file, required columns: chr, start, end, type, error
        :param file_path: input file (TSV)
        :param result_path: result file path (TSV)
        :return: -
        """

        if file_path.endswith('xlsx'):
            input_df = pd.read_excel(file_path)
        else:
            input_df = pd.read_csv(file_path, sep='\t')
        input_df['chr'] = input_df['chr'].map(self._norm_chrom)
        try:
            from tqdm import tqdm
            tqdm.pandas()
            input_df = input_df.progress_apply(self._seri_anno, axis=1)
        except ImportError:
            input_df = input_df.apply(self._seri_anno, axis=1)
        if result_path.endswith('xlsx'):
            input_df.to_excel(result_path, index=False)
        else:
            input_df.to_csv(result_path, sep='\t', index=False)
Exemple #29
0
def get_metrics(ftest, fbase_vcf, fbase_bed, contigs, variant_types, min_ro,
                padding, samples, metric_prefix, max_warnings):

    test_vcf = VariantFile(ftest)

    check_header(test_vcf, samples)

    genotyped = check_if_genotyped(test_vcf)
    has_vargq = check_if_vargq(test_vcf)
    collect_evidence = check_if_evidence(test_vcf)
    test_records = list(test_vcf.fetch())

    unfiltered_variant_type_counts = get_count_by_type(test_records,
                                                       variant_types)

    pass_filter_set = set(PASSING_FILTERS)
    pass_records = [
        r for r in test_records
        if ("PASS" in r.filter or len(set(r.filter) - pass_filter_set) == 0)
    ]

    error_counts = count_errors(test_records, contigs, max_warnings)

    variant_type_counts = get_count_by_type(pass_records, variant_types)
    size_counts = get_distributions_by_type(pass_records,
                                            variant_types,
                                            "SVLEN",
                                            SIZES,
                                            exclude_types=['BND'])

    metrics = add_error_count_metrics({}, error_counts, metric_prefix)

    if fbase_vcf is not None:
        base_vcf = VariantFile(fbase_vcf)
        if genotyped != check_if_genotyped(base_vcf):
            raise ValueError(
                "One of the vcfs seems to be genotyped but the other does not")
        if has_vargq != check_if_vargq(base_vcf):
            raise ValueError(
                "One of the vcfs has the varGQ field but the other does not")
        if collect_evidence != check_if_evidence(base_vcf):
            raise ValueError(
                "One of the vcfs has the EVIDENCE field but the other does not"
            )
        base_records = list(base_vcf.fetch())
        test_tree = iu.create_trees_from_records(test_records,
                                                 variant_types,
                                                 contigs,
                                                 padding=padding)
        base_tree = iu.create_trees_from_records(base_records,
                                                 variant_types,
                                                 contigs,
                                                 padding=padding)
        base_pass_records = [
            r for r in base_records
            if ("PASS" in r.filter or len(set(r.filter) -
                                          pass_filter_set) == 0)
        ]
        base_pass_tree = iu.create_trees_from_records(base_pass_records,
                                                      variant_types,
                                                      contigs,
                                                      padding=padding)
    elif fbase_bed is not None:
        base_records = parse_bed_file(fbase_bed)
        test_tree = iu.create_trees_from_records(test_records,
                                                 variant_types,
                                                 contigs,
                                                 padding=padding)
        base_tree = iu.create_trees_from_bed_records(base_records,
                                                     variant_types,
                                                     contigs,
                                                     padding=padding)
        base_pass_tree = None
    else:
        base_tree = None
        base_pass_tree = None

    if base_tree is not None:
        metrics, fp_intervals, fn_intervals = add_evaluation_metrics(
            metrics, test_tree, base_tree, variant_types, min_ro,
            metric_prefix)
    else:
        fp_intervals = None
        fn_intervals = None

    if base_pass_tree is not None:
        metrics, fp_intervals_pass, fn_intervals_pass = add_evaluation_metrics(
            metrics,
            test_tree,
            base_pass_tree,
            variant_types,
            min_ro,
            metric_prefix,
            metric_suffix="_pass")
    else:
        fp_intervals_pass = None
        fn_intervals_pass = None

    if genotyped:
        allele_frequencies, num_singletons = get_allele_frequency_counts(
            pass_records, test_vcf.header, variant_types)
    if has_vargq:
        vargq_counts = get_distributions_by_type(pass_records, variant_types,
                                                 "varGQ", VARGQ_BINS)
    if collect_evidence:
        evidence_counts = collect_evidence_fields(pass_records, variant_types)

    for type in variant_types:
        metrics[metric_prefix + VCF_METRIC_STR + type +
                "_count"] = unfiltered_variant_type_counts[type]
        metrics[metric_prefix + VCF_METRIC_STR + type +
                "_pass_count"] = variant_type_counts[type]
        if type != 'BND':
            metrics = add_binned_metrics(size_counts, SIZES, type, metrics,
                                         metric_prefix, "pass_size")
        if genotyped:
            metrics = add_binned_metrics(allele_frequencies, AF_BINS, type,
                                         metrics, metric_prefix, "pass_af")
            if type in num_singletons:
                metrics[metric_prefix + VCF_METRIC_STR + type +
                        "_pass_ac_1"] = num_singletons[type]
        if has_vargq:
            metrics = add_binned_metrics(vargq_counts, VARGQ_BINS, type,
                                         metrics, metric_prefix, "pass_vargq")
        if collect_evidence:
            metrics = add_metrics_from_dict(evidence_counts, type, metrics,
                                            metric_prefix, "pass_evidence")

    return metrics, fp_intervals, fn_intervals, fp_intervals_pass, fn_intervals_pass
def check_genotype(folder, sample, coverage_file):
    """
    Compares the genotype for all shared variants
    :param folder: location of results from the NGS analysis pipeline
    :param sample:  sample number (used in vcf file)
    :param coverage_file: file containing coverage information for each position in the panel
    :return: dictionary of number of matching variants and detailed information for any with mismatching genotypes
    """
    shared_giab = VariantFile(folder + '/0002.vcf')
    shared_patient = VariantFile(folder + '/0003.vcf')

    variants = []

    vars_giab = {}
    for rec in shared_giab.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if chrom not in vars_giab:
            vars_giab[chrom] = {}
        if pos not in vars_giab[chrom]:
            vars_giab[chrom][pos] = {}
        if alleles not in vars_giab[chrom][pos]:
            vars_giab[chrom][pos][alleles] = rec.samples['INTEGRATION']['GT']

    matching = 0
    for rec in shared_patient.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if 'AD' in rec.samples[sample].keys():
            allelic_depth = rec.samples[sample]['AD']
        else:
            allelic_depth = 'N/A'
        total_depth = rec.samples[sample]['DP']
        giab_genotype = vars_giab[chrom][pos][alleles]
        if rec.samples[sample]['GT'] == giab_genotype:
            matching += 1
        elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][
            0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1:
            matching += 1
        else:
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print 'Error executing command: ' + str(e.returncode)
                    exit(1)
                if line == '':
                    variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                               'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                               'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                               'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                else:
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':alleles[0], 'alt':alleles[1], 'QUAL':rec.qual,
                                    'GT':{sample:rec.samples[sample]['GT'], 'GIAB':giab_genotype},
                                    'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                                    'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}
            else:
                variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                           'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                           'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                           'coverage': {'total': 'indel: no coverage could be obtained', 'ref': 'N/A', 'alt': 'N/A'}}
            variants.append(variant)
    print str(matching) + ' matching variants'
    results = {'matching':matching, 'mismatching':variants}
    print results
    return results
def annotate_false_negs(folder, ref_sample, coverage_file):
    """
    Get information for any false negative results.

    Returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate)

    :param folder: Folder containing output from bcftools isec
    :type folder: String
    :param ref_sample: Sample number for reference vcf
    :type ref_sample: String
    :param coverage_file: File containing per base coverage for the truth_regions panel
    :type coverage_file: String
    :return: List of variant dictionaries containing information on false negatives
    :rtype: List
    """
    false_negs = VariantFile(folder + '/0000.vcf')
    num_neg = len(list(false_negs.fetch()))
    print(num_neg)

    variants = {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':[]}

    if num_neg > 0:
        print('false negatives')
        for rec in false_negs.fetch():
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples['Venter.il_st']['GT']
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print(command)
                    print('Error executing command: ' + str(e.returncode))
                    exit(1)

                if line == '':
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                    no_cov = variants['no_coverage']
                    no_cov.append(variant)
                    variants['no_coverage'] = no_cov
                else:
                    line.strip('\n')
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}

                    if cov == 0:
                        no_cov = variants['no_coverage']
                        no_cov.append(variant)
                        variants['no_coverage'] = no_cov
                    elif alt_cov != 0:
                        ev_alt = variants['evidence_of_alt']
                        ev_alt.append(variant)
                        variants['evidence_of_alt'] = ev_alt
                    else:
                        fn = variants['false_neg']
                        fn.append(variant)
                        variants['false_neg'] = fn
            else:
                variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                            'GT':genotype, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A',
                                                        'alt':'N/A'}}
                indels = variants['indels']
                indels.append(variant)
                variants['indels'] = indels

    else:
        print('no false negatives')

    return variants
Exemple #32
0
def run_process(opts, inputvcf):
    outputvcf = opts.output

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    vcf_in.header.info.add(
        "TYPE", "A", "String",
        "The type of allele, either snp, ins, del, or complex.")

    # Add FORMAT to Header
    vcf_in.header.formats.add(
        "NGB_DP", "1", "Integer",
        "Approximate read depth; some reads may have been filtered")
    vcf_in.header.formats.add("NGB_AO", "A", "Integer",
                              "Alternate allele observation count")
    vcf_in.header.formats.add("NGB_RO", "1", "Integer",
                              "Reference allele observation count")
    vcf_in.header.formats.add(
        "NGB_VAF", "A", "Float",
        "Allele fractions of alternate alleles in the tumor")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-',
                          'w',
                          header=vcf_in.header)

    for record in vcf_in.fetch():
        chrom = record.chrom
        pos = record.pos
        ref = record.ref
        alts = record.alts

        variant_type_list = list()
        ngb_dp_list = list()
        ngb_ao_list = list()
        ngb_ro_list = list()
        ngb_vaf_list = list()
        tmp_dp = sum(record.samples[0]['AD'])
        tmp_ro = record.samples[0]['AD'][0]
        for n, alt in enumerate(alts):
            # Get Variant TYPE (freebayes format)
            ret = ngb_functions.pairdiff(ref, alt)
            vartype = ret['variant_type']
            variant_type_list.append(vartype)

            # Get DP,AO,RO,VAF
            tmp_vaf = float(record.samples[0]['AD'][(n + 1)]) / float(tmp_dp)
            tmp_ao = int(record.samples[0]['AD'][(n + 1)])
            ngb_dp_list.append(tmp_dp)
            ngb_ao_list.append(tmp_ao)
            ngb_vaf_list.append(tmp_vaf)

        if variant_type_list != []:
            #info_value = ','.join(str(e) for e in variant_type_list)
            record.info['TYPE'] = variant_type_list
        if ngb_dp_list != []:
            record.samples[0]["NGB_DP"] = ngb_dp_list[0]
            record.samples[0]["NGB_AO"] = tuple(ngb_ao_list)
            record.samples[0]["NGB_RO"] = tmp_ro
            record.samples[0]["NGB_VAF"] = tuple(ngb_vaf_list)

        # Write VCF
        vcf_out.write(record)
Exemple #33
0
#!/group/ctan/anaconda3/envs/snakemake/bin/python

import sys
from vcf_ctan import samvcf
from pysam import VariantFile

samples = [
    "AC", "BD", "Commander", "EC2.1", "EC2.2", "EC7.1", "EC7.2", "Fleet",
    "Hindmarsh", "La_Trobe", "Scope", "Vlamingh", "W1", "WI4304", "X1",
    "barke", "bowman", "haruna_Nijo", "igri", "spontaneum_B1k-04-12"
]
grp1 = [samples[1], samples[10], samples[15], samples[17]]
grp2 = [samples[2], samples[8], samples[9], samples[11], samples[16]]

ibcf = VariantFile(sys.argv[1])
#obcf = VariantFile(sys.argv[2],'w',header=ibcf.header)
ofile = open(sys.argv[2], "w")
hd = ["#chr", "pos", "len", "ref", "alt", "gt_count"]
for one in grp1 + grp2:
    hd = hd + [one, "Reads"]
ofile.write("\t".join(hd) + "\n")
for one in ibcf.fetch("chr5H", 544822373, 546294499):
    record = samvcf(one)
    if record.diff_group(grp1, grp2):
        opt = record.opt + record.diff_group(grp1, grp2)
        ofile.write("\t".join(opt) + "\n")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-vcf', help='Results VCF to be compared', required=True)
    parser.add_argument('-bed', help='The reference BED file', required=True)
    parser.add_argument('-s', help='Sample ID in VCF', required=True)
    parser.add_argument('-out', help='The folder to putt results files', required=True)

    args = parser.parse_args()

    if args.out.endswith('/'):
        out_dir = args.out
    else:
        out_dir = args.out + '/'

    sample = args.s

    vcf_file = args.vcf
    bed = args.bed

    f = open(bed, 'r')
    regions = [line.strip('\n') for line in f.readlines()]
    f.close()

    variants = {}
    for region in regions:
        if region.startswith('#'):
            continue
        chrom, start, end, name = region.split('\t')
        pos, ref, alt = name.split(':')
        if chrom not in variants:
            variants[chrom] = {pos:{(ref, alt):False,}}
        elif pos not in variants[chrom]:
            variants[chrom][pos] = {(ref, alt):False,}
        else:
            variants[chrom][pos][(ref, alt)] = False

    vcf = VariantFile(vcf_file)
    false_pos = []
    false_neg = []
    true_pos = []
    for v in vcf.fetch():
        chrom = v.contig
        pos = str(v.pos)
        ref = v.alleles[0]
        alt = v.alleles[1]
        qual = v.qual
        genotype = v.samples[sample]['GT']
        if 'AD' in v.samples[sample].keys():
            allelic_depth = v.samples[sample]['AD']
        elif 'NV' in v.samples[sample].keys():
            allelic_depth = v.samples[sample]['NV']
        else:
            allelic_depth = 'N/A'
        if 'DP' in v.samples[sample].keys():
            total_depth = v.samples[sample]['DP']
        elif 'NR' in v.samples[sample].keys():
            total_depth = v.samples[sample]['NR']
        else:
            total_depth = 0
        if pos in variants[chrom].keys():
            if (ref,alt) in variants[chrom][pos].keys():
                variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual,
                           'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                           'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}}
                true_pos.append(variant)
                variants[chrom][pos][(ref, alt)] = True
            else:
                variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual,
                           'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                           'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}}
                false_pos.append(variant)
        else:
            variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual,
                       'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                       'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}}
            false_pos.append(variant)

    for chrom in variants.keys():
        for pos in variants[chrom].keys():
            for v in variants[chrom][pos].keys():
                if not variants[chrom][pos][v]:
                    variant = {'chrom': chrom, 'pos': pos, 'ref': v[0], 'alt': v[1], 'QUAL': 0,
                               'GT': (0,0),
                               'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}}
                    false_neg.append(variant)

    out = {'false_negative': {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':false_neg}, 'false_positive': false_pos,
           'mismatching_genotype': [], 'matching_variants': len(true_pos),
           'num_true_negatives': 0, 'sensitivity': 0, 'MCC': 0,
           'small_panel_remainder_length': 0, 'percent_small_panel_covered': 0,
           'num_false_positive': len(false_pos), 'num_false_negative': {'indel': 0,
                                                                   'no_coverage': 0,
                                                                   'ev_of_alt': 0,
                                                                   'false_neg': 0,
                                                                   'total': len(false_neg)},
           'num_mismatching_genotype': 0}

    all_results = {sample:out}
    f = open(out_dir + sample + '_summary.json', 'w')
    j = json.dumps(all_results, indent=4)
    print >> f, j
    f.close()
Exemple #35
0
def read_vcf(fh, alleles, slh=None):
    vcf_in = VariantFile(fh)
    sample = list(vcf_in.header.samples)[0]
    availcols = next(vcf_in.fetch()).format.keys()
    vcf_in.seek(0)

    # Check if sample size info is in header
    global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0]
    if alleles:
        dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
        usecols = list(dtype_dict.keys())

        # Read in data
        if 'SS' in availcols:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.samples[sample]['SS'][0], rec.alts[0], rec.ref
            ] for rec in vcf_in.fetch()]
            N = pd.Series([x[2] for x in o], dtype='float')
        else:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.alts[0], rec.ref
            ] for rec in vcf_in.fetch()]
            if 'TotalControls' in global_fields.keys(
            ) and 'TotalCases' in global_fields.keys():
                N = pd.Series([
                    float(global_fields['TotalControls']) +
                    float(global_fields['TotalCases'])
                ] * len(o),
                              dtype='float')
            elif 'TotalControls' in global_fields.keys():
                N = pd.Series([float(global_fields['TotalControls'])] * len(o),
                              dtype='float')
            else:
                N = pd.Series([np.NaN] * len(o), dtype='float')

        p = pd.DataFrame({
            'SNP':
            pd.Series([x[0] for x in o], dtype='str'),
            'Z':
            pd.Series([x[1] for x in o], dtype='float'),
            'N':
            N,
            'A1':
            pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'),
            'A2':
            pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str')
        })
    else:
        dtype_dict = {'SNP': str, 'Z': float, 'N': float}
        usecols = list(dtype_dict.keys())
        if 'SS' in availcols:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.samples[sample]['SS'][0]
            ] for rec in vcf_in.fetch()]
            N = pd.Series([x[2] for x in o], dtype='float')
        else:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0]
            ] for rec in vcf_in.fetch()]
            if 'TotalControls' in global_fields.keys(
            ) and 'TotalCases' in global_fields.keys():
                N = pd.Series([
                    float(global_fields['TotalControls']) +
                    float(global_fields['TotalCases'])
                ] * len(o),
                              dtype='float')
            elif 'TotalControls' in global_fields.keys():
                N = pd.Series([float(global_fields['TotalControls'])] * len(o),
                              dtype='float')
            else:
                N = pd.Series([np.NaN] * len(o), dtype='float')

        p = pd.DataFrame({
            'SNP': pd.Series([x[0] for x in o], dtype='str'),
            'Z': pd.Series([x[1] for x in o], dtype='float'),
            'N': N
        })

    vcf_in.close()

    if slh is not None:
        compression = get_compression(slh)
        sl = []
        if compression == "gzip":
            try:
                with gzip.open(slh) as f:
                    for line in f:
                        sl.append(line.strip())
            except (AttributeError, ValueError) as e:
                raise ValueError('Improperly formatted snplist file: ' +
                                 str(e.args))
        else:
            try:
                with open(slh) as f:
                    for line in f:
                        sl.append(line.strip())
            except (AttributeError, ValueError) as e:
                raise ValueError('Improperly formatted snplist file: ' +
                                 str(e.args))
        f.close()
        p = p.loc[p['SNP'].isin(sl)]

    return (p)
#!/usr/bin/env python3
from pysam import VariantFile
import sys

vcf_in = VariantFile(sys.argv[1], 'r')
vcf_out = VariantFile('-', 'w', header=vcf_in.header)
cp = (0, 0)
for rec in vcf_in.fetch():
    if (rec.chrom, rec.pos) != cp:
        vcf_out.write(rec)
    cp = (rec.chrom, rec.pos)
def annotate_false_negs(folder, ref_sample, coverage_file):
    """
    Get information for any false negative results.

    Returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate)

    False Negatives are split into categories to aid final comparison:

    * Zero coverage - No reads present
    * Evidence of alternate allele - Coverage or quality too low for variant call
    * Indels - Coverage is more difficult to obtain in these cases; currently they must be investigated by hand
    * All other false negatives - In these cases there are reads present and no evidence of the alternate allele

    :param folder: Folder containing output from bcftools isec
    :type folder: String
    :param ref_sample: Sample number for reference vcf
    :type ref_sample: String
    :param coverage_file: File containing per base coverage for the truth_regions panel
    :type coverage_file: String
    :return: List of variant dictionaries containing information on false negatives
    :rtype: List
    """
    false_negs = VariantFile(folder + '/0000.vcf')
    num_neg = len(list(false_negs.fetch()))
    print(num_neg)

    variants = {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':[]}
    v_list = []
    count=0
    if num_neg > 0:
        print('false negatives')
        for rec in false_negs.fetch():
            print(rec.samples)
            chrom = rec.contig
            pos = int(rec.pos)
            ref = rec.alleles[0]
            alt = rec.alleles[1]
            qual = rec.qual
            genotype = rec.samples[ref_sample]['GT']
            if [chrom, pos, ref, alt] in v_list:
                print("duplicate")
                continue
                
            count+=1
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print(command)
                    print('Error executing command: ' + str(e.returncode))
                    exit(1)

                if line == '':
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                    no_cov = variants['no_coverage']
                    no_cov.append(variant)
                    variants['no_coverage'] = no_cov
                else:
                    line.strip('\n')
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                               'GT':genotype, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}

                    if int(cov) == 0:
                        no_cov = variants['no_coverage']
                        no_cov.append(variant)
                        variants['no_coverage'] = no_cov
                    elif int(alt_cov) != 0:
                        ev_alt = variants['evidence_of_alt']
                        ev_alt.append(variant)
                        variants['evidence_of_alt'] = ev_alt
                    else:
                        fn = variants['false_neg']
                        fn.append(variant)
                        variants['false_neg'] = fn
            else:
                variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual,
                            'GT':genotype, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A',
                                                        'alt':'N/A'}}
                indels = variants['indels']
                indels.append(variant)
                variants['indels'] = indels

    else:
        print('no false negatives')
    print("false_negatives=" + str(count))
    return variants
Exemple #38
0
 def fetch(self, chrm, pos_start, pos_end, return_samples=False):
     vcf_file = "%s.%s.vcf.gz" % (self.pop_vcf_stem, chrm)
     vcf_open = VariantFile(vcf_file, drop_samples=(not return_samples))
     return vcf_open.fetch(chrm, pos_start, pos_end)
Exemple #39
0
    Args: 
        query (pysam.VariantRecord): query breakend
        targets (pysam.VariantFile): vcf file with target breakends
        ignore_strands, ignore_alt_pos (bool): argument to pass to record_matches

    Returns:
        bool: If there is any match, returns True. If there is no match, returns False.
    """
    within_distance_targets = targets.fetch(query.chrom, query.start - dist, query.start + dist)
    for candidate_hit in within_distance_targets:
        if record_matches(query, candidate_hit, ignore_strands = ignore_strands, ignore_alt_pos = ignore_alt_pos): return True

    return False




## Read vcf records from input file
out_file = open(sys.argv[3], "w")
for rec in vcf_query.fetch():
    if rec.info["SVTYPE"] == "BND" or rec.info["SVTYPE"] == "DEL" or rec.info["SVTYPE"] == "DUP":
        rec_has_match = has_match(rec, vcf_target)
    elif rec.info["SVTYPE"] == "INS":
        rec_has_match = has_match(rec, vcf_target, ignore_alt_pos = True, ignore_strands = True)
    out_file.write(rec.id + "\t" + rec.info["SVTYPE"] + "\t" + str(rec_has_match) + "\n")

out_file.close()


Exemple #40
0
def run_process(opts, inputvcf):
    db_file = opts.database
    outputvcf = opts.output
    minhomopolyx = int(opts.minhomopolyx)
    minrepeatcount = int(opts.minrepeatcount)
    maxvaf = float(opts.maxvaf)
    indelmaxdp = int(opts.indelmaxdp)
    indelmaxao = int(opts.indelmaxao)
    indelmaxvaf = float(opts.indelmaxvaf)
    snvmaxdp = int(opts.snvmaxdp)

    # Get Lowconf Database (obj1 : standard, obj2 : range)
    lowconfobj1, lowconfobj2 = lowconfdb2obj(db_file)

    # Open VCF
    vcf_in = VariantFile(inputvcf)

    # Add INFO to Header
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.info, "LOW_CONFIDENCE"):
        vcf_in.header.info.add("LOW_CONFIDENCE", ".", "String",
                               "Low Confidence Type")

    # Add FILTER to Header
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "homopolymer"):
        vcf_in.header.filters.add("homopolymer", None, None,
                                  "Homopolymer Sequence Region")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "repeat_sequence"):
        vcf_in.header.filters.add("repeat_sequence", None, None,
                                  "Repeat Sequence Region")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "sequencing_error"):
        vcf_in.header.filters.add("sequencing_error", None, None,
                                  "Sequencing Error Low Confidence Region")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "mapping_error"):
        vcf_in.header.filters.add("mapping_error", None, None,
                                  "Mapping Error Low Confidence Region")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "snp_candidate"):
        vcf_in.header.filters.add("snp_candidate", None, None,
                                  "SNP Candidates")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "strand_biased"):
        vcf_in.header.filters.add("strand_biased", None, None,
                                  "Strand Biased (Freebayes)")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "lowcoverage_indel"):
        vcf_in.header.filters.add("lowcoverage_indel", None, None,
                                  "Low Coverage (DP,AO,VAF) Indels")
    if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters,
                                        "lowcoverage_snv"):
        vcf_in.header.filters.add("lowcoverage_snv", None, None,
                                  "Low Coverage (DP) SNVs")

    # Write VCF
    vcf_out = VariantFile(outputvcf if outputvcf else '-',
                          'w',
                          header=vcf_in.header)

    for record in vcf_in.fetch():
        chrom = record.chrom
        pos = record.pos
        ref = record.ref
        alts = record.alts

        vaf = float(record.samples[0]["NGB_VAF"][0])
        ao = int(record.samples[0]["NGB_AO"][0])
        dp = int(record.samples[0]["NGB_DP"])
        vtype = record.info["TYPE"][0]
        reflen = len(record.ref)
        altlen = len(record.alts[0])
        """
        if "ngb_cv_rcv_sig_description" in record.info:
            tmpcv = record.info["ngb_cv_rcv_sig_description"][0]
            cv = tmpcv.split("|")
        else:
            cv = list()
        """

        seqerror_info_list = list()
        strandbiased_info_list = list()
        homopolymer_info_list = list()
        repeat_info_list = list()
        saf_format_list = list()
        sar_format_list = list()
        lowcov_indel_list = list()
        lowcov_snv_list = list()
        for i, alt in enumerate(alts):
            # Get Lowconf info
            lowconf = ""
            id1 = chrom + '-' + str(pos) + '-' + ref + '-' + alt
            if id1 in lowconfobj1:
                lowconf = lowconfobj1[id1]
            else:
                lowconf = ""

            # Get Lowconf Info from range database
            for lowconfdata in lowconfobj2:
                if chrom == lowconfdata["chrom"] and pos in range(
                        int(lowconfdata["start"]),
                        int(lowconfdata["end"]) + 1):
                    lowconf = lowconfdata["type"]
            seqerror_info_list.append(lowconf)

            # Get Strand Biased Information
            strandbiased = ""
            # (Freebayes)
            if "SAF" in record.info:
                if record.info["SAF"][i] == 0 or record.info["SAR"][
                        i] == 0 or record.info["RPR"][i] < 1 or record.info[
                            "RPL"][i] < 1:
                    strandbiased = "strand_biased"
                else:
                    strandbiased = ""
            """
            # Mutect
            elif "F1R2" in record.format:
                alt_f1r2 = record.samples[0]['F1R2'][i+1]
                alt_f2r1 = record.samples[0]['F2R1'][i+1]
                if alt_f1r2 == 0 or alt_f2r1 == 0:
                    strandbiased = "strand_biased"
                else:
                    strandbiased = ""
                saf_format_list.append(alt_f1r2)
                sar_format_list.append(alt_f2r1)
            """
            strandbiased_info_list.append(strandbiased)

            # Homopolymer & Repeat Sequence Filtering (VAF, CV)
            homopolymerinfo = ""
            repeatinfo = ""
            #if vaf < maxvaf and ("Pathogenic" not in cv) and ("Likely_pathogenic" not in cv):
            if vaf < maxvaf:
                # Get Homopolymer Info
                if "HOMOPOLYX" in record.info:
                    if int(record.info["HOMOPOLYX"][0]) >= minhomopolyx:
                        homopolymerinfo = "homopolymer"
                    else:
                        homopolymerinfo = ""
                # Get Repeat Info
                if "REPEAT_COUNT" in record.info:
                    if int(record.info["REPEAT_COUNT"][0]) >= minrepeatcount:
                        repeatinfo = "repeat_sequence"
                    else:
                        repeatinfo = ""
            homopolymer_info_list.append(homopolymerinfo)
            repeat_info_list.append(repeatinfo)

            # Indel Filtering
            lowcovindelinfo = ""
            if (altlen != reflen) and (vtype == "ins" or vtype == "del"
                                       or vtype == "complex"):
                if vaf < indelmaxvaf or ao < indelmaxao or dp < indelmaxdp:
                    lowcovindelinfo = "lowcoverage_indel"
                else:
                    lowcovindelinfo = ""
            else:
                lowcovindelinfo = ""
            lowcov_indel_list.append(lowcovindelinfo)

            # SNV Filtering
            lowcovsnvinfo = ""
            if (altlen == reflen) and (vtype == "snp" or vtype == "complex"):
                if dp < snvmaxdp:
                    lowcovsnvinfo = "lowcoverage_snv"
                else:
                    lowcovsnvinfo = ""
            else:
                lowcovsnvinfo = ""
            lowcov_snv_list.append(lowcovsnvinfo)

        lowconf_info_list = list()
        for i, itema in enumerate(seqerror_info_list):
            itemb = strandbiased_info_list[i]
            itemc = homopolymer_info_list[i]
            itemd = repeat_info_list[i]
            iteme = lowcov_indel_list[i]
            itemf = lowcov_snv_list[i]
            itemm = ""

            if itema != '':
                itemm += itema + "|"
            if itemb != '':
                itemm += itemb + "|"
            if itemc != '':
                itemm += itemc + "|"
            if itemd != '':
                itemm += itemd + "|"
            if iteme != '':
                itemm += iteme + "|"
            if itemf != '':
                itemm += itemf + "|"

            if itemm != '':
                itemn = itemm[0:-1]
            else:
                itemn = ''

            if itemn != '':
                lowconf_info_list.append(itemn)

        if lowconf_info_list != []:
            info_value = ','.join(str(e) for e in lowconf_info_list)
            record.info['LOW_CONFIDENCE'] = info_value

        # Add FILTER
        lowconf_infolist = list()
        if 'LOW_CONFIDENCE' in record.info:
            for lowconf_info in record.info['LOW_CONFIDENCE']:
                lowconf_infolist += lowconf_info.split("|")
        lowconf_infolist = list(set(lowconf_infolist))
        for lowconf_info in lowconf_infolist:
            record.filter.add(lowconf_info)

        # PASS FILTER
        if list(record.filter) == []:
            record.filter.add("PASS")

        # Remove Filter
        for rf in remove_filter_list:
            if rf in list(record.filter):
                record.filter.__delitem__(rf)

        # Write VCF
        vcf_out.write(record)
Exemple #41
0
    if not tb:
        return novel
    try:
        records = list(tb.query(chr, pos - 1, pos))
        if not records:
            return novel
        return records[0][2]
    except tabix.TabixError:
        return novel


reader = Vcf(infile)
writer.meta.add('Variant')
writer.meta.add(*tuple(reader.header.samples))
writer.writeHead()
for r in reader.fetch():
    alts = r.alts
    if not alts: continue
    if bialt and len(alts) != 1: continue
    if bialt and len(r.ref) != 1: continue
    alts = list(alts)
    refalts = [r.ref] + alts
    name = r.id if useid and r.id and r.id != '.' else getRsName(
        r.chrom, r.pos)
    for alt in alts:
        record = TsvRecord()
        record.Variant = '{chr}_{pos}_{name}_{ref}_{alt}'.format(chr=r.chrom,
                                                                 pos=r.pos,
                                                                 name=name,
                                                                 ref=r.ref,
                                                                 alt=alt)
Exemple #42
0
from pysam import VariantFile
from pysam import TabixFile
from pyfaidx import Fasta

# data files
reference_file = 'S_lycopersicum_chromosomes.2.40.fa'
annotation_file = 'gene_models.gff.gz'
variant_file = 'tomato_snps.bcf'

# load reference
reference = Fasta(reference_file)

# load annotations
annotations = TabixFile(annotation_file)

# laod variants
variants = VariantFile(variant_file)

# regions to query
region1 = ("SL2.40ch01", 15000, 21000)
region2 = ("SL2.40ch01", 20000, 70000)

region1_reference = reference[region1[0]][region1[1]: region1[2]]
region1_annotations = [a for a in annotations.fetch(*region1, parser=pysam.asGTF())]
region1_variants = [a for a in variants.fetch(*region1)]

region2_reference = reference[region2[0]][region2[1]: region2[2]]
region2_annotations = [a for a in annotations.fetch(*region2, parser=pysam.asGTF())]
region2_variants = [a for a in variants.fetch(*region2)]
Exemple #43
0
from pysam import VariantFile as Vcf
from pyppl import Box
from bioprocs.utils import alwaysList

infile = {{i.infile | quote}}
outfile = {{o.outfile | quote}}
rmfilter = {{args.rmfilter | repr}}
if rmfilter:
    rmfilter = alwaysList(rmfilter)

invcf = Vcf(infile)
outvcf = open(outfile, 'w')
outvcf.write(str(invcf.header))
for rec in invcf.fetch():
    parts = str(rec).split('\t')
    filters = parts[6].split(';')
    if not rmfilter:
        filters = 'PASS'
    else:
        filters = ';'.join(f for f in filters if f not in rmfilter)
        filters = filters or 'PASS'
    parts[6] = filters
    outvcf.write('\t'.join(parts))
outvcf.close()
Exemple #44
0
    inF1 = VariantFile(args['<input1>'], 'r')
    inF2 = VariantFile(args['<input2>'], 'r')
    Record = Record(inF2)
    #check smaples in two input file, same samples, and same order.
    if len(inF1.header.samples) != len(inF2.header.samples):
        sys.stderr.write('ERROR: different number of samples in two input files.\n')
        sys.exit(-1)
    else:
        for x, y in zip( inF1.header.samples, inF2.header.samples):
            if x != y:
                sys.stderr.write('ERROR: two input files should have the same samples, and ordered in same order.\n')
                sys.exit(-1)

    #output vcf header
    sys.stdout.write('%s'%(str(inF1.header)))
    for line in inF1.fetch():
        if len(line.alleles) != 2:
            sys.stderr.write('ERROR: please decompose the input vcf, only one alt allele permited each line, error record:\n%s\n'
            %(line))
            sys.exit(-1)
        ss = str(line).strip().split()
        #print(ss[0])
        key = ss[0] + ss[1] + ss[3] + ss[4]
        line2 = Record.getRecord(key, ss[0], int(ss[1]))
        if line2:
            out = ss[:vcfMetaCols]
            ss2 = str(line2).strip().split()
            for x, y in zip(ss[vcfMetaCols:], ss2[vcfMetaCols:]):
                if x[0] == '.' or y[0] == '.':
                    out.append('.')
                elif x[0] == y[0] and x[2] == y[2]:
Exemple #45
0
#!/group/ctan/anaconda3/envs/snakemake/bin/python

import sys
from vcf_ctan import samvcf
from pysam import VariantFile

samples = [
    "AC", "BD", "Commander", "EC2.1", "EC2.2", "EC7.1", "EC7.2", "Fleet",
    "Hindmarsh", "La_Trobe", "Scope", "Vlamingh", "W1", "WI4304", "X1",
    "barke", "bowman", "haruna_Nijo", "igri", "spontaneum_B1k-04-12"
]
grp = ["bam/YSX-W_HJMFHALXX_L5.rmdup.bam", "bam/TBT-M_HJMFHALXX_L4.rmdup.bam"]

ibcf = VariantFile(sys.argv[1])
#obcf = VariantFile(sys.argv[2],'w',header=ibcf.header)
ofile = open(sys.argv[2], "w")
hd = ["#chr", "pos", "len", "ref", "alt", "gt_count"]
for one in grp:
    hd = hd + [one, "Reads"]
ofile.write("\t".join(hd) + "\n")
for one in ibcf.fetch():
    record = samvcf(one)
    if record.extract(grp):
        opt = record.opt + record.extract(grp)
        ofile.write("\t".join(opt) + "\n")
Exemple #46
0
        contigs = set(vcf_infile.header.contigs)
        contigs = contigs.difference(set(exclude_chr))

    chrom_list = []
    total_af = []
    total_ac = []
    total_sites = 0
    total_callable = 0

    with open(args.outfile, 'w') as outfile:
        print('Population', 'Chromosome', 'Chromosome_length', 'Sites', 'S', 'thetaW', 'pi', 'tajd', sep='\t',
              file=outfile)

        for c in contigs:
            for site in vcf_infile.fetch(c):
                ac = site.info['AC'][0]
                af.append(ac / float(n))

            total_ac += ac
            total_af += af

            sites = calc_stats_chr(args.pop_id, af, c, args.callable, outfile)
            total_sites += sites[0]
            total_callable += sites[1]

            del af[:]

        if args.sfs:
            for c in total_ac:
                sfs[min(n - c, c)] += 1
Exemple #47
0
def phase_structural_variants(sv_vcf, long_reads_bam, workdir):
    sv_vcf_basename = os.path.basename(sv_vcf)
    if sv_vcf_basename.endswith('.vcf'):
        offset = -4
    elif sv_vcf_basename.endswith('.vcf.gz'):
        offset = -7
    else:
        return

    sv_filtered_phased_vcf = workdir + '/' + sv_vcf_basename[:offset] + '.filtered.phased.vcf'
    vcf_in = VariantFile(sv_vcf)
    vcf_out = VariantFile(sv_filtered_phased_vcf, 'w', header=vcf_in.header)
    bam_in = AlignmentFile(long_reads_bam)
    phasing_stat_f = open(workdir + '/' + 'phasing_stat.txt', 'w')

    
    chr_to_include = ['1',
                      '2',
                      '3',
                      '4',
                      '5',
                      '6',
                      '7',
                      '8',
                      '9',
                      '10',
                      '11',
                      '12',
                      '13',
                      '14',
                      '15',
                      '16',
                      '17',
                      '18',
                      '19',
                      '20',
                      '21',
                      '22',
                      'X',
                      'Y']
    
    """
    chr_to_include = ['chr1',
                      'chr2',
                      'chr3',
                      'chr4',
                      'chr5',
                      'chr6',
                      'chr7',
                      'chr8',
                      'chr9',
                      'chr10',
                      'chr11',
                      'chr12',
                      'chr13',
                      'chr14',
                      'chr15',
                      'chr16',
                      'chr17',
                      'chr18',
                      'chr19',
                      'chr20',
                      'chr21',
                      'chr22',
                      'chrX',
                      'chrY']
    """

    phasing_stat = {'INS' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'DEL' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'INV' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'BND' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'DUP:TANDEM' : {'Total':0, 'Phased HOM':0, 'Phased HET':0},
                    'DUP_INT' : {'Total':0, 'Phased HOM':0, 'Phased HET':0}}

    prev_chrom = ''
    for rec in vcf_in.fetch():
        sv_chrom = rec.chrom
        if sv_chrom in chr_to_include:
            if sv_chrom != prev_chrom:
                logging.info('Processing {0}'.format(sv_chrom))
            prev_chrom = sv_chrom
            if rec.filter.keys()[0] == 'PASS':
                sv_pos = rec.pos
                sv_read_ids = rec.info['READS']
                sv_support = rec.info['SUPPORT']
                sv_type = rec.info['SVTYPE']

                phasing_stat[sv_type]['Total'] += 1

                begin_pos = sv_pos - 1
                if 'END' in rec.info:
                    end_pos = rec.info['END']
                else:
                    end_pos = sv_pos

                hap1_counter = 0
                hap2_counter = 0
                try:
                    read_iterator = bam_in.fetch(sv_chrom, begin_pos-2000, end_pos+2000)
                except ValueError:
                    read_iterator = bam_in.fetch(sv_chrom, begin_pos, end_pos)
                for read in read_iterator:
                    if read.query_name in sv_read_ids:
                        if read.has_tag('HP'):
                            read_hp = read.get_tag('HP')
                            hap1_counter += read_hp == 1
                            hap2_counter += read_hp == 2

                threshold_read_count = max(int(0.85 * sv_support), 5)
                threshold_het = 0.8
                threshold_hom = 0.2

                if (hap1_counter + hap2_counter) >= threshold_read_count:
                    allele_frequency_hap1 = hap1_counter / float(hap1_counter + hap2_counter)
                    allele_frequency_hap2 = hap2_counter / float(hap1_counter + hap2_counter)

                    if allele_frequency_hap1 >= threshold_hom and allele_frequency_hap1 < threshold_het:
                        rec.samples[0]['GT'] = (1, 1)
                        rec.samples[0].phased = True
                        phasing_stat[sv_type]['Phased HOM'] += 1
                    elif allele_frequency_hap1 >= threshold_het:
                        rec.samples[0]['GT'] = (1, 0)
                        rec.samples[0].phased = True
                        phasing_stat[sv_type]['Phased HET'] += 1
                    elif allele_frequency_hap2 >= threshold_het:
                        rec.samples[0]['GT'] = (0, 1)
                        rec.samples[0].phased = True
                        phasing_stat[sv_type]['Phased HET'] += 1

                    vcf_out.write(rec)
    
    phasing_stat_f.write('\tTotal\tPhased HOM\tPhased HET\n')
    for sv in phasing_stat:
        phasing_stat_f.write('{0}:\t{1}\t{2}\t{3}\n'.format(sv, phasing_stat[sv]['Total'], phasing_stat[sv]['Phased HOM'], phasing_stat[sv]['Phased HET']))
    phasing_stat_f.close()
def check_genotype(folder, sample, ref_sample, coverage_file):
    """
    Compares the genotype for all shared variants

    The number of matching variants are counted and those that do not match are annotated with basic variant info plus
    quality, genotype, coverage (total, ref base and alt base if appropriate)

    :param folder: Location of results from the NGS analysis pipeline
    :type folder: String
    :param sample: Sample number (used in vcf file)
    :type sample: String
    :param ref_sample: Sample number for reference vcf
    :type ref_sample: String
    :param coverage_file: File containing coverage information for each position in the panel
    :type coverage_file: String
    :return: Number of matching variants
    :rtype: Int
    :return: List of variant dictionaries with detailed information for mismatching genotypes
    :rtype: List
    """
    shared_giab = VariantFile(folder + '/0002.vcf')
    shared_patient = VariantFile(folder + '/0003.vcf')

    variants = []

    vars_giab = {}
    for rec in shared_giab.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if chrom not in vars_giab:
            vars_giab[chrom] = {}
        if pos not in vars_giab[chrom]:
            vars_giab[chrom][pos] = {}
        if alleles not in vars_giab[chrom][pos]:
            vars_giab[chrom][pos][alleles] = rec.samples[ref_sample]['GT']

    matching = 0
    for rec in shared_patient.fetch():
        chrom = rec.contig
        pos = rec.pos
        alleles = rec.alleles
        if 'AD' in rec.samples[sample].keys():
            allelic_depth = rec.samples[sample]['AD']
        else:
            allelic_depth = 'N/A'
        if 'DP' in rec.samples[sample].keys():
            total_depth = rec.samples[sample]['DP']
        elif 'NR' in rec.samples[sample].keys():
            total_depth = rec.samples[sample]['NR']
        else:
            total_depth = 0
        giab_genotype = vars_giab[chrom][pos][alleles]
        if rec.samples[sample]['GT'] == giab_genotype:
            matching += 1
        elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][
            0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0:
            matching += 1
        elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1:
            matching += 1
        else:
            if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1:
                search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\''
                command = 'grep ' + search + ' ' + coverage_file
                try:
                    line = subprocess.check_output(command, shell=True)
                except subprocess.CalledProcessError as e:
                    print('Error executing command: ' + str(e.returncode))
                    exit(1)
                if line == '':
                    variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                               'GT': {"sample": rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                               'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                               'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}}
                else:
                    bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6}
                    fields = line.split()
                    cov = fields[2]
                    ref_cov = fields[bases[rec.alleles[0]]]
                    alt_cov = fields[bases[rec.alleles[1]]]
                    variant = {'chrom':chrom, 'pos':pos, 'ref':alleles[0], 'alt':alleles[1], 'QUAL':rec.qual,
                                    'GT':{"sample":rec.samples[sample]['GT'], 'GIAB':giab_genotype},
                                    'vcf_depth':{'DP':total_depth, 'AD':allelic_depth},
                                    'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}}
            else:
                variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual,
                           'GT': {"sample": rec.samples[sample]['GT'], 'GIAB': giab_genotype},
                           'vcf_depth': {'DP': total_depth, 'AD': allelic_depth},
                           'coverage': {'total': 'indel: no coverage could be obtained', 'ref': 'N/A', 'alt': 'N/A'}}
            variants.append(variant)
    print(str(matching) + ' matching variants')

    return matching, variants
Exemple #49
0
worksheetIntron.write('A11', 'Regions: ')
row = 11
col = 0
for gene in intronDict:
    worksheetIntron.write_row('B'+str(row), [gene]+intronDict[gene])
    row += 1

row += 1
worksheetIntron.write('A'+str(row), 'Coverage below '+str(medCov)+'x', italicFormat)
row += 1
tableheading = ['RunID', 'DNAnr', 'Gene', 'Chr', 'Pos', 'Ref', 'Alt', 'AF', 'DP',
                'Transcript', 'Mutation cds', 'ENSP', 'Consequence', 'Max popAF', 'Max Pop', 'Callers']
worksheetIntron.write_row('A'+str(row), tableheading, tableHeadFormat)  # 1 index

for snv in vcf_snv.fetch():
    if "PopAF" not in snv.filter.keys():
        if snv.contig in introns:
            for pair in introns[snv.contig]:
                if snv.pos >= pair[0] and snv.pos <= pair[1] and snv.info["AF"][0] >= 0.2:
                    # import pdb; pdb.set_trace()
                    csq = snv.info["CSQ"][0]
                    gene = csq.split("|")[3]
                    transcript = csq.split("|")[10].split(":")[0]
                    if len(csq.split("|")[10].split(":")) > 1:
                        codingName = csq.split("|")[10].split(":")[1]
                    else:
                        codingName = ''
                    ensp = csq.split("|")[11]
                    consequence = csq.split("|")[1]
                    popFreqsPop = ['AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomAD_AF', 'gnomAD_AFR_AF',