def annotate_false_negs(folder): """ Get information for any false negative results - returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Folder containing output from bcftools isec :return: array of variant dictionaries containing information on false negatives """ false_negs = VariantFile(folder + '/0000.vcf') num_neg = len(list(false_negs.fetch())) print num_neg variants = [] if num_neg > 0: print 'false negatives' for rec in false_negs.fetch(): chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples['INTEGRATION']['GT'] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype} variants.append(variant) else: print 'no false negatives' return variants
def gen_report(vcf, sample, ref_flag): vcf_in = VariantFile(vcf) # run cadd twice over snv and indel file out = open(sample + '.germline.vep91.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'HGVSg': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'BIOTYPE': 0, 'SIFT': 0, 'Existing_variation': 0, 'VARIANT_CLASS': 0, 'gnomAD_AF': 0, 'CLIN_SIG': 0, 'CADD_PHRED': []} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) if desc_list[i] == 'CADD_PHRED': desired[desc_list[i]].append(i) else: desired[desc_list[i]] = i out.write('CHROM\tPOS\tREF\tAllele\tTotal Allele Count\tTotal Position Coverage\tGene\tHGVSg\tTranscript_id' '\tEffect\tIMPACT\tBIOTYPE\tCodons\tAmino_acids\tExisting_variation\tVARIANT_CLASS\tSIFT\tgnomAD_AF' '\tCLIN_SIG\tCADD_PHRED\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt, alt_ct, tot_ct) = (record.contig, str(record.pos), record.ref, record.alts[0], str(record.info['TR']), str(record.info['TC'])) ann_list = [_.split('|') for _ in record.info['ANN']] output_highest_impact(chrom, pos, ref, alt, alt_ct, tot_ct, ann_list, desired, out, ref_flag) out.close() return 0
def gen_report(vcf): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.indels.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') vcf_in = VariantFile(vcf) out = open(parts[0] + '.indels.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in xrange(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact' '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n') for record in vcf_in.fetch(): (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0], str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO'])) ann_list = [_.split('|') for _ in record.info['ANN'].split(',')] output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def annotate_false_pos(folder, coverage_file, sample): """ Get information for any false positive results - returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Folder containing output from bcftools isec :param coverage_file: File containing per base coverage for the truth_regions panel :param sample: container ID used in vcf file :return: array of variant dictionaries containing information on false negatives """ false_pos = VariantFile(folder + '/0001.vcf') num_pos = len(list(false_pos.fetch())) print num_pos variants = [] if num_pos > 0: print 'false positives' for rec in false_pos.fetch(): chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples[sample]['GT'] if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' total_depth = rec.samples[sample]['DP'] if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command: ' + str(e.returncode) exit(1) if line == '': variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} else: bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} else: variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A', 'alt':'N/A'}} variants.append(variant) else: print 'no false positives' return variants
def check_genotype(folder, sample): """ Compares the genotype for all shared variants :param folder: location of results from the NGS analysis pipeline :param sample: sample number (used in vcf file) :return: dictionary of number of matching variants and detailed information for any with mismatching genotypes """ shared_giab = VariantFile(folder + '/0002.vcf') shared_patient = VariantFile(folder + '/0003.vcf') variants = [] vars_giab = {} for rec in shared_giab.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if chrom not in vars_giab: vars_giab[chrom] = {} if pos not in vars_giab[chrom]: vars_giab[chrom][pos] = {} if alleles not in vars_giab[chrom][pos]: vars_giab[chrom][pos][alleles] = rec.samples['INTEGRATION']['GT'] matching = 0 for rec in shared_patient.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' total_depth = rec.samples[sample]['DP'] giab_genotype = vars_giab[chrom][pos][alleles] if rec.samples[sample]['GT'] == giab_genotype: matching += 1 elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][ 0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]: matching += 1 elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0: matching += 1 elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1: matching += 1 else: variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}} variants.append(variant) print str(matching) + ' matching variants' results = {'matching':matching, 'mismatching':variants} print results return results
def subset_by_callers(in_file, callers): out_file = "%s-%s.vcf" % (in_file.replace(".vcf", "").replace(".gz", ""), "_".join(callers)) if not os.path.exists(out_file) and not os.path.exists(out_file + ".gz"): want_callers = set(callers) reader = VariantFile(in_file) writer = VariantFile(out_file, "w", header=reader.header) count = 0 for rec in reader: cur_callers = set(rec.info["set"].split("-")) if len(cur_callers & want_callers) > 0: count += 1 writer.write(rec) print callers, count return vcfutils.bgzip_and_index(out_file, {})
def gen_report(vcf, out, c, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.subsitutions.vep.priority_report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') mut_dict = create_mutect_ind(out) log(loc, date_time() + 'Created index for added mutect info\n') on_dict = {} if c != 'n': on_dict = create_target(c) log(loc, date_time() + 'Target file given, creating index for on target info\n') vcf_in = VariantFile(vcf) out = open(parts[0] + '.subsitutions.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i out.write('chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t' 'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tgnomAD_AF\tgene\ttx_id\teffect\timpact\tbiotype\t' 'codon_change\tamino_acid_change\ton/off-target\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0] ann_list = [_.split('|') for _ in record.info['ANN']] tflag = 'NA' if c != 'n': tflag = mark_target(chrom, pos, on_dict) # only outputting ON TARGET hits if tflag == 'OFF': continue output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def get_variants(filename): """ Function that parse the sample VCF file. This function get snp found in the representative genes, and uses the tag 'AD', a list containing the number of read mapped for reference and alternative variant. Args: filename [string] = sample filename Returns: var [dict] = contain snp variation informations of representative genes key : representative gene name value : variant [dict] containing snp position as key, and a list of (nucleotide variant, aligned reads number) """ # open VCF file vcf = VariantFile(filename) # initialise var = {} flag = 0 for rec in vcf.fetch(): # only for the first record, set variable name if flag == 0: name = rec.chrom # rec.chrom is the representative gene name variant = defaultdict(list) flag = 1 # if snp are found in another representative gene if rec.chrom != name: var[name] = variant # store the variant name = rec.chrom # change the representative gene name variant = defaultdict(list) # create a new variant dictionnary # read the snp informations for gene, obj in rec.samples.items(): i = 0 if 'AD' in obj: for nb in obj['AD']: if nb != 0: variant[rec.pos].append((rec.alleles[i], nb)) i +=1 return var
def parse_vcf(filename): """ Function that parse a database VCF file obtained by a variant calling using a multiple alignment file. It parses the VCF file and output a matrix containing all the variant at each snp position of all the clustered genes Args : filename [string] = VCF filename Returns: name [string] = representative gene name index [dict] = a dictionary containing index of snp position in list: key : snp position value : index of the snp in the list of the dict versions matrix [dict] = dictionary containing all variations key : clustered gene value : list of the nucleotide variation """ # open VCF file vcf = VariantFile(filename) # initialise index = {} matrix = defaultdict(list) i = 0 # index of snp name = 0 for rec in vcf.fetch(): name = rec.chrom # representative gene name # get the snp position (rec.pos) and his index (i) index[rec.pos] = i i += 1 # creation of the matrix of a cluster, gene are the different clustered # genes, obj contain information about the snp for gene, obj in rec.samples.items(): snp = obj.allele_indices[0] if snp != -1: matrix[gene].append(rec.alleles[snp]) else: # if deletion matrix[gene].append('') return name, [index, matrix]
def variants_missing_vcf(self,vcf_file): cat_chroms = set(self.data[self.col_chr].unique()) cat_variants = set(self.data[self.col_epacts].unique()) vcf_variants = set() for cat_chrom in cat_chroms: print >> sys.stderr, "Checking chromosome %s..." % str(cat_chrom) if '.json' in vcf_file: import json with open(vcf_file) as jsin: vcf_dict = json.load(jsin) vcf = vcf_dict.get(cat_chrom) if vcf is None: warning("GWAS catalog has variants on chromosome %s, but could not find this chromosome in your VCF (or JSON) file: %s" % (cat_chrom,vcf_file)) continue else: vcf = vcf_file vcf_pysam = VariantFile(vcf) # Subset catalog to chromosome df_cat_for_chrom = self.data.query("{} == '{}'".format(self.col_chr,cat_chrom)) # Catalog has repeated rows for variants depending on the number of traits * citations # But we just need each variant once df_cat_for_chrom = df_cat_for_chrom.drop_duplicates(self.col_epacts) # Loop over subsetted catalog, check if variant is in VCF for idx, row in df_cat_for_chrom.iterrows(): chrom, pos = row[self.col_chr], row[self.col_pos] for rec in vcf_pysam.fetch(chrom,pos,pos): epacts = "{}:{}_{}/{}".format(rec.chrom,rec.pos,rec.ref,rec.alt) vcf_variants.add(epacts) missing_variants = cat_variants.difference(vcf_variants) missing_rows = self.data[self.data[self.col_epacts].isin(missing_variants)] return missing_rows
def annotate_false_pos(folder, sample): """ Get information for any false positive results - returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Folder containing output from bcftools isec :param sample: container ID used in vcf file :return: array of variant dictionaries containing information on false negatives """ false_pos = VariantFile(folder + '/0001.vcf') num_pos = len(list(false_pos.fetch())) print num_pos variants = [] if num_pos > 0: print 'false positives' for rec in false_pos.fetch(): chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples[sample]['GT'] if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' total_depth = rec.samples[sample]['DP'] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}} variants.append(variant) else: print 'no false positives' return variants
def add_PASSED_field(in_vcf, out_vcf): """ Add PASSED_{caller} fields. Add flags (e.g. PASSED_caveman) under INFO for PASS variant in aim of reduce ambiguity of confident variants in the merged vcf. """ # see logic of merging INFO fields # https://github.com/vcftools/vcftools/blob/490848f7865abbb4b436ca09381ea7912a363fe3/src/perl/vcf-merge caller = get_caller(in_vcf) i_vcf = VariantFile(in_vcf, "rb") new_header = i_vcf.header.copy() try: new_header.info.add( "PASSED_{}".format(caller), ".", "Flag", "this variants passed which caller(s)", ) i_vcf.header.info.add( "PASSED_{}".format(caller), ".", "Flag", "this variants passed which caller(s)", ) except ValueError: pass raw_out = out_vcf.strip(".gz") o_vcf = VariantFile(raw_out, "w", header=new_header) for record in i_vcf: new_rec = record.copy() filters = list(record.filter) if filters and filters[0] == "PASS": new_rec.info["PASSED_{}".format(caller)] = 1 o_vcf.write(new_rec) o_vcf.close() subprocess.check_call(["bgzip", "-f", raw_out])
def main(): vcf_path = sys.argv[1] vcf = VariantFile(vcf_path, 'r', drop_samples=True) vcf.header.add_line( "##INFO=<ID=CAF,Number=.,Type=String,Description=\"An ordered, comma delimited list of allele frequencies, starting with the reference allele followed by alternate alleles as ordered in the ALT column.\">" ) for record in vcf.header.records: if record.key == "FORMAT": record.remove() print('\n'.join(str(vcf.header).split('\n')[:-2])) print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO") for record in vcf: afs = record.info.get('EUR_AF') rf = round(1.0 - sum(afs), 3) record.info.__setitem__( 'CAF', '{},{}'.format(rf, ','.join([str(round(af, 3)) for af in afs]))) print(record, end='')
def main(): parser = argparse.ArgumentParser( description="Tags the variants in a VCF with the corresponding gene.") parser.add_argument( "-a", dest="all_flag", action="store_true", help="Print all variants, not only the ones with genotype>0") parser.add_argument("vcf_path", metavar="VCF", type=str, help="Path to VCF file") parser.add_argument("gtf_path", metavar="GTF", type=str, help="Path to GTF file") args = parser.parse_args() with_anno = args.gtf_path != "None" if with_anno: gtf = open_gtf(args.gtf_path) cdss = extract_cdss(gtf) vcf = VariantFile(args.vcf_path, 'r', drop_samples=False) vcf.header.add_line( "##INFO=<ID=GENE,Number=1,Type=String,Description=\"Genic region\">") print('\n'.join(str(vcf.header).split('\n')[:-1])) for record in vcf: gt = record.samples[0]['GT'][0] if gt == 0 and not args.all_flag: continue if with_anno: gene_name = get_gene_name(cdss, record.pos) record.info.__setitem__("GENE", gene_name) print(record, end='')
def _pick_best_quality_score(vrn_file): """Flexible quality score selection, picking the best available. Implementation based on discussion: https://github.com/chapmanb/bcbio-nextgen/commit/a538cecd86c0000d17d3f9d4f8ac9d2da04f9884#commitcomment-14539249 (RTG=AVR/GATK=VQSLOD/MuTect=t_lod_fstar, otherwise GQ, otherwise QUAL, otherwise DP.) For MuTect, it's not clear how to get t_lod_fstar, the right quality score, into VCF cleanly. MuTect2 has TLOD in the INFO field. """ # pysam fails on checking reference contigs if input is empty if not vcfutils.vcf_has_variants(vrn_file): return "DP" to_check = 25 scores = collections.defaultdict(int) try: in_handle = VariantFile(vrn_file) except ValueError: raise ValueError("Failed to parse input file in preparation for validation: %s" % vrn_file) with contextlib.closing(in_handle) as val_in: for i, rec in enumerate(val_in): if i > to_check: break if "VQSLOD" in rec.info and rec.info.get("VQSLOD") is not None: scores["INFO=VQSLOD"] += 1 if "TLOD" in rec.info and rec.info.get("TLOD") is not None: scores["INFO=TLOD"] += 1 for skey in ["AVR", "GQ", "DP"]: if len(rec.samples) > 0 and rec.samples[0].get(skey) is not None: scores[skey] += 1 if rec.qual: scores["QUAL"] += 1 for key in ["AVR", "INFO=VQSLOD", "INFO=TLOD", "GQ", "QUAL", "DP"]: if scores[key] > 0: return key raise ValueError("Did not find quality score for validation from %s" % vrn_file)
def parse_filepaths(filepaths): """ Parameters ---------- filepaths : list of str List of paths to standardized VCFs Returns ------- vcfs : list of pysam.VariantFile """ vcfs = deque() for path in filepaths: if len(path.split()) != 1: raise ValueError('File list must be single column') if not os.path.isfile(path): raise FileNotFoundError('VCF {0} not found'.format(path)) vcf = VariantFile(path) vcfs.append(vcf) return vcfs
def rename_samples_headers(in_vcf, out_vcf): """Replace hard-to-read and ambiguious header with clear header.""" out_vcf = out_vcf.strip(".gz") caller = get_caller(in_vcf) vcf = VariantFile(in_vcf, "rb") samples_names = list(vcf.header.samples) header = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT" if len(samples_names) == 2: header += "\t{}_NORMAL\t{}_TUMOR\n".format(caller, caller) elif len(samples_names) == 1: header += "\t{}_NORMAL\n".format(caller) with open(out_vcf, "w") as fout: with gzip.open(in_vcf, "r") as fin: for line in fin: if line.startswith("#CHROM"): fout.write(header) else: fout.write(line) subprocess.check_call(["bgzip", "-f", out_vcf]) os.remove(in_vcf)
def filtered_bcf_to_fasta(filtered_bcf_file, reference_lengths): # make dictionaries to capture seeuence(s) and ongoing positons sequences = OrderedDict() for chrom in reference_lengths.keys(): sequences[chrom] = [] previous_positions = OrderedDict() for chrom in reference_lengths.keys(): previous_positions[chrom] = 0 with VariantFile(filtered_bcf_file) as vcf_reader: for record in vcf_reader.fetch(): record_chrom = record.chrom if record.pos == previous_positions[record_chrom]: # Insertion - remove previous character and add 'N' sequences[record_chrom].pop() # remove last position sequences[record_chrom].append('N') else: if previous_positions[record_chrom] + 1 < record.pos: # large deletion add gaps before adding next position sequences[record_chrom].extend(calculate_gaps_to_add(previous_positions[record_chrom] + 1, record.pos)) if 'PASS' in record.filter.keys(): # HQ SNP gt = record.samples[0]['GT'][0] # get genotype (1st from tuple) if 'INDEL' in record.info: # indel sequences[record_chrom].append('N') elif gt == 0: # reference base sequences[record_chrom].append(record.ref.lower()) # add reference base as lower case else: # variant if len(record.alts) != 1: # if more than one ALT genotype so add N sequences[record_chrom].append('N') else: # add ALT SNP as upper case sequences[record_chrom].append(record.alts[gt-1].upper()) else: # if not PASS it's a low qual SNP so add N sequences[record_chrom].append('N') previous_positions[record_chrom] = record.pos # check for gaps at end for chrom in sequences: if len(sequences[chrom]) != reference_lengths[record_chrom]: # if gap at the end sequences[chrom].extend(calculate_gaps_to_add(len(sequences[chrom]), reference_lengths[chrom])) return sequences
def main(): parser = argparse.ArgumentParser("find_outliers.py") parser.add_argument("input", type=str, help="list of samples names") parser.add_argument("output", type=str, help="list of samples names") parser.add_argument("outliers", type=str, help="list of samples names") args = parser.parse_args() #vcf = VariantFile(snakemake.input.vcf) vcf = VariantFile(args.input) outlier_table = pd.read_table(args.outliers) filtered = VariantFile(args.output, mode='w', header=vcf.header) outliers = defaultdict(list) for idx, row in outlier_table.iterrows(): outliers[row['svtype']].append(row['sample']) for record in remove_outliers(vcf, outliers): filtered.write(record) filtered.close()
def _prep_vrn_file(in_file, vcaller, work_dir, somatic_info): """Select heterozygous variants in the normal sample with sufficient depth. """ data = somatic_info.tumor_data params = {"min_freq": 0.4, "max_freq": 0.6, "min_depth": 15} out_file = os.path.join( work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], vcaller)) if not utils.file_uptodate(out_file, in_file): sub_file = _create_subset_file(in_file, work_dir, data) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["chrom", "start", "end", "freq"]) bcf_in = VariantFile(sub_file) for rec in bcf_in: tumor_freq = _is_possible_loh(rec, params, somatic_info) if chromhacks.is_autosomal( rec.chrom) and tumor_freq is not None: writer.writerow([ _to_ucsc_style(rec.chrom), rec.start, rec.stop, tumor_freq ]) return out_file
def extract_truth(truth_path): truth_vcf = VariantFile(truth_path) truth_gts = {} indels = {} ref_idx = "" for record in truth_vcf: if is_snp(record): continue ref_idx = record.chrom # Only if input is "chromosome-specific" vidx = ("chr" + record.chrom, record.pos, record.ref, "-".join(record.alts)) is_good = True for alt in record.alts: if alt[0] == '<': is_good = False break is_good = is_good and len( record.alts) == 1 # We consider only single-allelic indels if not is_good: continue gt = "" for (type_name, content) in record.samples.items()[0][1].items(): if type_name == 'GT': gt = str(min(content[0], content[1])) + "/" + str( max(content[0], content[1])) if gt == "0/0": continue l_indel = len(record.alts[0]) - len( record.ref) # + is an insertion, - is a deletion if gt not in indels: indels[gt] = {} if l_indel not in indels[gt]: indels[gt][l_indel] = [0, 0, 0] indels[gt][l_indel][2] += 1 truth_gts[vidx] = gt return ref_idx, truth_gts, indels
def __init__(self, vcf_file, reference_file, prg_output_file, mode="normal"): self.prg_bytes: Map[str, bytearray] = defaultdict(bytearray) self.num_sites = 0 self.processed_refs = list() self.skipped_records: int = 0 if mode not in self.acceptable_modes: raise ValueError(f"Mode not in {self.acceptable_modes}") self.f_out_prefix = prg_output_file self.vcf_in = VariantFile(vcf_file).fetch() self.ref_in = reference_file self.ref_records = load_fasta(reference_file) self._make_prg(mode) if self.skipped_records > 0: logger.warning( f"Skipped {self.skipped_records} because of no 'PASS' in their FORMAT column" )
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str, output_vcf: str) -> None: """ Transforms dToxoG MAF to minimal VCF of only dtoxo failures. :param input_maf: The annotated dtoxog MAF output file. :param reference_fa: Reference fasta used to make seqdict header. :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("dtoxog_maf_to_vcf") logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures") # setup total = 0 written = 0 tag = "oxog" # header header = generate_header(reference_fa, tag) # Writer mode = get_pysam_outmode(output_vcf) writer = VariantFile(output_vcf, mode=mode, header=header) # Process try: with open(input_maf, "rt") as fh: for record in maf_generator(fh): total += 1 if record["oxoGCut"] == "1": new_vcf_record = build_new_record(record, writer, tag) writer.write(new_vcf_record) written += 1 finally: writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Wrote {}".format(total, written))
def run_process(opts, inputvcf): outputvcf = opts.output # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header vcf_in.header.info.add("HGVS_p", ".", "String", "HGVS.p Information (Single Character Amino Acid)") vcf_in.header.info.add("variant_type", ".", "String", "Variant Type for Tiering System") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-', 'w', header=vcf_in.header) for record in vcf_in.fetch(): new_hgvsp = [] if "ANN" in record.info: # Get HGVS.p anns = record.info["ANN"] for annstring in anns: ann = annstring.split("|") #ann[6] = ann[6].split(".")[0] #print annstring #print "|".join(ann) new_hgvsp_tmp = convert_hgvsp(ann[10]) if new_hgvsp_tmp == "" or new_hgvsp_tmp == None: new_hgvsp_tmp = '.' new_hgvsp.append(new_hgvsp_tmp) new_hgvsp_string = ",".join(new_hgvsp) record.info["HGVS_p"] = new_hgvsp_string # Write VCF vcf_out.write(record)
def write_rephased_tenx_vcf(tenx_vcf, tenx_records, tenx_phase_sets, threshold, workdir): """ Writes new 10X VCF file and switches genotypes if logratios above / below threshold """ basename = os.path.basename(tenx_vcf) if basename.endswith('.vcf'): offset = -4 elif basename.endswith('.vcf.gz'): offset = -7 else: return tenx_rephased_vcf = workdir + '/' + basename[:offset] + '.filtered.het.rephased.vcf' vcf_in = VariantFile(tenx_vcf) vcf_out = VariantFile(tenx_rephased_vcf, 'w', header=vcf_in.header) for ps_id in tenx_phase_sets: if tenx_phase_sets[ps_id].rephased: chrom = tenx_phase_sets[ps_id].chrom if tenx_phase_sets[ps_id].log2ratio >= threshold: for pos in tenx_phase_sets[ps_id].positions: tenx_records[chrom + ':' + str(pos)].samples[0]['PS'] = 1 vcf_out.write(tenx_records[chrom + ':' + str(pos)]) elif tenx_phase_sets[ps_id].log2ratio <= -threshold: for pos in tenx_phase_sets[ps_id].positions: tenx_records[chrom + ':' + str(pos)].samples[0]['PS'] = 1 GT_swapped = (tenx_records[chrom + ':' + str(pos)].samples[0]['GT'][1], tenx_records[chrom + ':' + str(pos)].samples[0]['GT'][0]) tenx_records[chrom + ':' + str(pos)].samples[0]['GT'] = GT_swapped tenx_records[chrom + ':' + str(pos)].samples[0].phased = True vcf_out.write(tenx_records[chrom + ':' + str(pos)]) return tenx_rephased_vcf
class VcfAugmenter(ABC): def __init__(self, in_path, command_line, out_file=sys.stdout, include_haploid_phase_sets=False): """ in_path -- Path to input VCF, used as template. command_line -- A string that will be added as a VCF header entry (use None to not add this to the VCF header) out_file -- Open file-like object to which VCF is written. tag -- which type of tag to write, either 'PS' or 'HP'. 'PS' is standardized; 'HP' is compatible with GATK’s ReadBackedPhasing. """ # TODO This is slow because it reads in the entire VCF one extra time contigs, formats, infos = missing_headers(in_path) # TODO It would actually look nicer if the custom HS header was directly below PS if include_haploid_phase_sets and "HS" not in formats: formats.append("HS") # We repair the header (adding missing contigs, formats, infos) of the *input* VCF because # we will modify the records that we read, and these are associated with the input file. self._reader = VariantFile(in_path) augment_header(self._reader.header, contigs, formats, infos) if command_line is not None: command_line = '"' + command_line.replace('"', "") + '"' self._reader.header.add_meta("commandline", command_line) self.setup_header(self._reader.header) self._writer = VariantFile(out_file, mode="w", header=self._reader.header) self._unprocessed_record = None self._reader_iter = iter(self._reader) @abstractmethod def setup_header(self, header): pass def close(self): self._writer.close() def __enter__(self): return self def __exit__(self, *args): self.close() @property def samples(self): return list(self._reader.header.samples) def _iterrecords(self, chromosome): """Yield all records for the target chromosome""" n = 0 if self._unprocessed_record is not None: assert self._unprocessed_record.chrom == chromosome yield self._unprocessed_record n += 1 for record in self._reader_iter: n += 1 if record.chrom != chromosome: # save it for later self._unprocessed_record = record assert n != 1 return yield record
class VcfReader: """ Read a VCF file chromosome by chromosome. """ def __init__( self, path, indels=False, phases=False, genotype_likelihoods=False, ignore_genotypes=False, ploidy=None, ): """ path -- Path to VCF file indels -- Whether to include also insertions and deletions in the list of variants. ignore_genotypes -- In case of genotyping algorithm, no genotypes may be given in vcf, so ignore all genotypes ploidy -- Ploidy of the samples """ # TODO Always include deletions since they can 'overlap' other variants self._indels = indels self._vcf_reader = VariantFile(path) self._path = path self._phases = phases self._genotype_likelihoods = genotype_likelihoods self._ignore_genotypes = ignore_genotypes self.samples = list( self._vcf_reader.header.samples) # intentionally public self.ploidy = ploidy logger.debug("Found %d sample(s) in the VCF file.", len(self.samples)) def __enter__(self): return self def __exit__(self, *args): # follows same structure as for ReadSetReader self.close() def close(self): self._vcf_reader.close() @property def path(self): return self._vcf_reader.filename.decode() def _fetch(self, chromosome: str, start=0, end=None): try: records = self._vcf_reader.fetch(chromosome, start=start, stop=end) except ValueError as e: if "invalid contig" in e.args[0]: raise VcfInvalidChromosome(e.args[0]) from None elif "fetch requires an index" in e.args[0]: raise VcfIndexMissing( "{} is missing an index (.tbi or .csi)".format( self._path)) from None else: raise return records def fetch(self, chromosome: str, start=0, end=None): """ Fetch records from a single chromosome, optionally restricted to a single region. Return a VariantTable object. """ records = list(self._fetch(chromosome, start=start, end=end)) return self._process_single_chromosome(chromosome, records) def fetch_regions(self, chromosome: str, regions): """ Fetch records from a single chromosome that overlap the given regions. :param regions: a list of start, end tuples (end can be None) """ records = [] for start, end in regions: records.extend(list(self._fetch(chromosome, start=start, end=end))) return self._process_single_chromosome(chromosome, records) def __iter__(self): """ Yield VariantTable objects for each chromosome. Multi-ALT sites are skipped. """ for chromosome, records in itertools.groupby( self._vcf_reader, lambda record: record.chrom): yield self._process_single_chromosome(chromosome, records) @staticmethod def _extract_HP_phase(call): hp = call.get("HP") if hp is None or hp == (".", ): return None fields = [[int(x) for x in s.split("-")] for s in hp] for i in range(len(fields)): assert fields[0][0] == fields[i][0] block_id = fields[0][0] phase = tuple(field[1] - 1 for field in fields) return VariantCallPhase(block_id=block_id, phase=phase, quality=call.get("PQ", None)) @staticmethod def _extract_GT_PS_phase(call): is_het = not all(x == call["GT"][0] for x in call["GT"]) if not is_het: return None if not call.phased: return None block_id = call.get("PS", 0) phase = call["GT"] return VariantCallPhase(block_id=block_id, phase=phase, quality=call.get("PQ", None)) def _process_single_chromosome(self, chromosome, records): phase_detected = None n_snvs = 0 n_other = 0 n_multi = 0 table = VariantTable(chromosome, self.samples) prev_position = None for record in records: if len(record.alts) > 1: # Multi-ALT sites are not supported, yet n_multi += 1 continue pos, ref, alt = record.start, str(record.ref), str(record.alts[0]) if len(ref) == len(alt) == 1: n_snvs += 1 else: n_other += 1 if not self._indels: continue if (prev_position is not None) and (prev_position > pos): raise VcfNotSortedError( "VCF not ordered: {}:{} appears before {}:{}".format( chromosome, prev_position + 1, chromosome, pos + 1)) if prev_position == pos: logger.warning( "Skipping duplicated position %s on chromosome %r", pos + 1, chromosome, ) continue prev_position = pos # Read phasing information (allow GT/PS or HP phase information, but not both), # if requested if self._phases: phases = [] for sample_name, call in record.samples.items(): phase = None for extract_phase, phase_name in [ (self._extract_HP_phase, "HP"), (self._extract_GT_PS_phase, "GT_PS"), ]: p = extract_phase(call) if p is not None: if phase_detected is None: phase_detected = phase_name elif phase_detected != phase_name: raise MixedPhasingError( "Mixed phasing information in input VCF (e.g. mixing PS " "and HP fields)") phase = p # check for ploidy consistency and limits phase_ploidy = len(p.phase) if phase_ploidy > get_max_genotype_ploidy(): raise PloidyError( "Ploidies higher than {} are not supported." "".format(get_max_genotype_ploidy())) elif p is None or None in p: pass elif self.ploidy is None: self.ploidy = phase_ploidy elif phase_ploidy != self.ploidy: print("phase= {}".format(phase)) raise PloidyError( "Phasing information contains inconsistent ploidy ({} and " "{})".format(self.ploidy, phase_ploidy)) phases.append(phase) else: phases = [None] * len(record.samples) # Read genotype likelihoods, if requested if self._genotype_likelihoods: genotype_likelihoods = [] for call in record.samples.values(): GL = call.get("GL", None) PL = call.get("PL", None) # Prefer GLs (floats) over PLs (ints) if both should be present if GL is not None: genotype_likelihoods.append(GenotypeLikelihoods(GL)) elif PL is not None: genotype_likelihoods.append( GenotypeLikelihoods([pl / -10 for pl in PL])) else: genotype_likelihoods.append(None) else: genotype_likelihoods = [None] * len(record.samples) if not self._ignore_genotypes: # check for ploidy consistency and limits genotype_lists = [ call["GT"] for call in record.samples.values() ] for geno in genotype_lists: geno_ploidy = len(geno) if geno_ploidy > get_max_genotype_ploidy(): raise PloidyError( "Ploidies higher than {} are not supported." "".format(get_max_genotype_ploidy())) elif geno is None or None in geno: pass elif self.ploidy is None: self.ploidy = geno_ploidy elif geno_ploidy != self.ploidy: raise PloidyError("Inconsistent ploidy ({} and " "{})".format(self.ploidy, geno_ploidy)) genotypes = [ genotype_code(geno_list) for geno_list in genotype_lists ] else: genotypes = [Genotype([]) for i in range(len(self.samples))] phases = [None] * len(self.samples) variant = VcfVariant(position=pos, reference_allele=ref, alternative_allele=alt) table.add_variant(variant, genotypes, phases, genotype_likelihoods) logger.debug( "Parsed %s SNVs and %s non-SNVs. Also skipped %s multi-ALTs.", n_snvs, n_other, n_multi, ) # TODO remove overlapping variants return table
def missing_headers(path): """ Find contigs, FORMATs and INFOs that are used within the body of a VCF file, but are not listed in the header or that have an incorrect type. Return a tuple (contigs, formats, infos) where each of the items are lists of strings. The reason this function exists is that pysam.VariantFile crashes when we try to write a VCF record to it that uses contigs, INFOs or FORMATs that are missing from the header. See also <https://github.com/pysam-developers/pysam/issues/771> """ with VariantFile(path) as variant_file: header = variant_file.header.copy() # Check for FORMATs that do not have the expected type incorrect_formats = [] for fmt, v in variant_file.header.formats.items(): if fmt not in PREDEFINED_FORMATS: continue h = PREDEFINED_FORMATS[fmt] if v.number != h.number or v.type != h.typ: if fmt == "PS" and v.type != h.typ: raise VcfError( "The input VCF/BCF contains phase set ('PS') tags that are of the" " non-standard type '{}' instead of 'Integer'. WhatsHap cannot" " overwrite these as it could produce inconsistent files." " To proceed, you can use 'whatshap unphase' to remove phasing" " information from the input file".format(v.type)) incorrect_formats.append(fmt) # Iterate through entire file and check which contigs, formats and # info fields are used contigs = [] # contigs encountered, in the proper order seen_contigs = set() formats = [] # FORMATs encountered, in the proper order seen_formats = set() seen_infos = set() # INFOs encountered for record in variant_file: seen_infos.update(record.info) if record.alts is not None: for alt in record.alts: # If there are "vague" ALT alleles such as <INS>, <DEL> etc, then # the header needs to contain a LEN info entry even if LEN # is never used if alt.startswith("<"): seen_infos.add("END") # For the contigs, we maintain a set *and* a list because we want to # keep track of the order of the contigs. if record.contig not in seen_contigs: contigs.append(record.contig) seen_contigs.add(record.contig) for fmt in record.format: if fmt not in seen_formats: formats.append(fmt) seen_formats.add(fmt) # Determine which contigs are missing from the header header_contigs = set(header.contigs) missing_contigs = [] for contig in contigs: if contig not in header_contigs: missing_contigs.append(contig) # Determine which FORMATs are missing from the header header_formats = set(header.formats) missing_formats = [] for fmt in formats: if fmt in header_formats: continue missing_formats.append(fmt) # Determine which INFOs are missing from the header missing_infos = list(set(seen_infos) - set(header.info)) return (missing_contigs, incorrect_formats + missing_formats, missing_infos)
def _dump_rebased_vcf(records: List[VariantRecord], disco_paths: DiscoverPaths): template_vcf = VariantFile(disco_paths.discov_vcf_cortex) output_vcf = VariantFile(disco_paths.final_vcf, "w", header=template_vcf.header) for record in records: output_vcf.write(record)
from pysam import VariantFile from pysam import TabixFile from pyfaidx import Fasta # data files reference_file = 'S_lycopersicum_chromosomes.2.40.fa' annotation_file = 'gene_models.gff.gz' variant_file = 'tomato_snps.bcf' # load reference reference = Fasta(reference_file) # load annotations annotations = TabixFile(annotation_file) # laod variants variants = VariantFile(variant_file) # regions to query region1 = ("SL2.40ch01", 15000, 21000) region2 = ("SL2.40ch01", 20000, 70000) region1_reference = reference[region1[0]][region1[1]: region1[2]] region1_annotations = [a for a in annotations.fetch(*region1, parser=pysam.asGTF())] region1_variants = [a for a in variants.fetch(*region1)] region2_reference = reference[region2[0]][region2[1]: region2[2]] region2_annotations = [a for a in annotations.fetch(*region2, parser=pysam.asGTF())] region2_variants = [a for a in variants.fetch(*region2)]
key = r.contig + str(r.pos) + r.alleles[0] + r.alleles[1] if key in self.currentMap: sys.stderr.write('ERROR: repeated records detected, same meta info, error record:\n%s\n'%(r)) else: self.currentMap[key] = r if __name__ == '__main__': args = docopt(__doc__, version='1.0') #print(args) if(args['--format']): ShowFormat() sys.exit(-1) vcfMetaCols=9 #number of colummns for vcf meta information. inF1 = VariantFile(args['<input1>'], 'r') inF2 = VariantFile(args['<input2>'], 'r') Record = Record(inF2) #check smaples in two input file, same samples, and same order. if len(inF1.header.samples) != len(inF2.header.samples): sys.stderr.write('ERROR: different number of samples in two input files.\n') sys.exit(-1) else: for x, y in zip( inF1.header.samples, inF2.header.samples): if x != y: sys.stderr.write('ERROR: two input files should have the same samples, and ordered in same order.\n') sys.exit(-1) #output vcf header sys.stdout.write('%s'%(str(inF1.header))) for line in inF1.fetch():
#coding:utf-8 from sys import argv from os.path import exists import os import pysam import numpy as np from pysam import VariantFile script, bam_file, vcf_file, output_bam_file = argv bamfile = pysam.AlignmentFile(bam_file, "rb") vcffile = VariantFile(vcf_file) output_bamfile = pysam.AlignmentFile(output_bam_file, "wb", template=bamfile) for rec in vcffile.fetch(): for read in bamfile.fetch(): if (rec.pos == read.pos): output_bamfile.write(read) output_bamfile.close() bamfile.close() vcffile.close()
from labels import SVRecord_generic from pysam import VariantFile import re import os chr_list = [] with open("../MinorResearchInternship/BAM/BAM_chr_list", "r") as f: for line in f: line = line.strip() chr_list += [line] for chrom in chr_list: #filename = "genomewide_windowpairs/delly/"+chrom+"_windowpairs_DEL.npy.gz" #with gzip.GzipFile(filename, "rb") as f: ##shape = X.shape counter = 0 for vcf_file in ["../MinorResearchInternship/VCF/delly.sym.vcf"]: assert os.path.isfile(vcf_file) vcf_in = VariantFile(vcf_file, 'r') caller = re.findall(r'^\w*', vcf_file) for rec in vcf_in.fetch(): svrec = SVRecord_generic(rec, "delly") startCI = abs(svrec.cipos[0]) + svrec.cipos[1] endCI = abs(svrec.ciend[0]) + svrec.ciend[1] if startCI <= 200 and endCI <= 200 and svrec.chrom == "1" and svrec.svtype == "DEL": counter += 1 #if counter == shape[0]: #print("OK") print(counter)
def make_vcf_header(self): """ Add samples and sources to VCF template header. Returns ------- pysam.VariantHeader """ if self.preserve_header: header = self.vcfs[0].header for sample in self.samples: if sample not in header.samples: header.add_sample(sample) if self.preserve_ids and 'MEMBERS' not in header.info.keys(): info = ( '##INFO=<ID=MEMBERS,Number=.,Type=String,' 'Description="IDs of cluster\'s constituent records.">') header.add_line(info) if (not self.do_merge) and 'CLUSTER' not in header.info.keys(): info = ( '##INFO=<ID=CLUSTER,Number=1,Type=Integer,Description="Cluster ID">' ) header.add_line(info) return header # Read stock template template = pkg_resources.resource_filename( 'svtk', 'data/vcfcluster_template.vcf') # Make header template = VariantFile(template) header = template.header # Add samples for sample in self.samples: header.add_sample(sample) # Add contigs contigs = [] for vcf in self.vcfs: for contig in vcf.header.contigs.values(): tup = (contig.name, contig.length) if tup not in contigs: contigs.append(tup) contig_line = '##contig=<ID={0},length={1}>' for contig in contigs: header.add_line(contig_line.format(*contig)) # Add INFO infos = [] for vcf in self.vcfs: for tag, info in vcf.header.info.items(): if tag in header.info.keys(): continue tup = (info.name, info.number, info.type, info.description) if tup not in infos: infos.append(tup) info_line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">' for info in infos: header.add_line(info_line.format(*info)) if self.preserve_ids and 'MEMBERS' not in header.info.keys(): info = ('##INFO=<ID=MEMBERS,Number=.,Type=String,' 'Description="IDs of cluster\'s constituent records.">') header.add_line(info) if (not self.do_merge) and 'CLUSTER' not in header.info.keys(): info = ( '##INFO=<ID=CLUSTER,Number=1,Type=Integer,Description="Cluster ID">' ) header.add_line(info) # Add source sourcelist = sorted(set(self.sources)) header.add_line('##source={0}'.format(','.join(sourcelist))) # Add source FORMAT fields meta = ('##FORMAT=<ID={0},Number=1,Type=Integer,' 'Description="Called by {1}"') for source in self.sources: header.add_line(meta.format(source, source.capitalize())) return header
def gen_report(vcf, c, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.snv.strelka.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') on_dict = {} if c != 'n': on_dict = create_target(c) log( loc, date_time() + 'Target file given, creating index for on target info\n') vcf_in = VariantFile(vcf) call_type = 'snv' if bool(re.search('indel', fn)): out = open( parts[0] + '.indel.strelka.vep.prioritized_impact.report.xls', 'w') call_type = 'indel' else: out = open(parts[0] + '.snv.strelka.vep.prioritized_impact.report.xls', 'w') desired = { 'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0 } desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace( 'Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i if call_type == 'snv': out.write( 'chr\tpos\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t' 'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\t' 'variant_class_effect\teffect\timpact\tbiotype\tcodon_change\tamino_acid_change\ton/off-target\n' ) else: out.write( 'chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact\t' 'biotype\tcodon_change\tamino_acid_change\ton/off-target\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): # dict contains what's different between strelka indel and snv reports (chrom, pos, ref, alt) = (record.contig, str(record.pos), record.ref, record.alts[0]) if call_type == 'snv': not_shared = { 'norm_ref_ct': record.samples['NORMAL'][(record.ref + 'U')][0], 'norm_alt_ct': record.samples['NORMAL'][(record.alts[0] + 'U')][0], 'tum_ref_ct': record.samples['TUMOR'][(record.ref + 'U')][0], 'tum_alt_ct': record.samples['TUMOR'][(record.alts[0] + 'U')][0] } else: not_shared = {} ann_list = [_.split('|') for _ in record.info['ANN'].split(',')] tflag = 'NA' if c != 'n': tflag = mark_target(chrom, pos, on_dict) # only outputting ON TARGET hits if tflag == 'OFF': continue output_highest_impact(chrom, pos, ref, alt, not_shared, ann_list, desired, tflag, out, ref_flag, call_type) out.close() log( loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0
def sdi(data): if len(data) == 0: return "N/A" else: N = sum(data) return abs(-sum(p(n, N) for n in data if n is not 0)) ap = argparse.ArgumentParser() ap.add_argument('input_bcf', help='--') ap.add_argument('out', help='---') args = ap.parse_args() bcf_in = VariantFile(args.input_bcf) # auto-detect input format #test=[0.75,0.25] #print "---->",sdi(test) 0.811278124459 which is CORRECT! dict = {} pos = set() for rec in bcf_in.fetch(): #(0, 0, 8, 0) #VAF = (forward non-ref + reverse non-ref alleles) / (forward ref alleles + reverse ref + forward non-ref + reverse non-ref alleles) ref = rec.info["DP4"][0] + rec.info["DP4"][1] non_ref = rec.info["DP4"][2] + rec.info["DP4"][3] if ref == 0: VAF = 1
def check_genotype(folder, sample, coverage_file): """ Compares the genotype for all shared variants :param folder: location of results from the NGS analysis pipeline :param sample: sample number (used in vcf file) :param coverage_file: file containing coverage information for each position in the panel :return: dictionary of number of matching variants and detailed information for any with mismatching genotypes """ shared_giab = VariantFile(folder + '/0002.vcf') shared_patient = VariantFile(folder + '/0003.vcf') variants = [] vars_giab = {} for rec in shared_giab.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if chrom not in vars_giab: vars_giab[chrom] = {} if pos not in vars_giab[chrom]: vars_giab[chrom][pos] = {} if alleles not in vars_giab[chrom][pos]: vars_giab[chrom][pos][alleles] = rec.samples['INTEGRATION']['GT'] matching = 0 for rec in shared_patient.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' total_depth = rec.samples[sample]['DP'] giab_genotype = vars_giab[chrom][pos][alleles] if rec.samples[sample]['GT'] == giab_genotype: matching += 1 elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][ 0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]: matching += 1 elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0: matching += 1 elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1: matching += 1 else: if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print 'Error executing command: ' + str(e.returncode) exit(1) if line == '': variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} else: bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':alleles[0], 'alt':alleles[1], 'QUAL':rec.qual, 'GT':{sample:rec.samples[sample]['GT'], 'GIAB':giab_genotype}, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} else: variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {sample: rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'indel: no coverage could be obtained', 'ref': 'N/A', 'alt': 'N/A'}} variants.append(variant) print str(matching) + ' matching variants' results = {'matching':matching, 'mismatching':variants} print results return results
def main(): parser = argparse.ArgumentParser() parser.add_argument('-vcf', help='Results VCF to be compared', required=True) parser.add_argument('-bed', help='The reference BED file', required=True) parser.add_argument('-s', help='Sample ID in VCF', required=True) parser.add_argument('-out', help='The folder to putt results files', required=True) args = parser.parse_args() if args.out.endswith('/'): out_dir = args.out else: out_dir = args.out + '/' sample = args.s vcf_file = args.vcf bed = args.bed f = open(bed, 'r') regions = [line.strip('\n') for line in f.readlines()] f.close() variants = {} for region in regions: if region.startswith('#'): continue chrom, start, end, name = region.split('\t') pos, ref, alt = name.split(':') if chrom not in variants: variants[chrom] = {pos:{(ref, alt):False,}} elif pos not in variants[chrom]: variants[chrom][pos] = {(ref, alt):False,} else: variants[chrom][pos][(ref, alt)] = False vcf = VariantFile(vcf_file) false_pos = [] false_neg = [] true_pos = [] for v in vcf.fetch(): chrom = v.contig pos = str(v.pos) ref = v.alleles[0] alt = v.alleles[1] qual = v.qual genotype = v.samples[sample]['GT'] if 'AD' in v.samples[sample].keys(): allelic_depth = v.samples[sample]['AD'] elif 'NV' in v.samples[sample].keys(): allelic_depth = v.samples[sample]['NV'] else: allelic_depth = 'N/A' if 'DP' in v.samples[sample].keys(): total_depth = v.samples[sample]['DP'] elif 'NR' in v.samples[sample].keys(): total_depth = v.samples[sample]['NR'] else: total_depth = 0 if pos in variants[chrom].keys(): if (ref,alt) in variants[chrom][pos].keys(): variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual, 'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}} true_pos.append(variant) variants[chrom][pos][(ref, alt)] = True else: variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual, 'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}} false_pos.append(variant) else: variant = {'chrom': chrom, 'pos': pos, 'ref': ref, 'alt': alt, 'QUAL': qual, 'GT': genotype, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}} false_pos.append(variant) for chrom in variants.keys(): for pos in variants[chrom].keys(): for v in variants[chrom][pos].keys(): if not variants[chrom][pos][v]: variant = {'chrom': chrom, 'pos': pos, 'ref': v[0], 'alt': v[1], 'QUAL': 0, 'GT': (0,0), 'coverage': {'total': 'no coverage information', 'ref': 'N/A', 'alt': 'N/A'}} false_neg.append(variant) out = {'false_negative': {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':false_neg}, 'false_positive': false_pos, 'mismatching_genotype': [], 'matching_variants': len(true_pos), 'num_true_negatives': 0, 'sensitivity': 0, 'MCC': 0, 'small_panel_remainder_length': 0, 'percent_small_panel_covered': 0, 'num_false_positive': len(false_pos), 'num_false_negative': {'indel': 0, 'no_coverage': 0, 'ev_of_alt': 0, 'false_neg': 0, 'total': len(false_neg)}, 'num_mismatching_genotype': 0} all_results = {sample:out} f = open(out_dir + sample + '_summary.json', 'w') j = json.dumps(all_results, indent=4) print >> f, j f.close()
sys.stderr.write('ERROR: Index out of range. geno: %s, out index: %s\n'%(geno, str(outGenoArrayIndex))) sys.exit(-1) outGenoArrayIndex = [] def setoutGenoArrayIndex(oldFormatTags): outGenoArrayIndex.clear() ss = oldFormatTags.upper().split(':') for x in tags: try: y = ss.index(x) outGenoArrayIndex.append(y) except ValueError: sys.stderr.write('ERROR: can not find tag: "%s", from input vcf FORMAT field.\n'%(x)) sys.exit(-1) infile = VariantFile('-', 'r') sys.stdout.write(str(infile.header)) for line in infile: ss = str(line).strip().split() out = ss[:vcfMetaCols] out[8] = otags #update tags genotyp tags info. setoutGenoArrayIndex(ss[8]) #Check format line by line. for x in ss[vcfMetaCols:]: #if not outGenoArrayIndex: # setoutGenoArrayIndex(ss[8]) out.append(reformat(x)) sys.stdout.write('%s\n'%('\t'.join(out))) infile.close() sys.stdout.flush()
def annotate_false_negs(folder, ref_sample, coverage_file): """ Get information for any false negative results. Returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) False Negatives are split into categories to aid final comparison: * Zero coverage - No reads present * Evidence of alternate allele - Coverage or quality too low for variant call * Indels - Coverage is more difficult to obtain in these cases; currently they must be investigated by hand * All other false negatives - In these cases there are reads present and no evidence of the alternate allele :param folder: Folder containing output from bcftools isec :type folder: String :param ref_sample: Sample number for reference vcf :type ref_sample: String :param coverage_file: File containing per base coverage for the truth_regions panel :type coverage_file: String :return: List of variant dictionaries containing information on false negatives :rtype: List """ false_negs = VariantFile(folder + '/0000.vcf') num_neg = len(list(false_negs.fetch())) print(num_neg) variants = {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':[]} v_list = [] count=0 if num_neg > 0: print('false negatives') for rec in false_negs.fetch(): print(rec.samples) chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples[ref_sample]['GT'] if [chrom, pos, ref, alt] in v_list: print("duplicate") continue count+=1 if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print(command) print('Error executing command: ' + str(e.returncode)) exit(1) if line == '': variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} no_cov = variants['no_coverage'] no_cov.append(variant) variants['no_coverage'] = no_cov else: line.strip('\n') bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} if int(cov) == 0: no_cov = variants['no_coverage'] no_cov.append(variant) variants['no_coverage'] = no_cov elif int(alt_cov) != 0: ev_alt = variants['evidence_of_alt'] ev_alt.append(variant) variants['evidence_of_alt'] = ev_alt else: fn = variants['false_neg'] fn.append(variant) variants['false_neg'] = fn else: variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A', 'alt':'N/A'}} indels = variants['indels'] indels.append(variant) variants['indels'] = indels else: print('no false negatives') print("false_negatives=" + str(count)) return variants
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtk standardize', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Raw VCF.') parser.add_argument('fout', help='Standardized VCF.') parser.add_argument('source', help='Source algorithm. ' '[delly,lumpy,manta,wham,melt]') parser.add_argument('-p', '--prefix', help='If provided, variant names ' 'will be overwritten with this prefix.') parser.add_argument('--include-reference-sites', action='store_true', default=False, help='Include records where all ' 'samples are called 0/0 or ./.') parser.add_argument('--standardizer', help='Path to python file with ' 'custom standardizer definition. (Not yet supported.)') parser.add_argument('--contigs', type=argparse.FileType('r'), help='Reference fasta index (.fai). If provided, ' 'contigs in index will be used in VCF header. ' 'Otherwise all GRCh37 contigs will be used in header. ' 'Variants on contigs not in provided list will be ' 'removed.') parser.add_argument('--min-size', type=int, default=50, help='Minimum SV size to report [50].') parser.add_argument('--call-null-sites', action='store_true', default=False, help='Call sites with null genotypes (./.). Generally ' 'useful when an algorithm has been run on a single ' 'sample and has only reported variant sites.') parser.add_argument('--sample-names', type=str, default=None, help='Comma-delimited list of sample names to use in ' 'header [use existing].') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) # Add contigs to header if provided if args.contigs: template = pkg_resources.resource_filename( 'svtk', 'data/no_contigs_template.vcf') template = VariantFile(template) header = template.header contig_line = '##contig=<ID={contig},length={length}>' for line in args.contigs: contig, length = line.split()[:2] header.add_line(contig_line.format(**locals())) # Use GRCh37 by default else: template = pkg_resources.resource_filename('svtk', 'data/GRCh37_template.vcf') template = VariantFile(template) header = template.header vcf = VariantFile(args.vcf) # Parse new sample names if provided if args.sample_names: sample_names_list = args.sample_names.split(',') else: sample_names_list = vcf.header.samples # Tag source in header meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"' meta = meta.format(args.source, args.source.capitalize()) header.add_line(meta) header.add_line('##source={0}'.format(args.source)) fout = VariantFile(args.fout, mode='w', header=header) standardizer = VCFStandardizer.create(args.source, vcf, fout, sample_names_list, args.prefix, args.min_size, args.include_reference_sites, args.call_null_sites) for record in standardizer.standardize_vcf(): fout.write(record) fout.close() vcf.close()
def check_genotype(folder, sample, ref_sample, coverage_file): """ Compares the genotype for all shared variants The number of matching variants are counted and those that do not match are annotated with basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Location of results from the NGS analysis pipeline :type folder: String :param sample: Sample number (used in vcf file) :type sample: String :param ref_sample: Sample number for reference vcf :type ref_sample: String :param coverage_file: File containing coverage information for each position in the panel :type coverage_file: String :return: Number of matching variants :rtype: Int :return: List of variant dictionaries with detailed information for mismatching genotypes :rtype: List """ shared_giab = VariantFile(folder + '/0002.vcf') shared_patient = VariantFile(folder + '/0003.vcf') variants = [] vars_giab = {} for rec in shared_giab.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if chrom not in vars_giab: vars_giab[chrom] = {} if pos not in vars_giab[chrom]: vars_giab[chrom][pos] = {} if alleles not in vars_giab[chrom][pos]: vars_giab[chrom][pos][alleles] = rec.samples[ref_sample]['GT'] matching = 0 for rec in shared_patient.fetch(): chrom = rec.contig pos = rec.pos alleles = rec.alleles if 'AD' in rec.samples[sample].keys(): allelic_depth = rec.samples[sample]['AD'] else: allelic_depth = 'N/A' if 'DP' in rec.samples[sample].keys(): total_depth = rec.samples[sample]['DP'] elif 'NR' in rec.samples[sample].keys(): total_depth = rec.samples[sample]['NR'] else: total_depth = 0 giab_genotype = vars_giab[chrom][pos][alleles] if rec.samples[sample]['GT'] == giab_genotype: matching += 1 elif (rec.samples[sample]['GT'][0] is None or rec.samples[sample]['GT'][0] == 1) and rec.samples[sample]['GT'][ 0] == giab_genotype[1] and rec.samples[sample]['GT'][1] == giab_genotype[0]: matching += 1 elif rec.samples[sample]['GT'][0] == 0 and rec.samples[sample]['GT'][1] == 1 and giab_genotype[0] == 1 and giab_genotype[1] == 0: matching += 1 elif rec.samples[sample]['GT'][0] == 1 and rec.samples[sample]['GT'][1] == 0 and giab_genotype[0] == 0 and giab_genotype[1] == 1: matching += 1 else: if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print('Error executing command: ' + str(e.returncode)) exit(1) if line == '': variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {"sample": rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} else: bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':alleles[0], 'alt':alleles[1], 'QUAL':rec.qual, 'GT':{"sample":rec.samples[sample]['GT'], 'GIAB':giab_genotype}, 'vcf_depth':{'DP':total_depth, 'AD':allelic_depth}, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} else: variant = {'chrom': chrom, 'pos': pos, 'ref': alleles[0], 'alt': alleles[1], 'QUAL': rec.qual, 'GT': {"sample": rec.samples[sample]['GT'], 'GIAB': giab_genotype}, 'vcf_depth': {'DP': total_depth, 'AD': allelic_depth}, 'coverage': {'total': 'indel: no coverage could be obtained', 'ref': 'N/A', 'alt': 'N/A'}} variants.append(variant) print(str(matching) + ' matching variants') return matching, variants
class masking(object): Bampath = "" variantfile = None Vcfpath = "" Variants = [] def __init__(self, bampath, vcfpath): ### populates class variables ### input : string(bampath), string(vcfpath) ### output : none self.Bampath = bampath self.Bamfile = pysam.AlignmentFile(bampath, "rb") self.Vcfpath = vcfpath self.variantfile = VariantFile(self.Vcfpath) self.populateVarlist() def populateVarlist(self): ### populate varaints from vcf file ### input : none ### output : none for rec in self.variantfile.fetch(): self.Variants.append(rec) def printMaskVars(self): ### prints class variables ### input : none ### output : none for v in self.Variants: print(v.chrom, v.pos, v.ref, v.alts) print("Bampath :", self.Bampath) print("Vcfpath:", self.Vcfpath) def maskVariant(self, varRec, bamAlign): ### masking the alt base to ref ### input : VariantRecord(varRec), AlignedSegment ### output : AlignedSegment (modified) if not self.doesOverlap(varRec, bamAlign): return (bamAlign, False) elif len(varRec.alts) != 1: return (bamAlign, False) else: print("found the overlap with variant") AlIndex = (varRec.pos - 1) - (bamAlign.reference_end - bamAlign.reference_length + 1) queryBases = bamAlign.query_sequence if queryBases[AlIndex] == varRec.ref: return (bamAlign, False) elif queryBases[AlIndex] == varRec.alts[0]: queryBases = self.replaceChar(queryBases, varRec.ref, AlIndex) print("queryBas", queryBases) bamAlign.query_sequence = queryBases return (bamAlign, True) else: print("Unhandle case for maskvariant") return (bamAlign, False) def replaceChar(self, bamSeq, ref, index): bamSeq = bamSeq[:index] + ref + bamSeq[index + len(ref):] return bamSeq def doesOverlap(self, varRec, bamAlign): ### check if vcf overlaps bamalignment ### input : VariantRecord(varRec), AlignedSegment ### output : boolean pos = varRec.pos bamEndPos = bamAlign.reference_end bamStartPos = bamEndPos - bamAlign.reference_length return (pos >= bamStartPos and pos <= bamEndPos) def maskAllVariants(self): iter = self.Bamfile.fetch() masked_bam = pysam.AlignmentFile("masked.norm.bam", "wb", template=self.Bamfile) for x in iter: read = x for v in self.Variants: ret = self.maskVariant(v, x) b = ret[1] if b: read = ret[0] masked_bam.write(read) masked_bam.close() return
#metavar='File', help='Filtered vcffile', type=str, required=True) parser.add_argument('-af', '--minAF', help='Minimal allele frequency to output', default=0.25, type=float, required=False) args = parser.parse_args() if __name__ == '__main__': vcffile = VariantFile(args.infile) with open(args.outfile, 'w') as outfile: filtered_variant_dict = {} for rec in vcffile: #Skip minor allele variants if rec.info['MAJOR'] == 0: continue #Skip positions with an allele frequency below threshold if rec.info['AF'] < args.minAF: continue #Ignore out-of-frame indels if not (len(rec.alleles[0]) - len(rec.alleles[1])) % 3 == 0: continue #Add record when it passes all filters
# parser.add_argument('-y', '--pop_2', required=False, dest='pop2', help="Samples belonging to population 2") args = parser.parse_args() if args.bed and args.exclude: sys.exit('\nCan not use the -e and -b options together\n') if args.wwss and not args.bed: sys.exit('\nNeed to specify a bed file of regions with -b option\n') if args.bed and not args.min_sites: sys.exit('\nThe --min option must be specified with -b option\n') vcf_infile = VariantFile(args.vcf_infile) sample_num = len(vcf_infile.header.samples) if args.ploidy == 2: n = 2 * sample_num elif args.ploidy == 1: n = sample_num else: sys.exit("\nSpecify ploidy as 1 or 2\n") if args.sfs: sfs = np.zeros((n // 2) + 1, dtype=int) if args.exclude: af = []
def vcf_to_ref( outfile, vcf_file, rec_file, pop2sample, random_read_samples=[], pos_id="Physical_Pos", map_ids=["AA_Map"], default_map="AA_Map", rec_rate=1e-8, chroms = None, bed=None, lax_alleles = False ): pprint(pop2sample) if chroms is None: vcf_first = vcf_file else: vcf_first = vcf_file.format(CHROM=chroms[0]) # get chromosomes with VariantFile(vcf_first) as vcf: if chroms is None: chroms = [i for i in vcf.header.contigs] else: chroms = parse_chroms(chroms) log_.info("chroms found: %s", chroms) sample2pop = defaultdict(list) for pop, v in pop2sample.items(): for sample in v: if sample in vcf.header.samples: sample2pop[sample].append(pop) samples = sample2pop.keys() pops = set(pop for s, v in sample2pop.items() for pop in v) pprint(sample2pop) pprint(pops) map_ids = ['map'] + map_ids data_cols = [f"{p}_{e}" for p in pops for e in EXT] with lzma.open(outfile, "wt") as ref: ref.write("chrom,pos,ref,alt,") if rec_file is None: ref.write("map,") else: ref.write(",".join(map_ids)) ref.write(",") ref.write(",".join(data_cols)) ref.write("\n") for chrom in chroms: # set up rec file if rec_file is not None: rec = pd.read_csv(rec_file.format(CHROM=chrom), sep=" ") if "chrom" in rec: rec = rec[rec.chrom == chrom] rec['map'] = rec[default_map] rec_file_cols = list((pos_id, *map_ids)) rec = rec[rec_file_cols] rec_iter = rec.iterrows() R0 = next(rec_iter)[1] R1 = next(rec_iter)[1] #skip chrom if empty with VariantFile(vcf_file.format(CHROM=chrom)) as vcf: try: V = next(vcf) except StopIteration: continue with VariantFile(vcf_file.format(CHROM=chrom)) as vcf: vcf.subset_samples(samples) for row in vcf.fetch(chrom): alt_ix = 0 if len(row.alleles) <= 1 or len(row.alleles) > 3: continue if len(row.alleles) == 3: alleles = [i for v in row.samples.values() for i in v["GT"]] if 3 in alleles: continue elif 1 in alleles and 2 in alleles: continue elif 1 not in alleles and 2 not in alleles: continue elif 1 in alleles: alt_ix = 0 elif 2 in alleles: alt_ix = 1 else: raise ValueError(f"weird alleles {row.alleles}") log_.debug(f"{row.chrom}, {row.pos}, {row.alleles}, {Counter(alleles)}") if row.alts[alt_ix] not in "ACGT" or lax_alleles: continue D = defaultdict(int) # rec stuff if rec_file is None: map_ = row.pos * rec_rate ref.write( f"{row.chrom},{row.pos},{row.ref},{row.alts[alt_ix]},{map_}," ) else: if R1 is None: map_ = R0[map_ids] elif row.pos <= R0[pos_id]: map_ = R0[map_ids] elif R0[pos_id] < row.pos <= R1[pos_id]: slope = (R1[map_ids] - R0[map_ids]) / ( R1[pos_id] - R0[pos_id] ) map_ = R0[map_ids] + slope * (row.pos - R0[pos_id]) / ( R1[pos_id] - R0[pos_id] ) elif row.pos > R1[pos_id]: try: while row.pos > R1[pos_id]: R0, R1 = R1, next(rec_iter)[1] except StopIteration: R0, R1 = R1, None if R1 is None: map_ = R0[map_ids] else: slope = (R1[map_ids] - R0[map_ids]) / ( R1[pos_id] - R0[pos_id] ) map_ = R0[map_ids] + slope * (row.pos - R0[pos_id]) / ( R1[pos_id] - R0[pos_id] ) ref.write( f"{row.chrom},{row.pos},{row.ref},{row.alts[alt_ix]}," ) map_str = ",".join((str(m) for m in map_)) ref.write(f"{map_str},") sample_data = row.samples for s in sample_data: if s in random_read_samples: allele = sample_data[s]["GT"][0] if allele is not None: for pop in sample2pop[s]: D[f"{pop}_{EXT[allele > 0]}"] += 1 else: for allele in sample_data[s]["GT"]: if allele is not None: for pop in sample2pop[s]: D[f"{pop}_{EXT[allele > 0]}"] += 1 ref.write(",".join((str(D[c]) for c in data_cols))) ref.write("\n")
def annotate_false_negs(folder, ref_sample, coverage_file): """ Get information for any false negative results. Returns basic variant info plus quality, genotype, coverage (total, ref base and alt base if appropriate) :param folder: Folder containing output from bcftools isec :type folder: String :param ref_sample: Sample number for reference vcf :type ref_sample: String :param coverage_file: File containing per base coverage for the truth_regions panel :type coverage_file: String :return: List of variant dictionaries containing information on false negatives :rtype: List """ false_negs = VariantFile(folder + '/0000.vcf') num_neg = len(list(false_negs.fetch())) print(num_neg) variants = {'indels':[],'no_coverage':[],'evidence_of_alt':[],'false_neg':[]} if num_neg > 0: print('false negatives') for rec in false_negs.fetch(): chrom = rec.contig pos = int(rec.pos) ref = rec.alleles[0] alt = rec.alleles[1] qual = rec.qual genotype = rec.samples['Venter.il_st']['GT'] if len(rec.alleles[0]) == 1 and len(rec.alleles[1]) == 1: search = '\'' + rec.contig + '\s' + str(rec.pos - 1) + '\'' command = 'grep ' + search + ' ' + coverage_file try: line = subprocess.check_output(command, shell=True) except subprocess.CalledProcessError as e: print(command) print('Error executing command: ' + str(e.returncode)) exit(1) if line == '': variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':'no coverage information', 'ref':'N/A', 'alt':'N/A'}} no_cov = variants['no_coverage'] no_cov.append(variant) variants['no_coverage'] = no_cov else: line.strip('\n') bases = {'A': 3, 'C': 4, 'G': 5, 'T': 6} fields = line.split() cov = fields[2] ref_cov = fields[bases[rec.alleles[0]]] alt_cov = fields[bases[rec.alleles[1]]] variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':cov, 'ref':ref_cov, 'alt':alt_cov}} if cov == 0: no_cov = variants['no_coverage'] no_cov.append(variant) variants['no_coverage'] = no_cov elif alt_cov != 0: ev_alt = variants['evidence_of_alt'] ev_alt.append(variant) variants['evidence_of_alt'] = ev_alt else: fn = variants['false_neg'] fn.append(variant) variants['false_neg'] = fn else: variant = {'chrom':chrom, 'pos':pos, 'ref':ref, 'alt':alt, 'QUAL':qual, 'GT':genotype, 'coverage':{'total':'indel: no coverage could be obtained', 'ref':'N/A', 'alt':'N/A'}} indels = variants['indels'] indels.append(variant) variants['indels'] = indels else: print('no false negatives') return variants
Do we want the record, a dictionary, both?? """ return(None) if __name__ == '__main__': parser = argparse.ArgumentParser(description = "Takes a list of input files? Or Idrectory...TBD") parser.add_argument("--input_file", default = "./101.bcf") parser.add_argument("--output_dir", default = "./extract_output/") parser.add_argument("--merge_strands", action = "store_true") args = parser.parse_args() infile = VariantFile("101.bcf") csv_out_name = args.input_file.replace('.bcf', '.csv') ofile = open(csv_out_name, "w") # Column names for ouptut writer = csv.writer(ofile) writer.writerow(["chr", "pos", "reference", "call", "methylated", "unmethylated", "strand"]) # The things in rec.format # GT FT DP MQ GQ QD GL MC8 AMQ CS CG CX # Iterator I = infile.fetch('chr1', 100000, 110000, threads=4) # Iterate two records at a time if merging... for rec1, rec2 in itertools.zip_longest(*[I]*2):
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtools standardize', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Raw VCF.') parser.add_argument('fout', help='Standardized VCF.') parser.add_argument('source', help='Source algorithm. ' '[delly,lumpy,manta,wham,melt]') parser.add_argument('-p', '--prefix', help='If provided, variant names ' 'will be overwritten with this prefix.') parser.add_argument('--include-reference-sites', action='store_true', default=False, help='Include records where all ' 'samples are called 0/0 or ./.') parser.add_argument('--standardizer', help='Path to python file with ' 'custom standardizer definition. (Not yet supported.)') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) template = pkg_resources.resource_filename('svtools', 'data/standard_template.vcf') template = VariantFile(template) vcf = VariantFile(args.vcf) # Template header includes all necessary FILTER, INFO, and FORMAT fields # Just need to add samples from VCF being standardized header = template.header for sample in vcf.header.samples: header.add_sample(sample) # Tag source in header meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"' meta = meta.format(args.source, args.source.capitalize()) header.add_line(meta) header.add_line('##source={0}'.format(args.source)) fout = VariantFile(args.fout, mode='w', header=header) standardizer = VCFStandardizer.create(args.source, vcf, fout) idx = 1 for record in standardizer.standardize_vcf(): if any_called(record) or args.include_reference_sites: if args.prefix is not None: record.id = '{0}_{1}'.format(args.prefix, idx) idx += 1 fout.write(record) # for std_rec in standardize_vcf(vcf, fout): # fout.write(std_rec) fout.close() vcf.close()
def run_process(opts, inputvcf): outputvcf = opts.output popfreq = float(opts.popfreq) # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header vcf_in.header.info.add("ngb_popmaf_snp_db_cnt",".","Integer","Population Database Count above setting MAF") vcf_in.header.info.add("ngb_popmaf_snp_db_list",".","String","Population Database List above setting MAF") vcf_in.header.info.add("ngb_popmaf_snp_db_eastasian",".","String","East Asian Exist Flag above setting MAF") vcf_in.header.info.add("ngb_popmaf_snp_db_korean",".","String","Korean Exist Flag above setting MAF") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-','w',header=vcf_in.header) for record in vcf_in.fetch(): record_data = OrderedDict() record_value = list() # Check Population MAF for key in freq_check_list: try: value = record.info[key] if type(value) == list or type(value) == tuple: value2 = float(value[0]) else: value2 = float(value) if value2 >= popfreq: record_data[key] = value2 except: continue # Check ESP6500 try: value_list = record.info['esp6500_MAF'] if float(value_list[2]) / 100 >= popfreq: record_data['esp6500_MAF_ALL'] = float(value_list[2]) / 100 if float(value_list[1]) / 100 >= popfreq: record_data['esp6500_MAF_AA'] = float(value_list[1]) / 100 if float(value_list[0]) / 100 >= popfreq: record_data['esp6500_MAF_EA'] = float(value_list[0]) / 100 except: pass for key in record_data.iterkeys(): record_value.append(key) filtered_db_list = '|'.join(record_value) if filtered_db_list == '': filtered_db_list = '.' record.info['ngb_popmaf_snp_db_list'] = filtered_db_list record.info['ngb_popmaf_snp_db_cnt'] = len(record_data) if "EAS" in filtered_db_list: record.info['ngb_popmaf_snp_db_eastasian'] = 'Y' else: record.info['ngb_popmaf_snp_db_eastasian'] = 'N' if ("KRGDB" in filtered_db_list) or ("KoEXID" in filtered_db_list): record.info['ngb_popmaf_snp_db_korean'] = 'Y' else: record.info['ngb_popmaf_snp_db_korean'] = 'N' # Write VCF vcf_out.write(record)
def gen_report(vcf, c, ref_flag): # open out file and index counts, context, etc fn = os.path.basename(vcf) parts = fn.split('.') loc = 'LOGS/' + parts[0] + '.snv.strelka.vep_priority.report.log' log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n') on_dict = {} if c != 'n': on_dict = create_target(c) log(loc, date_time() + 'Target file given, creating index for on target info\n') vcf_in = VariantFile(vcf) call_type = 'snv' if bool(re.search('indel', fn)): out = open(parts[0] + '.indel.strelka.vep.prioritized_impact.report.xls', 'w') call_type = 'indel' else: out = open(parts[0] + '.snv.strelka.vep.prioritized_impact.report.xls', 'w') desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0, 'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0} desc_string = vcf_in.header.info['ANN'].record['Description'] desc_string = desc_string.lstrip('"') desc_string = desc_string.rstrip('"') desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '') f_pos_list = [] desc_list = desc_string.split('|') ann_size = len(desc_list) for i in range(0, ann_size, 1): if desc_list[i] in desired: f_pos_list.append(i) desired[desc_list[i]] = i if call_type == 'snv': out.write('chr\tpos\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t' 'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\t' 'variant_class_effect\teffect\timpact\tbiotype\tcodon_change\tamino_acid_change\ton/off-target\n') else: out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact\t' 'biotype\tcodon_change\tamino_acid_change\ton/off-target\n') if ref_flag != 'n': ref_flag = create_index(ref_flag) for record in vcf_in.fetch(): # dict contains what's different between strelka indel and snv reports (chrom, pos, ref, alt) = (record.contig, str(record.pos), record.ref, record.alts[0]) if call_type == 'snv': not_shared = {'norm_ref_ct': record.samples['NORMAL'][(record.ref + 'U')][0], 'norm_alt_ct': record.samples['NORMAL'][(record.alts[0] + 'U')][0], 'tum_ref_ct': record.samples['TUMOR'][(record.ref + 'U')][0], 'tum_alt_ct': record.samples['TUMOR'][(record.alts[0] + 'U')][0]} else: not_shared = {} ann_list = [_.split('|') for _ in record.info['ANN'].split(',')] tflag = 'NA' if c != 'n': tflag = mark_target(chrom, pos, on_dict) # only outputting ON TARGET hits if tflag == 'OFF': continue output_highest_impact(chrom, pos, ref, alt, not_shared, ann_list, desired, tflag, out, ref_flag, call_type) out.close() log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n') return 0