def makeHeader(self): header = pysam.VariantHeader() header.filters.add('PBAAFAIL', None, None, 'Consensus failed pbAA filters') header.filters.add('HP', None, None, 'Homopolymer length variant') header.info.add('NS', 1, 'Integer', 'Number of samples with data') header.info.add('AF', 'A', 'Float', 'Allele frequency') header.formats.add('GT', 1, 'String', 'Genotype ') header.formats.add('DP', 1, 'Integer', "Read depth") #maybe header.formats.add('FT', '.', 'String', 'pbAA filter') #header.formats.add('GQ',1,'Integer',"Conditional genotype quality") header.formats.add('AQ', '.', 'Float', "pbAA mean cluster quality") #header.formats.add('MIN_DP',1,'Integer',"Minimum DP observed within the GVCF block.") header.formats.add('AD', '.', 'Integer', "Reads supporting each alt call") header.formats.add('VAF', '.', 'Float', "pbAA cluster frequency") #header.formats.add('PL','G','Integer',"Phred-scaled genotype likelihoods rounded to the closest integer") header.formats.add('TG', '.', 'String', 'pbAA guide') header.formats.add('HP', '.', 'Integer', "pbAA cluster identifier") header.formats.add('DV', '.', 'Float', "pbAA diversity score") header.formats.add('CH', '.', 'Float', "pbAA chimera score") for ctg, length in zip(self.reference.references, self.reference.lengths): header.contigs.add(ctg, length=length) header.add_meta('commandline', value=self._getCommandLine()) #add samples for s in self.samples: header.add_sample(s) return header
def build_header( reader: VariantFileT, patient_barcode: str, case_id: str, tumor_barcode: str, tumor_aliquot_uuid: str, tumor_bam_uuid: str, normal_barcode: str, normal_aliquot_uuid: str, normal_bam_uuid: str, reference_name: str, ) -> VcfHeaderT: """ Takes the user arguments and the input VCF to generate the GDC formatted header entries and returns the header object. """ # First, load the old header, skipping ones that we will update lst = [] for record in reader.header.records: if ( record.key == "fileDate" or record.key == "fileformat" or record.key == "reference" ): continue lst.append(str(record)) # Add GDC specific metadata lst.extend( [ "##fileDate={0}".format(datetime.date.today().strftime("%Y%m%d")), '##center="NCI Genomic Data Commons (GDC)"', "##reference={0}".format(reference_name), "##INDIVIDUAL=<NAME={0},ID={1}>".format(patient_barcode, case_id), "##SAMPLE=<ID=NORMAL,NAME={0},ALIQUOT_ID={1},BAM_ID={2}>".format( normal_barcode, normal_aliquot_uuid, normal_bam_uuid ), "##SAMPLE=<ID=TUMOR,NAME={0},ALIQUOT_ID={1},BAM_ID={2}>".format( tumor_barcode, tumor_aliquot_uuid, tumor_bam_uuid ), ] ) # Initialize new header object new_head = pysam.VariantHeader() for line in lst: new_head.add_line(line) # Add samples for sample in reader.header.samples: new_head.add_sample(sample) # Return updated header return new_head
def get_header(old_header: VariantHeaderT) -> VariantHeaderT: """ Creates a new header with the new elements that will be handled in this tool. """ header = pysam.VariantHeader() added_flag = False for record in old_header.records: if record.type == "INFO": if not added_flag: header.add_meta( "INFO", items=[ ("ID", "forcedHet"), ("Number", 0), ("Type", "Flag"), ( "Description", "The original homozygous-reference call " "was converted to heterozygous-alt.", ), ], ) added_flag = True if record.get("ID", "") == "SVTYPE": curr = [] for k, v in record.items(): if k == "ID": curr.append((k, "TYPEOFSV")) else: if k == "IDX": continue curr.append((k, v.replace('"', ""))) header.add_meta(record.key, items=curr) else: header.add_record(record) elif ( record.type == "GENERIC" and record.key == "center" and record.value == '""' ): continue else: header.add_record(record) for sample in old_header.samples: header.add_sample(sample) return header
def _get_test_vcf_header(meta=None, samples=None): hdr = pysam.VariantHeader() if meta: if isinstance(meta, list): for rec in meta: hdr.add_meta(**rec) else: hdr.add_meta(**meta) # hdr.add_meta(key=meta['key'], value=meta.get('value'), items=meta.get('items')) if samples: if isinstance(samples, list): for sample in samples: hdr.add_sample(sample) else: hdr.add_sample(samples) res = GenericVcfObject(header=hdr) return res
def write_vcf(call_data, output_file, verboseness=0): """ Write out variant calling data to a VCF file. By default only writes entries where something else than the reference has been observed. Parameters ---------- call_data : VariantCallData output_file : file verboseness : int Determines the verboseness of the VCF. Higher value will result in more entries in the VCF. Accepted values: 0: Only output StrainGR strong SNPs 1: Output strong and weak SNPs 2: Output an entry for every position in the genome, even if nothing else but the reference is observed. """ contig_lengths = [] for scaffold in call_data.scaffolds_data.values(): contig_lengths.append( f"##contig=<ID={scaffold.name},length={scaffold.length}>") header = pysam.VariantHeader() header_str = VCF_TEMPLATE.format(date=datetime.now(), ref=call_data.reference_fasta, contig_lengths="\n".join(contig_lengths)) for line in header_str.split('\n'): header.add_line(line) header.add_sample("straingr") vcf_writer = pysam.VariantFile(output_file, 'w', header=header) record_iter = itertools.chain.from_iterable( vcf_records_for_scaffold(vcf_writer, scaffold, verboseness) for scaffold in call_data.scaffolds_data.values()) for record in record_iter: vcf_writer.write(record)
def create_vcf_file(path, sample): """ Creates VCF header and Variant File. Writes VCF header in Variant File and returns it. Parameters ---------- path: str Name and path of an output vcf file, for example output/out.vcf sample: str Name of a sample to add to the VCF file Returns ------- pysam.VariantFile Created VCF file with header written in it """ vcf_header = pysam.VariantHeader() vcf_header.add_sample(sample) current_time = datetime.datetime.now() date = current_time.strftime('%Y%m%d') vcf_header.add_line('##fileDate=' + date) vcf_header.add_line('##source=Ema&Nikola') faifile = open("test_data/human_g1k_v37_decoy.fasta.fai") for line in faifile: split_line = line.split("\t") contig = '##contig=<ID=' + str(split_line[0]) + ', length=' + str( split_line[1]) + '>' vcf_header.add_line(contig) faifile.close() vcf_header.add_line( "##ALT=<ID=*,Description=Different allele than referent.>") vcf_header.add_line( "##FORMAT=<ID=GT,Number=1,Type=String,Description=Genotype>") vcf_header.add_line( "##FORMAT=<ID=VAF,Number=1,Type=String,Description=Variant allele frequency>" ) vcf = pysam.VariantFile(path, 'w', header=vcf_header) return vcf
def _compile_common_header(self, varcall_template, no_filters=False): # fix the header generated by VarScan # by adding reference and contig information common_header = pysam.VariantHeader() common_header.add_meta('reference', value=self.ref_genome) self._add_ref_contigs_to_header(common_header) if not no_filters: # add filter info self._add_filters_to_header(common_header) # change the source information common_header.add_meta('source', value='varscan.py') # declare an INDEL flag for record INFO fields self._add_indel_info_flag_to_header(common_header) # take the remaining metadata from the template header produced by # VarScan with pysam.VariantFile(varcall_template, 'r') as original_data: varscan_header = original_data.header for sample in varscan_header.samples: common_header.samples.add(sample) common_header.merge(varscan_header) return common_header
def testConstructionWithRecords(self): fn_in = os.path.join(DATADIR, self.filename) fn_out = get_temp_filename(suffix=".vcf") vcf_in = pysam.VariantFile(fn_in) header = pysam.VariantHeader() for record in vcf_in.header.records: header.add_record(record) fn = str("tmp_VariantFileTest_testConstructionWithRecords") + ".vcf" vcf_out = pysam.VariantFile(fn, "w", header=header) for record in vcf_in: # currently segfaults here: # vcf_out.write(record) pass return vcf_out.close() self.complete_check(fn_in, fn_out)
def testConstructionWithLines(self): fn_in = os.path.join(CBCF_DATADIR, self.filename) fn_out = get_temp_filename(suffix=".vcf") vcf_in = pysam.VariantFile(fn_in) header = pysam.VariantHeader() for sample in vcf_in.header.samples: header.add_sample(sample) for hr in vcf_in.header.records: header.add_line(str(hr)) vcf_out = pysam.VariantFile(fn_out, "w", header=header) for record in vcf_in: vcf_out.write(record) vcf_out.close() vcf_in.close() self.complete_check(fn_in, fn_out)
def VCFHeader(sample): '''Creates VariantHeader object for the vcf file containing variant calling results for sample from the input''' VCFheader = pysam.VariantHeader() VCFheader.add_sample(sample) faifile = open("human_g1k_v37_decoy.fasta.fai") for line in faifile: split_line = line.split("\t") contig = '##contig=<ID=' + str(split_line[0]) + ', length=' + str( split_line[1]) + '>' VCFheader.add_line(contig) faifile.close() VCFheader.add_line( "##ALT=<ID=*,Description=Represents allele(s) other than observed.>") VCFheader.add_line( "##FORMAT=<ID=GT,Number=1,Type=String,Description=Genotype>") VCFheader.add_line( "##FORMAT=<ID=VAF,Number=1,Type=String,Description=Variant allele frequency>" ) return VCFheader
def extract(self, contig, start, end, lVarFiles): """Extract region with 0-based indexing system""" logging.info('Extracting from variant files') self.checkIndexOfVariantFiles(lVarFiles) for varFile in lVarFiles: varFileHeader = pysam.VariantHeader() f = pysam.VariantFile(varFile, header=varFileHeader) # get list of samples lCurrentSamples = self.addSamples(list(f.header.samples), varFile) # get list of pos for val in f.fetch(str(contig), int(start), int(end)): for sample in val.samples.items(): lAlleles, depth = self.getAllelesFromPosition( list(val.alleles), dict(val.info), sample[1].items()) if depth > 0: genotype = (self._getFormatTagValue(sample[1].items(), 'GT', index=0), self._getFormatTagValue(sample[1].items(), 'GT', index=1)) else: genotype = (None, None) # depth = self._getFormatTagValue(sample[1].items(), 'DP') self.dSamples[sample[0]].addPosition( Position(val.contig, val.pos, genotype=genotype, depth=depth, lAlleles=lAlleles, lFilters=val.filter))
type=str, nargs='+', required=True, help= 'List of sample names. Order must correspond to samtools mpileup input BAMs/CRAMs.' ) argparser.add_argument('-o', '--output', metavar='file', dest='output_file', required=True, help='Output file compressed using bgzip.') if __name__ == '__main__': args = argparser.parse_args() header = pysam.VariantHeader() header.add_line( '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of bases">') for chrom in list(map(str, range(1, 23))) + [ 'X', 'Y' ]: # add all possible chromosome names. needed by htslibs to track chromosomes in records. header.contigs.add(chrom) header.contigs.add('chr' + chrom) for sample in args.in_samples: header.add_sample(sample) with pysam.VariantFile(args.output_file, 'wb', header=header) as ofile: for line in args.in_mpileup_file: fields = line.rstrip().split('\t') record = header.new_record(contig=fields[0], start=int(fields[1]) - 1, stop=int(fields[1]),
def vcf_header(): vcf_header = pysam.VariantHeader() vcf_header.add_sample("sample1") vcf_header.add_sample("sample2") vcf_header.contigs.add("1") return vcf_header
def write_to_file(gwas_file, gwas_idx, path, fasta, build, trait_id, sample_metadata=None, file_metadata=None, csi=False): logging.info("Writing headers to BCF/VCF: {}".format(path)) header = pysam.VariantHeader() # INFO header.add_line( '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">' ) # FORMAT header.add_line( '##FORMAT=<ID=ES,Number=A,Type=Float,Description="Effect size estimate relative to the alternative allele">' ) header.add_line( '##FORMAT=<ID=SE,Number=A,Type=Float,Description="Standard error of effect size estimate">' ) header.add_line( '##FORMAT=<ID=LP,Number=A,Type=Float,Description="-log10 p-value for effect estimate">' ) header.add_line( '##FORMAT=<ID=AF,Number=A,Type=Float,Description="Alternate allele frequency in the association study">' ) header.add_line( '##FORMAT=<ID=SS,Number=A,Type=Integer,Description="Sample size used to estimate genetic effect">' ) header.add_line( '##FORMAT=<ID=EZ,Number=A,Type=Float,Description="Z-score provided if it was used to derive the EFFECT and SE fields">' ) header.add_line( '##FORMAT=<ID=SI,Number=A,Type=Float,Description="Accuracy score of summary data imputation">' ) header.add_line( '##FORMAT=<ID=NC,Number=A,Type=Integer,Description="Number of cases used to estimate genetic effect">' ) header.add_line( '##FORMAT=<ID=ID,Number=1,Type=String,Description="Study variant identifier">' ) # META header.add_line( '##META=<ID=TotalVariants,Number=1,Type=Integer,Description="Total number of variants in input">' ) header.add_line( '##META=<ID=VariantsNotRead,Number=1,Type=Integer,Description="Number of variants that could not be read">' ) header.add_line( '##META=<ID=HarmonisedVariants,Number=1,Type=Integer,Description="Total number of harmonised variants">' ) header.add_line( '##META=<ID=VariantsNotHarmonised,Number=1,Type=Integer,Description="Total number of variants that could not be harmonised">' ) header.add_line( '##META=<ID=SwitchedAlleles,Number=1,Type=Integer,Description="Total number of variants strand switched">' ) header.add_line( '##META=<ID=TotalControls,Number=1,Type=Integer,Description="Total number of controls in the association study">' ) header.add_line( '##META=<ID=TotalCases,Number=1,Type=Integer,Description="Total number of cases in the association study">' ) header.add_line( '##META=<ID=StudyType,Number=1,Type=String,Description="Type of GWAS study [Continuous or CaseControl]">' ) # SAMPLES header.samples.add(trait_id) if file_metadata is not None: s = "" for k in sample_metadata: s += ",{}={}".format(k, sample_metadata[k]) header.add_line('##SAMPLE=<ID={}{}>'.format(trait_id, s)) # CONTIG assert len(fasta.references) == len(fasta.lengths) for n, contig in enumerate(fasta.references): header.add_line("##contig=<ID={},length={}, assembly={}>".format( contig, fasta.lengths[n], build)) # add metadata if file_metadata is not None: for k in file_metadata: header.add_line('##{}={}'.format(k, file_metadata[k])) vcf = pysam.VariantFile(path, "w", header=header) # recall variant objects in chromosome position order logging.info("Writing variants to BCF/VCF: {}".format(path)) for contig in fasta.references: if contig not in gwas_idx: continue while gwas_idx[contig]: chr_pos = heappop(gwas_idx[contig]) # load GWAS result gwas_file.seek(chr_pos[1]) result = pickle.load(gwas_file) lpval = -np.log10(result.pval) # check floats if Vcf.is_float32_lossy(result.b): logging.warning( "Effect field cannot fit into float32. Expect loss of precision for: {}" .format(result.b)) if Vcf.is_float32_lossy(result.se): result.se = np.float64(np.finfo(np.float32).tiny).item() logging.warning( "Standard error field cannot fit into float32. Expect loss of precision for: {}" .format(result.se)) if Vcf.is_float32_lossy(lpval): logging.warning( "-log10(pval) field cannot fit into float32. Expect loss of precision for: {}" .format(lpval)) if Vcf.is_float32_lossy(result.alt_freq): logging.warning( "Allele frequency field cannot fit into float32. Expect loss of precision for: {}" .format(result.alt_freq)) if Vcf.is_float32_lossy(result.imp_z): logging.warning( "Imputation Z score field cannot fit into float32. Expect loss of precision for: {}" .format(result.imp_z)) if Vcf.is_float32_lossy(result.imp_info): logging.warning( "Imputation INFO field cannot fit into float32. Expect loss of precision for: {}" .format(result.imp_info)) record = vcf.new_record() record.chrom = result.chrom assert " " not in record.chrom record.pos = result.pos assert record.pos > 0 record.id = Vcf.remove_illegal_chars(result.dbsnpid) record.alleles = (result.ref, result.alt) record.filter.add(result.vcf_filter) if result.alt_freq is not None: record.info['AF'] = result.alt_freq if result.b is not None: record.samples[trait_id]['ES'] = result.b if result.se is not None: record.samples[trait_id]['SE'] = result.se if lpval is not None: record.samples[trait_id]['LP'] = lpval if result.alt_freq is not None: record.samples[trait_id]['AF'] = result.alt_freq if result.n is not None: record.samples[trait_id]['SS'] = round(result.n) if result.imp_z is not None: record.samples[trait_id]['EZ'] = result.imp_z if result.imp_info is not None: record.samples[trait_id]['SI'] = result.imp_info if result.ncase is not None: record.samples[trait_id]['NC'] = round(result.ncase) if result.dbsnpid is not None: record.samples[trait_id]['ID'] = record.id # write to file vcf.write(record) vcf.close() # index output file logging.info("Indexing output file") pysam.tabix_index(path, preset="vcf", force=True, csi=csi)
def writevcf_pysam(outpath,bampath): bamfile=pysam.AlignmentFile(bampath,'r') bamhead=bamfile.header filedate=str(datetime.datetime.now()).split(' ')[0] vcfheader=pysam.VariantHeader() vcfheader.add_sample(bampath) vcfheader.add_line('##source=DeBreak\n') vcfheader.add_line('##fileDate='+filedate+'\n') for i in range(len(bamhead.references)): vcfheader.contigs.add(bamhead.references[i],length=bamhead.lengths[i]) vcfheader.add_line('##ALT=<ID=DEL,Description="Deletion">\n') vcfheader.add_line('##ALT=<ID=INS,Description="Insertion">\n') vcfheader.add_line('##ALT=<ID=DUP,Description="Duplication">\n') vcfheader.add_line('##ALT=<ID=INV,Description="Inversion">\n') vcfheader.add_line('##ALT=<ID=TRA,Description="Translocation">\n') vcfheader.info.add('CHR2',1,'String','Chromosome for END') vcfheader.info.add('END',1,'Integer','End position of the structural variant') vcfheader.info.add('MAPQ',1,'Integer','Mean mapping quality of supporting reads') vcfheader.info.add('SUPPREAD',1,'Integer','Number of supporting reads') vcfheader.info.add('SVLEN',1,'Integer','Length of the SV') vcfheader.info.add('SVMETHOD',1,'String','Type of approach used to detect SV') vcfheader.info.add('SVTYPE',1,'String','Type of structural variant') vcfheader.info.add('PRECISE',0,'Flag','Variant with precise breakpoint position from POA') vcfheader.info.add('MULTI',0,'Flag','If the SV is multi-allelic SV') vcfheader.info.add('LARGEINS',0,'Flag','Large insertion indentified from local assembly') vcfheader.info.add('START2',1,'Integer','SV start position on the second haplotype of multi-allilic SV') vcfheader.info.add('END2',1,'Integer','SV end position on the second haplotype of multi-allilic SV') vcfheader.info.add('SVLEN2',1,'Integer','SV length on the second haplotype of multi-allilic SV') vcfheader.add_meta('FORMAT', items=[('ID',"GT"), ('Number',1), ('Type','String'),('Description','Genotype')]) vcfheader.add_line('##CommandLine=debreak '+" ".join(sys.argv[1:])+'\n') f=pysam.VariantFile(outpath+'debreak.vcf','w',header=vcfheader) allsv=open(outpath+'debreak-allsv-merged-final','r').read().split('\n')[:-1] svid=1 for sv in allsv: sv=sv.split('\t') try: newrec=f.new_record(contig=sv[0],start=int(sv[1]),filter='PASS') except: continue newrec.id='DB'+str(svid) newrec.ref='N' svid+=1 if 'Translocation' in sv: newrec.info['CHR2']=sv[2] else: newrec.info['CHR2']=sv[0] if 'Insertion' in sv: newrec.stop=newrec.start+1 elif 'Translocation' in sv: newrec.stop=int(sv[3]) else: newrec.stop=newrec.start+int(sv[2]) if 'Translocation' in sv: newrec.info['SVLEN']=0 newrec.info['SUPPREAD']=int(sv[4]) newrec.info['MAPQ']=int(float(sv[5])) else: newrec.info['SVLEN']=int(sv[2]) if 'rescue_largeins_' in ''.join(sv): newrec.info['SUPPREAD']=0 else: newrec.info['SUPPREAD']=int(sv[3]) newrec.info['MAPQ']=int(float(sv[5])) newrec.info['SVMETHOD']='DeBreak' if 'Insertion' in sv and 'rescue_largeins_' in ''.join(sv): newrec.info['LARGEINS']=True if 'Precise' in sv: newrec.info['PRECISE']=True newrec.info['SVTYPE']='DEL' if 'Deletion' in sv else ('INS' if 'Insertion' in sv else ('INV' if 'Inversion' in sv else ('DUP' if 'Duplication' in sv else 'TRA'))) if 'GT=1/0' in sv: newrec.samples[bampath]['GT']=(0,1) if 'GT=1/1' in sv: newrec.samples[bampath]['GT']=(1,1) if 'GT=./.' in sv: newrec.samples[bampath]['GT']=(0,1) if 'CompoundSV' in sv: newrec.info['MULTI']=True f.write(newrec) f.close() return 0