def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("input", metavar='input.vcf', action='store', help='vcf file.', type=str) parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output invcf = args.input ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1"])) # adding format lines header.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype")])) header.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) # read the input vcf with vcfpy.Reader.from_path(invcf) as reader: # get the FORMAT header lines of the input file # and convert them in INFO header lines of the output file format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' header.add_info_line(str_to_mapping(format_line.value)) #print(header) # write the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record record = vcfpy.Record( CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=[], INFO={}, FORMAT=["GT", "DP"], calls=[vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"),("DP", "47")]))] ) #print(record) writer.write_record(record)
def _open(self): # Setup header lines = [ vcfpy.HeaderLine("fileformat", "VCFv4.2"), vcfpy.FormatHeaderLine.from_mapping({ "ID": "AD", "Number": "R", "Type": "Integer", "Description": "Allelic depths for the ref and alt alleles in the order listed", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "DP", "Number": "1", "Type": "Integer", "Description": "Approximate read depth at the locus", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "GQ", "Number": "1", "Type": "Integer", "Description": "Phred-scaled genotype quality", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "GT", "Number": "1", "Type": "String", "Description": "Genotype" }), ] # Add header lines for contigs. # TODO: switch based on release in case for name, length in CONTIGS_GRCH37: lines.append( vcfpy.ContigHeaderLine.from_mapping({ "ID": name, "length": length })) header = vcfpy.Header(lines=lines, samples=vcfpy.SamplesInfos(self.members)) # Open VCF writer self.vcf_writer = vcfpy.Writer.from_path(self.tmp_file.name, header)
def __init__(self, input_vcf, info_fields, sample_fields, caller_priority, output_vcf): self.reader = vcfpy.Reader.from_path(input_vcf) self.info_fields = info_fields self.sample_fields = sample_fields self.caller_priority = caller_priority self.write_header = vcfpy.Header(samples=self.reader.header.samples) self.add_file_format() self.select_contig_header() self.select_filter_header() self.select_info_header() self.select_format_header() self.records = self.select_record_fields() self.output_vcf = output_vcf self.write_merged(self.records)
def __init__(self, readers, callers, output_vcf): self.readers = readers self.callers = callers self.samples = list(set([name for reader in self.readers for name in reader.header.samples.names])) # TODO: multi-sample? using first vcf samples here self.merge_header = vcfpy.Header(samples=[reader.header.samples for reader in self.readers][0]) self.add_file_format() self.merge_contig_header() self.add_caller_filter_header() self.merge_filter_header() self.merge_info_header() self.merge_format_header() self.records = self.merge_records() self.output_vcf = output_vcf self.write_merged(self.records)
def main(): if len(sys.argv) != 2: print("Usage: vcf_from_scratch.py OUTPUT.vcf", file=sys.stderr) return 1 header = vcfpy.Header(samples=vcfpy.SamplesInfos([])) with vcfpy.Writer.from_path(sys.argv[1], header) as writer: record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="N", ALT=[], QUAL=None, FILTER=[], INFO={}, FORMAT=[]) writer.write_record(record)
def create_vcf_writer(args, vcf_reader): if args.output_vcf: output_file = args.output_vcf else: (head, sep, tail) = args.input_vcf.rpartition('.vcf') output_file = ('').join([head, '.readcount.vcf', tail]) new_header = vcfpy.Header(samples=vcf_reader.header.samples) if args.data_type == 'DNA': for line in vcf_reader.header.lines: if not (line.key == 'FORMAT' and line.id in ['DP', 'AD', 'AF']): new_header.add_line(line) new_header.add_format_line( OrderedDict([('ID', 'DP'), ('Number', '1'), ('Type', 'Integer'), ('Description', 'Read depth')])) new_header.add_format_line( OrderedDict([ ('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer'), ('Description', 'Allelic depths for the ref and alt alleles in the order listed' ) ])) new_header.add_format_line( OrderedDict([('ID', 'AF'), ('Number', 'A'), ('Type', 'Float'), ('Description', 'Variant-allele frequency for the alt alleles')])) if args.data_type == 'RNA': for line in vcf_reader.header.lines: if not (line.key == 'FORMAT' and line.id in ['RDP', 'RAD', 'RAF']): new_header.add_line(line) new_header.add_format_line( OrderedDict([('ID', 'RDP'), ('Number', '1'), ('Type', 'Integer'), ('Description', 'RNA Read depth')])) new_header.add_format_line( OrderedDict([ ('ID', 'RAD'), ('Number', 'R'), ('Type', 'Integer'), ('Description', 'RNA Allelic depths for the ref and alt alleles in the order listed' ) ])) new_header.add_format_line( OrderedDict([('ID', 'RAF'), ('Number', 'A'), ('Type', 'Float'), ('Description', 'RNA Variant-allele frequency for the alt alleles') ])) return vcfpy.Writer.from_path(output_file, new_header)
def create_vcf_writer(args, vcf_reader): if args.output_vcf: output_file = args.output_vcf else: (head, sep, tail) = args.input_vcf.rpartition('.vcf') output_file = ('').join([head, '.genotype.vcf', tail]) sample_info = vcf_reader.header.samples if args.sample_name in sample_info.names: append_to_existing_sample = True else: append_to_existing_sample = False sample_info.names.append(args.sample_name) sample_info.name_to_idx[args.sample_name] = len(sample_info.names) - 1 new_header = vcfpy.Header(samples=sample_info) for line in vcf_reader.header.lines: if not (line.key == 'FORMAT' and line.id == 'GT'): new_header.add_line(line) new_header.add_format_line( OrderedDict([('ID', 'GT'), ('Number', '1'), ('Type', 'String'), ('Description', 'Genotype')])) return (vcfpy.Writer.from_path(output_file, new_header), append_to_existing_sample)
def build_header(contigs, species): header = vcfpy.Header() header.samples = vcfpy.SamplesInfos([]) header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.2")) for name, length in contigs: header.add_contig_line({"ID": name, "length": length}) header.add_line(vcfpy.HeaderLine("species", ",".join(species))) header.add_info_line({ "ID": "END", "Description": "End position of the alignment", "Type": "Integer", "Number": 1, }) header.add_info_line({ "ID": "UCSC_GENE", "Description": "UCSC gene ID", "Type": "String", "Number": 1 }) header.add_info_line({ "ID": "EXON", "Description": "Index of exon in transcript", "Type": "Integer", "Number": 1 }) header.add_info_line({ "ID": "EXON_COUNT", "Description": "Number of exons in transcript", "Type": "Integer", "Number": 1, }) header.add_info_line({ "ID": "ALIGNMENT", "Description": "Amino acid alignment at this location", "Type": "String", "Number": 1, }) return header
def write_vcf(vcffilename, sample_name, records): """ Generate a VCF with the given records and randomly generated genotypes Arguments: vcffilename - path to generated file records - list of vcfpy.Record describing the variants """ lengths = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566] samples = vcfpy.SamplesInfos([sample_name]) header = vcfpy.Header(samples=samples) header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.3")) header.add_line(vcfpy.HeaderLine("fileDate", "20200901")) for chrom, length in enumerate(lengths): header.add_contig_line({"ID": str(chrom), "assembly": "GRCh37", "length": length}) header.add_format_line({"ID":"GT", "Number":1, "Type":"String", "Description": "Genotype"}) with open(vcffilename, 'wb') as vcffile: writer = vcfpy.Writer.from_stream(vcffile, header, samples, use_bgzf=True) for record in records: genotype = random.choice(['0/0', '0/1', '1/1']) newrecord = vcfpy.Record(record.CHROM, record.POS, record.ID, record.REF, record.ALT, record.QUAL, record.FILTER, record.INFO, ["GT"], calls=[vcfpy.record.Call(sample_name, {"GT": genotype})]) writer.write_record(newrecord) writer.close()
def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1", "Sample2"])) # Tuples of valid entries ----------------------------------------------------- # #: valid INFO value types # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String") #: valid FORMAT value types # FORMAT_TYPES = ("Integer", "Float", "Character", "String") #: valid values for "Number" entries, except for integers # VALID_NUMBERS = ("A", "R", "G", ".") #: header lines that contain an "ID" entry # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE") # Constants for "Number" entries ---------------------------------------------- # #: number of alleles excluding reference # HEADER_NUMBER_ALLELES = "A" #: number of alleles including reference # HEADER_NUMBER_REF = "R" #: number of genotypes # HEADER_NUMBER_GENOTYPES = "G" #: unbounded number of values # HEADER_NUMBER_UNBOUNDED = "." # adding filter lines header.add_filter_line( OrderedDict([("ID", "PASS"), ("Description", "All filters passed")])) # adding info lines header.add_info_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Raw read depth (without mapping quality filters)")])) header.add_info_line( OrderedDict([ ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"), ("Description", "States if the record mutation is supported (1) or not (0).") ])) # adding format lines header.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype")])) header.add_format_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]])) # adding contig lines header.add_contig_line( OrderedDict([("ID", "chr1"), ("length", "248956422")])) # adding sample lines header.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", "Sample1"), ("Description", "Tumor")]))) # writing the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record calls = [] calls.append( vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")]))) calls.append( vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")]))) record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=["PASS"], INFO={ "DP": "50", "MUT": 0 }, FORMAT=["GT", "DP"], calls=calls) #record.add_format(key="GT") #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")]))) writer.write_record(record)
def get_header(sample_name_to_header, chromosome_set): """ Returns the header of the output VCF file :param sample_name_to_header: a dictionary from the sample names to the headers :param chromosome_set: the set of chromosomes selected for analysis :return: a vcfpy.Header """ header = vcfpy.Header() header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2")) # CONTIG headers first_sample_header = next(iter(sample_name_to_header.values())) for input_header_line in first_sample_header.lines: if isinstance(input_header_line, vcfpy.ContigHeaderLine): if chromosome_set is None or input_header_line.mapping[ "ID"] in chromosome_set: header.add_line(input_header_line) # INFO fields header.add_info_line( vcfpy.OrderedDict(ID="END", Number=1, Type="Integer", Description="Stop position of the interval")) header.add_info_line( vcfpy.OrderedDict(ID="SVTYPE", Number=1, Type="String", Description="Type of structural variant")) header.add_info_line( vcfpy.OrderedDict( ID="INSSEQ", Number=1, Type="String", Description= "Insertion sequence of structural variant, not including sequence marked as duplication" )) header.add_info_line( vcfpy.OrderedDict( ID="TRANCHE2", Number=1, Type="String", Description= "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH" )) header.add_info_line( vcfpy.OrderedDict( ID="BNDVAF", Number=1, Type="Float", Description= "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REF+SR+RP+IC+AS)" )) # FORMAT fields header.add_format_line( vcfpy.OrderedDict(ID="GT", Number=1, Type="String", Description="Genotype")) header.add_format_line( vcfpy.OrderedDict( ID="TRANCHE2", Number=1, Type="String", Description= "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH" )) header.add_format_line( vcfpy.OrderedDict( ID="BNDVAF", Number=1, Type="Float", Description= "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REFPAIR+SR+RP+IC+AS)" )) header.add_format_line( vcfpy.OrderedDict( ID="VAF", Number=1, Type="Float", Description= "VAF of this SV call, derived from BNDVAF values of BND calls used to call this SV" )) header.add_format_line( vcfpy.OrderedDict( ID="INSSEQ", Number=1, Type="String", Description= "Insertion sequence of structural variant, not including sequence marked as duplication" )) # Samples, sorted to ensure determinism sample_names = sample_name_to_header.keys() header.samples = vcfpy.SamplesInfos(sorted(sample_names)) return header
bam_readcount_position = str(entry.POS) ref_base = reference var_base = alt return (bam_readcount_position, ref_base, var_base) (script, vcf_filename, bam_readcount_filenames, samples_list, output_dir) = sys.argv samples = samples_list.split(',') bam_readcount_files = bam_readcount_filenames.split(',') read_counts = parse_bam_readcount_file(bam_readcount_files, samples) vcf_reader = vcfpy.Reader.from_path(vcf_filename) new_header = vcfpy.Header(samples=vcf_reader.header.samples) for line in vcf_reader.header.lines: if not (line.key == 'FORMAT' and line.id in ['DP', 'AD', 'AF']): new_header.add_line(line) new_header.add_format_line( OrderedDict([('ID', 'DP'), ('Number', '1'), ('Type', 'Integer'), ('Description', 'Read depth')])) new_header.add_format_line( OrderedDict([ ('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer'), ('Description', 'Allelic depths for the ref and alt alleles in the order listed') ])) new_header.add_format_line( OrderedDict([('ID', 'AF'), ('Number', 'A'), ('Type', 'Float'), ('Description',
cmd_vt = path_vt+" decompose -s "+path_filteredSort_vcf+" | "+path_vt+" normalize -r "+pathFasta+" -o "+path_normalized_vcf+" -" process = subprocess.Popen([cmd_vt], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out, err = process.communicate() if process.returncode!=0: exit("🅴 🆁 🆁 🅾 🆁\n[Nk_mergeVCF] Decompose & Normalize\n "+err.decode('utf-8')) #***** MERGE callers VCFs *****# lst_caller_name = [] lst_contig_line = [] dico_filter_line = {} dico_vcf = {} pathMergeVCF = sample+"_Nk.vcf" pathMergeUnsortedVCF = pathMergeVCF.replace(".vcf","_unsorted.vcf") #***** INIT new vcf header *****# new_header = vcfpy.Header(lines=None, samples=None) new_header.add_line(vcfpy.HeaderLine("fileformat","VCFv4.2")) new_header.add_line(vcfpy.HeaderLine("Nk_version",niourkVersion)) #***** BROWSE caller vcf *****# for path_vcf in lst_vcf_sample: caller_name = os.path.basename(path_vcf).split("_")[2].replace(".vcf","") lst_caller_name.append(caller_name) path_normalized_vcf = path_vcf.replace(".vcf","_normalize.vcf") vcf_tool_reader = vcfpy.Reader.from_path(path_normalized_vcf) vcf_header = vcf_tool_reader.header #***** READ HEADERS *****# # check header sample if new_header.samples==None: new_header.samples = vcf_header.samples # check header filters for filter_line in vcf_header.get_lines("FILTER"): if not filter_line.id in dico_filter_line: dico_filter_line[filter_line.id] = filter_line.description
varCt = 0 header = vcfpy.Header( samples=vcfpy.SamplesInfos(filteredEPIs), lines=[ vcfpy.HeaderLine('fileformat', 'VCFv4.0'), vcfpy.HeaderLine('fileDate', str(datetime.datetime.now())), vcfpy.HeaderLine('source', parser.prog), vcfpy.ContigHeaderLine('contig', '<ID=String,Length=Integer>', { 'ID': 'EPI_ISL_406030', 'length': 29903 }), vcfpy.InfoHeaderLine( 'INFO', '<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">', { 'ID': 'NS', 'Number': 1, 'Type': 'Integer', 'Description': 'Number of Samples With Data' }), vcfpy.FormatHeaderLine( 'FORMAT', '<ID=GT,Number=1,Type=String,Description="Genotype">', { 'ID': 'GT', 'Number': 1, 'Type': 'String', 'Description': 'Genotype' }) ]) with vcfpy.Writer.from_path(args.vcf, header) as writer:
def main(): parser = argparse.ArgumentParser( description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument( "barcodes", metavar='barcodes.list', action='store', help= "File containing cell barcodes (the same used in the alignment file to identify cell reads).", type=str) parser.add_argument("vcf", metavar='variants.vcf', action='store', help="VCF file storing BULK SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) parser.add_argument( "--gt", metavar='1/1 (0/1)', choices=["0/0", "0/1", "1/1"], action='store', help= "Genotype filter: considers only mutations with the specified GT in the original vcf file.", type=str) args = parser.parse_args() bam = args.bam barcodes = args.barcodes invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" if args.gt: gt_filter = True gt = args.gt else: gt_filter = False with open(barcodes, "r") as f: samples = f.read().splitlines() #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(samples)) # sample header lines header_out.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", sample), ("Description", "Sample name")]))) # filter header lines # sample header lines header_out.add_filter_line( OrderedDict([("ID", "1/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/0"), ("Number", "1"), ("Description", "Filtered on such GT")])) #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")])) # format header lines header_out.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line( OrderedDict([ ("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)" ) ])) header_out.add_format_line( OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line( OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line( OrderedDict([ ("ID", "AF"), ("Number", "1"), ("Type", "Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored." ) ])) # read input vcf reader = vcfpy.Reader.from_path(invcf) # info header lines # Use input FORMAT lines as output INFO line header_out.add_info_line( OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of cells supporting the mutation.")])) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about bulk mutation)" + mapping[ "Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: d = samples_dict(samples) supp = 0 # filter out indels: only interested in snvs in this analysis phase if gt_filter: if record.calls[0].data.get('GT') != gt: continue if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS - 1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[ 0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos + 1, stepper='all', truncate=True, max_depth=10000): for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: #iterate on cells tags = list_to_dict(base.alignment.tags) if "CB" not in tags.keys(): ''' reads with no error-corrected barcode are discarded ''' continue elif tags["CB"].split("-")[0] not in samples: ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)''' continue cb = tags["CB"].split("-")[0] #10x barcodes #print("barcode {} is a cell barcode ".format(cb)) d[cb][ 'dp'] += 1 #update info for the sample identified by CB if base.alignment.query_sequence[ base.query_position] == alt: d[cb]['ad'] += 1 elif base.alignment.query_sequence[ base.query_position] == ref: d[cb]['rd'] += 1 for cb in d.keys(): if d[cb]['ad'] > 0: supp += 1 d[cb][ 'gt'] = "0/1" #temporary, all the supported mutations are set to 0/1 d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad']) # generate calls for each sample/cell calls = [] for cb in d.keys(): calls.append( vcfpy.Call( cb, OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']), ("RD", d[cb]['rd']), ("AD", d[cb]['ad']), ("AF", d[cb]['af'])]))) # create a mapping between each FORMAT entry and the # corresponding value, in the call, in the input vcf file # note that the input vcf contains only one sample, so # the calls field of each record contains only one entry info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) if gt_filter == True: filter_l = [gt] else: filter_l = [] # build and write the output record record_out = vcfpy.Record( CHROM=chrom, POS=pos + 1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=filter_l, INFO=info_d, FORMAT=["GT", "DP", "RD", "AD", "AF"], calls=calls) writer.write_record(record_out) reader.close() writer.close() samfile.close()
def main(): parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument("vcf", metavar='file.vcf', action='store', help="VCF file storing SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) #parser.add_argument("--sample_name2", metavar='sample2', action='store', # help="Another sample name", type=str) args = parser.parse_args() bam= args.bam invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" ''' if args.sample_name2: sample_name2 = args.sample_name2 else: sample_name2 = null ''' #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample])) # sample header lines header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample)) ''' if sample_name2 is not null: header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")]))) ''' # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")])) # adding format lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # read input vcf reader = vcfpy.Reader.from_path(invcf) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: # filter out indels: only interested in snvs in this analysis phase if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000): #number of reads at this position sdp = pileupcolumn.n #number of supporting reads for the alternate base ad = 0 rd = 0 dp = 0 af = 0.0 for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: dp += 1 if base.alignment.query_sequence[base.query_position] == alt: ad += 1 elif base.alignment.query_sequence[base.query_position] == ref: rd += 1 if ad > 0: af = ad / (rd + ad) supp = 1 gt = "0/1" #temporary, all the supported mutations are set to 0/1 else: supp = 0 gt = "0/0" #af = ad / (rd + ad) info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"], calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))] ) writer.write_record(record_out) reader.close() writer.close() samfile.close()
def main(): parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.") parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str) parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str) #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str) #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str) parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str) args = parser.parse_args() input1 = args.input1 input2 = args.input2 prefix = args.outprefix #sample = args.sample #input_type = args.input_type clusters_df = pd.read_csv(input2) #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x)) clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()] # Create out header header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters)) # format header lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")])) # read input vcf reader = vcfpy.Reader.from_path(input1) # open the output vcf writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out) """ snps = read_vcf(input1, input_type) #Filtering bulk mutations not supported by cells snps = snps[~snps['INFO'].str.startswith("SUPP=0")] #Create mutation id column and set it as index snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"] snps = snps.set_index('mutid') """ #for each record in the vcf file for record_in in reader: d = samples_dict(clusters_df['cluster'].unique()) supp = 0 chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument for c in clusters_df['cluster'].unique(): #retrieve cell columns for cells in current cluster cells = clusters_df['cellid'][clusters_df['cluster'] == c] #retrieve cell data calls = [record_in.call_for_sample[cell] for cell in cells] #sum total read count, alt read count and ref read count of cells in the cluster for call in calls: d[c]['dp'] = d[c]['dp'] + call.data.get('DP') d[c]['rd'] = d[c]['rd'] + call.data.get('RD') d[c]['ad'] = d[c]['ad'] + call.data.get('AD') if d[c]['ad'] > 0: d[c]['gt'] = "0/1" d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad']) supp = 1 calls = [] # create one call for each cluster for c in d.keys(): calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])]))) print(calls) # write new record record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"], calls=calls ) writer.write_record(record_out) reader.close() writer.close()