def test_reading_parse_nosample(tmpdir, nosample_vcf_file): """Read VCF file without samples, write file with samples.""" # Perform record-wise copying, saving results in records path_out = tmpdir.mkdir("output").join("output.vcf") with vcfpy.Reader.from_path(nosample_vcf_file) as reader: header = reader.header.copy() header.samples = vcfpy.SamplesInfos(["NA00001", "NA00002", "NA00003"]) with vcfpy.Writer.from_path(str(path_out), header) as writer: for record in reader: record.update_calls([ vcfpy.Call(sample, {}) for sample in ("NA00001", "NA00002", "NA00003") ]) record.add_format("GT", "./.") writer.write_record(record) expected = textwrap.dedent(""" ##fileformat=VCFv4.3 ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 14370 . G A 29 . . GT . . . 20 17330 . T A 3 . . GT . . . 20 1110696 . A G,T 67 . . GT . . . 20 1230237 . T . 47 . . GT . . . 20 1234567 . GTC G,GTCT 50 . . GT . . . """).lstrip() assert path_out.open("rt").read() == expected
def _open(self): # Setup header lines = [ vcfpy.HeaderLine("fileformat", "VCFv4.2"), vcfpy.FormatHeaderLine.from_mapping({ "ID": "AD", "Number": "R", "Type": "Integer", "Description": "Allelic depths for the ref and alt alleles in the order listed", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "DP", "Number": "1", "Type": "Integer", "Description": "Approximate read depth at the locus", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "GQ", "Number": "1", "Type": "Integer", "Description": "Phred-scaled genotype quality", }), vcfpy.FormatHeaderLine.from_mapping({ "ID": "GT", "Number": "1", "Type": "String", "Description": "Genotype" }), ] # Add header lines for contigs. # TODO: switch based on release in case for name, length in CONTIGS_GRCH37: lines.append( vcfpy.ContigHeaderLine.from_mapping({ "ID": name, "length": length })) header = vcfpy.Header(lines=lines, samples=vcfpy.SamplesInfos(self.members)) # Open VCF writer self.vcf_writer = vcfpy.Writer.from_path(self.tmp_file.name, header)
def main(): if len(sys.argv) != 2: print("Usage: vcf_from_scratch.py OUTPUT.vcf", file=sys.stderr) return 1 header = vcfpy.Header(samples=vcfpy.SamplesInfos([])) with vcfpy.Writer.from_path(sys.argv[1], header) as writer: record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="N", ALT=[], QUAL=None, FILTER=[], INFO={}, FORMAT=[]) writer.write_record(record)
def write_vcf(vcffilename, sample_name, records): """ Generate a VCF with the given records and randomly generated genotypes Arguments: vcffilename - path to generated file records - list of vcfpy.Record describing the variants """ lengths = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566] samples = vcfpy.SamplesInfos([sample_name]) header = vcfpy.Header(samples=samples) header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.3")) header.add_line(vcfpy.HeaderLine("fileDate", "20200901")) for chrom, length in enumerate(lengths): header.add_contig_line({"ID": str(chrom), "assembly": "GRCh37", "length": length}) header.add_format_line({"ID":"GT", "Number":1, "Type":"String", "Description": "Genotype"}) with open(vcffilename, 'wb') as vcffile: writer = vcfpy.Writer.from_stream(vcffile, header, samples, use_bgzf=True) for record in records: genotype = random.choice(['0/0', '0/1', '1/1']) newrecord = vcfpy.Record(record.CHROM, record.POS, record.ID, record.REF, record.ALT, record.QUAL, record.FILTER, record.INFO, ["GT"], calls=[vcfpy.record.Call(sample_name, {"GT": genotype})]) writer.write_record(newrecord) writer.close()
def build_header(contigs, species): header = vcfpy.Header() header.samples = vcfpy.SamplesInfos([]) header.add_line(vcfpy.HeaderLine("fileformat", "VCFv4.2")) for name, length in contigs: header.add_contig_line({"ID": name, "length": length}) header.add_line(vcfpy.HeaderLine("species", ",".join(species))) header.add_info_line({ "ID": "END", "Description": "End position of the alignment", "Type": "Integer", "Number": 1, }) header.add_info_line({ "ID": "UCSC_GENE", "Description": "UCSC gene ID", "Type": "String", "Number": 1 }) header.add_info_line({ "ID": "EXON", "Description": "Index of exon in transcript", "Type": "Integer", "Number": 1 }) header.add_info_line({ "ID": "EXON_COUNT", "Description": "Number of exons in transcript", "Type": "Integer", "Number": 1, }) header.add_info_line({ "ID": "ALIGNMENT", "Description": "Amino acid alignment at this location", "Type": "String", "Number": 1, }) return header
def get_header(sample_name_to_header, chromosome_set): """ Returns the header of the output VCF file :param sample_name_to_header: a dictionary from the sample names to the headers :param chromosome_set: the set of chromosomes selected for analysis :return: a vcfpy.Header """ header = vcfpy.Header() header.add_line(vcfpy.HeaderLine(key="fileformat", value="VCFv4.2")) # CONTIG headers first_sample_header = next(iter(sample_name_to_header.values())) for input_header_line in first_sample_header.lines: if isinstance(input_header_line, vcfpy.ContigHeaderLine): if chromosome_set is None or input_header_line.mapping[ "ID"] in chromosome_set: header.add_line(input_header_line) # INFO fields header.add_info_line( vcfpy.OrderedDict(ID="END", Number=1, Type="Integer", Description="Stop position of the interval")) header.add_info_line( vcfpy.OrderedDict(ID="SVTYPE", Number=1, Type="String", Description="Type of structural variant")) header.add_info_line( vcfpy.OrderedDict( ID="INSSEQ", Number=1, Type="String", Description= "Insertion sequence of structural variant, not including sequence marked as duplication" )) header.add_info_line( vcfpy.OrderedDict( ID="TRANCHE2", Number=1, Type="String", Description= "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH" )) header.add_info_line( vcfpy.OrderedDict( ID="BNDVAF", Number=1, Type="Float", Description= "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REF+SR+RP+IC+AS)" )) # FORMAT fields header.add_format_line( vcfpy.OrderedDict(ID="GT", Number=1, Type="String", Description="Genotype")) header.add_format_line( vcfpy.OrderedDict( ID="TRANCHE2", Number=1, Type="String", Description= "Quality category of GRIDSS structural variant calls determined using FILTER,SRQ,AS,RAS. Values are LOW INTERMEDIATE HIGH" )) header.add_format_line( vcfpy.OrderedDict( ID="BNDVAF", Number=1, Type="Float", Description= "VAF of this gridss-called BND calculated as (SR+RP+IC+AS)/(REFPAIR+SR+RP+IC+AS)" )) header.add_format_line( vcfpy.OrderedDict( ID="VAF", Number=1, Type="Float", Description= "VAF of this SV call, derived from BNDVAF values of BND calls used to call this SV" )) header.add_format_line( vcfpy.OrderedDict( ID="INSSEQ", Number=1, Type="String", Description= "Insertion sequence of structural variant, not including sequence marked as duplication" )) # Samples, sorted to ensure determinism sample_names = sample_name_to_header.keys() header.samples = vcfpy.SamplesInfos(sorted(sample_names)) return header
def extract_vcf_records( sample_name, # input paths alignments_path, contigs_path, ref_fasta_path, vcf_template_path, # output paths vcf_out_path, selected_contigs_path, flanked_contigs_path, flank_length, min_insert_size): n_records = 0 ref_fasta = pysam.FastaFile(ref_fasta_path) contig_fasta = pysam.FastaFile(contigs_path) selected_contig_fasta = open(selected_contigs_path, "w") flanked_contig_fasta = open(flanked_contigs_path, "w") alns = pandas.read_csv(alignments_path, sep=" ") reader = vcfpy.Reader.from_path(vcf_template_path) reader.header.samples = vcfpy.SamplesInfos([sample_name]) writer = vcfpy.Writer.from_path(vcf_out_path, reader.header) contig_loci = set() # parse each alignment and look for insertions above min_insert_size for r in alns.iterrows(): # skip secondary alignments hit = r[1]["Hit"] if hit > 0: continue query_name = r[1]["QName"] # local alignment window in the reference ref_chrom, ref_start, ref_end, phase_set, phase, n = query_name.split( "_") phase_set = phase_set[2:] phase = phase[2:] # convert to ints ref_start, ref_end = (int(ref_start), int(ref_end)) # alignment start and end for reference sequence target_start = r[1]["TStart"] target_end = r[1]["TEnd"] # alignment start and end for query sequence query_start = r[1]["QStart"] query_end = r[1]["QEnd"] # strand-ness of the query sequence strand = r[1]["Strand"] # parse cigar for variant extraction cig = cigar.Cigar(r[1]["CIGAR"]) ops = list(cig.items()) # convert sequences to the positive strand query_seq = contig_fasta.fetch(query_name) if strand == "-": query_seq = str(Bio.Seq.Seq(query_seq).reverse_complement()) ref_seq = ref_fasta.fetch(ref_chrom, ref_start, ref_end) # initialize iterators for the cigar string query_pos = query_start target_pos = target_start # we are looking to extract insertions larger than 50bp for op in ops: # skip matches if op[1] == 'M': query_pos += op[0] target_pos += op[0] # skip deletions in the query sequence elif op[1] == 'D': target_pos += op[0] # insertions in the query sequence elif op[1] == 'I': # only interested in large insertions if op[0] > min_insert_size: # Generate pysam.VariantRecord # need to check conversion from 0-based coordinates to 1-based ref_allele = ref_seq[target_pos] alt_allele = ref_allele + query_seq[query_pos:query_pos + op[0]] gt = "" if phase == "1": gt = "1|0" elif phase == "2": gt = "0|1" else: gt = "0/1" break_point = ref_start + target_pos # output VCF record corresponding to the insertion rec = vcfpy.Record( CHROM=ref_chrom, POS=break_point + 1, ID=[query_name], REF=ref_allele, ALT=[vcfpy.Substitution("INS", alt_allele)], QUAL=999, FILTER=["PASS"], INFO={}, FORMAT=[ "GT", "SVLEN", "PS", "HP", "CIGAR", "STRAND", "CONTIG_START" ], calls=[ vcfpy.Call(sample=sample_name, data=vcfpy.OrderedDict( GT=gt, SVLEN=op[0], PS=phase_set, HP=phase, CIGAR=str(cig), STRAND=strand, CONTIG_START=str(query_start))) ]) n_records += 1 # output contig that contains this insertion writer.write_record(rec) contig_locus = ">" + query_name + "_" + sample_name contig_hash = sha1("_{chrom}_{pos}_{alt}".format( chrom=ref_chrom, pos=ref_start, alt=alt_allele[1:]).encode()).hexdigest() contig_name = contig_locus + "_" + contig_hash + "_" + str( op[0]) if contig_locus not in contig_loci: selected_contig_fasta.writelines( [contig_name + "\n", query_seq + "\n"]) contig_loci.add(contig_locus) # output same insertion, but with flanking sequences # note, the interval is [start, end[ if flank_length > 0: left_flank = ref_fasta.fetch( ref_chrom, break_point - flank_length, break_point) right_flank = ref_fasta.fetch( ref_chrom, break_point, break_point + flank_length) else: left_flank = "" right_flank = "" flanked_contig_fasta.writelines([ contig_name + "\n", left_flank + alt_allele[1:] + right_flank + "\n" ]) query_pos += op[0] selected_contig_fasta.close() return n_records
def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("input", metavar='input.vcf', action='store', help='vcf file.', type=str) parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output invcf = args.input ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1"])) # adding format lines header.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype")])) header.add_format_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) # read the input vcf with vcfpy.Reader.from_path(invcf) as reader: # get the FORMAT header lines of the input file # and convert them in INFO header lines of the output file format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' header.add_info_line(str_to_mapping(format_line.value)) #print(header) # write the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=[], INFO={}, FORMAT=["GT", "DP"], calls=[ vcfpy.Call( "Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")])) ]) #print(record) writer.write_record(record)
def main(): parser = argparse.ArgumentParser( description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument( "barcodes", metavar='barcodes.list', action='store', help= "File containing cell barcodes (the same used in the alignment file to identify cell reads).", type=str) parser.add_argument("vcf", metavar='variants.vcf', action='store', help="VCF file storing BULK SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) parser.add_argument( "--gt", metavar='1/1 (0/1)', choices=["0/0", "0/1", "1/1"], action='store', help= "Genotype filter: considers only mutations with the specified GT in the original vcf file.", type=str) args = parser.parse_args() bam = args.bam barcodes = args.barcodes invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" if args.gt: gt_filter = True gt = args.gt else: gt_filter = False with open(barcodes, "r") as f: samples = f.read().splitlines() #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(samples)) # sample header lines header_out.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", sample), ("Description", "Sample name")]))) # filter header lines # sample header lines header_out.add_filter_line( OrderedDict([("ID", "1/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/0"), ("Number", "1"), ("Description", "Filtered on such GT")])) #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")])) # format header lines header_out.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line( OrderedDict([ ("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)" ) ])) header_out.add_format_line( OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line( OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line( OrderedDict([ ("ID", "AF"), ("Number", "1"), ("Type", "Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored." ) ])) # read input vcf reader = vcfpy.Reader.from_path(invcf) # info header lines # Use input FORMAT lines as output INFO line header_out.add_info_line( OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of cells supporting the mutation.")])) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about bulk mutation)" + mapping[ "Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: d = samples_dict(samples) supp = 0 # filter out indels: only interested in snvs in this analysis phase if gt_filter: if record.calls[0].data.get('GT') != gt: continue if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS - 1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[ 0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos + 1, stepper='all', truncate=True, max_depth=10000): for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: #iterate on cells tags = list_to_dict(base.alignment.tags) if "CB" not in tags.keys(): ''' reads with no error-corrected barcode are discarded ''' continue elif tags["CB"].split("-")[0] not in samples: ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)''' continue cb = tags["CB"].split("-")[0] #10x barcodes #print("barcode {} is a cell barcode ".format(cb)) d[cb][ 'dp'] += 1 #update info for the sample identified by CB if base.alignment.query_sequence[ base.query_position] == alt: d[cb]['ad'] += 1 elif base.alignment.query_sequence[ base.query_position] == ref: d[cb]['rd'] += 1 for cb in d.keys(): if d[cb]['ad'] > 0: supp += 1 d[cb][ 'gt'] = "0/1" #temporary, all the supported mutations are set to 0/1 d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad']) # generate calls for each sample/cell calls = [] for cb in d.keys(): calls.append( vcfpy.Call( cb, OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']), ("RD", d[cb]['rd']), ("AD", d[cb]['ad']), ("AF", d[cb]['af'])]))) # create a mapping between each FORMAT entry and the # corresponding value, in the call, in the input vcf file # note that the input vcf contains only one sample, so # the calls field of each record contains only one entry info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) if gt_filter == True: filter_l = [gt] else: filter_l = [] # build and write the output record record_out = vcfpy.Record( CHROM=chrom, POS=pos + 1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=filter_l, INFO=info_d, FORMAT=["GT", "DP", "RD", "AD", "AF"], calls=calls) writer.write_record(record_out) reader.close() writer.close() samfile.close()
def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1", "Sample2"])) # Tuples of valid entries ----------------------------------------------------- # #: valid INFO value types # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String") #: valid FORMAT value types # FORMAT_TYPES = ("Integer", "Float", "Character", "String") #: valid values for "Number" entries, except for integers # VALID_NUMBERS = ("A", "R", "G", ".") #: header lines that contain an "ID" entry # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE") # Constants for "Number" entries ---------------------------------------------- # #: number of alleles excluding reference # HEADER_NUMBER_ALLELES = "A" #: number of alleles including reference # HEADER_NUMBER_REF = "R" #: number of genotypes # HEADER_NUMBER_GENOTYPES = "G" #: unbounded number of values # HEADER_NUMBER_UNBOUNDED = "." # adding filter lines header.add_filter_line( OrderedDict([("ID", "PASS"), ("Description", "All filters passed")])) # adding info lines header.add_info_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Raw read depth (without mapping quality filters)")])) header.add_info_line( OrderedDict([ ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"), ("Description", "States if the record mutation is supported (1) or not (0).") ])) # adding format lines header.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype")])) header.add_format_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]])) # adding contig lines header.add_contig_line( OrderedDict([("ID", "chr1"), ("length", "248956422")])) # adding sample lines header.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", "Sample1"), ("Description", "Tumor")]))) # writing the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record calls = [] calls.append( vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")]))) calls.append( vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")]))) record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=["PASS"], INFO={ "DP": "50", "MUT": 0 }, FORMAT=["GT", "DP"], calls=calls) #record.add_format(key="GT") #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")]))) writer.write_record(record)
def main(): parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument("vcf", metavar='file.vcf', action='store', help="VCF file storing SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) #parser.add_argument("--sample_name2", metavar='sample2', action='store', # help="Another sample name", type=str) args = parser.parse_args() bam= args.bam invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" ''' if args.sample_name2: sample_name2 = args.sample_name2 else: sample_name2 = null ''' #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample])) # sample header lines header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample)) ''' if sample_name2 is not null: header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")]))) ''' # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")])) # adding format lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # read input vcf reader = vcfpy.Reader.from_path(invcf) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: # filter out indels: only interested in snvs in this analysis phase if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000): #number of reads at this position sdp = pileupcolumn.n #number of supporting reads for the alternate base ad = 0 rd = 0 dp = 0 af = 0.0 for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: dp += 1 if base.alignment.query_sequence[base.query_position] == alt: ad += 1 elif base.alignment.query_sequence[base.query_position] == ref: rd += 1 if ad > 0: af = ad / (rd + ad) supp = 1 gt = "0/1" #temporary, all the supported mutations are set to 0/1 else: supp = 0 gt = "0/0" #af = ad / (rd + ad) info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"], calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))] ) writer.write_record(record_out) reader.close() writer.close() samfile.close()
def main(): parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.") parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str) parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str) #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str) #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str) parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str) args = parser.parse_args() input1 = args.input1 input2 = args.input2 prefix = args.outprefix #sample = args.sample #input_type = args.input_type clusters_df = pd.read_csv(input2) #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x)) clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()] # Create out header header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters)) # format header lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")])) # read input vcf reader = vcfpy.Reader.from_path(input1) # open the output vcf writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out) """ snps = read_vcf(input1, input_type) #Filtering bulk mutations not supported by cells snps = snps[~snps['INFO'].str.startswith("SUPP=0")] #Create mutation id column and set it as index snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"] snps = snps.set_index('mutid') """ #for each record in the vcf file for record_in in reader: d = samples_dict(clusters_df['cluster'].unique()) supp = 0 chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument for c in clusters_df['cluster'].unique(): #retrieve cell columns for cells in current cluster cells = clusters_df['cellid'][clusters_df['cluster'] == c] #retrieve cell data calls = [record_in.call_for_sample[cell] for cell in cells] #sum total read count, alt read count and ref read count of cells in the cluster for call in calls: d[c]['dp'] = d[c]['dp'] + call.data.get('DP') d[c]['rd'] = d[c]['rd'] + call.data.get('RD') d[c]['ad'] = d[c]['ad'] + call.data.get('AD') if d[c]['ad'] > 0: d[c]['gt'] = "0/1" d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad']) supp = 1 calls = [] # create one call for each cluster for c in d.keys(): calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])]))) print(calls) # write new record record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"], calls=calls ) writer.write_record(record_out) reader.close() writer.close()
def extract_consensus_insertions(contig_path, cons_path, ref_fasta_path, vcf_out_path, vcf_template_path, min_insertion_size, flank_length, flanked_contigs_path): n_records = 0 # open input sequences cons_fasta = pysam.FastaFile(cons_path) ref_fasta = pysam.FastaFile(ref_fasta_path) flanked_contig_fasta = open(flanked_contigs_path, "w") (samples, loci) = collect_genotypes(contig_path) print("Found", len(samples), "samples for", len(loci), "phased loci") reader = vcfpy.Reader.from_path(vcf_template_path) reader.header.samples = vcfpy.SamplesInfos(list(samples)) writer = vcfpy.Writer.from_path(vcf_out_path, reader.header) for contig in cons_fasta.references: # parse coordinates (chrom, start, end) = contig.split("_") (start, end) = int(start), int(end) cons_seq = cons_fasta.fetch(contig) ref_seq = ref_fasta.fetch(chrom, start, end) aligner = mappy.Aligner(seq = ref_seq, preset = None , k = 15, w = 10, n_threads = 1, max_join_long = 20000, max_join_short = 10000, min_join_flank_sc = 10, min_join_flank_ratio = 0.1, max_gap = 10000, bw = 2000, end_bonus = 10, zdrop = 10000, zdrop_inv = 1000, scoring = (2, 4, 4, 10, 300, 0, 1), extra_flags = 0x1) alignments = list(aligner.map(cons_seq, seq2 = None, cs = True, MD = False)) if len(alignments) == 0: print("No hits in", contig) continue aln = max(alignments, key = lambda x: x.blen) cig = cigar.Cigar(aln.cigar_str) ops = list(cig.items()) cons_pos = aln.q_st target_pos = aln.r_st strand = "+" if aln.strand == -1: cons_seq = str(Bio.Seq.Seq(cons_seq).reverse_complement()) strand = "-" # print(contig) for op in ops: # skip matches if op[1] == 'M': cons_pos += op[0] target_pos += op[0] # skip deletions in the query sequence elif op[1] == 'D': target_pos += op[0] # insertions in the query sequence elif op[1] == 'I': # only interested in large insertions if op[0] > min_insertion_size: # Generate pysam.VariantRecord # need to check conversion from 0-based coordinates to 1-based ref_allele = ref_seq[target_pos-1] alt_allele = cons_seq[cons_pos:cons_pos + op[0]] break_point = start + target_pos # output VCF record corresponding to the insertion # print(break_point, (start + end) / 2 ) # print(len(loci[contig]), "samples at", contig) # build calls data structure calls = [] for sample in samples: sample_gt = "0/0" ps = 0 if sample in loci[contig]: sample_gt = loci[contig][sample]["1"] + "|" + loci[contig][sample]["2"] ps = loci[contig][sample]["ps"] sample_call = vcfpy.Call(sample = sample, data = vcfpy.OrderedDict(GT = sample_gt, PS = ps)) # print(sample_call) calls.append(sample_call) rec = vcfpy.Record(CHROM = chrom, POS = break_point, ID = [contig + "_" + str(cons_pos)], REF = ref_allele, ALT = [vcfpy.Substitution("INS", ref_allele + alt_allele)], QUAL = 999, FILTER = ["PASS"], INFO = vcfpy.OrderedDict(SVLEN = op[0], CIGAR = [str(cig)], STRAND = strand, CONTIG_START = str(aln.q_st)), FORMAT = ["GT", "PS"], calls = calls) # output contig that contains this insertion writer.write_record(rec) # output same insertion, but with flanking sequences # note, the interval is [start, end[ if flank_length > 0: left_flank = ref_fasta.fetch(chrom, break_point - flank_length, break_point) right_flank = ref_fasta.fetch(chrom, break_point, break_point + flank_length) else: left_flank = "" right_flank = "" flanked_contig_fasta.writelines([ ">" + contig + "_" + str(cons_pos) + "\n", left_flank + alt_allele[1:] + right_flank + "\n"]) # output same contig, but with large flanking sequences # note, the interval is [start, end[ n_records += 1 cons_pos += op[0] flanked_contig_fasta.close() return n_records
logging.info("genotypes collected for : n = %s isolates, at %s", len(list(genoSample.keys())), datetime.datetime.now()) filteredEPIs = list(genoSample.keys()) filteredEPIs.sort() # filter out EPIs contains only low-freq variants #for acc in ['ISL_700228', 'ISL_539719', 'ISL_539706', 'ISL_539708']: # x = genoSample[acc] # print(acc, x) #sys.exit() ##################### # construct VCF records ############################# timePre = datetime.datetime.now() varCt = 0 header = vcfpy.Header( samples=vcfpy.SamplesInfos(filteredEPIs), lines=[ vcfpy.HeaderLine('fileformat', 'VCFv4.0'), vcfpy.HeaderLine('fileDate', str(datetime.datetime.now())), vcfpy.HeaderLine('source', parser.prog), vcfpy.ContigHeaderLine('contig', '<ID=String,Length=Integer>', { 'ID': 'EPI_ISL_406030', 'length': 29903 }), vcfpy.InfoHeaderLine( 'INFO', '<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">', { 'ID': 'NS', 'Number': 1, 'Type': 'Integer',