def main(args_input=sys.argv[1:]): parser = define_parser() args = parser.parse_args(args_input) vcf_reader = create_vcf_reader(args) (vcf_writer, append_to_existing_sample) = create_vcf_writer(args, vcf_reader) for entry in vcf_reader: if "GT" not in entry.FORMAT: if isinstance(entry.FORMAT, tuple): entry.FORMAT = ["GT"] else: entry.FORMAT.insert(0, 'GT') if append_to_existing_sample: entry.call_for_sample[ args.sample_name].data['GT'] = args.genotype_value else: new_sample_call = vcfpy.Call(args.sample_name, data={'GT': args.genotype_value}) if entry.calls: entry.calls.append(new_sample_call) else: entry.calls = [new_sample_call] entry.call_for_sample = {call.sample: call for call in entry.calls} vcf_writer.write_record(entry) vcf_reader.close() vcf_writer.close()
def test_reading_parse_nosample(tmpdir, nosample_vcf_file): """Read VCF file without samples, write file with samples.""" # Perform record-wise copying, saving results in records path_out = tmpdir.mkdir("output").join("output.vcf") with vcfpy.Reader.from_path(nosample_vcf_file) as reader: header = reader.header.copy() header.samples = vcfpy.SamplesInfos(["NA00001", "NA00002", "NA00003"]) with vcfpy.Writer.from_path(str(path_out), header) as writer: for record in reader: record.update_calls([ vcfpy.Call(sample, {}) for sample in ("NA00001", "NA00002", "NA00003") ]) record.add_format("GT", "./.") writer.write_record(record) expected = textwrap.dedent(""" ##fileformat=VCFv4.3 ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 14370 . G A 29 . . GT . . . 20 17330 . T A 3 . . GT . . . 20 1110696 . A G,T 67 . . GT . . . 20 1230237 . T . 47 . . GT . . . 20 1234567 . GTC G,GTCT 50 . . GT . . . """).lstrip() assert path_out.open("rt").read() == expected
def collect_all_vcf( dirs: str, vcf_filename: str = "phased.partial.vcf", output: str = "IsoSeq_IsoPhase.vcf", ) -> None: no_snp_found_filename = Path(f"{Path(vcf_filename).stem}.NO_SNPS_FOUND") snps_by_chrom = defaultdict(lambda: []) reader = None for d in dirs: filename = Path(d, vcf_filename) if not filename.exists(): if not no_snp_found_filename.exists(): logger.info("VCF file {filename} does not exist. Skipping.") continue with open(filename) as rf: reader = vcfpy.Reader(rf) for r in reader: c = Counter() # genotype -> count for x in r.samples: if x.data.GT.count("|") == 0: c[x.data.GT] += x.data.HQ else: for i, gt in enumerate(x.data.GT.split("|")): c[gt] += x.data.HQ[i] c_keys = c.keys() genotype = "|".join(str(k) for k in c_keys) counts = ",".join(str(c[k]) for k in c_keys) r.samples = [ vcfpy.Call( r, "SAMPLE", vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]), ) ] snps_by_chrom[r.CHROM].append((r.POS, r)) keys = list(snps_by_chrom.keys()) keys.sort() if reader is not None: reader.samples = ["SAMPLE"] with open(output, "w") as f: f = vcfpy.Writer(f, reader) for k in keys: v = snps_by_chrom[k] v.sort(key=lambda x: x[0]) for _, rec in v: f.write_record(rec) print("Output written to:", output)
def get_sample_call(sample_name, records): """ This function generates the Call for a single sample at at a given location, given a single record, multiple records or no record at all :param sample_name: :param records: :return: """ call_data = vcfpy.OrderedDict.fromkeys(["GT", "TRANCHE2", "VAF"]) if records: average_vaf = get_average_vaf(records) call_data["GT"] = get_gt(average_vaf) call_data["TRANCHE2"] = maximum_tranche(records) call_data["VAF"] = average_vaf return vcfpy.Call(sample=sample_name, data=call_data)
def _write_variants_data(self): for small_var in self._yield_smallvars(): # Get variant type if len(small_var.reference) == 1 and len( small_var.alternative) == 1: var_type = vcfpy.SNV elif len(small_var.reference) == len(small_var.alternative): var_type = vcfpy.MNV else: var_type = vcfpy.INDEL # Build list of calls calls = [ vcfpy.Call( member, { key.upper(): f( small_var.genotype.get(member, {}).get( key, default_value)) for key, default_value, f in ( ("gt", "./.", lambda x: x), ("gq", None, lambda x: x), ("ad", None, lambda x: None if x is None else [x]), ("dp", None, lambda x: x), ) }, ) for member in self.members ] # Construct and write out the VCF ``Record`` object self.vcf_writer.write_record( vcfpy.Record( small_var.chromosome, small_var.start, [], small_var.reference, [vcfpy.Substitution(var_type, small_var.alternative)], None, [], {}, ["GT", "GQ", "AD", "DP"], calls, ))
def generate_records(locus, individuals, chrom, offset): """Generate VCF records for all mutations of the given locus.""" records = [] mutation_allele_frequencies = dict() mutation_sample_count = defaultdict(set) # assemble normalized and sorted list of mutations, each of which # will later receive one record in the VCF file # additionally, create a dictionary that maps mutations to their # allele frequency: the sum of all alleles with this specific mutation for allele, frequency in locus["allele frequencies"].items(): # print("allele", allele) if allele == 0: # skip reference alleles continue else: # collect all mutated positions all_mutations = set() for ind in individuals: mutations_per_individual = set() try: mutations_per_individual.update( locus["individuals"][ind][allele]["mutations"]) # print(mutations_per_individual) for m in mutations_per_individual: mutation_sample_count[parse_mutation(m, offset)].add (ind) all_mutations.update(mutations_per_individual) except KeyError: # this allele is not present in this individual ... # normalize them and sort them by position in merged read # this is rad seq stacks specific normalized_mutations = sorted( [ parse_mutation(a, offset) for a in all_mutations ], key=lambda mut: mut.pos, ) for mut in normalized_mutations: if mut in mutation_allele_frequencies: mutation_allele_frequencies[mut] += frequency else: mutation_allele_frequencies[mut] = frequency individual_allele_coverage = defaultdict(lambda: 0) for ind_name, alleles in locus["individuals"].items(): for name, allele in alleles.items(): if not allele["mutations"]: individual_allele_coverage[ind_name, 0] += allele["cov"] else: for mut in allele["mutations"]: individual_allele_coverage[ind_name, parse_mutation(mut, offset)] += allele["cov"] # TODO: make sure that there is no position with two different alt bases # right now, these are not handled properly # # create one record for each mutation, # i.e. each variant at each mutated position # print("norm mut", normalized_mutations) for mut in normalized_mutations: info = OrderedDict() locus_calls = [] # print(f"looking for {mut}") # per mutated pos -> record # round allele frequencies info["AF"] = [round(mutation_allele_frequencies[mut], 3)] # coverage of the variant site is the sum of all reads # of all individuals info["DP"] = sum(locus["allele coverages"].values()) # number of samples with mutation info["NS"] = len(mutation_sample_count[mut]) # check for each individual, if the reference base # or another base is present at this location for ind in individuals: individual_calls = OrderedDict() individual_alleles = parse_alleles(locus["individuals"][ind], offset) # print(f"norm individual alleles for {ind}", individual_alleles) # coverage for the individual is ths sum of all reads coveraging # the site individual_calls["DP"] = sum( (i.cov for i in individual_alleles.values()) ) # get call strings allele_presence, allele_str = allele_present(individual_alleles, mut) individual_calls["GT"] = allele_str # print(individual_calls["GT"]) # fill individual allele coverage as a tuple of # (coverage of ref allele, coverage of alt allele) try: if allele_presence is None: ind_allele_cov = (0, 0) elif allele_presence == (0, 0): ind_allele_cov = (individual_allele_coverage[(ind, 0)], 0) elif allele_presence == (1, 1): ind_allele_cov = (0, individual_allele_coverage[(ind, mut)]) elif allele_presence == (1, 0): ind_allele_cov = (individual_allele_coverage[(ind, mut)], individual_allele_coverage[(ind, 0)]) elif allele_presence == (0, 1): ind_allele_cov = (individual_allele_coverage[(ind, 0)], individual_allele_coverage[(ind, mut)]) else: raise ValueError("Invalid mutation") individual_calls["AD"] = ind_allele_cov except KeyError: print(individual_allele_coverage) raise # TODO: handle different variants of the same base on # different alleles => REF = A, ALT = C,T, GT= 0|1|2 locus_calls.append(vcfpy.Call(ind, individual_calls)) rec = vcfpy.record.Record( CHROM=chrom, POS=mut.pos, ID=[""], REF=mut.ref, ALT=[vcfpy.Substitution("SNP", mut.alt)], QUAL="", FILTER=["PASS"], INFO=info, FORMAT=["GT", "DP", "AD"], calls=locus_calls ) # print("Record:", rec) records.append( rec ) return records
# Info dico_info = { "CALLNB":[nb_call], "CALLAF":["|".join(numpy.array(lst_af,dtype=str))], "CALLFILTER":["|".join(lst_filter).replace(" ","")], "CALLQUAL":["|".join(numpy.array(lst_qual,dtype=str))] } #***** FORMAT *****# lst_format_id = ['GT', 'DP', 'AF'] # Merge GT field try: set_gt.remove('./.') except: pass if len(set_gt)==1: field_gt = set_gt.pop() else: field_gt = "./." # Merge DP field field_dp = int(round(numpy.median(lst_dp),0)) # Merge AF field while lst_af.count(".")>0: lst_af.remove(".") field_af = float(round(numpy.median(lst_af),2)) # Create call dico_calls = [vcfpy.Call(dico_vcf[var_id]["sample"], {'GT':field_gt, 'DP':field_dp, 'AF':[field_af]})] #***** WRITE VARIANT *****# new_record = vcfpy.Record(chrom, pos, ".", ref, dico_vcf[var_id]["ALT"], field_qual, [field_filter], dico_info, lst_format_id, dico_calls) writer.write_record(new_record) writer.close() #***** POST-PROCESSING *****# # Sort sortVCF(pathMergeUnsortedVCF,pathMergeVCF) # Validate boolvalid,lst_errors = validateVCF(path_vcfvalidator,pathMergeVCF) if boolvalid==False: exit("🅴 🆁 🆁 🅾 🆁\n[Nk_mergeVCF] Validate VCF `"+os.path.basename(pathMergeVCF)+"`\n "+"\n ".join(lst_errors)) # bgzip cmd_bgzip = "bgzip -f "+pathMergeVCF
def extract_vcf_records( sample_name, # input paths alignments_path, contigs_path, ref_fasta_path, vcf_template_path, # output paths vcf_out_path, selected_contigs_path, flanked_contigs_path, flank_length, min_insert_size): n_records = 0 ref_fasta = pysam.FastaFile(ref_fasta_path) contig_fasta = pysam.FastaFile(contigs_path) selected_contig_fasta = open(selected_contigs_path, "w") flanked_contig_fasta = open(flanked_contigs_path, "w") alns = pandas.read_csv(alignments_path, sep=" ") reader = vcfpy.Reader.from_path(vcf_template_path) reader.header.samples = vcfpy.SamplesInfos([sample_name]) writer = vcfpy.Writer.from_path(vcf_out_path, reader.header) contig_loci = set() # parse each alignment and look for insertions above min_insert_size for r in alns.iterrows(): # skip secondary alignments hit = r[1]["Hit"] if hit > 0: continue query_name = r[1]["QName"] # local alignment window in the reference ref_chrom, ref_start, ref_end, phase_set, phase, n = query_name.split( "_") phase_set = phase_set[2:] phase = phase[2:] # convert to ints ref_start, ref_end = (int(ref_start), int(ref_end)) # alignment start and end for reference sequence target_start = r[1]["TStart"] target_end = r[1]["TEnd"] # alignment start and end for query sequence query_start = r[1]["QStart"] query_end = r[1]["QEnd"] # strand-ness of the query sequence strand = r[1]["Strand"] # parse cigar for variant extraction cig = cigar.Cigar(r[1]["CIGAR"]) ops = list(cig.items()) # convert sequences to the positive strand query_seq = contig_fasta.fetch(query_name) if strand == "-": query_seq = str(Bio.Seq.Seq(query_seq).reverse_complement()) ref_seq = ref_fasta.fetch(ref_chrom, ref_start, ref_end) # initialize iterators for the cigar string query_pos = query_start target_pos = target_start # we are looking to extract insertions larger than 50bp for op in ops: # skip matches if op[1] == 'M': query_pos += op[0] target_pos += op[0] # skip deletions in the query sequence elif op[1] == 'D': target_pos += op[0] # insertions in the query sequence elif op[1] == 'I': # only interested in large insertions if op[0] > min_insert_size: # Generate pysam.VariantRecord # need to check conversion from 0-based coordinates to 1-based ref_allele = ref_seq[target_pos] alt_allele = ref_allele + query_seq[query_pos:query_pos + op[0]] gt = "" if phase == "1": gt = "1|0" elif phase == "2": gt = "0|1" else: gt = "0/1" break_point = ref_start + target_pos # output VCF record corresponding to the insertion rec = vcfpy.Record( CHROM=ref_chrom, POS=break_point + 1, ID=[query_name], REF=ref_allele, ALT=[vcfpy.Substitution("INS", alt_allele)], QUAL=999, FILTER=["PASS"], INFO={}, FORMAT=[ "GT", "SVLEN", "PS", "HP", "CIGAR", "STRAND", "CONTIG_START" ], calls=[ vcfpy.Call(sample=sample_name, data=vcfpy.OrderedDict( GT=gt, SVLEN=op[0], PS=phase_set, HP=phase, CIGAR=str(cig), STRAND=strand, CONTIG_START=str(query_start))) ]) n_records += 1 # output contig that contains this insertion writer.write_record(rec) contig_locus = ">" + query_name + "_" + sample_name contig_hash = sha1("_{chrom}_{pos}_{alt}".format( chrom=ref_chrom, pos=ref_start, alt=alt_allele[1:]).encode()).hexdigest() contig_name = contig_locus + "_" + contig_hash + "_" + str( op[0]) if contig_locus not in contig_loci: selected_contig_fasta.writelines( [contig_name + "\n", query_seq + "\n"]) contig_loci.add(contig_locus) # output same insertion, but with flanking sequences # note, the interval is [start, end[ if flank_length > 0: left_flank = ref_fasta.fetch( ref_chrom, break_point - flank_length, break_point) right_flank = ref_fasta.fetch( ref_chrom, break_point, break_point + flank_length) else: left_flank = "" right_flank = "" flanked_contig_fasta.writelines([ contig_name + "\n", left_flank + alt_allele[1:] + right_flank + "\n" ]) query_pos += op[0] selected_contig_fasta.close() return n_records
def write_haplotype_to_vcf(self, fake_genome_mapping_filename, isoform_tally, output_prefix): """ The following functions must first be called first: -- self.get_haplotype_vcf_assignment """ if self.haplotype_vcf_index is None or self.alt_at_pos is None: raise Exception( "Must call self.get_haplotype_vcf_assignment() first!") self.sanity_check() name_isoforms = list(isoform_tally.keys()) name_isoforms.sort() # write a fake VCF example so we can read the headers in with open("template.vcf", "w") as f: f.write(__VCF_EXAMPLE__) reader = vcfpy.Reader(open("template.vcf")) reader.samples = name_isoforms f_vcf = vcfpy.Writer(f"{output_prefix}.vcf", reader) # human readable text: # first line: assoc VCF filename # second line: haplotype, list of sorted isoforms # third line onwards: haplotype and assoc count with open(f"{output_prefix}.human_readable.txt", "w") as f_human: f_human.write(f"Associated VCF file: {output_prefix}.vcf\n") f_human.write("haplotype\t{samples}\n".format( samples="\t".join(name_isoforms))) for hap_index, hap_str in enumerate(self.haplotypes): f_human.write(hap_str) for _iso in name_isoforms: if hap_index in isoform_tally[_iso]: f_human.write(f"\t{isoform_tally[_iso][hap_index]}") else: f_human.write("\t0") f_human.write("\n") # read fake genome mapping file fake_map = {} # 0-based position on fake --> (, 0-based ref position) with open(fake_genome_mapping_filename) as f: for line in f: fake_pos, ref_chr, ref_pos = line.strip().split(",") fake_map[int(fake_pos)] = (ref_chr, int(ref_pos)) # for each position, write out the ref and alt bases # then fill in for each isoform (aka "sample"): # if this isoform only shows one allele, then it's just that allele (0 for ref, 1+ otherwise) # if this isoform shows 2+ allele, then the first allele is indicated by self.haplotypes[0] for i, pos in enumerate(self.hap_var_positions): ref_chr, ref_pos = fake_map[pos] total_count = sum(self.count_of_vars_by_pos[pos].values()) alt_freq = [ f"{self.count_of_vars_by_pos[pos][b] * 1.0 / total_count:.2f}" for b in self.alt_at_pos[pos] ] rec = vcfpy.Record( CHROM=ref_chr, POS=ref_pos + 1, ID=".", REF=self.ref_at_pos[pos], ALT=[vcfpy.Substitution(b) for b in self.alt_at_pos[pos]], QUAL=".", FILTER="PASS", INFO={ "AF": alt_freq, "DP": total_count }, FORMAT="GT:HQ", sample_indexes=None, ) rec.samples = [] for _iso in name_isoforms: # isoform_tally[_iso] is a dict of haplotype index --> count # the index for thos base at this pos would thus be haplotype_vcf_index[hap_index][i] # we always need to show the phases in haplotype index order sorted hap_indices = list(isoform_tally[_iso].keys()) hap_indices.sort() genotype = "|".join( str(self.haplotype_vcf_index[hap_index][pos]) for hap_index in hap_indices) counts = ",".join( str(isoform_tally[_iso][hap_index]) for hap_index in hap_indices) rec.samples.append( vcfpy.Call( rec, _iso, vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]))) f_vcf.write_record(rec) f_vcf.close()
def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("input", metavar='input.vcf', action='store', help='vcf file.', type=str) parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output invcf = args.input ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1"])) # adding format lines header.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype")])) header.add_format_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) # read the input vcf with vcfpy.Reader.from_path(invcf) as reader: # get the FORMAT header lines of the input file # and convert them in INFO header lines of the output file format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' header.add_info_line(str_to_mapping(format_line.value)) #print(header) # write the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=[], INFO={}, FORMAT=["GT", "DP"], calls=[ vcfpy.Call( "Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")])) ]) #print(record) writer.write_record(record)
def main(): parser = argparse.ArgumentParser( description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument( "barcodes", metavar='barcodes.list', action='store', help= "File containing cell barcodes (the same used in the alignment file to identify cell reads).", type=str) parser.add_argument("vcf", metavar='variants.vcf', action='store', help="VCF file storing BULK SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) parser.add_argument( "--gt", metavar='1/1 (0/1)', choices=["0/0", "0/1", "1/1"], action='store', help= "Genotype filter: considers only mutations with the specified GT in the original vcf file.", type=str) args = parser.parse_args() bam = args.bam barcodes = args.barcodes invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" if args.gt: gt_filter = True gt = args.gt else: gt_filter = False with open(barcodes, "r") as f: samples = f.read().splitlines() #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(samples)) # sample header lines header_out.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", sample), ("Description", "Sample name")]))) # filter header lines # sample header lines header_out.add_filter_line( OrderedDict([("ID", "1/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/1"), ("Number", "1"), ("Description", "Filtered on such GT")])) header_out.add_filter_line( OrderedDict([("ID", "0/0"), ("Number", "1"), ("Description", "Filtered on such GT")])) #header_out.add_info_line(OrderedDict([("ID", "MUT"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the record mutation is supported (1) or not (0).")])) # format header lines header_out.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line( OrderedDict([ ("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)" ) ])) header_out.add_format_line( OrderedDict([("ID", "RD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line( OrderedDict([("ID", "AD"), ("Number", "1"), ("Type", "Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line( OrderedDict([ ("ID", "AF"), ("Number", "1"), ("Type", "Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored." ) ])) # read input vcf reader = vcfpy.Reader.from_path(invcf) # info header lines # Use input FORMAT lines as output INFO line header_out.add_info_line( OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Number of cells supporting the mutation.")])) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about bulk mutation)" + mapping[ "Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: d = samples_dict(samples) supp = 0 # filter out indels: only interested in snvs in this analysis phase if gt_filter: if record.calls[0].data.get('GT') != gt: continue if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS - 1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[ 0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos + 1, stepper='all', truncate=True, max_depth=10000): for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: #iterate on cells tags = list_to_dict(base.alignment.tags) if "CB" not in tags.keys(): ''' reads with no error-corrected barcode are discarded ''' continue elif tags["CB"].split("-")[0] not in samples: ''' The barcode hasn't been labeled has belonging to a cell by cellranger (floating DNA)''' continue cb = tags["CB"].split("-")[0] #10x barcodes #print("barcode {} is a cell barcode ".format(cb)) d[cb][ 'dp'] += 1 #update info for the sample identified by CB if base.alignment.query_sequence[ base.query_position] == alt: d[cb]['ad'] += 1 elif base.alignment.query_sequence[ base.query_position] == ref: d[cb]['rd'] += 1 for cb in d.keys(): if d[cb]['ad'] > 0: supp += 1 d[cb][ 'gt'] = "0/1" #temporary, all the supported mutations are set to 0/1 d[cb]['af'] = d[cb]['ad'] / (d[cb]['rd'] + d[cb]['ad']) # generate calls for each sample/cell calls = [] for cb in d.keys(): calls.append( vcfpy.Call( cb, OrderedDict([("GT", d[cb]['gt']), ("DP", d[cb]['dp']), ("RD", d[cb]['rd']), ("AD", d[cb]['ad']), ("AF", d[cb]['af'])]))) # create a mapping between each FORMAT entry and the # corresponding value, in the call, in the input vcf file # note that the input vcf contains only one sample, so # the calls field of each record contains only one entry info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) if gt_filter == True: filter_l = [gt] else: filter_l = [] # build and write the output record record_out = vcfpy.Record( CHROM=chrom, POS=pos + 1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=filter_l, INFO=info_d, FORMAT=["GT", "DP", "RD", "AD", "AF"], calls=calls) writer.write_record(record_out) reader.close() writer.close() samfile.close()
if site in genoSample[acc]: # is mutated # print(genoSample[acc][site]) if genoSample[acc][site] in geno: # alt is valid allele = geno[genoSample[acc][site]] # logging.info("alt assigned for %s at %s: %s", acc, site, allele) else: # alt is singleton/discarded logging.warning( "alt is singleton for %s at %s: assign ref allele", acc, site) # else: # logging.info("ref alleles assigned for %s at %s", acc, site) gt = str(allele) + "|" + str(allele) if args.diploid else str( allele) sampleCall = vcfpy.Call( sample=acc, data={'GT': gt}, # has to be string; diploid # data = {'GT': str(allele) }, # has to be string site=site) genoCalls.append(sampleCall) record = vcfpy.Record( CHROM=refEPI, POS=site, ID=snpInfo[site]['varID'], REF=snpInfo[site]['refNT'], ALT=subs, QUAL=None, FILTER=[], # PASS INFO={}, # consequence calls, locus, etc; a dict FORMAT=['GT'], # a list calls=genoCalls)
def write_snp_to_vcf( snp_filename: Path, vcf_filename: Path, genome_filename: Path, genome_d: LazyFastaReader = None, ) -> None: # read the genome is genome_d is not given if genome_d is None: genome_d = LazyFastaReader(genome_filename) # read the first SNP record so we know the query name snp_reader = SNPReader(snp_filename) snp_rec = next(snp_reader) sample_name = snp_rec.query_name cur_recs = [snp_rec] genome_rec = genome_d[snp_rec.ref_name] with open("template.vcf", "w+") as f: f.write(f"{__VCF_EXAMPLE__}\n") reader = vcfpy.Reader(f) reader.samples = [sample_name] f_vcf = vcfpy.Writer(vcf_filename, reader) for r1 in snp_reader: if r1.ref_pos == cur_recs[ -1].ref_pos: # multi-nt insertion, keep recording cur_recs.append(r1) elif (r1.query_base == "." and cur_recs[-1].query_base == "."): # multi-nt deletion, keep recording cur_recs.append(r1) else: # time to write out the current set of records # multiple records mean it could be: # 1. multi-nucleotide insertions # 2. multi-nucleotide deletions if (len(cur_recs) == 1 and cur_recs[0].ref_base != "." and cur_recs[0].query_base != "."): # just a SNP record pos = cur_recs[0].ref_pos ref_base = cur_recs[0].ref_base alt_base = cur_recs[0].query_base elif cur_recs[0].ref_base == ".": # is a single or multi-nt insertions, must retrieve ref base from genome # ex: in out.snps_files it is . --> ATG # in VCF it should be T --> TATG (meaning insertion of ATG) pos = cur_recs[0].ref_pos ref_base = genome_rec[cur_recs[0].ref_pos] alt_base = ref_base + "".join(r.query_base for r in cur_recs) else: # is a single multi-nt deletions, we need to get one more ref base before the first deletion # ex: in out.snps_files it is GGG --> deletion # in VCF it should be TGGG --> T (meaning deletion of GGG) pos = cur_recs[0].ref_pos - 1 ref_base_prev = genome_rec[pos] ref_base = ref_base_prev + "".join(r.ref_base for r in cur_recs) alt_base = ref_base_prev rec = vcfpy.Record( CHROM=snp_rec.ref_name, POS=pos + 1, ID=".", REF=ref_base, ALT=[vcfpy.Substitution(alt_base)], QUAL=".", FILTER="PASS", INFO={"AF": 0.5}, FORMAT="GT", sample_indexes=None, ) rec.samples.append( vcfpy.Call(rec, sample_name, vcfpy.OrderedDict([("GT", "0|1")]))) f_vcf.write_record(rec) if r1.ref_name != cur_recs[0].ref_name: genome_rec = genome_d[r1.ref_name] cur_recs = [r1]
def main(): parser = argparse.ArgumentParser(description="vcf writer") parser.add_argument("output", metavar='output.vcf', action='store', help='vcf file.', type=str) args = parser.parse_args() outvcf = args.output ######################### # # # creating the header # # # ######################### # The header can contain some fixed type lines (INFO, FORMAT, FILTER, etc.) and some general ones # In this case, the header will contain a line storing the name of the program which generated # the file. We also add the information about the name of the sample which have been analyzed header = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(["Sample1", "Sample2"])) # Tuples of valid entries ----------------------------------------------------- # #: valid INFO value types # INFO_TYPES = ("Integer", "Float", "Flag", "Character", "String") #: valid FORMAT value types # FORMAT_TYPES = ("Integer", "Float", "Character", "String") #: valid values for "Number" entries, except for integers # VALID_NUMBERS = ("A", "R", "G", ".") #: header lines that contain an "ID" entry # LINES_WITH_ID = ("ALT", "contig", "FILTER", "FORMAT", "INFO", "META", "PEDIGREE", "SAMPLE") # Constants for "Number" entries ---------------------------------------------- # #: number of alleles excluding reference # HEADER_NUMBER_ALLELES = "A" #: number of alleles including reference # HEADER_NUMBER_REF = "R" #: number of genotypes # HEADER_NUMBER_GENOTYPES = "G" #: unbounded number of values # HEADER_NUMBER_UNBOUNDED = "." # adding filter lines header.add_filter_line( OrderedDict([("ID", "PASS"), ("Description", "All filters passed")])) # adding info lines header.add_info_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Raw read depth (without mapping quality filters)")])) header.add_info_line( OrderedDict([ ("ID", "MUT"), ("Number", "1"), ("Type", "Integer"), ("Description", "States if the record mutation is supported (1) or not (0).") ])) # adding format lines header.add_format_line( OrderedDict([("ID", "GT"), ("Number", "1"), ("Type", "String"), ("Description", "Genotype")])) header.add_format_line( OrderedDict([("ID", "DP"), ("Number", "1"), ("Type", "Integer"), ("Description", "Filtered read depth (MAPQ > 30)")])) #header.add_format_line(OrderedDict([vcfpy.header.RESERVED_FORMAT["GT"]])) # adding contig lines header.add_contig_line( OrderedDict([("ID", "chr1"), ("length", "248956422")])) # adding sample lines header.add_line( vcfpy.SampleHeaderLine.from_mapping( OrderedDict([("ID", "Sample1"), ("Description", "Tumor")]))) # writing the vcf with vcfpy.Writer.from_path(outvcf, header) as writer: # creating one record calls = [] calls.append( vcfpy.Call("Sample1", OrderedDict([("GT", "0/1"), ("DP", "47")]))) calls.append( vcfpy.Call("Sample2", OrderedDict([("GT", "0/1"), ("DP", "31")]))) record = vcfpy.Record(CHROM="1", POS=1, ID=[], REF="C", ALT=[vcfpy.Substitution(type_="SNV", value="G")], QUAL=None, FILTER=["PASS"], INFO={ "DP": "50", "MUT": 0 }, FORMAT=["GT", "DP"], calls=calls) #record.add_format(key="GT") #record.calls.append(vcfpy.Call("Sample1", OrderedDict([("GT", "0|1")]))) writer.write_record(record)
def main(): parser = argparse.ArgumentParser(description="Looks for a given set of SNPs whithin a bam file.") parser.add_argument("bam", metavar='sample.bam', action='store', help='BAM file.', type=str) parser.add_argument("vcf", metavar='file.vcf', action='store', help="VCF file storing SNPs.", type=str) parser.add_argument("sample_name", metavar='sample1', action='store', help="Sample identifier.", type=str) parser.add_argument("out_prefix", metavar="outdir/sample", action="store", help="Output VCF file prefix.", type=str) #parser.add_argument("--sample_name2", metavar='sample2', action='store', # help="Another sample name", type=str) args = parser.parse_args() bam= args.bam invcf = args.vcf sample = args.sample_name outvcf = args.out_prefix + ".snpseeker.vcf" ''' if args.sample_name2: sample_name2 = args.sample_name2 else: sample_name2 = null ''' #read bam file samfile = pysam.AlignmentFile(bam, "rb") #build the header of the output vcf header_out = vcfpy.Header(lines=[vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos([sample])) # sample header lines header_out.add_line(vcfpy.HeaderLine(key="SampleName", value=sample)) ''' if sample_name2 is not null: header_out.add_line(vcfpy.SampleHeaderLine.from_mapping(OrderedDict([("ID", sample_name2),("Description", "Second sample name")]))) ''' # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "States if the mutation is supported (1) or not (0).")])) # adding format lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "SDP"),("Number", "1"), ("Type","Integer"), ("Description", "Samtools read depth (secondary alignments, PCR duplicates, unppammed reads and reads not passing vendor QC are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # read input vcf reader = vcfpy.Reader.from_path(invcf) format_ids = reader.header.format_ids() for format_id in format_ids: format_line = reader.header.get_format_field_info(format_id) ''' output example: FormatHeaderLine('FORMAT', '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">', {'ID': 'AD', 'Number': 'R', 'Type': 'Integer', 'Description': 'Allelic depths for the ref and alt alleles in the order listed'}) key = 'FORMAT' value = '<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> ''' mapping = str_to_mapping(format_line.value) mapping["Description"] = "(Info about mutation in the original vcf)" + mapping["Description"] header_out.add_info_line(str_to_mapping(format_line.value)) # open the output vcf writer = vcfpy.Writer.from_path(outvcf, header_out) #read bam file samfile = pysam.AlignmentFile(bam, "rb") #for each mutation in the vcf file for record_in in reader: # filter out indels: only interested in snvs in this analysis phase if not record_in.is_snv(): continue chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #record.ALT is a list by construction which contains only one value # if the mutation is a SNV #line += [call.data.get('GT') or './.' for call in record.calls] #look for the pileup in the samfile at position (chrom,pos) for pileupcolumn in samfile.pileup(chrom, pos, pos+1, stepper='all', truncate=True, max_depth=10000): #number of reads at this position sdp = pileupcolumn.n #number of supporting reads for the alternate base ad = 0 rd = 0 dp = 0 af = 0.0 for base in pileupcolumn.pileups: # .is_del -> the base is a deletion? # .is_refskip -> the base is a N in the CIGAR string ? if not base.is_del and not base.is_refskip and not base.alignment.mapping_quality < 30: dp += 1 if base.alignment.query_sequence[base.query_position] == alt: ad += 1 elif base.alignment.query_sequence[base.query_position] == ref: rd += 1 if ad > 0: af = ad / (rd + ad) supp = 1 gt = "0/1" #temporary, all the supported mutations are set to 0/1 else: supp = 0 gt = "0/0" #af = ad / (rd + ad) info_d = {} info_d['SUPP'] = supp for f in record_in.FORMAT: info_d[f] = record_in.calls[0].data.get(f) record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO=info_d, FORMAT=["GT","SDP","DP","RD","AD","AF"], calls=[vcfpy.Call(sample, OrderedDict([("GT", gt), ("SDP",sdp), ("DP", dp), ("RD", rd), ("AD", ad), ("AF", af)]))] ) writer.write_record(record_out) reader.close() writer.close() samfile.close()
def main(): parser = argparse.ArgumentParser(description="From single cell VCF to clones vcf.") parser.add_argument("input1", metavar="sample.muts.vcf", action="store", help="Single cell VCF file.", type=str) parser.add_argument("input2", metavar="clusters.list", action="store", help="Clusters list.", type=str) #parser.add_argument("input_type", choices=["gz", "vcf"], help="VCF input type (vcf/gz).", type=str) #parser.add_argument("sample", metavar="sample_name", action="store", help="Sample name", type=str) parser.add_argument("outprefix", metavar="out/path/prefix", action="store", help="Output prefix", type=str) args = parser.parse_args() input1 = args.input1 input2 = args.input2 prefix = args.outprefix #sample = args.sample #input_type = args.input_type clusters_df = pd.read_csv(input2) #clusters_df['cluster'] = clusters_df['a'].apply(lambda x: "{}_{}".format(sample, x)) clusters = [str(cluster) for cluster in clusters_df['cluster'].unique()] # Create out header header_out = vcfpy.Header(lines=[ vcfpy.HeaderLine(key="fileformat", value="VCFv4.3"), vcfpy.HeaderLine(key="source", value=sys.argv[0]), vcfpy.HeaderLine(key="fileDate", value=date.today().strftime("%d/%m/%Y")) ], samples=vcfpy.SamplesInfos(clusters)) # format header lines header_out.add_format_line(OrderedDict([("ID", "GT"),("Number", "1"), ("Type","String"), ("Description", "Genotype (0/1, 0/0)")])) header_out.add_format_line(OrderedDict([("ID", "DP"),("Number", "1"), ("Type","Integer"), ("Description", "Filtered read depth (reads with MAPQ < 30, indels and gaps are filtered)")])) header_out.add_format_line(OrderedDict([("ID", "RD"),("Number", "1"), ("Type","Integer"), ("Description", "Reference allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AD"),("Number", "1"), ("Type","Integer"), ("Description", "Alternate allele read depth")])) header_out.add_format_line(OrderedDict([("ID", "AF"),("Number", "1"), ("Type","Float"), ("Description", "Allele frequency: AD/(RD+AD). Other alleles, in case of mutli-allelic regions, are ignored.")])) # info header lines header_out.add_info_line(OrderedDict([("ID", "SUPP"), ("Number", "1"), ("Type","Integer"), ("Description", "Whether the mutation is supported or not.")])) # read input vcf reader = vcfpy.Reader.from_path(input1) # open the output vcf writer = vcfpy.Writer.from_path(prefix+"_clusters.vcf", header_out) """ snps = read_vcf(input1, input_type) #Filtering bulk mutations not supported by cells snps = snps[~snps['INFO'].str.startswith("SUPP=0")] #Create mutation id column and set it as index snps["mutid"] = snps["CHROM"] + "_"+snps["POS"].map(str) + "_" + snps["REF"] + "_" +snps["ALT"] snps = snps.set_index('mutid') """ #for each record in the vcf file for record_in in reader: d = samples_dict(clusters_df['cluster'].unique()) supp = 0 chrom = record_in.CHROM pos = record_in.POS-1 #to correct on 1-based positions ref = record_in.REF alt = record_in.ALT[0].value #for each cluster compute 'GT:DP:RD:AD:AF' to be provided as call argument for c in clusters_df['cluster'].unique(): #retrieve cell columns for cells in current cluster cells = clusters_df['cellid'][clusters_df['cluster'] == c] #retrieve cell data calls = [record_in.call_for_sample[cell] for cell in cells] #sum total read count, alt read count and ref read count of cells in the cluster for call in calls: d[c]['dp'] = d[c]['dp'] + call.data.get('DP') d[c]['rd'] = d[c]['rd'] + call.data.get('RD') d[c]['ad'] = d[c]['ad'] + call.data.get('AD') if d[c]['ad'] > 0: d[c]['gt'] = "0/1" d[c]['af'] = d[c]['ad'] / (d[c]['rd'] + d[c]['ad']) supp = 1 calls = [] # create one call for each cluster for c in d.keys(): calls.append(vcfpy.Call(str(c), OrderedDict([("GT", d[c]['gt']), ("DP", d[c]['dp']), ("RD", d[c]['rd']), ("AD", d[c]['ad']), ("AF", d[c]['af'])]))) print(calls) # write new record record_out = vcfpy.Record(CHROM=chrom, POS=pos+1, ID=[], REF=ref, ALT=[vcfpy.Substitution(type_="SNV", value=alt)], QUAL=None, FILTER=[], INFO={"SUPP":supp}, FORMAT=["GT","DP","RD","AD","AF"], calls=calls ) writer.write_record(record_out) reader.close() writer.close()
def extract_consensus_insertions(contig_path, cons_path, ref_fasta_path, vcf_out_path, vcf_template_path, min_insertion_size, flank_length, flanked_contigs_path): n_records = 0 # open input sequences cons_fasta = pysam.FastaFile(cons_path) ref_fasta = pysam.FastaFile(ref_fasta_path) flanked_contig_fasta = open(flanked_contigs_path, "w") (samples, loci) = collect_genotypes(contig_path) print("Found", len(samples), "samples for", len(loci), "phased loci") reader = vcfpy.Reader.from_path(vcf_template_path) reader.header.samples = vcfpy.SamplesInfos(list(samples)) writer = vcfpy.Writer.from_path(vcf_out_path, reader.header) for contig in cons_fasta.references: # parse coordinates (chrom, start, end) = contig.split("_") (start, end) = int(start), int(end) cons_seq = cons_fasta.fetch(contig) ref_seq = ref_fasta.fetch(chrom, start, end) aligner = mappy.Aligner(seq = ref_seq, preset = None , k = 15, w = 10, n_threads = 1, max_join_long = 20000, max_join_short = 10000, min_join_flank_sc = 10, min_join_flank_ratio = 0.1, max_gap = 10000, bw = 2000, end_bonus = 10, zdrop = 10000, zdrop_inv = 1000, scoring = (2, 4, 4, 10, 300, 0, 1), extra_flags = 0x1) alignments = list(aligner.map(cons_seq, seq2 = None, cs = True, MD = False)) if len(alignments) == 0: print("No hits in", contig) continue aln = max(alignments, key = lambda x: x.blen) cig = cigar.Cigar(aln.cigar_str) ops = list(cig.items()) cons_pos = aln.q_st target_pos = aln.r_st strand = "+" if aln.strand == -1: cons_seq = str(Bio.Seq.Seq(cons_seq).reverse_complement()) strand = "-" # print(contig) for op in ops: # skip matches if op[1] == 'M': cons_pos += op[0] target_pos += op[0] # skip deletions in the query sequence elif op[1] == 'D': target_pos += op[0] # insertions in the query sequence elif op[1] == 'I': # only interested in large insertions if op[0] > min_insertion_size: # Generate pysam.VariantRecord # need to check conversion from 0-based coordinates to 1-based ref_allele = ref_seq[target_pos-1] alt_allele = cons_seq[cons_pos:cons_pos + op[0]] break_point = start + target_pos # output VCF record corresponding to the insertion # print(break_point, (start + end) / 2 ) # print(len(loci[contig]), "samples at", contig) # build calls data structure calls = [] for sample in samples: sample_gt = "0/0" ps = 0 if sample in loci[contig]: sample_gt = loci[contig][sample]["1"] + "|" + loci[contig][sample]["2"] ps = loci[contig][sample]["ps"] sample_call = vcfpy.Call(sample = sample, data = vcfpy.OrderedDict(GT = sample_gt, PS = ps)) # print(sample_call) calls.append(sample_call) rec = vcfpy.Record(CHROM = chrom, POS = break_point, ID = [contig + "_" + str(cons_pos)], REF = ref_allele, ALT = [vcfpy.Substitution("INS", ref_allele + alt_allele)], QUAL = 999, FILTER = ["PASS"], INFO = vcfpy.OrderedDict(SVLEN = op[0], CIGAR = [str(cig)], STRAND = strand, CONTIG_START = str(aln.q_st)), FORMAT = ["GT", "PS"], calls = calls) # output contig that contains this insertion writer.write_record(rec) # output same insertion, but with flanking sequences # note, the interval is [start, end[ if flank_length > 0: left_flank = ref_fasta.fetch(chrom, break_point - flank_length, break_point) right_flank = ref_fasta.fetch(chrom, break_point, break_point + flank_length) else: left_flank = "" right_flank = "" flanked_contig_fasta.writelines([ ">" + contig + "_" + str(cons_pos) + "\n", left_flank + alt_allele[1:] + right_flank + "\n"]) # output same contig, but with large flanking sequences # note, the interval is [start, end[ n_records += 1 cons_pos += op[0] flanked_contig_fasta.close() return n_records