def merge_sv(raw, SVs, minOvr): non, result, merged, premerge = {}, {}, [], [] for locus in SVs: non[locus] = SVs[locus].med_alt dels = BedTool([(format_chrom(x[0]), x[1], x[2], x[3]) for x in raw if 'DEL' in x[3]]) dups = BedTool([(format_chrom(x[0]), x[1], x[2], x[3]) for x in raw if 'DUP' in x[3]]) dels_nomerge = first_merge(dels, minOvr) while len(check_merge(dels, minOvr)) > 0: dels = merging(check_merge(dels, minOvr), non) for x in dels: premerge.append(x) dups_nomerge = first_merge(dups, minOvr) while len(check_merge(dups, minOvr)) > 0: dups = merging(check_merge(dups, minOvr), non) for x in dups: premerge.append(x) premerge = set(premerge) for sv in list( itertools.chain(dels, dels_nomerge, dups, dups_nomerge, premerge)): sv = tokenize_sv(tuple(sv)) result[sv] = 1 for sv in raw: chrom, start, end, svtype = sv _sv = (format_chrom(chrom), start, end, svtype) if result.get(_sv) != None: merged.append(sv) return merged
def output(Structural_Variant, SVs, Ped, ids, gen, ofh, anno_flag, tmp_dir): Ofh = VCF(ofh) Ofh.init_header(datetime.date.today(), ids, Structural_Variant, gen) Ofh.load_genotypes(Structural_Variant, SVs, Ped, ids, gen, anno_flag, tmp_dir) if not ofh.endswith('.vcf'): ofh = ofh + '.vcf' vcf_ofh = open(ofh, 'w') vcf_ofh.write('\n'.join(Ofh.head) + '\n') chroms = {} for entry in Structural_Variant.raw: chroms[entry[0]] = 1 variant_id = '.' if Structural_Variant.variant_id.get(entry) != None: variant_id = Structural_Variant.variant_id[entry] locus = (format_chrom(entry[0]), int(entry[1]), int(entry[2]), str(entry[3])) genotypes = [] if Ofh.genotypes.get(locus) == None: for sample_id in ids: gt = './.' if (locus[0] == 'chrX' or locus[0] == 'chrY') and Ped.males.get( sample_id ) != None and Structural_Variant.par[locus] == False: gt = '.' genotypes.append(gt) else: genotypes = Ofh.genotypes[locus] genotypes = '\t'.join(genotypes) out = Ofh.init_row(entry) vcf_ofh.write('{}\t{}\t{}\t.\t{}\t{}\n'.format(entry[0], int(entry[1]) + 1, variant_id, out, genotypes)) vcf_ofh.close()
def tmp_chrom_file(self, tmp_dir=None, genome=True, chrom=None): make_dir(tmp_dir) tmp_genome = tmp_dir + '{}.genome'.format(self.id) tmpfh = open(tmp_genome, 'w') if genome == True: if chrom == None: for ref in self.refs: tmpfh.write('{}\t{}\n'.format(format_chrom(ref), self.refs[ref])) else: tmpfh.write('{}\t{}\n'.format(format_chrom(chrom), self.refs[chrom])) if genome == False: for ref in self.refs: tmpfh.write('{}\t0\t{}\n'.format(format_chrom(ref), self.refs[ref])) tmpfh.close() return tmp_genome
def __init__(self, sv=None, variant_id=None, gen=None): self.raw = [(str(x[0]), int(x[1]), int(x[2]), str(x[3])) for x in BedTool(list(set(sv))).sort()] self.variant_id = variant_id self.par = {} # [locus]=True/False; true if intersects PAR for locus in self.raw: chrom = format_chrom(locus[0]) if chrom != 'chrX' and chrom != 'chrY': continue key = (chrom, int(locus[1]), int(locus[2]), str(locus[3])) self.par[key] = check_PAR( '{} {} {}'.format(chrom, locus[1], locus[2]), gen)
def __init__(self,data=None): self.locus=(format_chrom(data[0]),int(data[1]),int(data[2]),str(data[3])) # (chrom,start,end,type) self.ref={} # [classifier_name]=[reference genotype likelihoods] self.med_ref='NA' self.alt={} # [classifier_name]=[alternate genotype likelihoods] self.med_alt='NA' self.format={} # [locus+id]='GT:CN:PE:SR:SC:SP:AR:HT:SQ:GL' self.gt={} # gt[locus+id]=genotype self.gq={} # gq[locus+id]=(ref,het,hom) likelihoods self.breakpoint_feats=0 # breakpoint features: discordant paired-ends + split-reads self.clf=[] # classifiers self.standard_filter='PASS' self.denovo_filter='PASS'
def load_1kgp(self,raw=None,svtype=None,gen=None,tmp_bed=None): sv = BedTool([(format_chrom(x[0]),x[1],x[2],x[3]) for x in raw if svtype in str(x[3])]).sort() sv.intersect('{}annotation_files/{}_1000Genomes_{}.bed'.format(Config().resource_path(),gen,svtype), f=0.8, F=0.8, wao=True,output=tmp_bed) with open(tmp_bed,'r') as f: for l in f: x = tuple(l.rstrip().split('\t')) locus = tokenize_sv(x)+(str(x[3]),) ovr = int(x[-1]) if ovr==0: continue ovr = format(float(x[len(x)-1])/(int(x[2])-int(x[1])),'.2f') if self._1kgp.get(locus)==None: self._1kgp[locus]=(x[len(x)-2],ovr) elif self._1kgp.get(locus)!=None and float(ovr) > float(self._1kgp[locus][1]): self._1kgp[locus]=(x[len(x)-2],ovr) else: continue os.remove(tmp_bed)
def load_genotypes(self, Structural_Variant=None, SVs=None, Ped=None, ids=None, gen=None, no_anno=None, tmp_dir=None): svs = BedTool( list( set([(format_chrom(x[0]), x[1], x[2]) for x in Structural_Variant.raw]))).sort() if no_anno == False: Annot = Annotation() Annot.check_overlap(svs, Structural_Variant.raw, gen, tmp_dir) self.Annotations = Annot for locus in SVs: Variant = SVs[locus] self.quals[locus] = Variant.med_ref, Variant.med_alt self.filters[ locus] = Variant.standard_filter, Variant.denovo_filter for sample_id in ids: gt = './.' if (locus[0] == 'chrX' or locus[0] == 'chrY') and Ped.males.get( sample_id ) != None and Structural_Variant.par[locus] == False: gt = '.' if Variant.gt.get(locus + (sample_id, )) != None: gt = Variant.gt[locus + (sample_id, )] for allele in gt.split(':').pop(0).split('/'): if allele == '.': continue if self.allele_freq.get(locus) == None: self.allele_freq[locus] = [int(allele), 1] else: self.allele_freq[locus] = [ self.allele_freq[locus][0] + int(allele), self.allele_freq[locus][1] + 1 ] if self.genotypes.get(locus) == None: self.genotypes[locus] = [gt] else: self.genotypes[locus].append(gt)
def init_header(self, date=None, ids=None, Structural_Variant=None, gen=None): chroms, refs, contigs = {}, OrderedDict(), [] from sv2Config import Config with open('{}{}.genome'.format(Config().resource_path(), gen), 'r') as f: for l in f: chrom, leng = l.rstrip().split('\t') chroms[format_chrom(chrom)] = leng for x in Structural_Variant.raw: if chroms.get(format_chrom(x[0])) != None: refs[x[0]] = chroms[format_chrom(x[0])] for chrom in refs: contigs.append('##contig=<ID={},length={}>'.format( chrom, refs[chrom])) self.head = [ '##fileformat=VCFv4.1', '##fileDate={}'.format(date), '##SV2_CMD="{}"'.format(' '.join(map(str, sys.argv[:]))), '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of structural variant">', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">', '##INFO=<ID=DENOVO_FILTER,Number=1,Type=String,Description="Stringent filter status, recommended for de novo mutation discovery">', '##INFO=<ID=REF_GTL,Number=1,Type=Float,Description="Median Phred-adjusted REF genotype likelihood">', '##INFO=<ID=AF,Number=1,Type=Float,Description="Alternate allele frequency,in the range (0,1)">', '##INFO=<ID=CYTOBAND,Number=.,Type=String,Description="Cytoband(s) overlapping the variant">', '##INFO=<ID=REPEATMASKER,Number=2,Type=String,Description="Name and reciprocal overlap of RepeatMasker variant">', '##INFO=<ID=1000G_ID,Number=1,Type=String,Description="1000 Genomes Phase 3 integrated SV callset variant identifier">', '##INFO=<ID=1000G_OVERLAP,Number=1,Type=Float,Description="Overlap to 1000 Genomes Phase 3 variant, in the range (0,1)">', '##INFO=<ID=DESCRIPTION,Number=1,Type=String,Description="Verbose description of SV, 1-based coordinates"', '##INFO=<ID=GENES,Number=1,Type=String,Description="Genes overlapping the variant, pipe-separated by transcripts>"', '##INFO=<ID=ABPARTS,Number=1,Type=Float,Description="Overlap to antibody parts, in the range (0,1)">', '##INFO=<ID=CENTROMERE,Number=1,Type=Float,Description="Centromere overlap, in the range (0,1)">', '##INFO=<ID=GAP,Number=1,Type=Float,Description="Overlap to gaps in the reference, in the range (0,1)">', '##INFO=<ID=SEGDUP,Number=1,Type=Float,Description="Segmental duplication overlap, in the range (0,1)">', '##INFO=<ID=STR,Number=1,Type=Float,Description="Short tandem repeat overlap, in the range (0,1)">', '##INFO=<ID=UNMAPPABLE,Number=1,Type=Float,Description="Overlap to DAC Blacklisted Regions, in the range (0,1)">', '##FILTER=<ID=ABPARTS,Description="Variant overlaps to antibody parts >50%">', '##FILTER=<ID=CENTROMERE,Description="Variant overlaps to centromere >50%">', '##FILTER=<ID=GAP>,Description="Variant overlaps to reference gaps >50%">', '##FILTER=<ID=GENOTYPEFAIL,Description="Variant was unable to be genotyped">', '##FILTER=<ID=NOALT,Description="No alternate allele detected">', '##FILTER=<ID=SEGDUP,Description="Variant overlaps to segmental duplications >50%">', '##FILTER=<ID=STR,Description="Variant overlaps to short tandem repeats >50%">', '##FILTER=<ID=UNMAPPABLE,Description="Variant overlaps to DAC Blacklisted Regions regions >50%">', '##FILTER=<ID=FAIL,Description="Variant failed standard filters">', '##FILTER=<ID=PASS,Description="Variant passed standard filters">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=CN,Number=1,Type=Float,Description="Copy number estimate">', '##FORMAT=<ID=PE,Number=1,Type=Float,Description="Normalized discordant paired-end count">', '##FORMAT=<ID=SR,Number=1,Type=Float,Description="Normalized split-read count">', '##FORMAT=<ID=SC,Number=1,Type=Float,Description="SNV normalized coverage">', '##FORMAT=<ID=NS,Number=1,Type=Integer,Description="Number of SNVs within locus">', '##FORMAT=<ID=HA,Number=1,Type=Float,Description="Heterozygous allele ratio">', '##FORMAT=<ID=NH,Number=1,Type=Integer,Description="Number of heterozygous SNVs">', '##FORMAT=<ID=SQ,Number=1,Type=Float,Description="Phred-scaled genotype likelihood">', '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Phred-scaled genotype likelihoods in the order, REF:(0/0), HET:(0/1), HOM:(1/1)">', '##ALT=<ID=DEL,Description="Deletion, if 80% reciprocal overlap with RepeatMasker element, the class, name, and family are given separated by colons">', '##ALT=<ID=DUP,Description="Duplication, if 80% reciprocal overlap with RepeatMasker element, the class, name, and family are given separated by colons">', '{}'.format('\n'.join(contigs)), '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}'.format( '\t'.join(ids)), ]
def tokenize_sv(x): return (format_chrom(str(x[0])),int(x[1]),int(x[2]))