Ejemplo n.º 1
0
def merge_sv(raw, SVs, minOvr):
    non, result, merged, premerge = {}, {}, [], []
    for locus in SVs:
        non[locus] = SVs[locus].med_alt
    dels = BedTool([(format_chrom(x[0]), x[1], x[2], x[3]) for x in raw
                    if 'DEL' in x[3]])
    dups = BedTool([(format_chrom(x[0]), x[1], x[2], x[3]) for x in raw
                    if 'DUP' in x[3]])
    dels_nomerge = first_merge(dels, minOvr)
    while len(check_merge(dels, minOvr)) > 0:
        dels = merging(check_merge(dels, minOvr), non)
        for x in dels:
            premerge.append(x)
    dups_nomerge = first_merge(dups, minOvr)
    while len(check_merge(dups, minOvr)) > 0:
        dups = merging(check_merge(dups, minOvr), non)
        for x in dups:
            premerge.append(x)
    premerge = set(premerge)
    for sv in list(
            itertools.chain(dels, dels_nomerge, dups, dups_nomerge, premerge)):
        sv = tokenize_sv(tuple(sv))
        result[sv] = 1
    for sv in raw:
        chrom, start, end, svtype = sv
        _sv = (format_chrom(chrom), start, end, svtype)
        if result.get(_sv) != None: merged.append(sv)
    return merged
Ejemplo n.º 2
0
def output(Structural_Variant, SVs, Ped, ids, gen, ofh, anno_flag, tmp_dir):
    Ofh = VCF(ofh)
    Ofh.init_header(datetime.date.today(), ids, Structural_Variant, gen)
    Ofh.load_genotypes(Structural_Variant, SVs, Ped, ids, gen, anno_flag,
                       tmp_dir)
    if not ofh.endswith('.vcf'): ofh = ofh + '.vcf'
    vcf_ofh = open(ofh, 'w')
    vcf_ofh.write('\n'.join(Ofh.head) + '\n')
    chroms = {}
    for entry in Structural_Variant.raw:
        chroms[entry[0]] = 1
        variant_id = '.'
        if Structural_Variant.variant_id.get(entry) != None:
            variant_id = Structural_Variant.variant_id[entry]
        locus = (format_chrom(entry[0]), int(entry[1]), int(entry[2]),
                 str(entry[3]))
        genotypes = []
        if Ofh.genotypes.get(locus) == None:
            for sample_id in ids:
                gt = './.'
                if (locus[0] == 'chrX'
                        or locus[0] == 'chrY') and Ped.males.get(
                            sample_id
                        ) != None and Structural_Variant.par[locus] == False:
                    gt = '.'
                genotypes.append(gt)
        else:
            genotypes = Ofh.genotypes[locus]
        genotypes = '\t'.join(genotypes)
        out = Ofh.init_row(entry)
        vcf_ofh.write('{}\t{}\t{}\t.\t{}\t{}\n'.format(entry[0],
                                                       int(entry[1]) + 1,
                                                       variant_id, out,
                                                       genotypes))
    vcf_ofh.close()
Ejemplo n.º 3
0
 def tmp_chrom_file(self, tmp_dir=None, genome=True, chrom=None):
     make_dir(tmp_dir)
     tmp_genome = tmp_dir + '{}.genome'.format(self.id)
     tmpfh = open(tmp_genome, 'w')
     if genome == True:
         if chrom == None:
             for ref in self.refs:
                 tmpfh.write('{}\t{}\n'.format(format_chrom(ref),
                                               self.refs[ref]))
         else:
             tmpfh.write('{}\t{}\n'.format(format_chrom(chrom),
                                           self.refs[chrom]))
     if genome == False:
         for ref in self.refs:
             tmpfh.write('{}\t0\t{}\n'.format(format_chrom(ref),
                                              self.refs[ref]))
     tmpfh.close()
     return tmp_genome
Ejemplo n.º 4
0
 def __init__(self, sv=None, variant_id=None, gen=None):
     self.raw = [(str(x[0]), int(x[1]), int(x[2]), str(x[3]))
                 for x in BedTool(list(set(sv))).sort()]
     self.variant_id = variant_id
     self.par = {}  # [locus]=True/False; true if intersects PAR
     for locus in self.raw:
         chrom = format_chrom(locus[0])
         if chrom != 'chrX' and chrom != 'chrY': continue
         key = (chrom, int(locus[1]), int(locus[2]), str(locus[3]))
         self.par[key] = check_PAR(
             '{} {} {}'.format(chrom, locus[1], locus[2]), gen)
Ejemplo n.º 5
0
	def __init__(self,data=None):
		self.locus=(format_chrom(data[0]),int(data[1]),int(data[2]),str(data[3])) # (chrom,start,end,type)
		self.ref={} # [classifier_name]=[reference genotype likelihoods]
		self.med_ref='NA'
		self.alt={} # [classifier_name]=[alternate genotype likelihoods]
		self.med_alt='NA'
		self.format={} # [locus+id]='GT:CN:PE:SR:SC:SP:AR:HT:SQ:GL'
		self.gt={} # gt[locus+id]=genotype
		self.gq={} # gq[locus+id]=(ref,het,hom) likelihoods
		self.breakpoint_feats=0 # breakpoint features: discordant paired-ends + split-reads
		self.clf=[] # classifiers
		self.standard_filter='PASS'
		self.denovo_filter='PASS'
Ejemplo n.º 6
0
	def load_1kgp(self,raw=None,svtype=None,gen=None,tmp_bed=None):
		sv = BedTool([(format_chrom(x[0]),x[1],x[2],x[3]) for x in raw if svtype in str(x[3])]).sort()
		sv.intersect('{}annotation_files/{}_1000Genomes_{}.bed'.format(Config().resource_path(),gen,svtype), f=0.8, F=0.8, wao=True,output=tmp_bed)
		with open(tmp_bed,'r') as f:
			for l in f:
				x = tuple(l.rstrip().split('\t'))
				locus = tokenize_sv(x)+(str(x[3]),)
				ovr = int(x[-1])
				if ovr==0: continue
				ovr = format(float(x[len(x)-1])/(int(x[2])-int(x[1])),'.2f')
				if self._1kgp.get(locus)==None:
					self._1kgp[locus]=(x[len(x)-2],ovr)
				elif self._1kgp.get(locus)!=None and float(ovr) > float(self._1kgp[locus][1]):
					self._1kgp[locus]=(x[len(x)-2],ovr)
				else: continue
		os.remove(tmp_bed)
Ejemplo n.º 7
0
Archivo: Vcf.py Proyecto: sebatlab/SV2
 def load_genotypes(self,
                    Structural_Variant=None,
                    SVs=None,
                    Ped=None,
                    ids=None,
                    gen=None,
                    no_anno=None,
                    tmp_dir=None):
     svs = BedTool(
         list(
             set([(format_chrom(x[0]), x[1], x[2])
                  for x in Structural_Variant.raw]))).sort()
     if no_anno == False:
         Annot = Annotation()
         Annot.check_overlap(svs, Structural_Variant.raw, gen, tmp_dir)
         self.Annotations = Annot
     for locus in SVs:
         Variant = SVs[locus]
         self.quals[locus] = Variant.med_ref, Variant.med_alt
         self.filters[
             locus] = Variant.standard_filter, Variant.denovo_filter
         for sample_id in ids:
             gt = './.'
             if (locus[0] == 'chrX'
                     or locus[0] == 'chrY') and Ped.males.get(
                         sample_id
                     ) != None and Structural_Variant.par[locus] == False:
                 gt = '.'
             if Variant.gt.get(locus + (sample_id, )) != None:
                 gt = Variant.gt[locus + (sample_id, )]
                 for allele in gt.split(':').pop(0).split('/'):
                     if allele == '.': continue
                     if self.allele_freq.get(locus) == None:
                         self.allele_freq[locus] = [int(allele), 1]
                     else:
                         self.allele_freq[locus] = [
                             self.allele_freq[locus][0] + int(allele),
                             self.allele_freq[locus][1] + 1
                         ]
             if self.genotypes.get(locus) == None:
                 self.genotypes[locus] = [gt]
             else:
                 self.genotypes[locus].append(gt)
Ejemplo n.º 8
0
Archivo: Vcf.py Proyecto: sebatlab/SV2
 def init_header(self,
                 date=None,
                 ids=None,
                 Structural_Variant=None,
                 gen=None):
     chroms, refs, contigs = {}, OrderedDict(), []
     from sv2Config import Config
     with open('{}{}.genome'.format(Config().resource_path(), gen),
               'r') as f:
         for l in f:
             chrom, leng = l.rstrip().split('\t')
             chroms[format_chrom(chrom)] = leng
     for x in Structural_Variant.raw:
         if chroms.get(format_chrom(x[0])) != None:
             refs[x[0]] = chroms[format_chrom(x[0])]
     for chrom in refs:
         contigs.append('##contig=<ID={},length={}>'.format(
             chrom, refs[chrom]))
     self.head = [
         '##fileformat=VCFv4.1',
         '##fileDate={}'.format(date),
         '##SV2_CMD="{}"'.format(' '.join(map(str, sys.argv[:]))),
         '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of structural variant">',
         '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
         '##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">',
         '##INFO=<ID=DENOVO_FILTER,Number=1,Type=String,Description="Stringent filter status, recommended for de novo mutation discovery">',
         '##INFO=<ID=REF_GTL,Number=1,Type=Float,Description="Median Phred-adjusted REF genotype likelihood">',
         '##INFO=<ID=AF,Number=1,Type=Float,Description="Alternate allele frequency,in the range (0,1)">',
         '##INFO=<ID=CYTOBAND,Number=.,Type=String,Description="Cytoband(s) overlapping the variant">',
         '##INFO=<ID=REPEATMASKER,Number=2,Type=String,Description="Name and reciprocal overlap of RepeatMasker variant">',
         '##INFO=<ID=1000G_ID,Number=1,Type=String,Description="1000 Genomes Phase 3 integrated SV callset variant identifier">',
         '##INFO=<ID=1000G_OVERLAP,Number=1,Type=Float,Description="Overlap to 1000 Genomes Phase 3 variant, in the range (0,1)">',
         '##INFO=<ID=DESCRIPTION,Number=1,Type=String,Description="Verbose description of SV, 1-based coordinates"',
         '##INFO=<ID=GENES,Number=1,Type=String,Description="Genes overlapping the variant, pipe-separated by transcripts>"',
         '##INFO=<ID=ABPARTS,Number=1,Type=Float,Description="Overlap to antibody parts, in the range (0,1)">',
         '##INFO=<ID=CENTROMERE,Number=1,Type=Float,Description="Centromere overlap, in the range (0,1)">',
         '##INFO=<ID=GAP,Number=1,Type=Float,Description="Overlap to gaps in the reference, in the range (0,1)">',
         '##INFO=<ID=SEGDUP,Number=1,Type=Float,Description="Segmental duplication overlap, in the range (0,1)">',
         '##INFO=<ID=STR,Number=1,Type=Float,Description="Short tandem repeat overlap, in the range (0,1)">',
         '##INFO=<ID=UNMAPPABLE,Number=1,Type=Float,Description="Overlap to DAC Blacklisted Regions, in the range (0,1)">',
         '##FILTER=<ID=ABPARTS,Description="Variant overlaps to antibody parts >50%">',
         '##FILTER=<ID=CENTROMERE,Description="Variant overlaps to centromere >50%">',
         '##FILTER=<ID=GAP>,Description="Variant overlaps to reference gaps >50%">',
         '##FILTER=<ID=GENOTYPEFAIL,Description="Variant was unable to be genotyped">',
         '##FILTER=<ID=NOALT,Description="No alternate allele detected">',
         '##FILTER=<ID=SEGDUP,Description="Variant overlaps to segmental duplications >50%">',
         '##FILTER=<ID=STR,Description="Variant overlaps to short tandem repeats >50%">',
         '##FILTER=<ID=UNMAPPABLE,Description="Variant overlaps to DAC Blacklisted Regions regions >50%">',
         '##FILTER=<ID=FAIL,Description="Variant failed standard filters">',
         '##FILTER=<ID=PASS,Description="Variant passed standard filters">',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
         '##FORMAT=<ID=CN,Number=1,Type=Float,Description="Copy number estimate">',
         '##FORMAT=<ID=PE,Number=1,Type=Float,Description="Normalized discordant paired-end count">',
         '##FORMAT=<ID=SR,Number=1,Type=Float,Description="Normalized split-read count">',
         '##FORMAT=<ID=SC,Number=1,Type=Float,Description="SNV normalized coverage">',
         '##FORMAT=<ID=NS,Number=1,Type=Integer,Description="Number of SNVs within locus">',
         '##FORMAT=<ID=HA,Number=1,Type=Float,Description="Heterozygous allele ratio">',
         '##FORMAT=<ID=NH,Number=1,Type=Integer,Description="Number of heterozygous SNVs">',
         '##FORMAT=<ID=SQ,Number=1,Type=Float,Description="Phred-scaled genotype likelihood">',
         '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Phred-scaled genotype likelihoods in the order, REF:(0/0), HET:(0/1), HOM:(1/1)">',
         '##ALT=<ID=DEL,Description="Deletion, if 80% reciprocal overlap with RepeatMasker element, the class, name, and family are given separated by colons">',
         '##ALT=<ID=DUP,Description="Duplication, if 80% reciprocal overlap with RepeatMasker element, the class, name, and family are given separated by colons">',
         '{}'.format('\n'.join(contigs)),
         '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}'.format(
             '\t'.join(ids)),
     ]
Ejemplo n.º 9
0
def tokenize_sv(x): return (format_chrom(str(x[0])),int(x[1]),int(x[2]))