def integrator(self, target, splts) : #this is a one to many match with SeattleSeq assert len(splts) == 1 splt = splts[0] #transition from variant fields to JointCall fields ix = 4 splt[0] = chromNum( splt[0] ) fields = dict( zip( headers[:ix], splt[:ix] ) ) jc = JointCall( self.pat_name ) jc.fields = dict( zip( headers[ix:], splt[ix:] ) ) jc.fields["GT"] = convertGT( splt[-9:] ) return Variant( fields, [jc] )
def integrator( self, target, splts ) : if len(splts) != 1 : assert "len isn't right" for splt in splts : calls = splt[ CALL_START: ] base_calls = [] for pat_ix,c in enumerate(calls) : pat_name = self.patients[pat_ix] sc = splitCall(c) gt = convertGT( sc ) if isMutated( gt ) or noInf( gt ) : base_calls.append( variant.BaseCall(sc,pat_name) ) fields = {} keys = COLUMN_MAP.keys() for k in keys : if k == "chrom" : fields[k] = globes.chromNum( splt[self.indexOf[k]] ) elif k == "info" : #this information is useless, will get globally updated dinfo = makeInfoDict( splt[ self.indexOf[k] ] ) fields["AF"] = dinfo["AF"] elif k == "dbSNP" : value = splt[self.indexOf[k]] #when we getFields, 'dbsnp' will be missing and yield null if value == '.' : pass #right now just take the first rs number if multiple elif value.startswith('rs') : fields[k] = [int(t.strip()[2:]) for t in value.split(';')][0] else : print "malformed rs number?", splt assert False else : fields[k] = splt[ self.indexOf[k] ] #according to: http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper's_VCF_files #ref and alt are always given for the forward strand fields['strand'] = True #TODO #This is abusing that fact that VCFSource goes first in the collimator #o/w will overwrite what has been put in target return variant.Variant( fields, base_calls )
def integrator( self, target, splts ) : if len(splts) != 1 : assert "len isn't right" for splt in splts : ##TODO generalize this to make it vendor independent, call start column is feature of VCF, not broad??? calls = splt[ broad.CALL_START: ] base_calls = [] for pat_ix,c in enumerate(calls) : sc = broad.splitCall(c) gt = broad.convertGT( sc ) if broad.isMutated( gt ) or broad.noInf( gt ) : base_calls.append( BaseCall(sc,pat_ix) ) fields = {} keys = broad.COLUMN_MAP.keys() for k in keys : if k == "chrom" : fields[k] = globes.chromNum( splt[self.indexOf[k]] ) elif k == "info" : ##TODO generalize this to make it vendor independent dinfo = broad.makeInfoDict( splt[ self.indexOf[k] ] ) fields["AF"] = dinfo["AF"] elif k == "dbSNP" : value = splt[self.indexOf[k]] #when we getFields, 'dbsnp' will be missing and yield null if value == '.' : pass #right now just take the first rs number if multiple elif value.startswith('rs') : fields[k] = [int(t.strip()[2:]) for t in value.split(';')][0] else : print "malformed rs number?", splt assert False else : fields[k] = splt[ self.indexOf[k] ] #according to: http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper's_VCF_files #ref and alt are always given for the forward strand fields['strand'] = True return Variant( fields, base_calls )
def eqkey( self, out_splt ) : if self.switch == 'snp' : keys = ['chrom','position','reference','change'] (chrom,pos,ref,mut) = [ out_splt[ self.indexOf[k] ] for k in keys ] return (globes.chromNum(chrom),pos,ref,mut) elif sel.switch == 'indel' : keys = ['chrom','position','reference','change', "change type"] (chrom2,pos2,ref2,mut2,change_type) = \ [ out_splt[ self.indexOf[k] ] for k in keys ] if change_type == 'INS' : #example: # 1 866511 rs60722469 C CCCCT # 1 866512 * +CCCT INS return globes.compareVariants( chrom1, pos1, ref1, mut1[1:], \ chrom2, int(pos2)-1, ref2, mut2[1:] ) elif change_type == 'DEL' : #1 874864 . CT C #1 874865 * -T DEL Het return globes.compareVariants( chrom1, pos1, ref1[1:], mut1, \ chrom2, int(pos2)-1, mut2[1:], ref2 ) else : assert "change_type %s is not" % change_type == " INS or DEL" else : assert 'switch must be' == ' snp or indel'
def sortHelper( it ) : return (chromNum(it[0]),int(it[1]))
def getPosition( self, out_splt ) : if self.switch == 'snp' : #if it was called with parsed input, there will be only one thing in #the sampleGenotype column, rather than info for everyone if False : #len( out_splt[self.indexOf["sampleGenotype"]] ) == 1 : keys = ["chromosome","position", \ "referenceBase","sampleGenotype"] (chrom2,pos2,ref2,mut2) = [out_splt[self.indexOf[k]] \ for k in keys] #otherwise it is easier to use sampleAlleles else : keys = ["chromosome","position","referenceBase","sampleAlleles"] (chrom2,pos2,ref2,als) = [out_splt[self.indexOf[k]] for k in keys] sp = als.split('/') #split up the sample alleles, find which one matches the ref if len(sp) == 2 : (a1,a2) = sp if a1 == ref2 : mut2 = a2 elif a2 == ref2 : mut2 = a1 else : #print out_splt mut2 = 'N' #assert "Have a problem" == "with figuring out mut2" elif len(sp) == 1 : mut2 = sp[0] else : assert "length of sampleAllelels" == "not == 2 or 1" #what is going on with the *'s, exactly? elif self.switch == 'indel' : keys = ['chromosome','position','referenceBase','sampleGenotype'] (chrom2,pos2,ref2,sg) = [out_splt[self.indexOf[k]] for k in keys] samples = sg.split(',') mut2 = "" isInsertion = '-' in ref2 # go through each sample, find the first allele different # from the ref # when we can't match, use wildcard '*', # globes.compareVariants will know how to use if isInsertion : ref2 = ref2[0] for sample in samples : (one,two) = sample.split('/') if ref2 != one : mut2 = one break elif ref2 != two : mut2 = two break else : continue if mut2 == 'N' or mut2 == '' : mut2 = '*' else : for sample in samples : (one,two) = sample.split('/') if one[1:] == ref2 : ref2 = one mut2 = one[0] break elif two[1:] == ref2 : ref2 = two mut2 = two[0] break else : continue if mut2 == "" : ref2 = '*' mut2 = '*' else : assert 'switch must be' == 'snp or indel' if mut2 == 'N' : mut2 = '*' #print 'seattle pos: ',chrom2, pos2, ref2, mut2 return (globes.chromNum(chrom2),pos2,ref2,mut2)
def sortHelper( splt ) : return (globes.chromNum(splt[1]), int(splt[2]) )
fin = open("../preliminary/%s" % fname) fout = open("../preliminary/%s.functional.tsv" % patient,'w') csvout = csv.writer( fout,\ delimiter='\t', \ quoting=csv.QUOTE_MINIMAL ) csvout.writerow( queries.column_headers ) hits = 0 already_called = 0 total = 0 for line in fin.readlines() : splt = line.split() chrom, pos = splt[0], splt[1] #print chrom, pos chrom = globes.chromNum( chrom ) if not chrom : continue query = ''' select %s,%s,%s from Variants as v inner join Isoforms as i on i.var_id = v.id inner join Genes as g on g.id = i.gene_id where chrom = %s and pos = %s and AF < .1 and (%s)''' % \ (queries.vcols_string, queries.icols_string, queries.gcols_string, \ chrom, pos, queries.gvs) rows = conn.query(query) num_rows = len(rows) if num_rows > 0 : hits += 1
def sortHelper( line ) : [chrom,loc] = line.strip().split('\t')[0].split(',')[0:2] return ( globes.chromNum(chrom), int(loc) )
def getPosition( self, out_splt ) : if self.switch == 'snp' : #if it was called with parsed input, there will be only one thing in #the sampleGenotype column, rather than info for everyone if False : #len( out_splt[self.indexOf["sampleGenotype"]] ) == 1 : keys = ["chromosome","position","referenceBase","sampleGenotype"] (chrom2,pos2,ref2,mut2) = [out_splt[self.indexOf[k]] for k in keys] #otherwise it is easier to use sampleAlleles else : keys = ["chromosome","position","referenceBase","sampleAlleles"] (chrom2,pos2,ref2,als) = [out_splt[self.indexOf[k]] for k in keys] sp = als.split('/') if len(sp) == 2 : (a1,a2) = sp if a1 == ref2 : mut2 = a2 elif a2 == ref2 : mut2 = a1 else : assert "Have a problem" == "with figuring out mut2" elif len(sp) == 1 : mut2 = sp[0] else : assert "length of sampleAllelels" == "not == 2 or 1" #what is going on with the *'s, exactly? elif self.switch == 'indel' : keys = ['chromosome','position','referenceBase','sampleGenotype'] (chrom2,pos2,ref2,sg) = [out_splt[self.indexOf[k]] for k in keys] #print "readiung: ", chrom2, pos2, ref2, sg samples = sg.split(',') mut2 = "" isInsertion = '-' in ref2 if isInsertion : ref2 = ref2[0] for sample in samples : (one,two) = sample.split('/') if ref2 != one : mut2 = one break elif ref2 != two : mut2 = two break else : continue if mut2 == 'N' or mut2 == '' : mut2 = '*' else : for sample in samples : (one,two) = sample.split('/') if one[1:] == ref2 : ref2 = one mut2 = one[0] break elif two[1:] == ref2 : ref2 = two mut2 = two[0] break else : continue if mut2 == "" : ref2 = '*' mut2 = '*' else : assert 'switch must be' == 'snp or indel' #not ideal, but if SeattleSeq is going to be a bitch #it's the best we can do if mut2 == 'N' : mut2 = '*' #print "returning: ", chrom2, pos2, ref2, mut2 return (globes.chromNum(chrom2),pos2,ref2,mut2)