Example #1
0
    def integrator(self, target, splts) :
        #this is a one to many match with SeattleSeq
        assert len(splts) == 1

        splt = splts[0]

        #transition from variant fields to JointCall fields
        ix = 4
        splt[0] = chromNum( splt[0] )
        fields = dict( zip( headers[:ix], splt[:ix] ) )

        jc = JointCall( self.pat_name )
        jc.fields = dict( zip( headers[ix:], splt[ix:] ) )
        jc.fields["GT"] = convertGT( splt[-9:] )

        return Variant( fields, [jc] )
Example #2
0
    def integrator( self, target, splts ) :
        if len(splts) != 1 : assert "len isn't right"
        for splt in splts :
            calls = splt[ CALL_START: ]
            base_calls = []
            for pat_ix,c in enumerate(calls) :
                pat_name = self.patients[pat_ix]
                sc = splitCall(c)
                gt = convertGT( sc )
                if isMutated( gt ) or noInf( gt ) :
                   base_calls.append( variant.BaseCall(sc,pat_name) )

            fields = {}
            keys = COLUMN_MAP.keys()
            for k in keys :
                if k == "chrom" :
                    fields[k] = globes.chromNum( splt[self.indexOf[k]] )
                elif k == "info" :
                    #this information is useless, will get globally updated
                    dinfo = makeInfoDict( splt[ self.indexOf[k] ] )
                    fields["AF"] = dinfo["AF"]
                elif k == "dbSNP" :
                    value = splt[self.indexOf[k]]
                    #when we getFields, 'dbsnp' will be missing and yield null
                    if value == '.' : pass
                    #right now just take the first rs number if multiple
                    elif value.startswith('rs') :
                        fields[k] = [int(t.strip()[2:]) for t in value.split(';')][0]
                    else :
                        print "malformed rs number?", splt
                        assert False
                else :
                    fields[k] = splt[ self.indexOf[k] ]


            #according to: http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper's_VCF_files
            #ref and alt are always given for the forward strand
            fields['strand'] = True

        #TODO
        #This is abusing that fact that VCFSource goes first in the collimator
        #o/w will overwrite what has been put in target
        return variant.Variant( fields, base_calls )
Example #3
0
    def integrator( self, target, splts ) :
        if len(splts) != 1 : assert "len isn't right"
        for splt in splts :
            ##TODO generalize this to make it vendor independent, call start column is feature of VCF, not broad???
            calls = splt[ broad.CALL_START: ]
            base_calls = []
            for pat_ix,c in enumerate(calls) :
                sc = broad.splitCall(c)
                gt = broad.convertGT( sc )
                if broad.isMutated( gt ) or broad.noInf( gt ) :
                    base_calls.append( BaseCall(sc,pat_ix) )

            fields = {}
            keys = broad.COLUMN_MAP.keys()
            for k in keys :
                if k == "chrom" :
                    fields[k] = globes.chromNum( splt[self.indexOf[k]] )
                elif k == "info" :
                    ##TODO generalize this to make it vendor independent
                    dinfo = broad.makeInfoDict( splt[ self.indexOf[k] ] )
                    fields["AF"] = dinfo["AF"]
                elif k == "dbSNP" :
                    value = splt[self.indexOf[k]]

                    #when we getFields, 'dbsnp' will be missing and yield null
                    if value == '.' : pass
                    #right now just take the first rs number if multiple
                    elif value.startswith('rs') :
                        fields[k] = [int(t.strip()[2:]) for t in value.split(';')][0]
                    else :
                        print "malformed rs number?", splt
                        assert False
                else :
                    fields[k] = splt[ self.indexOf[k] ]


            #according to: http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper's_VCF_files
            #ref and alt are always given for the forward strand
            fields['strand'] = True

        return Variant( fields, base_calls )
Example #4
0
    def eqkey( self, out_splt ) :
        if self.switch == 'snp' :
            keys = ['chrom','position','reference','change']
            (chrom,pos,ref,mut) = [ out_splt[ self.indexOf[k] ] for k in keys ]
            return (globes.chromNum(chrom),pos,ref,mut)
        elif sel.switch == 'indel' :
            keys = ['chrom','position','reference','change', "change type"]
            (chrom2,pos2,ref2,mut2,change_type) = \
                              [ out_splt[ self.indexOf[k] ] for k in keys ]
            if change_type == 'INS' :
                #example:
                # 1   866511  rs60722469  C   CCCCT
                # 1   866512              *   +CCCT   INS
                return globes.compareVariants( chrom1, pos1, ref1, mut1[1:], \
                                               chrom2, int(pos2)-1, ref2, mut2[1:] )
            elif change_type == 'DEL' :
                #1   874864  .   CT  C
                #1   874865       * -T  DEL Het
                return globes.compareVariants( chrom1, pos1, ref1[1:], mut1, \
                                               chrom2, int(pos2)-1, mut2[1:], ref2 ) 
            else :
                assert "change_type %s is not" % change_type == " INS or DEL"

        else : assert 'switch must be' == ' snp or indel'
Example #5
0
def sortHelper( it ) :
    return (chromNum(it[0]),int(it[1]))
Example #6
0
    def getPosition( self, out_splt ) :
        if self.switch == 'snp' :
            #if it was called with parsed input, there will be only one thing in
            #the sampleGenotype column, rather than info for everyone
            if False : #len( out_splt[self.indexOf["sampleGenotype"]] ) == 1 :
                keys = ["chromosome","position", \
                        "referenceBase","sampleGenotype"]
                (chrom2,pos2,ref2,mut2) = [out_splt[self.indexOf[k]] \
                                           for k in keys]

            #otherwise it is easier to use sampleAlleles
            else :
                keys = ["chromosome","position","referenceBase","sampleAlleles"]
                (chrom2,pos2,ref2,als) = [out_splt[self.indexOf[k]] for k in keys]
                sp = als.split('/')
                #split up the sample alleles, find which one matches the ref
                if len(sp) == 2 :
                    (a1,a2) = sp
                    if   a1 == ref2 : mut2 = a2
                    elif a2 == ref2 : mut2 = a1
                    else : 
                        #print out_splt
                        mut2 = 'N'
                        #assert "Have a problem" == "with figuring out mut2"
                elif len(sp) == 1 :
                    mut2 = sp[0]
                else :
                    assert "length of sampleAllelels" == "not == 2 or 1"

        #what is going on with the *'s, exactly?
        elif self.switch == 'indel' :
            keys = ['chromosome','position','referenceBase','sampleGenotype']
            (chrom2,pos2,ref2,sg) = [out_splt[self.indexOf[k]] for k in keys]
            samples = sg.split(',')
            mut2 = ""

            isInsertion = '-' in ref2
            # go through each sample, find the first allele different 
            # from the ref
            # when we can't match, use wildcard '*', 
            # globes.compareVariants will know how to use
            if isInsertion :
                ref2 = ref2[0]
                for sample in samples :
                    (one,two) = sample.split('/')
                    if ref2 != one :
                        mut2 = one
                        break
                    elif ref2 != two :
                        mut2 = two
                        break
                    else : continue
                if mut2 == 'N' or mut2 == '' : mut2 = '*'

            else :
                for sample in samples :
                    (one,two) = sample.split('/')
                    if one[1:] == ref2 :
                        ref2 = one
                        mut2 = one[0]
                        break
                    elif two[1:] == ref2 :
                        ref2 = two
                        mut2 = two[0]
                        break
                    else : continue
                if mut2 == "" :
                    ref2 = '*'
                    mut2 = '*'

        else : assert 'switch must be' == 'snp or indel'

        if mut2 == 'N' : mut2 = '*'
        #print 'seattle pos: ',chrom2, pos2, ref2, mut2
        return (globes.chromNum(chrom2),pos2,ref2,mut2)
Example #7
0
def sortHelper( splt ) :
    return (globes.chromNum(splt[1]), int(splt[2]) )
Example #8
0
fin = open("../preliminary/%s" % fname)
fout = open("../preliminary/%s.functional.tsv" % patient,'w')
csvout = csv.writer( fout,\
                     delimiter='\t', \
                     quoting=csv.QUOTE_MINIMAL )
csvout.writerow( queries.column_headers )

hits = 0
already_called = 0
total = 0

for line in fin.readlines() :
    splt = line.split()
    chrom, pos = splt[0], splt[1]
    #print chrom, pos
    chrom = globes.chromNum( chrom )
    if not chrom : continue

    query = '''
select %s,%s,%s
from Variants as v inner join Isoforms as i on i.var_id = v.id
                   inner join Genes as g on g.id = i.gene_id
where chrom = %s and pos = %s and AF < .1 and (%s)''' % \
    (queries.vcols_string, queries.icols_string, queries.gcols_string, \
     chrom, pos, queries.gvs)


    rows = conn.query(query)
    num_rows = len(rows)
    if num_rows > 0 :
        hits += 1
Example #9
0
def sortHelper( line ) :
    [chrom,loc] = line.strip().split('\t')[0].split(',')[0:2]
    return ( globes.chromNum(chrom), int(loc) )
Example #10
0
    def getPosition( self, out_splt ) :
        if self.switch == 'snp' :
            #if it was called with parsed input, there will be only one thing in
            #the sampleGenotype column, rather than info for everyone
            if False : #len( out_splt[self.indexOf["sampleGenotype"]] ) == 1 :
                keys = ["chromosome","position","referenceBase","sampleGenotype"]
                (chrom2,pos2,ref2,mut2) = [out_splt[self.indexOf[k]] for k in keys]
            #otherwise it is easier to use sampleAlleles
            else :
                keys = ["chromosome","position","referenceBase","sampleAlleles"]
                (chrom2,pos2,ref2,als) = [out_splt[self.indexOf[k]] for k in keys]
                sp = als.split('/')
                if len(sp) == 2 :
                    (a1,a2) = sp
                    if a1 == ref2 : mut2 = a2
                    elif a2 == ref2 : mut2 = a1
                    else : assert "Have a problem" == "with figuring out mut2"
                elif len(sp) == 1 :
                    mut2 = sp[0]
                else :
                    assert "length of sampleAllelels" == "not == 2 or 1"

        #what is going on with the *'s, exactly?
        elif self.switch == 'indel' :
            keys = ['chromosome','position','referenceBase','sampleGenotype']
            (chrom2,pos2,ref2,sg) = [out_splt[self.indexOf[k]] for k in keys]
            #print "readiung: ", chrom2, pos2, ref2, sg
            samples = sg.split(',')
            mut2 = ""

            isInsertion = '-' in ref2
            if isInsertion :
                ref2 = ref2[0]
                for sample in samples :
                    (one,two) = sample.split('/')
                    if ref2 != one :
                        mut2 = one
                        break
                    elif ref2 != two :
                        mut2 = two
                        break
                    else : continue
                if mut2 == 'N' or mut2 == '' : mut2 = '*'
            else :
                for sample in samples :
                    (one,two) = sample.split('/')
                    if one[1:] == ref2 :
                        ref2 = one
                        mut2 = one[0]
                        break
                    elif two[1:] == ref2 :
                        ref2 = two
                        mut2 = two[0]
                        break
                    else : continue
                if mut2 == "" :
                    ref2 = '*'
                    mut2 = '*'

        else : assert 'switch must be' == 'snp or indel'

        #not ideal, but if SeattleSeq is going to be a bitch
        #it's the best we can do
        if mut2 == 'N' : mut2 = '*'
        #print "returning: ", chrom2, pos2, ref2, mut2
        return (globes.chromNum(chrom2),pos2,ref2,mut2)