Exemple #1
0
def parseForINDEL( indel_file ) :
    print indel_file
    fin = open( indel_file )
    (path,ext) = indel_file.split('.',1)
    fname = path.split('/')[-1]
    fout = open( "%s/intermediate_data/sift/input/%s_sift_input.csv" \
                    % (globes.DATA_DIR, fname), 'wb' )
    print fout

    #fast-forward through header lines
    patients = broad.getPatients( fin )

    indexOf = broad.COLUMN_MAP
    for dataline in fin :

        splt = dataline.strip().split('\t')
        col_keys = ['chrom','pos','mut','ref']
        chrom,pos,mut,ref = [ splt[ indexOf[k] ] for k in col_keys ]

        dinfo = broad.makeInfoDict( splt[ indexOf["info"] ] )
        try :
            strand = dinfo["refseq.transcriptStrand"]
        except KeyError :
            try :
                strand = dinfo["refseq.transcriptStrand_1"]
            except KeyError : #dont have it guess '+'
                strand = '+' 
                ##raise Exception("what the f**k: %s" % splt[ indexOf["info"] ] )

        if strand == "+" : strand = 1
        elif strand == "-" : strand = -1
        else : raise Exception("Strand is not + or - ??")

        isInsertion = len(ref) == 1 and len(mut) > 1
        isDeletion = len(ref) > 1 and len(mut) == 1
        if isInsertion :
            start = int(pos)
            end = start
            allele = mut
        elif isDeletion :
            start = int(pos)
            end = start + (len(ref)-len(mut))
            allele = '-/'
        else : assert False

        fout.write( "%s,%d,%d,%d,%s\n" % (chrom,start,end,strand,allele) )

    fout.close()
    fin.close()
    def integrator( self, target, splts ) :
        if len(splts) != 1 : assert "len isn't right"
        for splt in splts :
            ##TODO generalize this to make it vendor independent, call start column is feature of VCF, not broad???
            calls = splt[ broad.CALL_START: ]
            base_calls = []
            for pat_ix,c in enumerate(calls) :
                sc = broad.splitCall(c)
                gt = broad.convertGT( sc )
                if broad.isMutated( gt ) or broad.noInf( gt ) :
                    base_calls.append( BaseCall(sc,pat_ix) )

            fields = {}
            keys = broad.COLUMN_MAP.keys()
            for k in keys :
                if k == "chrom" :
                    fields[k] = globes.chromNum( splt[self.indexOf[k]] )
                elif k == "info" :
                    ##TODO generalize this to make it vendor independent
                    dinfo = broad.makeInfoDict( splt[ self.indexOf[k] ] )
                    fields["AF"] = dinfo["AF"]
                elif k == "dbSNP" :
                    value = splt[self.indexOf[k]]

                    #when we getFields, 'dbsnp' will be missing and yield null
                    if value == '.' : pass
                    #right now just take the first rs number if multiple
                    elif value.startswith('rs') :
                        fields[k] = [int(t.strip()[2:]) for t in value.split(';')][0]
                    else :
                        print "malformed rs number?", splt
                        assert False
                else :
                    fields[k] = splt[ self.indexOf[k] ]


            #according to: http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper's_VCF_files
            #ref and alt are always given for the forward strand
            fields['strand'] = True

        return Variant( fields, base_calls )