Ejemplo n.º 1
0
def pep_seqs (cursor, gene_id, exons):
    

    for exon in exons:
        #####################################                
        if (not exon.is_coding):
            if verbose: print exon.exon_id,  "is not coding "
            continue
        if (exon.covering_exon > 0):
            if verbose: print exon.exon_id,  "has covering exon"
            continue 
        exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
        if (not exon_seqs):
            if verbose: print exon.exon_id,  "no exon_seqs"
            continue                   
        [exon_seq_id, pepseq, pepseq_transl_start, 
         pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs
        if len(dna_seq)<4:
            if verbose: print exon.exon_id,  "short dna"
            continue

        #####################################                
        mitochondrial        = is_mitochondrial(cursor, gene_id)
        [seq_start, seq_end] = translation_bounds (cursor, exon.exon_id, verbose)
        if verbose: print " ** ", seq_start, seq_end
        dna_cropped          = crop_dna (seq_start, seq_end, dna_seq)
        if verbose: print " ** ", dna_cropped
        [offset, length_translated, pepseq, phase_corrected] = translate (dna_cropped, exon.phase, mitochondrial, verbose)

        if ( offset < 0): #  translation failure; usually some short pieces (end in pos 4 and such)
            if verbose: 
                print exon.exon_id,  "translation failure"
                print "mitochondrial:", mitochondrial
                print seq_start, seq_end
            continue

        if seq_start is None: seq_start = 1
        if seq_start == 0: seq_start = 1
        start = seq_start+offset-1
        end   = start + length_translated

        dnaseq  = Seq (dna_seq[start:end], generic_dna)
        if (mitochondrial):
            pepseq2 = dnaseq.translate(table="Vertebrate Mitochondrial").tostring()
        else:
            pepseq2 = dnaseq.translate().tostring()

        if (not pepseq == pepseq2):
            start = -10
            end   = -10
            
        if verbose: 
            print exon.exon_id
            print "pep from translate:", pepseq
            print "dna transl:", pepseq2
            print "start:" , start
            print "end:",  end
            print

        if True:
            qry  = "update exon_seq "
            qry += " set protein_seq   = '%s',  " %  pepseq
            qry += " pepseq_transl_start =  %d, " %  start
            qry += " pepseq_transl_end   =  %d  " %  end
            qry += " where exon_seq_id =  %d    " %  exon_seq_id
            rows = search_db (cursor, qry)
            if (rows):
                rows = search_db (cursor, qry, verbose = True)
                continue
Ejemplo n.º 2
0
def alt_splice_almt (cursor, cfg, acg, species, ensembl_db_name):

    flank_length = 10

    print "############################"
    print 'checking alt splicing in ', species

    qry = "use " + ensembl_db_name[species]
    search_db(cursor, qry)
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    if species == 'homo_sapiens':
        spec_short = 'HSA'
    else:
        spec_short = 'MMU'

    outdir   = "{0}/alt/{1}".format(cfg.dir_path['afs_dumps'], spec_short)
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    ########################################
    ########################################
    ########################################

    #gene_ids.reverse()

    for gene_id in gene_ids:
    #for gene_id in [429349]:
    #for count in range(1000):
        #gene_id = choice (gene_ids)

        stable_gene_id = gene2stable(cursor, gene_id)
        if verbose: print  gene_id, stable_gene_id, get_description (cursor, gene_id)
        transcript_ids = get_transcript_ids(cursor, gene_id)

        tr_w_ccds = []
        for [tr_id, tr_stable] in transcript_ids:
            ccds = check_ccds (cursor, tr_stable)
            if not ccds: continue
            tr_w_ccds.append([tr_id, tr_stable])

        if not tr_w_ccds: continue

        # get all exons for this gene
        all_exons    = gene2exon_list (cursor, gene_id)
        
        exons_w_ccds = set([]) # get the unique_ids

        # find exons which are on the ccds list
        for [tr_id, tr_stable] in tr_w_ccds:
            exon_ids =  transcript_id2exon_ids (cursor, tr_id)
            exons_w_ccds.update( set(exon_ids))
           
        # for these exons check sequence
        is_known = 1
        bad_exon = set([])
        for exon_id in exons_w_ccds:
            exon = get_exon      (cursor, exon_id, is_known)
            seq  = get_exon_seqs (cursor, exon_id, is_known)
            if not seq:
                bad_exon.add(exon_id)
                continue
            [exon_seq_id, protein_seq, pepseq_transl_start, 
             pepseq_transl_end, left_flank, right_flank, dna_seq] = seq

            if exon.covering_exon < 0:
                if not dna_seq:
                     bad_exon.add(exon_id)
            else:
                if exon.covering_exon_known and exon.covering_exon in exons_w_ccds:
                    pass
                else:
                    all_exon_ids =  map(lambda exon: exon.exon_id, all_exons)
                    if not exon.covering_exon in all_exon_ids:
                        bad_exon.add(exon_id)
                        
        # which transcripts seem to be completely ok?
        if verbose: print  "reconstructing alt splice almts for "
        if verbose: print  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)
        if verbose: print "there are ", len(tr_w_ccds), " transscripts with ccds"

        # get the gene_sequence
        ret = get_gene_seq(acg, cursor, gene_id, species)
        [gene_seq, canonical_exon_pepseq, file_name, seq_name, seq_region_start, seq_region_end]  = ret
        output_seq    = {}
        global_boundaries = []
        local_boundaries  = {}



        # sort exons by the order in which they appear in the gene
        all_exons.sort(key=lambda exon: exon.start_in_gene)


        # a bit of a cleanup
        for exon in all_exons:
            cleanup_endphase (cursor, exon)


        # check if any of the translations is complete:
        no_ok_transcripts = True
        for [tr_id, tr_stable] in tr_w_ccds:
            tr_exon_ids =  transcript_id2exon_ids (cursor, tr_id)
            if bad_exon & set(tr_exon_ids): continue

            if verbose: print tr_stable, " ok "
            no_ok_transcripts = False

        if no_ok_transcripts:
            if verbose: print " no ok transcripts found"
            continue

        # main loop
        cary = "" # for patching up codons split by intron
        for [tr_id, tr_stable] in tr_w_ccds:
            tr_exon_ids =  transcript_id2exon_ids (cursor, tr_id)
            if bad_exon & set(tr_exon_ids): continue

            # translation is from where to where?
            ret = get_translation_coords (cursor, tr_id)
            if not ret:
                continue
            [seq_start, start_exon_id, seq_end, end_exon_id] = ret
            for exon in all_exons:
                if exon.exon_id == start_exon_id: start_exon=exon
                if exon.exon_id == end_exon_id:   end_exon=exon
           
            transl_start_in_gene = start_exon.start_in_gene + seq_start
            transl_end_in_gene   =   end_exon.start_in_gene + seq_end
                

            local_boundaries[tr_stable] = []
            output_seq[tr_stable] = "-"*len(gene_sequence)
            output_seq[tr_stable+"_pep"] = "-"*len(gene_sequence)
            transl_end = ""

            for exon in all_exons:
                if not exon.exon_id in tr_exon_ids: continue
                
                start       = exon.start_in_gene
                start_flank = exon.start_in_gene - flank_length
                if start_flank  < 0: 
                    start_flank  = 0
                else:
                    if not start_flank-1 in global_boundaries: global_boundaries.append(start_flank-1)
                    local_boundaries[tr_stable].append(start_flank)

                end       = exon.end_in_gene
                end_flank = exon.end_in_gene + flank_length
                if end_flank > len(gene_sequence): 
                    end_flank = len(gene_sequence)
                else:
                    if not end_flank in global_boundaries: global_boundaries.append(end_flank)
                    local_boundaries[tr_stable].append(end_flank)
                
                tmp_dna  = output_seq[tr_stable][:start_flank]  + gene_sequence[start_flank:start].lower()
                tmp_dna += gene_sequence[start:end]
                tmp_dna += gene_sequence[end:end_flank].lower() + output_seq[tr_stable][end_flank:]

                output_seq[tr_stable] = tmp_dna


                #################################################
                # now try and handle translation to protein
                prev_transl_end = transl_end

                # where does translation start:
                if exon.end_in_gene < transl_start_in_gene:
                    transl_start = -1
                elif exon.exon_id == start_exon_id:
                    # if this is the first exon, the transl start given above
                    transl_start =  exon.start_in_gene+seq_start-1
                else:
                    # otherwise it is the exon start - except that if this is not the
                    # first exon and the codon is split, we want to start with the
                    # translation of the stitched up exon
                    transl_start = exon.start_in_gene
                    start_flank  = exon.phase

                # where does translation end: 
                if exon.start_in_gene >  transl_end_in_gene:
                    transl_end  = -1
                elif exon.exon_id == end_exon_id:
                    # if this is the first exon, the transl start given above
                    transl_end = exon.start_in_gene+seq_end
                else:
                    # otherwise it is the exon start - except that if this is not the
                    # first exon and the codon is split, we want to start with the
                    # translation of the stitched up exon
                    transl_end  = exon.end_in_gene - exon.end_phase+1
                    end_flank   = exon.end_phase


                if transl_start < 0 or transl_end < 0 :
                     continue

                if exon.phase > 0 and  prev_transl_end:
                    cary = gene_sequence[prev_transl_end:prev_transl_end+exon.phase]
                else:
                    cary = ""

                [phase, pepseq] = translate (cary + gene_sequence[transl_start:transl_end], 
                                             0,  mitochondrial, strip_stop = False)
                prev_transl_end = transl_end

                pepseq_padded = ""
                for aa in pepseq:
                    pepseq_padded += "-"+aa+"-"

                pepseq_name = tr_stable+"_pep"

                tmp_pep  = output_seq[pepseq_name][:transl_start-len(cary)] 
                tmp_pep += pepseq_padded
                tmp_pep += output_seq[pepseq_name][transl_end:]

                output_seq[pepseq_name] = tmp_pep



        global_boundaries.sort()
        for [tr_id, tr_stable] in tr_w_ccds:
            seq =  output_seq[tr_stable]
            tmp_seq   = ""
            prev_bdry = 0
            for bdry in global_boundaries:
                tmp_seq += seq[prev_bdry:bdry] 
                if bdry >= len(seq): continue
                if bdry in local_boundaries[tr_stable]:
                    marker = "-Z-"
                else:
                    marker = "---" 
                tmp_seq += marker 

                prev_bdry = bdry

            output_seq[tr_stable] = tmp_seq

            pepseq_name = tr_stable+"_pep"
            seq =  output_seq[pepseq_name]
            tmp_seq   = ""
            prev_bdry = 0
            for bdry in global_boundaries:
                tmp_seq += seq[prev_bdry:bdry] 
                if bdry >= len(seq): continue
                if bdry in local_boundaries[tr_stable]: # note here
                    marker = "-Z-"
                else:
                    marker = "---" 
                tmp_seq += marker 

                prev_bdry = bdry

            output_seq[pepseq_name] = tmp_seq


        output_seq = strip_gaps(output_seq)


        # define the order in which we  want the sequences output
        name_order = []
        for [tr_id, tr_stable] in tr_w_ccds:
           pepseq_name = tr_stable+"_pep"
           name_order.append (pepseq_name)
           name_order.append (tr_stable)


        afa_fnm  = "{0}/{1}.afa".format(outdir, stable_gene_id)
        ret = output_fasta (afa_fnm, name_order, output_seq)

        print afa_fnm

    return True