def translate_alignment_exons_for_protein(protein_id, exon_number):
    '''
    Translates all the proteins for which there is SW to gene alignment
    '''
    algorithm = "sw_gene"
    
    # instantiate all the utilities
    logger              = Logger.Instance()
    dc                  = DirectoryCrawler()
    translation_logger  = logger.get_logger("translator")
    
    # instantiate all the containters
    eec                 = EnsemblExonContainer.Instance()
    ec                  = ExonContainer.Instance()
    pc                  = ProteinContainer.Instance()    

    failed_species          = []
    assembled_protein_path  = dc.get_assembled_protein_path(protein_id)

    # for all the species for which it is required to generate translated protein
    for species in get_species_list(protein_id, assembled_protein_path):
        
        # get all you need for the processing
        assembled_protein_fasta = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species)
        exon_key                = (protein_id, species, algorithm)
        target_prot             = pc.get(protein_id)
        target_prot_seq         = target_prot.get_sequence_record().seq
        
        try:
            exons = ec.get(exon_key)
        except KeyError:
            translation_logger.error("%s,%s,%s" % (protein_id, species, "No exons available"))
            failed_species.append(species)
            continue
        exons_for_transcription = []

        # THIS PART WILL NOT EXIST IN THE NEAR FUTURE
        last_translated_exon = False
        for al_exon in exons.get_ordered_exons():

            ref_exon     = eec.get(al_exon.ref_exon_id)
            trans_exon   = Exon_translation(ref_exon, al_exon)
            # if we've already bumped into exon with UTR on its end, all the other exons are not viable
            if last_translated_exon:
                trans_exon.viability = False
                
            if trans_exon.viability:
                (trans_exon, last_translated_exon)  = chop_off_start_utr(al_exon.ref_exon_id, trans_exon, target_prot_seq, exon_number)
                trans_exon                          = chop_off_end_utr (al_exon.ref_exon_id, trans_exon, target_prot_seq, exon_number, protein_id)
            
            exons_for_transcription.append(trans_exon)
        # up to here - this will get trashed
        
        assemble_and_store_protein (protein_id, species, exons_for_transcription, target_prot_seq, assembled_protein_fasta)
        create_protein_alignment   (protein_id, species)
                   
    write_failed_species_to_status(failed_species, assembled_protein_path)
    return failed_species
def set_frames_to_coding_exons_batch(protein_list):
    
    exon_container = ExonContainer.Instance()
    
    for protein_id in protein_list:
        for species in get_species_list(protein_id, None):
            try:
                exons = exon_container.get((protein_id, species, "ensembl"))
                exons.set_coding_exon_frames()
            except Exception:
                pass 
Ejemplo n.º 3
0
def create_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    
    
    
    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        
        exoloc_proteins = []
        ensembl_proteins = []
        
        exoloc_proteins.append(ref_prot_rec)
        ensembl_proteins.append(ref_prot_rec)
        
        assembled_dir = dc.get_assembled_protein_path(prot_id)
        for fasta in sorted(os.listdir(assembled_dir)):
            if fasta == "Homo_sapiens.fa":
                continue
            abs_fasta = "%s/%s" % (assembled_dir, fasta)
            prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein)
            exoloc_proteins.append(prot_rec)
            
        species_list = get_species_list(prot_id, None)
        for species in species_list:
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            ensembl_proteins.append(prot_rec) 
            
        msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id)
        msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id)
        
        write_seq_records_to_file(msa_exoloc_path, exoloc_proteins)
        write_seq_records_to_file(msa_ensembl_path, ensembl_proteins)
        
        cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id))
        print cmd
        os.system(cmd)
        cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id))
        print  cmd
        os.system(cmd)
Ejemplo n.º 4
0
def create_species_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    

    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        ref_prot_rec.id = "Homo_sapiens"

        assembled_dir = dc.get_assembled_protein_path(prot_id)    
        species_list = get_species_list(prot_id, None)
        
        for species in species_list:
            
            protein_recs = []
            protein_recs.append(ref_prot_rec)
            
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            protein_recs.append(prot_rec)
            
            if "%s.fa" % species in os.listdir(assembled_dir):
                exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein)
                protein_recs.append(exoloc_protein_rec)

            
            msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species)
            
            if len(protein_recs) == 1:
                continue
            write_seq_records_to_file(msa_species_path, protein_recs)
            
            cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species))
            print cmd
            os.system(cmd)
            
            os.remove(msa_species_path)
 def get_SW_exon_targets(self, protein_id):
     '''
     @param protein_id: retrieves the list of species not aligned with SW_exon for that protein
     '''       
     path = self.crawler.get_SW_exon_path(protein_id)
     return get_species_list(protein_id, path)
Ejemplo n.º 6
0
def main ():
    #ERROR FILE:::
    err_f = open('/home/marioot/err_status_monday.txt', 'w')

    fill_all_containers(True)
    
    protein_tuples = get_protein_list()
    ec = ExonContainer.Instance()
    beac = BestExonAlignmentContainer.Instance()
    dc = DirectoryCrawler()
    
    for (protein_id, exon_num) in protein_tuples:
        
        if int(exon_num) > 15:
            print "too big"
            continue
        
        species_list = get_species_list(protein_id, None)
        try:
            ref_exons = ec.get((protein_id, "Homo_sapiens", "ensembl"))
        except KeyError:
            print "ERROR: No protein %s" % protein_id
            continue
        
        for species in species_list:
            try:
                print "\nBest_exon_al: %s, %s" % (protein_id, species)
                err_f.write("%s, %s" % (protein_id, species))
                
                bpp = BestProteinProduct (protein_id, species, "Homo_sapiens")
                bpp.load_alignments()
                bpp.decide_on_best_exons()
                #bpp.patch_interexon_AAS()
                
                for ref_exon in ref_exons.get_coding_exons():
                    
                    best_exon_alignment = bpp.best_exons[ref_exon.exon_id]
                    if best_exon_alignment:
                        beac.add(ref_exon.exon_id, species, best_exon_alignment)
                        print "%d. Exon status: %s (%s)" % (ref_exon.ordinal, best_exon_alignment.status, ref_exon.exon_id)
                        if best_exon_alignment.sw_gene_alignment:
                            print ref_exon.sequence[ref_exon.frame:].translate()
                            best_exon_alignment.sw_gene_alignment.create_cDNA()
                            print "\tAdded  %2d alignment pieces" % (len(best_exon_alignment.sw_gene_alignment.alignment_pieces))
                            for al_piece in best_exon_alignment.sw_gene_alignment.alignment_pieces:
                                print "\t\t%s:" % (al_piece.type),
                                if al_piece.type in ["coding", "insertion"]:
                                    print "PROT: %d-%d, GENOME: %d-%d, %s" % (al_piece.ref_protein_start,
                                                                            al_piece.ref_protein_stop,
                                                                            al_piece.genomic_start, 
                                                                            al_piece.genomic_stop, 
                                                                            al_piece.sequence_id)
                                    print "\t\t\tHUMAN:", al_piece.ref_protein_seq
                                    print "\t\t\tSPEC :", al_piece.spec_protein_seq
                                else:
                                    print
                                    
                whole_prot =  bpp.get_spec_protein_translation()
                whole_prot_rec = SeqRecord(whole_prot, id = species, description = "assembled_protein")
                file_name = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species)
                SeqIO.write(whole_prot_rec, file_name, "fasta")
                
                print beac.get("ENSE00002199725", species)
            except Exception, e:
                print '{0} {1} \n'.format(protein_id, species)
                err_f.write('{0} {1} \n'.format(protein_id, species))