def translate_alignment_exons():
    """
    For the protein in the protein list file does the following:
        - check if the status file is ok. If not, it writes the failed status of translation
        - if the status is ok, it checks if the translation status is already OK
        - if the translation status is OK, then it just continues to the next protein
        - if the status is FAILED or PARTIAL, it tries to translate exons to proteins 
          for all the species for which it is necessary (meaning the translated
          protein hasn't already been generated). 
    """

    protein_list = get_protein_list()

    for (protein_id, exon_num) in protein_list:

        if not check_status_file(protein_id):
            print "ABORTING {0} TRANSLATION: some resources have FAILED stats!".format(protein_id)
            update_entry_in_status_file(protein_id, "EXON_TRANSLATION", "FAILED")
            continue
        try:
            if read_status_file(protein_id)["EXON_TRANSLATION"] == "OK":
                print "SKIPPING {0} TRANSLATION: .status file -> OK!".format(protein_id)
                continue
        except KeyError:
            pass
        print "TRANSLATING EXONS: {0}".format(protein_id)
        failed_species = translate_alignment_exons_for_protein(protein_id, exon_num)
        if failed_species:
            update_entry_in_status_file(protein_id, "EXON_TRANSLATION", "PARTIAL")
        else:
            update_entry_in_status_file(protein_id, "EXON_TRANSLATION", "OK")
def create_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    
    
    
    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        
        exoloc_proteins = []
        ensembl_proteins = []
        
        exoloc_proteins.append(ref_prot_rec)
        ensembl_proteins.append(ref_prot_rec)
        
        assembled_dir = dc.get_assembled_protein_path(prot_id)
        for fasta in sorted(os.listdir(assembled_dir)):
            if fasta == "Homo_sapiens.fa":
                continue
            abs_fasta = "%s/%s" % (assembled_dir, fasta)
            prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein)
            exoloc_proteins.append(prot_rec)
            
        species_list = get_species_list(prot_id, None)
        for species in species_list:
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            ensembl_proteins.append(prot_rec) 
            
        msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id)
        msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id)
        
        write_seq_records_to_file(msa_exoloc_path, exoloc_proteins)
        write_seq_records_to_file(msa_ensembl_path, ensembl_proteins)
        
        cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id))
        print cmd
        os.system(cmd)
        cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id))
        print  cmd
        os.system(cmd)
def create_species_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    

    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        ref_prot_rec.id = "Homo_sapiens"

        assembled_dir = dc.get_assembled_protein_path(prot_id)    
        species_list = get_species_list(prot_id, None)
        
        for species in species_list:
            
            protein_recs = []
            protein_recs.append(ref_prot_rec)
            
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            protein_recs.append(prot_rec)
            
            if "%s.fa" % species in os.listdir(assembled_dir):
                exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein)
                protein_recs.append(exoloc_protein_rec)

            
            msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species)
            
            if len(protein_recs) == 1:
                continue
            write_seq_records_to_file(msa_species_path, protein_recs)
            
            cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species))
            print cmd
            os.system(cmd)
            
            os.remove(msa_species_path)
def main():
    
    '''
    Retrieves the list of all the proteins from reference species.
    For each ref species protein, it tries to find orthologues for all the species (from the species list)
    and generates the description file accordingly. If the description file already exists, it checks
    the status (OK/PARTIAL/FAILED).
    '''
    
    reference_species = "Homo_sapiens"
    
    dc = DirectoryCrawler()
    acg = AlignmentCommandGenerator()
    
    logger = Logger.Instance()
    mutual_best_logger = logger.get_logger('mutual_best')
    
    protein_list = get_protein_list()
    species_list = get_default_species_list()
    failed_proteins = []
    
    for (protein_id, num_of_exons) in protein_list:
        
        known_dict = {}
        abinitio_dict = {}
        print protein_id
        
        # generate all the directories for the protein
        dc.generate_directory_tree(protein_id)
        
        descr_file_path = dc.get_protein_description_file_path(protein_id)
        status_file_path = dc.get_mutual_best_status_file_path(protein_id)
        
        if (os.path.isfile(status_file_path) and os.path.getsize(status_file_path)):
            print DescriptionParser().get_protein_ids(protein_id)
            
            status_dict = read_status_file(protein_id)
            if (status_dict.has_key('MUTUAL_BEST')):
                if status_dict['MUTUAL_BEST'] == 'OK':
                    mutual_best_logger.info('-,%s,mutual_best already exists for this protein - moving to the next one' % protein_id)
                else :
                    mutual_best_logger.error('-,%s,mutual_best has failed for this protein (no orthologs found) - moving on the next one' % protein_id)
                    failed_proteins.append(protein_id)
            continue
        
        
        # create the description file
        descr_file = open(descr_file_path, 'w')
        # reference protein file
        ref_species_pep =  dc.get_protein_path(protein_id) + "/" + reference_species + ".fasta"
        fastacmd = acg.generate_fastacmd_protein_command(protein_id, reference_species, "all", ref_species_pep)
        
        p = Popen(fastacmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        output = p.stdout.read()
        if output:
            mutual_best_logger.error("%s,fastacmd error" % protein_id)
             
        # find orthologues for all species
        for species in species_list:
            find_ortholog_by_RBH (reference_species, species, ref_species_pep, protein_id, descr_file, mutual_best_logger)
            
        descr_file.close()
        
        mutual_best_logger.info("\n\n")
        
        # check what we've found out, whether this protein has any orthologs
        (known_dict, abinitio_dict) = DescriptionParser().get_protein_ids(protein_id)
        if (not abinitio_dict and (not known_dict or (len(known_dict.keys()) == 1 and known_dict.keys()[0] == reference_species))):
            mutual_best_logger.info ("-,%s, mutual best failed for this protein." % protein_id)
            update_entry_in_status_file(protein_id, "MUTUAL_BEST", "FAILED")
            failed_proteins.append(protein_id)
            
        else:
            update_entry_in_status_file(protein_id, "MUTUAL_BEST", "OK")
            
    print "Failed proteins: "        
    for failed_protein_id in failed_proteins:
        print failed_protein_id
def main():
    fill_all_containers(True)
    ec = ExonContainer.Instance()
    # translate_alignment_exons()

    create_statistics(get_protein_list())
def main ():
    #ERROR FILE:::
    err_f = open('/home/marioot/err_status_monday.txt', 'w')

    fill_all_containers(True)
    
    protein_tuples = get_protein_list()
    ec = ExonContainer.Instance()
    beac = BestExonAlignmentContainer.Instance()
    dc = DirectoryCrawler()
    
    for (protein_id, exon_num) in protein_tuples:
        
        if int(exon_num) > 15:
            print "too big"
            continue
        
        species_list = get_species_list(protein_id, None)
        try:
            ref_exons = ec.get((protein_id, "Homo_sapiens", "ensembl"))
        except KeyError:
            print "ERROR: No protein %s" % protein_id
            continue
        
        for species in species_list:
            try:
                print "\nBest_exon_al: %s, %s" % (protein_id, species)
                err_f.write("%s, %s" % (protein_id, species))
                
                bpp = BestProteinProduct (protein_id, species, "Homo_sapiens")
                bpp.load_alignments()
                bpp.decide_on_best_exons()
                #bpp.patch_interexon_AAS()
                
                for ref_exon in ref_exons.get_coding_exons():
                    
                    best_exon_alignment = bpp.best_exons[ref_exon.exon_id]
                    if best_exon_alignment:
                        beac.add(ref_exon.exon_id, species, best_exon_alignment)
                        print "%d. Exon status: %s (%s)" % (ref_exon.ordinal, best_exon_alignment.status, ref_exon.exon_id)
                        if best_exon_alignment.sw_gene_alignment:
                            print ref_exon.sequence[ref_exon.frame:].translate()
                            best_exon_alignment.sw_gene_alignment.create_cDNA()
                            print "\tAdded  %2d alignment pieces" % (len(best_exon_alignment.sw_gene_alignment.alignment_pieces))
                            for al_piece in best_exon_alignment.sw_gene_alignment.alignment_pieces:
                                print "\t\t%s:" % (al_piece.type),
                                if al_piece.type in ["coding", "insertion"]:
                                    print "PROT: %d-%d, GENOME: %d-%d, %s" % (al_piece.ref_protein_start,
                                                                            al_piece.ref_protein_stop,
                                                                            al_piece.genomic_start, 
                                                                            al_piece.genomic_stop, 
                                                                            al_piece.sequence_id)
                                    print "\t\t\tHUMAN:", al_piece.ref_protein_seq
                                    print "\t\t\tSPEC :", al_piece.spec_protein_seq
                                else:
                                    print
                                    
                whole_prot =  bpp.get_spec_protein_translation()
                whole_prot_rec = SeqRecord(whole_prot, id = species, description = "assembled_protein")
                file_name = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species)
                SeqIO.write(whole_prot_rec, file_name, "fasta")
                
                print beac.get("ENSE00002199725", species)
            except Exception, e:
                print '{0} {1} \n'.format(protein_id, species)
                err_f.write('{0} {1} \n'.format(protein_id, species))