Example #1
0
def translate_alignment_exons():
    """
    For the protein in the protein list file does the following:
        - check if the status file is ok. If not, it writes the failed status of translation
        - if the status is ok, it checks if the translation status is already OK
        - if the translation status is OK, then it just continues to the next protein
        - if the status is FAILED or PARTIAL, it tries to translate exons to proteins 
          for all the species for which it is necessary (meaning the translated
          protein hasn't already been generated). 
    """

    protein_list = get_protein_list()

    for (protein_id, exon_num) in protein_list:

        if not check_status_file(protein_id):
            print "ABORTING {0} TRANSLATION: some resources have FAILED stats!".format(protein_id)
            update_entry_in_status_file(protein_id, "EXON_TRANSLATION", "FAILED")
            continue
        try:
            if read_status_file(protein_id)["EXON_TRANSLATION"] == "OK":
                print "SKIPPING {0} TRANSLATION: .status file -> OK!".format(protein_id)
                continue
        except KeyError:
            pass
        print "TRANSLATING EXONS: {0}".format(protein_id)
        failed_species = translate_alignment_exons_for_protein(protein_id, exon_num)
        if failed_species:
            update_entry_in_status_file(protein_id, "EXON_TRANSLATION", "PARTIAL")
        else:
            update_entry_in_status_file(protein_id, "EXON_TRANSLATION", "OK")
Example #2
0
def create_statistics(protein_list):
    dc = DirectoryCrawler()

    for (protein_id, exon_num) in protein_list:

        stat_file = "%s/stats.csv" % dc.get_root_path(protein_id)
        if not check_status_file(protein_id):
            continue
        create_protein_statistics(protein_id, stat_file)
def create_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    
    
    
    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        
        exoloc_proteins = []
        ensembl_proteins = []
        
        exoloc_proteins.append(ref_prot_rec)
        ensembl_proteins.append(ref_prot_rec)
        
        assembled_dir = dc.get_assembled_protein_path(prot_id)
        for fasta in sorted(os.listdir(assembled_dir)):
            if fasta == "Homo_sapiens.fa":
                continue
            abs_fasta = "%s/%s" % (assembled_dir, fasta)
            prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein)
            exoloc_proteins.append(prot_rec)
            
        species_list = get_species_list(prot_id, None)
        for species in species_list:
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            ensembl_proteins.append(prot_rec) 
            
        msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id)
        msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id)
        
        write_seq_records_to_file(msa_exoloc_path, exoloc_proteins)
        write_seq_records_to_file(msa_ensembl_path, ensembl_proteins)
        
        cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id))
        print cmd
        os.system(cmd)
        cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id))
        print  cmd
        os.system(cmd)
def load_exon_configuration (ref_protein_id, ref_species_dict, exon_type):
    '''
    Load exons of a particular type for all available species
    @param ref_protein_id: referent protein id
    @param exon_type: exon_type: ensembl, genewise, blatn, tblastn, sw_gene, sw_exon
    '''
    
    dc                  = DescriptionParser()
    exon_container      = ExonContainer.Instance()
    ens_exon_container  = EnsemblExonContainer.Instance()
    
    logger              = Logger.Instance()
    containers_logger   = logger.get_logger('containers')
    
    if exon_type == "ensembl" or exon_type == "genewise":
        if not check_status_file_no_alignment(ref_protein_id):
            containers_logger.info ("{0},exon_type:{1},check status file -> failed".format(ref_protein_id, exon_type))
            return False
    else:
        if not check_status_file(ref_protein_id):
            containers_logger.info ("{0},exon_type:{1},check status file -> failed".format(ref_protein_id, exon_type))
            return False
    
    if not ref_species_dict:
        ref_species_dict = FileUtilities.get_reference_species_dictionary()

    (known_species, abinitio_species) = dc.get_separated_species(ref_protein_id)
    
    for species in known_species:
         
        ref_species = ref_species_dict[species]
        if exon_type != "genewise":
            if exon_type == "ensembl":
                exons = EnsemblExons ((ref_protein_id, species), ref_species)
                try:
                    exon_dict = exons.load_exons()
                except Exception, e:
                    containers_logger.error("{0},{1},{2},error loading exons".format(ref_protein_id, species, exon_type))
                    continue
            else:
                exons = Exons((ref_protein_id, species), ref_species, exon_type)
            try:
                exon_dict = exons.load_exons()
            except Exception, e:
                    containers_logger.error("{0},{1},{2},error loading exons".format(ref_protein_id, species, exon_type))
                    continue
            if not exon_dict:
                continue
        
            if (exon_type != "ensembl"):
                exons.set_exon_ordinals()
            data_map_key = [ref_protein_id, species]
            exon_container.add(exon_type, data_map_key, exons)
def create_species_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    

    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        ref_prot_rec.id = "Homo_sapiens"

        assembled_dir = dc.get_assembled_protein_path(prot_id)    
        species_list = get_species_list(prot_id, None)
        
        for species in species_list:
            
            protein_recs = []
            protein_recs.append(ref_prot_rec)
            
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            protein_recs.append(prot_rec)
            
            if "%s.fa" % species in os.listdir(assembled_dir):
                exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein)
                protein_recs.append(exoloc_protein_rec)

            
            msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species)
            
            if len(protein_recs) == 1:
                continue
            write_seq_records_to_file(msa_species_path, protein_recs)
            
            cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species))
            print cmd
            os.system(cmd)
            
            os.remove(msa_species_path)
def annotate_spurious_alignments(exons_key):
    '''
    Annotates all the alignments which are not in the correct order.
    Annotation means their viability variable will be set to False.
    (Supporting the assumption that all exons are in the correct, sequential order)
    @param exons_key: (reference protein id, species)
    @param alignment_type: blastn, tblastn, sw_gene, sw_exon
    @return: updated alignment exons, None if something is wrong with
            the protein (meaning in the .status file)
    '''
    
    (ref_protein_id, 
     species, 
     alignment_type)            = exons_key
     
    print "Annotating spurious alignments %s,%s,%s" % (ref_protein_id, species, alignment_type)
     
    # if something is wrong with the protein, return
    if not check_status_file(ref_protein_id):
        return None
     
    exon_container              = ExonContainer.Instance()
    reference_species_dict      = FileUtilities.get_reference_species_dictionary()
    
    # load logging utilities
    logger                      = Logger.Instance()
    containers_logger           = logger.get_logger("containers")
    
    # get the reference exons: (ref_prot_id, ref_species, ensembl)
    reference_exons             = exon_container.get((ref_protein_id, 
                                              reference_species_dict[species], 
                                              "ensembl"))
    # try to get the exons which are the product of specified alignment
    try:
        alignment_exons = exon_container.get((ref_protein_id, species, alignment_type))
    except KeyError:
        containers_logger.error ("{0},{1},{2},No exons available for alignment".format(ref_protein_id, species, alignment_type))
        return None

    correct_order_exons     = _find_best_orderred_subset (alignment_exons,
                                                      reference_exons)
    updated_alignment_exons = _set_viabilities (alignment_exons, correct_order_exons)    
    # update the exon container to hold the new alignment exons 
    exon_container.update(exons_key, updated_alignment_exons)
    
    return updated_alignment_exons
def translate_ensembl_exons(protein_list):
    
    ec = ExonContainer.Instance()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    
    for protein_id in protein_list:
        
        if not check_status_file(protein_id):
            continue
        
        species_list = DescriptionParser().get_species(protein_id)
        for species in species_list:
            
            data_map = dmc.get((protein_id, species))
            species_protein = pc.get(data_map.protein_id)
            species_protein_seq = species_protein.get_sequence_record() 
            
            exon_key = (protein_id, species, "ensembl")
            try:
                exons = ec.get(exon_key)
            except Exception:
                continue
            (cdna, locations) = exons.get_cDNA()
            translation_len = 0
            for frame in range(0,3): 
                translated_protein = cdna[frame:].seq.translate()
                common_translation = LongestCommonSubstring(translated_protein, species_protein_seq.seq)
                if len(common_translation) > translation_len:
                    longest_common_translation = common_translation
                    translation_len = len(common_translation)
                
            if not str(longest_common_translation) == str(species_protein_seq.seq):
                print "not OK"
                print "Original:   " + species_protein_seq.seq
                print "Translated: " + longest_common_translation 
def remove_overlapping_alignments (exons_key):
    (ref_protein_id, 
     species, 
     alignment_type)            = exons_key
    printin = False
    if printin: 
        print "Removing blastn overlaps (%s,%s,%s)..." % (ref_protein_id, species, alignment_type)

    if not check_status_file(ref_protein_id):
        return None
    
    exon_container              = ExonContainer.Instance()
    reference_species_dict      = FileUtilities.get_reference_species_dictionary()
    
    # load logging utilities
    logger                      = Logger.Instance()
    containers_logger           = logger.get_logger("containers")
    
    # get the reference exons: (ref_prot_id, ref_species, ensembl)
    reference_exons     = exon_container.get((ref_protein_id, 
                                              reference_species_dict[species], 
                                              "ensembl"))
    # try to get the exons which are the product of specified alignment
    try:
        alignment_exons = exon_container.get(exons_key)
    except KeyError:
        containers_logger.error ("{0},{1},{2}".format(ref_protein_id, species, alignment_type))
        return None
    
    for ref_exon_id in alignment_exons.alignment_exons:
        al_exons = alignment_exons.alignment_exons[ref_exon_id]
        if printin:
            print ref_exon_id
        toplevel_start = 0
        toplevel_stop = 0
        #for al_exon in sorted(al_exons, key = lambda al_exon: al_exon.get_fitness(), reverse = True):
        for al_exon in al_exons:
            
            exon_start = al_exon.alignment_info["sbjct_start"]
            exon_stop = exon_start + al_exon.alignment_info["length"]
            
            # if exon is already marked as not viable, just discard it
            if hasattr(al_exon, "viability"):
                if not al_exon.viability:
                    continue
                     
            
            if not toplevel_start:
                # if toplevel locations haven't been set, set them
                toplevel_start = exon_start
                toplevel_stop = exon_stop
                toplevel_exon = al_exon
                al_exon.set_viability(True)
                if printin:
                    print "First exon: %d - %d" % (exon_start, exon_stop)
                
            elif exon_start < toplevel_start and exon_stop > toplevel_stop:
                toplevel_exon.set_viability(False)
                toplevel_exon = al_exon
                toplevel_start = exon_start
                toplevel_stop = exon_stop
                al_exon.set_viability(True)
                if printin:
                    print "  New toplevel: %d - %d" % (exon_start, exon_stop)
                
            else:
                # what this wonderful if checks if one of the following cases:
                # if the exon is contained within the toplevel exon
                #          |----------------------|
                #               |------|
                # or the start is to the left of the toplevel, but they are still overlapping
                #      |----------|
                # or the end is to the right of the toplevel, but they are still overlapping
                #                        |--------------|
                if (exon_start >=toplevel_start and exon_stop <= toplevel_stop) or \
                (exon_start <= toplevel_start and (exon_stop >= toplevel_start and exon_stop <= toplevel_stop)) or \
                ((exon_start >= toplevel_start and exon_start <= toplevel_stop) and exon_stop >= toplevel_stop):
                    if printin:
                        print "   Bad exon: %d - %d" % (exon_start, exon_stop)
                    al_exon.set_viability(False)
                else:
                    if exon_start < toplevel_start:
                        toplevel_start = exon_start
                    if exon_stop > toplevel_stop:
                        toplevel_stop = exon_stop
                    if printin:
                        print "  Good exon: %d - %d" % (exon_start, exon_stop)
                        
    exon_container.update(exons_key, alignment_exons)