コード例 #1
0
def generate_referenced_species_database(protein_id, referenced_species):
    '''
        Creates a database for a referenced species and protein, using formatdb
        @param protein_id
        @param referenced_species
    ''' 
    logger              = Logger.Instance()
    alignment_logger    = logger.get_logger('alignment')
    
    command_generator   = CommandGenerator()
    crawler             = DirectoryCrawler()
    
    exon_container      = ExonContainer.Instance()
    
    input_exons = exon_container.get((protein_id, referenced_species, "ensembl"))
    
    #source_exon_file    = "{0}/{1}.fa".format(crawler.get_exon_ensembl_path(protein_id), referenced_species)
    input_db_file       = "{0}/{1}.fa".format(crawler.get_database_path(protein_id), referenced_species)
    sequence_type       = "Nucleotide"
    
    input_exons.export_coding_exons_to_fasta(input_db_file)
    
    command             = command_generator.generate_formatdb_command(input_db_file, sequence_type)
    command_return      = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
    output              = command_return.stdout.read()
    if output != "":
        #LOGGING
        alignment_logger.warning("{0}, {1}, REF SPECIES DB, {2}".format(protein_id, referenced_species.strip(), output.strip()))      
        return False
    return True
コード例 #2
0
def generate_SW_exon_alignments2 (protein_id, species_list = None, referenced_species = "Homo_sapiens"):
    
    # utilities
    alignment_generator = AlignmentTargetGenerator()
    crawler             = DirectoryCrawler()
    command_generator   = CommandGenerator()
    
    logger              = Logger.Instance()
    alignment_logger    = logger.get_logger('alignment')
    
    exon_container      = ExonContainer.Instance()
    
    tmp_fasta_target_path   = "tmp_target.fa"
    ref_exons_fasta = "%s/%s.fa" % (crawler.get_database_path(protein_id), referenced_species)
    
    #ref_exons = exon_container.get((protein_id, referenced_species, "ensembl"))
    #ref_exons.export_coding_exons_to_fasta(tmp_ref_exons_fasta_path)
    
    failed_species_list = []
    
    if (not species_list):
        species_list    = alignment_generator.get_SW_exon_targets(protein_id)
        
    try:
        (proteins_known, proteins_abinitio) = DescriptionParser().parse_descr_file(protein_id)
    except IOError, e:
        alignment_logger.error("{0}, , SW cDNA_EXONS, {2}".format(protein_id, e))
        return False
コード例 #3
0
def create_protein_alignment(protein_id, species):
    '''
    Generates the SW alignment of three protein sequences:
    reference species protein, the assembled protein and the ensembl species protein
    @param protein_id: referent protein id
    @param species: species (latin)
    '''
    
    sequences_for_fasta = []
    
    dc                  = DirectoryCrawler()
    pc                  = ProteinContainer.Instance()
    dmc                 = DataMapContainer.Instance()
    acg                 = AlignmentCommandGenerator()
    tpc                 = TranslatedProteinContainer.Instance()
    
    data_map            = dmc.get((protein_id, species))
    
    # get all the proteins
    ref_protein         = pc.get(protein_id)
    species_protein     = pc.get(data_map.protein_id)
    assembled_protein   = tpc.get(protein_id, species)
    
    sequences_for_fasta.append(ref_protein.get_sequence_record())
    sequences_for_fasta.append(assembled_protein.get_sequence_record())
    sequences_for_fasta.append(species_protein.get_sequence_record())
    
    msa_fasta       = "%s/%s.fa" % (dc.get_mafft_path(protein_id), species)
    msa_afa         = "%s/%s.afa" % (dc.get_mafft_path(protein_id), species)
    msa_fasta_file  = open(msa_fasta, "w")
    SeqIO.write(sequences_for_fasta, msa_fasta_file, "fasta")
    msa_fasta_file.close()
    
    mafft_cmd = acg.generate_mafft_command(msa_fasta, msa_afa)
    os.system(mafft_cmd)
コード例 #4
0
def update_entry_in_status_file (protein_id, status_entry, status_entry_value):
    '''
    Updates the status entry to new value. 
    If there is no .status file as to this update, it generates the status file.
    If there exists the status file, it reads it.
    If this status entry is already present, and its value the same as the new value, then nothing is done.
    Otherwise, the value is updated and written in the status file.
    '''
    
    dc = DirectoryCrawler()
    status_file_path = dc.get_mutual_best_status_file_path(protein_id)
    status_dict = {}
    
    if (os.path.isfile(status_file_path)):
        status_dict = read_status_file(protein_id)
    
    if (status_dict.has_key(status_entry)):
        if (status_dict[status_entry] == status_entry_value):
            return
        else:
            status_dict[status_entry] = status_entry_value
            status_file = open(status_file_path, 'w')
            for status_entry, status_entry_value in status_dict.items():
                status_file.write("%s %s\n" % (status_entry, status_entry_value))
            status_file.close()
            
    else:
        status_file = open(status_file_path, 'a+')
        status_file.write("%s %s\n" % (status_entry, status_entry_value))
        status_file.close()
コード例 #5
0
 def load_exons(self):
     
     dc = DirectoryCrawler()
     logger = Logger.Instance()
     container_logger = logger.get_logger('containters')
     
     exon_file_path = dc.get_exon_genewise_path(self.ref_protein_id)
     exon_file_path += "/%s.fa" % self.species
     
     if not os.path.isfile(exon_file_path):
         container_logger.error ("{0},{1},genewise,no fasta file for genewise exons.".format(self.ref_protein_id, self.species))
         return False
     try:
         exon_file = open(exon_file_path, 'r')
     except IOError:
         container_logger.error("%s,%s,%s" % (self.ref_protein_id, self.species, "No genewise exon file."))
         return None
     
     seq_records = SeqIO.parse(exon_file, "fasta", unambiguous_dna)
     
     for seq_record in seq_records:
         (num,ir1,ir2,data) = seq_record.description.split()
         num = int(num)
         (length, start, stop) = data.split('|')
         
         exon = GenewiseExon((self.ref_protein_id, self.species), num, start, stop, seq_record.seq)
         self.exons[num] = exon
         
     return self.exons
コード例 #6
0
def fill_all_containers (load_alignments):
    '''
    Fills all the containers with correspondent data.
    The containers are: data maps, proteins, genes, transcripts, ensembl exons, and all the alignment exons
    '''
    dc = DirectoryCrawler()
    
    protein_list_raw = FileUtilities.get_protein_list()
    # flatten the raw protein list and take every second element, which is a protein id
    protein_list = list(chain.from_iterable(protein_list_raw))[0::2]
    algorithms = ["blastn", "tblastn", "sw_gene", "sw_exon"]
    for protein_id in protein_list:
        dc.generate_directory_tree(protein_id)
        
        
    ens_exon_container = load_protein_configuration_batch(protein_list)
    if ens_exon_container:
        
        load_exon_configuration_batch(protein_list, "ensembl")
        load_exon_configuration_batch(protein_list, "genewise")
        if load_alignments:
            load_exon_configuration_batch (protein_list, "blastn")
            load_exon_configuration_batch(protein_list, "tblastn")
            load_exon_configuration_batch(protein_list, "sw_gene")
            load_exon_configuration_batch(protein_list, "sw_exon") 
            set_frames_to_coding_exons_batch (protein_list)
            remove_overlapping_alignments_batch(protein_list, ["blastn", "tblastn"])
            annotate_spurious_alignments_batch(protein_list, algorithms)
コード例 #7
0
def translate_alignment_exons_for_protein(protein_id, exon_number):
    '''
    Translates all the proteins for which there is SW to gene alignment
    '''
    algorithm = "sw_gene"
    
    # instantiate all the utilities
    logger              = Logger.Instance()
    dc                  = DirectoryCrawler()
    translation_logger  = logger.get_logger("translator")
    
    # instantiate all the containters
    eec                 = EnsemblExonContainer.Instance()
    ec                  = ExonContainer.Instance()
    pc                  = ProteinContainer.Instance()    

    failed_species          = []
    assembled_protein_path  = dc.get_assembled_protein_path(protein_id)

    # for all the species for which it is required to generate translated protein
    for species in get_species_list(protein_id, assembled_protein_path):
        
        # get all you need for the processing
        assembled_protein_fasta = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species)
        exon_key                = (protein_id, species, algorithm)
        target_prot             = pc.get(protein_id)
        target_prot_seq         = target_prot.get_sequence_record().seq
        
        try:
            exons = ec.get(exon_key)
        except KeyError:
            translation_logger.error("%s,%s,%s" % (protein_id, species, "No exons available"))
            failed_species.append(species)
            continue
        exons_for_transcription = []

        # THIS PART WILL NOT EXIST IN THE NEAR FUTURE
        last_translated_exon = False
        for al_exon in exons.get_ordered_exons():

            ref_exon     = eec.get(al_exon.ref_exon_id)
            trans_exon   = Exon_translation(ref_exon, al_exon)
            # if we've already bumped into exon with UTR on its end, all the other exons are not viable
            if last_translated_exon:
                trans_exon.viability = False
                
            if trans_exon.viability:
                (trans_exon, last_translated_exon)  = chop_off_start_utr(al_exon.ref_exon_id, trans_exon, target_prot_seq, exon_number)
                trans_exon                          = chop_off_end_utr (al_exon.ref_exon_id, trans_exon, target_prot_seq, exon_number, protein_id)
            
            exons_for_transcription.append(trans_exon)
        # up to here - this will get trashed
        
        assemble_and_store_protein (protein_id, species, exons_for_transcription, target_prot_seq, assembled_protein_fasta)
        create_protein_alignment   (protein_id, species)
                   
    write_failed_species_to_status(failed_species, assembled_protein_path)
    return failed_species
コード例 #8
0
def create_statistics(protein_list):
    dc = DirectoryCrawler()

    for (protein_id, exon_num) in protein_list:

        stat_file = "%s/stats.csv" % dc.get_root_path(protein_id)
        if not check_status_file(protein_id):
            continue
        create_protein_statistics(protein_id, stat_file)
コード例 #9
0
def create_species_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    

    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        ref_prot_rec.id = "Homo_sapiens"

        assembled_dir = dc.get_assembled_protein_path(prot_id)    
        species_list = get_species_list(prot_id, None)
        
        for species in species_list:
            
            protein_recs = []
            protein_recs.append(ref_prot_rec)
            
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            protein_recs.append(prot_rec)
            
            if "%s.fa" % species in os.listdir(assembled_dir):
                exoloc_protein_rec = load_fasta_single_record("%s/%s.fa" % (assembled_dir, species), IUPAC.protein)
                protein_recs.append(exoloc_protein_rec)

            
            msa_species_path = "%s/%s.fa" % (dc.get_mafft_path(prot_id), species)
            
            if len(protein_recs) == 1:
                continue
            write_seq_records_to_file(msa_species_path, protein_recs)
            
            cmd = acg.generate_mafft_command(msa_species_path, "%s/%s.afa" % (dc.get_mafft_path(prot_id), species))
            print cmd
            os.system(cmd)
            
            os.remove(msa_species_path)
コード例 #10
0
def populate_sequence_protein (protein_id):
    '''
    Populates the "/PROTEIN_ID/sequence/protein/<species>.fa" 
    folder with fasta files containing protein sequence for
    all the species registered by the Reciprocal Best Search.
    '''
    logger                      = Logger.Instance()
    alignment_logger            = logger.get_logger('data_retrieval')
    
    alignment_command_generator = AlignmentCommandGenerator()
    directory_crawler           = DirectoryCrawler()
    protein_path                = directory_crawler.get_protein_path(protein_id)
    try:
        (proteins_known, proteins_abinitio) = DescriptionParser().get_protein_ids(protein_id)
    except IOError, e:
        alignment_logger.error("{0}, PROTEIN, , {2}".format(protein_id, e))
        return
コード例 #11
0
def read_status_file (protein_id):
    '''
    @return: status_dict dictionary of mapped status entries to their values
    Status entries may be:
        MUTUAL_BEST:    OK/FAILED
        DATA_RETRIEVAL: OK/PARTIAL/FAILED
    '''
    
    dc = DirectoryCrawler()
    status_file_path = dc.get_mutual_best_status_file_path(protein_id)
    try:
        status_file = open(status_file_path, 'r')
    except IOError:
        raise IOError('No .status file for protein %s' % protein_id)
    
    status_dict = dict(token.split() for token in status_file.read().strip().split('\n'))
    status_file.close()
    
    return status_dict
コード例 #12
0
def populate_sequence_exon_genewise(protein_id):
    '''
    Populates the "/PROTEIN_ID/sequence/exon/genewisel/<species>.fa" 
    folder with fasta files containing a list of all the exons for
    a particular transcript. The data is acquired using the genewise
    program.
    This is used for the proteins found with an ab_initio method, that
    dont have a list of exons on ensembl.
    '''
    logger              = Logger.Instance()
    alignment_logger    = logger.get_logger('data_retrieval')
    directory_crawler   = DirectoryCrawler()
    command_generator   = CommandGenerator()
    exon_genewise_path  = directory_crawler.get_exon_genewise_path(protein_id)
    try:
        (proteins_known, proteins_abinitio) = DescriptionParser().parse_descr_file(protein_id)
    except IOError, e:
        alignment_logger.error("{0}, {1}, , {2}".format(protein_id, 'GENEWISE', e))
        return
コード例 #13
0
def generate_blastn_alignments(protein_id, species_list = None, referenced_species = "Homo_sapiens"):
    '''
        Runs the blastn program for a specified protein and list of species
        @param protein_id
        @param species_list: if provided, runs blastn for this list of species, \
                             otherwise runs for species that are missing the blastn output \
                             who are determined by .status file in the blastn folder.
    '''
    logger              = Logger.Instance()
    alignment_logger    = logger.get_logger('alignment')
    
    crawler             = DirectoryCrawler()
    
    command_generator   = CommandGenerator()
    alignment_generator = AlignmentTargetGenerator()
    
    failed_species_list = []
    
    # retrieve the blastn targets
    if (not species_list):
        species_list    = alignment_generator.get_blastn_targets(protein_id)

    for species in species_list:
        
        ############# MOVE TO ANOTHER FNC
        output_file     = "{0}/{1}.blastout".format(crawler.get_blastn_path(protein_id), species.strip())
        input_file      = "{0}/{1}.fa".format(crawler.get_expanded_gene_path(protein_id), species.strip())
        database        = "{0}/{1}.fa".format(crawler.get_database_path(protein_id), referenced_species)
        
        command         = command_generator.generate_blastn_command(database, input_file, output_file)
        command_return  = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        output          = command_return.stdout.read()
        if output != "":
            #LOGGING
            os.remove(output_file)
            alignment_logger.warning("{0}, {1}, BLASTN, {2}".format(protein_id, species.strip(), output.strip()))
            failed_species_list.append(species.strip())
            
    if failed_species_list:
        alignment_generator.set_failed_blastn_targets(protein_id, failed_species_list)
        return False
    return True
コード例 #14
0
class AlignmentTargetGenerator(object):
    '''
    Class used to retrieve the list of all possible targets from certain alignment
    '''
    def __init__(self):
        self.crawler = DirectoryCrawler()
        self.description_parser = DescriptionParser()
    
    def get_blastn_targets(self, protein_id):
        '''
        @param protein_id: retrieves the list of species (RBS) not aligned with blastn for that protein
        '''       
        path = self.crawler.get_blastn_path(protein_id)
        return get_species_list(protein_id, path)
    
    
    def set_failed_blastn_targets(self, protein_id, failed_species_list):
        path = self.crawler.get_blastn_path(protein_id)
        write_failed_species_to_status(failed_species_list, path)
        
    def get_tblastn_targets(self, protein_id):
        '''
        @param protein_id: retrieves the list of species not aligned with tblastn for that protein
        '''       
        path = self.crawler.get_tblastn_path(protein_id)
        return get_species_list(protein_id, path)
        
    def set_failed_tblastn_targets(self, protein_id, failed_species_list):
        path = self.crawler.get_tblastn_path(protein_id)
        write_failed_species_to_status(failed_species_list, path)
        
    def get_SW_gene_targets(self, protein_id):
        '''
        @param protein_id: retrieves the list of species not aligned with SW_gene for that protein
        '''       
        path = self.crawler.get_SW_gene_path(protein_id)
        return get_species_list(protein_id, path)
    
    def set_failed_SW_gene_targets(self, protein_id, failed_species_list):
        path = self.crawler.get_SW_gene_path(protein_id)
        write_failed_species_to_status(failed_species_list, path)
    
    def get_SW_exon_targets(self, protein_id):
        '''
        @param protein_id: retrieves the list of species not aligned with SW_exon for that protein
        '''       
        path = self.crawler.get_SW_exon_path(protein_id)
        return get_species_list(protein_id, path)
    
    def set_failed_SW_exon_targets(self, protein_id, failed_species_list):
        path = self.crawler.get_SW_exon_path(protein_id)
        write_failed_species_to_status(failed_species_list, path)
コード例 #15
0
def generate_SW_gene_alignments(protein_id, species_list = None, referenced_species = "Homo_sapiens"):
    '''
        Runs the SW program for a specified protein and list of species, using the expanded gene region.
        @param protein_id
        @param species_list: if provided, runs SW for this list of species, \
                             otherwise runs for species that are missing the SW output \
                             who are determined by .status file in the /SW/gene folder.
    '''       
    logger                   = Logger.Instance()
    alignment_logger         = logger.get_logger('alignment')
     
    alignment_generator      = AlignmentTargetGenerator()
    crawler                  = DirectoryCrawler()
    command_generator        = CommandGenerator()
    
    if (not species_list):
        species_list         = alignment_generator.get_SW_gene_targets(protein_id)

    failed_species_list = []
    for species in species_list:
        
        ########### MOVE
        output_file          = "{0}/{1}.swout".format(crawler.get_SW_gene_path(protein_id), species.strip())
        query_sequence_file  = "{0}/{1}.fa".format(crawler.get_expanded_gene_path(protein_id), species.strip())
        target_fasta_db_file = "{0}/{1}.fa".format(crawler.get_database_path(protein_id), referenced_species)
        
        command              = command_generator.generate_SW_command(query_sequence_file, target_fasta_db_file, output_file)
        command_return       = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        output               = command_return.stdout.read()
        if output != "":
            #LOGGING
            alignment_logger.warning("{0}, {1}, SW GENE, {2}".format(protein_id, species.strip(), output.strip()))
            failed_species_list.append(species.strip())
    os.remove(".sw_stdout_supressed")
    
    if failed_species_list: 
        alignment_generator.set_failed_SW_gene_targets(protein_id, failed_species_list)
        return False
    return True
コード例 #16
0
def create_msa_alignments ():
    
    dc = DirectoryCrawler()
    pc = ProteinContainer.Instance()
    dmc = DataMapContainer.Instance()
    acg = AlignmentCommandGenerator()
    
    fill_all_containers(False)
    
    
    
    for (prot_id, exon_num) in get_protein_list():
        
        if not check_status_file(prot_id):
            continue
        
        ref_prot_rec = pc.get(prot_id).get_sequence_record()
        
        exoloc_proteins = []
        ensembl_proteins = []
        
        exoloc_proteins.append(ref_prot_rec)
        ensembl_proteins.append(ref_prot_rec)
        
        assembled_dir = dc.get_assembled_protein_path(prot_id)
        for fasta in sorted(os.listdir(assembled_dir)):
            if fasta == "Homo_sapiens.fa":
                continue
            abs_fasta = "%s/%s" % (assembled_dir, fasta)
            prot_rec = load_fasta_single_record(abs_fasta, IUPAC.protein)
            exoloc_proteins.append(prot_rec)
            
        species_list = get_species_list(prot_id, None)
        for species in species_list:
            if species == "Homo_sapiens":
                continue
            data_map = dmc.get((prot_id, species))
            prot_rec = pc.get(data_map.protein_id).get_sequence_record()
            prot_rec.id = species
            ensembl_proteins.append(prot_rec) 
            
        msa_exoloc_path = "%s/msa_exoloc.fa" % dc.get_mafft_path(prot_id)
        msa_ensembl_path = "%s/msa_ensembl.fa" % dc.get_mafft_path(prot_id)
        
        write_seq_records_to_file(msa_exoloc_path, exoloc_proteins)
        write_seq_records_to_file(msa_ensembl_path, ensembl_proteins)
        
        cmd = acg.generate_mafft_command(msa_exoloc_path, "%s/msa_exoloc.afa" % dc.get_mafft_path(prot_id))
        print cmd
        os.system(cmd)
        cmd = acg.generate_mafft_command(msa_ensembl_path, "%s/msa_ensembl.afa" % dc.get_mafft_path(prot_id))
        print  cmd
        os.system(cmd)
コード例 #17
0
 def __init__(self):
     self.crawler = DirectoryCrawler()
     self.description_parser = DescriptionParser()
コード例 #18
0
def parse_blast_output (ref_protein_id, species, blast):
    '''
    @return: Dictionary where key is reference species exon_id, and 
    the value is list of corresponding alignments
    '''
    
    logger              = Logger.Instance()
    containers_logger   = logger.get_logger('containers')
    dc                  = DirectoryCrawler()
    
    if blast == "blastn":
        blast_file = "{0}/{1}.blastout".format(dc.get_blastn_path(ref_protein_id), species)
    else:
        blast_file = "{0}/{1}.blastout".format(dc.get_tblastn_path(ref_protein_id), species)
        
    if not os.path.isfile(blast_file):
        containers_logger.error ("{0}, {1}, {2}, no blastout file".format(ref_protein_id, species, blast))
        return None
        
    file_handle = open(blast_file, 'r')
    
    # parse blastn output  
    try:
        blastn_record = NCBIXML.read(file_handle)
    except ValueError:
        containers_logger.error("%s,%s,%s,No hits found" % (ref_protein_id, species, blast))
        return None
    
    exon_dict = {}
    exon_pattern = re.compile("(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)")
    
    for alignment in blastn_record.alignments:
        (blast_info, exon_info) = alignment.title.split()
        pattern_match = re.match(exon_pattern, exon_info)
        ref_exon_id = pattern_match.groups()[3]
        exon_start = int (pattern_match.groups()[0])
        exon_end = int(pattern_match.groups()[1])
        
        # limit alignments to 10 hsps
        
        num_of_hsps = 0
        
        for hsp in alignment.hsps:
            # limit!
            if blast == "blastn":
                (query_frame, hit_frame) = hsp.frame
                if query_frame == -1 or hit_frame == -1:
                    continue 
            if num_of_hsps == 5:
                break
            num_of_hsps += 1
            
            exon = Exon(blast, ref_exon_id, ref_protein_id, species)
            if type(hsp.gaps) is int:
                gaps = hsp.gaps
            elif type(hsp.gaps) is tuple:
                if not hsp.gaps[0]:
                    gaps = 0
            exon.set_alignment_info ( hsp.identities, 
                                      hsp.positives, 
                                      gaps, 
                                      hsp.sbjct_start, 
                                      hsp.sbjct_start + len(hsp.sbjct) -1,
                                      hsp.query_start,
                                      hsp.query_start + len(hsp.sbjct) -1,
                                      len(hsp.sbjct),
                                      hsp.sbjct,
                                      hsp.query,
                                      hsp.score)
            if not ref_exon_id in exon_dict:
                exon_dict[ref_exon_id] = [exon]
            else:
                exon_dict[ref_exon_id].append(exon)
                
            # means we covered the whole exon
            if len(hsp.sbjct) == abs(exon_end-exon_start)+1 and len(hsp.sbjct) == hsp.identities:
                break
        
    file_handle.close()
    return exon_dict
コード例 #19
0
 def get_exon_file_path (self):
     '''
     Retrieve the file with the ensembl exons in fasta format
     '''
     dc = DirectoryCrawler()
     return "{0}/{1}.fa".format(dc.get_exon_ensembl_path(self.ref_protein_id), self.species)
コード例 #20
0
        else:
            break
        i += 1
    
    pattern = re.compile("lcl\|(.*)\spep::*")
    for title in best_alignments:
        prot_match = re.match(pattern, title)
        if prot_match.groups()[0] == protein_id:
            return title
    return best_alignments[0]
    
    
    
if __name__ == '__main__':
    protein_id = "ENSP00000311134"
    acg = AlignmentCommandGenerator()
    dc = DirectoryCrawler()
    
    dc.generate_directory_tree(protein_id)
    descr_file_path = dc.get_protein_description_file_path(protein_id)
    descr_file = open(descr_file_path, 'w')
    
    output_file_path = dc.get_protein_path(protein_id) + "/" + "Homo_sapiens.fasta"
    
    fastacmd = acg.generate_fastacmd_protein_command(protein_id, "Homo_sapiens", "all", output_file_path)
    os.system(fastacmd)
    
    for species in get_default_species_list():
        find_ortholog_by_RBH("Homo_sapiens", species, output_file_path, protein_id)
        
    descr_file.close()
コード例 #21
0
def reset_action (protein_id, key):
    update_entry_in_status_file(protein_id, key, 'FAILED')
    crawler = DirectoryCrawler()
    
    if key == 'GENE_RETRIEVAL': 
        clear_directory(crawler.get_gene_path(protein_id))
    elif key == 'EXP_GENE_RETRIEVAL' : 
        clear_directory(crawler.get_expanded_gene_path(protein_id))
    elif key == 'PROTEIN_RETRIEVAL' : 
        clear_directory(crawler.get_protein_path(protein_id))
    elif key == 'ENSEMBL_EXON_RETRIEVAL' : 
        clear_directory(crawler.get_exon_ensembl_path(protein_id))
    elif key == 'GENEWISE_EXON_RETRIEVAL' : 
        clear_directory(crawler.get_exon_genewise_path(protein_id))
        clear_directory(crawler.get_genewise_path(protein_id))
    elif key == 'REF_SP_DB_FORMATTING' : 
        clear_directory(crawler.get_database_path(protein_id))
    elif key == 'BLASTN_ALIGNMENT' : 
        clear_directory(crawler.get_blastn_path(protein_id))
    elif key == 'TBLASTN_ALIGNMENT' : 
        clear_directory(crawler.get_tblastn_path(protein_id))
    elif key == 'SW_GENE_ALIGNMENT' : 
        clear_directory(crawler.get_SW_gene_path(protein_id))
    elif key == 'SW_EXON_ALIGNMENT' : 
        clear_directory(crawler.get_SW_exon_path(protein_id))
コード例 #22
0
class DescriptionParser:
    """
    Loads configuration files from the cfg directory
    """

    def __init__(self):
        self.crawler = DirectoryCrawler()

    def get_gene_regions(self, protein_id):
        """
        Parses the description file for the protein_id, and retrieves the information about protein locations for every species.
        Locations are stored as tuples (location_type, assembly, location_id, seq_begin, seq_end, strand)
        @param protein_id: protein_id for which protein ids of other species should be retrieved
        @return: (prot_ids_known, prot_ids_abinitio) - two dictionaries (key is species name, value is gene location data as described)
        """
        (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id)

        genes_known = {}
        genes_abinitio = {}

        for key, value in proteins_known.items():
            genes_known[key] = list(value)[3:]
        for key, value in proteins_abinitio.items():
            genes_abinitio[key] = list(value)[1:]

        return genes_known, genes_abinitio

    def get_protein_ids(self, protein_id):
        """
        Parses the description file for the protein_id, and retrieves only the protein ids for every species
        @param protein_id: protein_id for which protein ids of other species should be retrieved
        @return: (prot_ids_known, prot_ids_abinitio) - two dictionaries (key is species name, value is orthologous protein id)
        """
        (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id)

        prot_ids_known = {}
        prot_ids_abinitio = {}

        for key, value in proteins_known.items():
            prot_ids_known[key] = list(value)[0]
        for key, value in proteins_abinitio.items():
            prot_ids_abinitio[key] = list(value)[0]

        return prot_ids_known, prot_ids_abinitio

    def get_species(self, protein_id):
        """
        Parses the description file for the protein_id, and retrieves the list of species for which has reciprocal best search
        found a valid protein.
        @param protein_id: protein_id for which protein ids of other species should be retrieved
        @return: species_list - found by RBH
        """
        (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id)

        species_list = proteins_known.keys()
        species_list.extend(proteins_abinitio.keys())

        for species in species_list:
            species = species.strip()

        return sorted(species_list)

    def get_separated_species(self, protein_id):
        (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id)
        species_known = proteins_known.keys()
        species_abinitio = proteins_abinitio.keys()
        return (species_known, species_abinitio)

    def get_strand_information(self, protein_id):
        """
        Parses the description file for the protein_id and retrieves list of appropriate strands on which the proteins found by
        RBH are found.
        @param protein_id: protein_id for which protein ids of other species should be retrieved
        @return: strands: dictionary (species:strand) - found by RBH
        """
        strands = {}
        (proteins_known, proteins_abinitio) = self.parse_descr_file(protein_id)
        for (
            species,
            (spec_protein_id, gene_id, transcript_id, location_type, assembly, location_id, seq_begin, seq_end, strand),
        ) in proteins_known.items():
            strands[species] = int(strand)
        for (
            species,
            (spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand),
        ) in proteins_abinitio.items():
            strands[species] = int(strand)
        return strands

    def parse_descr_file(self, protein_id):
        """
        Function for parsing the description file associated with the protein_id.
        Description file contains two different types of entries: for known protein and for abinitio.
        Consequently, there are two formats that can be expected. They are both tab delimited.
        known_format:     species protein_id gene_id transcript_id location_type:assembly:location_id:seq_begin:seq_end:strand
        abinitio_format:  species protein_id location_type:assembly:location_id:seq_begin:seq_end:strand
        @param protein_id: protein for which the description file will be parsed. 
        @return: proteins_known_data - dictionary (key is species, value is a tuple of all the available data for that species protein)
                 abinitio_known_data (the same), two dictionaries are returned as a tuple
                 (spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand)
        @raise IOError: in case there is no description file present for the protein_id
        """

        proteins_known_data = {}
        proteins_abinitio_data = {}

        descr_file_path = "{0}/{1}.descr".format(self.crawler.get_root_path(protein_id), protein_id)

        pattern_known = re.compile("(.*)\t(ENS.*)\t(.*)\t(.*)\t(.*):(.*):(.*):(.*):(.*):(.*)")
        pattern_abinitio = re.compile("(.*)\t(GEN.*)\t(.*):(.*):(.*):(.*):(.*):(.*)")

        try:
            descr_file = open(descr_file_path, "r")
        except IOError:
            raise IOError("There is no description file present for protein: %s" % protein_id)

        for line in descr_file.readlines():
            line = line.strip()

            species_data = line.split()
            if len(species_data) == 5:
                species_name = species_data[0]
                spec_protein_id, gene_id, transcript_id = species_data[1:4]
                (location_type, assembly, location_id, seq_begin, seq_end, strand) = species_data[-1].split(":")
                proteins_known_data[species_name] = (
                    spec_protein_id,
                    gene_id,
                    transcript_id,
                    location_type,
                    assembly,
                    location_id,
                    seq_begin,
                    seq_end,
                    strand,
                )

            elif len(species_data) == 3:
                species_name = species_data[0]
                (location_type, assembly, location_id, seq_begin, seq_end, strand) = species_data[-1].split(":")
                spec_protein_id = species_data[1]
                proteins_abinitio_data[species_name] = (
                    spec_protein_id,
                    location_type,
                    assembly,
                    location_id,
                    seq_begin,
                    seq_end,
                    strand,
                )

        descr_file.close()

        return proteins_known_data, proteins_abinitio_data

    def parse_description_file_general_info(self, protein_id):
        """
        Similar as parse_description_file, but returns all the information. Required for generating a model structure.
        known: (species_name, spec_protein_id, gene_id, transcript_id, location_type, assembly, location_id, seq_begin, seq_end, strand)
        abinitio: (species_name, spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand)
        """
        proteins_known_data = []
        proteins_abinitio_data = []

        descr_file_path = "{0}/{1}.descr".format(self.crawler.get_root_path(protein_id), protein_id)

        pattern_known = re.compile("(.*)\t(ENS.*)\t(.*)\t(.*)\t(.*):(.*):(.*):(.*):(.*):(.*)")
        pattern_abinitio = re.compile("(.*)\t(GEN.*)\t(.*):(.*):(.*):(.*):(.*):(.*)")

        try:
            descr_file = open(descr_file_path, "r")
        except IOError:
            raise IOError("There is no description file present for protein: %s" % protein_id)

        for line in descr_file.readlines():
            line = line.strip()

            match = re.match(pattern_known, line)
            if match:
                (
                    species_name,
                    spec_protein_id,
                    gene_id,
                    transcript_id,
                    location_type,
                    assembly,
                    location_id,
                    seq_begin,
                    seq_end,
                    strand,
                ) = match.groups()
                proteins_known_data.append(
                    (
                        species_name,
                        spec_protein_id,
                        gene_id,
                        transcript_id,
                        location_type,
                        assembly,
                        location_id,
                        seq_begin,
                        seq_end,
                        strand,
                    )
                )

            match = re.match(pattern_abinitio, line)
            if match:
                (
                    species_name,
                    spec_protein_id,
                    location_type,
                    assembly,
                    location_id,
                    seq_begin,
                    seq_end,
                    strand,
                ) = match.groups()
                proteins_abinitio_data.append(
                    (species_name, spec_protein_id, location_type, assembly, location_id, seq_begin, seq_end, strand)
                )
        descr_file.close()

        return proteins_known_data, proteins_abinitio_data
コード例 #23
0
def main ():
    #ERROR FILE:::
    err_f = open('/home/marioot/err_status_monday.txt', 'w')

    fill_all_containers(True)
    
    protein_tuples = get_protein_list()
    ec = ExonContainer.Instance()
    beac = BestExonAlignmentContainer.Instance()
    dc = DirectoryCrawler()
    
    for (protein_id, exon_num) in protein_tuples:
        
        if int(exon_num) > 15:
            print "too big"
            continue
        
        species_list = get_species_list(protein_id, None)
        try:
            ref_exons = ec.get((protein_id, "Homo_sapiens", "ensembl"))
        except KeyError:
            print "ERROR: No protein %s" % protein_id
            continue
        
        for species in species_list:
            try:
                print "\nBest_exon_al: %s, %s" % (protein_id, species)
                err_f.write("%s, %s" % (protein_id, species))
                
                bpp = BestProteinProduct (protein_id, species, "Homo_sapiens")
                bpp.load_alignments()
                bpp.decide_on_best_exons()
                #bpp.patch_interexon_AAS()
                
                for ref_exon in ref_exons.get_coding_exons():
                    
                    best_exon_alignment = bpp.best_exons[ref_exon.exon_id]
                    if best_exon_alignment:
                        beac.add(ref_exon.exon_id, species, best_exon_alignment)
                        print "%d. Exon status: %s (%s)" % (ref_exon.ordinal, best_exon_alignment.status, ref_exon.exon_id)
                        if best_exon_alignment.sw_gene_alignment:
                            print ref_exon.sequence[ref_exon.frame:].translate()
                            best_exon_alignment.sw_gene_alignment.create_cDNA()
                            print "\tAdded  %2d alignment pieces" % (len(best_exon_alignment.sw_gene_alignment.alignment_pieces))
                            for al_piece in best_exon_alignment.sw_gene_alignment.alignment_pieces:
                                print "\t\t%s:" % (al_piece.type),
                                if al_piece.type in ["coding", "insertion"]:
                                    print "PROT: %d-%d, GENOME: %d-%d, %s" % (al_piece.ref_protein_start,
                                                                            al_piece.ref_protein_stop,
                                                                            al_piece.genomic_start, 
                                                                            al_piece.genomic_stop, 
                                                                            al_piece.sequence_id)
                                    print "\t\t\tHUMAN:", al_piece.ref_protein_seq
                                    print "\t\t\tSPEC :", al_piece.spec_protein_seq
                                else:
                                    print
                                    
                whole_prot =  bpp.get_spec_protein_translation()
                whole_prot_rec = SeqRecord(whole_prot, id = species, description = "assembled_protein")
                file_name = "%s/%s.fa" % (dc.get_assembled_protein_path(protein_id), species)
                SeqIO.write(whole_prot_rec, file_name, "fasta")
                
                print beac.get("ENSE00002199725", species)
            except Exception, e:
                print '{0} {1} \n'.format(protein_id, species)
                err_f.write('{0} {1} \n'.format(protein_id, species))
コード例 #24
0
 def __init__(self):
     self.crawler = DirectoryCrawler()
コード例 #25
0
def main():
    
    '''
    Retrieves the list of all the proteins from reference species.
    For each ref species protein, it tries to find orthologues for all the species (from the species list)
    and generates the description file accordingly. If the description file already exists, it checks
    the status (OK/PARTIAL/FAILED).
    '''
    
    reference_species = "Homo_sapiens"
    
    dc = DirectoryCrawler()
    acg = AlignmentCommandGenerator()
    
    logger = Logger.Instance()
    mutual_best_logger = logger.get_logger('mutual_best')
    
    protein_list = get_protein_list()
    species_list = get_default_species_list()
    failed_proteins = []
    
    for (protein_id, num_of_exons) in protein_list:
        
        known_dict = {}
        abinitio_dict = {}
        print protein_id
        
        # generate all the directories for the protein
        dc.generate_directory_tree(protein_id)
        
        descr_file_path = dc.get_protein_description_file_path(protein_id)
        status_file_path = dc.get_mutual_best_status_file_path(protein_id)
        
        if (os.path.isfile(status_file_path) and os.path.getsize(status_file_path)):
            print DescriptionParser().get_protein_ids(protein_id)
            
            status_dict = read_status_file(protein_id)
            if (status_dict.has_key('MUTUAL_BEST')):
                if status_dict['MUTUAL_BEST'] == 'OK':
                    mutual_best_logger.info('-,%s,mutual_best already exists for this protein - moving to the next one' % protein_id)
                else :
                    mutual_best_logger.error('-,%s,mutual_best has failed for this protein (no orthologs found) - moving on the next one' % protein_id)
                    failed_proteins.append(protein_id)
            continue
        
        
        # create the description file
        descr_file = open(descr_file_path, 'w')
        # reference protein file
        ref_species_pep =  dc.get_protein_path(protein_id) + "/" + reference_species + ".fasta"
        fastacmd = acg.generate_fastacmd_protein_command(protein_id, reference_species, "all", ref_species_pep)
        
        p = Popen(fastacmd, shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True)
        output = p.stdout.read()
        if output:
            mutual_best_logger.error("%s,fastacmd error" % protein_id)
             
        # find orthologues for all species
        for species in species_list:
            find_ortholog_by_RBH (reference_species, species, ref_species_pep, protein_id, descr_file, mutual_best_logger)
            
        descr_file.close()
        
        mutual_best_logger.info("\n\n")
        
        # check what we've found out, whether this protein has any orthologs
        (known_dict, abinitio_dict) = DescriptionParser().get_protein_ids(protein_id)
        if (not abinitio_dict and (not known_dict or (len(known_dict.keys()) == 1 and known_dict.keys()[0] == reference_species))):
            mutual_best_logger.info ("-,%s, mutual best failed for this protein." % protein_id)
            update_entry_in_status_file(protein_id, "MUTUAL_BEST", "FAILED")
            failed_proteins.append(protein_id)
            
        else:
            update_entry_in_status_file(protein_id, "MUTUAL_BEST", "OK")
            
    print "Failed proteins: "        
    for failed_protein_id in failed_proteins:
        print failed_protein_id
コード例 #26
0
def parse_SW_output (ref_protein_id, species, sw_type):
    '''
    Parses the output from the SW# command line application.
    (suitable for version as it was distributed on May 1st, 2012)
    
    @param sw_type: sw_exon/sw_gene
    @return: dictionary of alignment exons. The keys are referent exon IDs, and 
    values are lists of all the alignment exons which correspond to the certain
    reference exon 
    '''
    
    logger              = Logger.Instance()
    containers_logger   = logger.get_logger('containers')
    dc                  = DirectoryCrawler()
    
    # determine the swout file path
    if sw_type.lower() == "sw_gene":
        swout_file_path = dc.get_SW_gene_path(ref_protein_id)
    elif sw_type.lower() == "sw_exon":
        swout_file_path = dc.get_SW_exon_path(ref_protein_id)
    else:
        raise KeyError ("There is no known swout path for type %s" % sw_type)
    swout_file_path += "/%s.swout" % species
    
    if not os.path.isfile(swout_file_path):
        containers_logger.error ("{0}, {1}, {2}, no swout file".format(ref_protein_id, species, sw_type))
        return False
    
    swout_file = open(swout_file_path, 'r')
    
    # status boolean variables
    parsing_query_seq = True
    
    # patterns for matching
    header_pattern      = re.compile ("Name: >(\d+)\|(\d+)\|(ENS\w+)\|(ENS\w+)\|([-]*1)")
    #Intervals: 1207047 1207087 30 69 (+) strand 
    intervals_pattern   = re.compile ("Intervals:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\([+-]\)\s+strand")
    #Identity: 31/41 (75.6%)
    identity_pattern    = re.compile ("Identity:\s+(\d+)/(\d+).*")
    #Similarity: 40/41 (97.6%)
    similarity_pattern  = re.compile ("Similarity:\s+(\d+)/.*")
    #Gaps: 1/41 (2.4%)
    gaps_pattern        = re.compile ("Gaps:\s+(\d+)/\d+.*")
    #Score: 2828.000
    score_pattern       = re.compile ("Score:.*")
    # sequence pattern
    sequence_pattern    = re.compile ("\s*(\d+)\s+([ATCGN-]+)\s+(\d+).*")
    
    exon_dict = {}
    ref_exon_id     = ""
    identities      = 0
    positives       = 0
    gaps            = 0
    score           = 0.
    sbjct_start     = 0
    sbjct_end       = 0
    query_start     = 0
    query_end       = 0
    length          = 0
    query_sequence  = ""
    sbjct_sequence  = ""
    exon = Exon(sw_type, "", ref_protein_id, species)
    
    for line in swout_file.readlines():
        
        line = line.strip()
        header_match = re.match(header_pattern, line)
        if header_match:
            #add the current exon and start a new one
            if ref_exon_id:
                exon.set_alignment_info(int(identities), 
                                        int(positives), 
                                        int(gaps), 
                                        int(sbjct_start), 
                                        int(sbjct_end), 
                                        int(query_start), 
                                        int(query_end), 
                                        int(length), 
                                        sbjct_sequence,
                                        query_sequence,
                                        float(score))
                if ref_exon_id in exon_dict:
                    exon_dict[ref_exon_id].append(exon)
                else:
                    exon_dict[ref_exon_id] = [exon]
                 
            
            ref_exon_id = header_match.groups()[3]
            exon = Exon(sw_type, ref_exon_id, ref_protein_id, species)
            parsing_query_seq = True
            query_sequence = ""
            sbjct_sequence = ""
        
        # intervals    
        intervals_match = re.match (intervals_pattern, line)
        if intervals_match:
            (query_start, query_end, sbjct_start, sbjct_end) = intervals_match.groups()
            
        # identities
        identity_match = re.match (identity_pattern, line)
        if identity_match:
            (identities, length) = identity_match.groups()
            
        # similarities
        similarity_match = re.match(similarity_pattern, line)
        if similarity_match:
            positives = similarity_match.groups()[0]
            
        # gaps
        gaps_match = re.match (gaps_pattern, line)
        if gaps_match:
            gaps = gaps_match.groups()[0]
            
        score_match = re.match(score_pattern, line)
        if score_match:
            score = line.split()[-1]
            
        # sequence
        sequence_match = re.match (sequence_pattern, line)
        if sequence_match:
            sequence_to_append = sequence_match.groups()[1].strip()
            if parsing_query_seq:
                query_sequence += sequence_to_append
                parsing_query_seq = False
            else:
                sbjct_sequence += sequence_to_append
                parsing_query_seq = True
                
    exon.set_alignment_info(int(identities), 
                            int(positives), 
                            int(gaps), 
                            int(sbjct_start), 
                            int(sbjct_end), 
                            int(query_start), 
                            int(query_end), 
                            int(length), 
                            sbjct_sequence,
                            query_sequence,
                            float(score))
    if query_sequence:
        if ref_exon_id in exon_dict:
            exon_dict[ref_exon_id].append(exon)
        else:
            exon_dict[ref_exon_id] = [exon]
         
    return exon_dict