def filter_gene_names_present_in_database(gene_names_of_interest): _session = db.create_scoped_session() _log.info( "Filtering gene names that are already present in the database ...") try: # Make sure the gene names are a set, so we can pop them gene_names_of_interest = set(gene_names_of_interest) # check which gene names are already present in the database n_gene_names = len(gene_names_of_interest) n_filtered_gene_names = 0 for gene_name in _session.query(distinct(Gene.gene_name)).filter( Gene.gene_name.in_(gene_names_of_interest)).all(): gene_names_of_interest.remove(gene_name[0]) n_filtered_gene_names += 1 _log.info("Filtered '" + str(n_filtered_gene_names) + "' out of '" + str(n_gene_names) + "' gene names that are already present in the database ...") return list(gene_names_of_interest) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def add_gene_mapping_to_database(gene_mapping): _session = db.create_scoped_session() try: for transcription_id in gene_mapping["genes"].keys(): with _session.no_autoflush: # retrieve the gene_translation gene_translation = gene_mapping["genes"][transcription_id] if transcription_id in gene_mapping["proteins"].keys(): # test if this protein already exists in the database matching_protein = _session.query(Protein).filter_by( uniprot_ac=gene_mapping["proteins"] [transcription_id].uniprot_ac).first() protein_already_present = True if matching_protein is None: # Protein is already present in the dataase, remove it from the to-be-added proteins matching_protein = gene_mapping["proteins"].pop( transcription_id) protein_already_present = False # add relationships to mapping from gene translation and protein for mapping in gene_mapping["mappings"][transcription_id]: gene_translation.mappings.append(mapping) matching_protein.mappings.append(mapping) # add relationship from protein to gene transcript matching_protein.genes.append(gene_translation) # add the gene translation to the database _session.add(gene_translation) # add the protein to the database if not protein_already_present: _session.add(matching_protein) # add all other objects to the database _session.add_all( gene_mapping["mappings"][transcription_id]) else: # add the gene translation to the database _session.add(gene_translation) # Commit the changes of this mapping _session.commit() except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def retrieve_all_transcript_ids_with_mappings(): """Retrieves all transcripts for which there are mappings""" # Open as session _session = db.create_scoped_session() try : return [transcript for transcript in _session.query(Gene.gencode_transcription_id).filter(Gene.protein_id != None).all()] except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def get_mappings_for_gene(_gene): """Retrieves all mappings for a Gene object""" # Open as session _session = db.create_scoped_session() try: return [x for x in _session.query(Mapping).filter(Mapping.gene_id == _gene.id).all()] except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def get_domains_for_protein(protein_id): """Retrieves all interpro entries for a given protein_id""" # Open as session _session = db.create_scoped_session() try: return [interpro_domain for interpro_domain in _session.query(Interpro).filter(Interpro.protein_id == protein_id).all()] except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def retrieve_all_transcript_ids(gene_name): """Retrieves all transcript ids for a gene name""" # Open as session _session = db.create_scoped_session() try: return [transcript for transcript in _session.query(Gene).filter(func.lower(Gene.gene_name) == gene_name.lower()).all()] except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def retrieve_all_gene_names_from_db(): """Retrieves all gene names present in the database""" # Open as session _session = db.create_scoped_session() try: return [gene_name for gene_name in _session.query(Gene.gene_name).distinct(Gene.gene_name).all()] except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def retrieve_transcript_id_for_multiple_protein_ids(_protein_ids): """Retrieves all gencode transcripts for multiple protein ids as {gene_id: gencode_transcription_id}""" # Open as session _session = db.create_scoped_session() try: _gencode_transcription_id_per_gene_id = {} for gene in _session.query(Gene).filter(Gene.protein_id.in_(_protein_ids)).all(): _gencode_transcription_id_per_gene_id[gene.id] = gene.gencode_transcription_id return _gencode_transcription_id_per_gene_id except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def retrieve_protein_id_for_multiple_protein_acs(_protein_acs): """Retrieves all protein ids for multiple Protein objects as {protein_ac: uniprot_id}""" # Open as session _session = db.create_scoped_session() try: _protein_id_per_protein_ac = {} for protein in _session.query(Protein).filter(Protein.uniprot_ac.in_(_protein_acs)).all(): _protein_id_per_protein_ac[protein.uniprot_ac] = protein.id return _protein_id_per_protein_ac except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def get_mappings_for_multiple_protein_ids(_protein_ids): """Retrieves all mappings for a multiple Protein objects as {protein_id: [ Mapping ]}""" # Open as session _session = db.create_scoped_session() try: _mappings_per_protein = {} for mapping in _session.query(Mapping).filter(Mapping.protein_id.in_(_protein_ids)).all(): if not mapping.protein_id in _mappings_per_protein: _mappings_per_protein[mapping.protein_id] = [] _mappings_per_protein[mapping.protein_id].append(mapping) return _mappings_per_protein except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def retrieve_protein(protein_id): """Retrieves the protein object for a given protein id""" # Open as session _session = db.create_scoped_session() try: return _session.query(Protein).filter(Protein.id == protein_id).one() except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except MultipleResultsFound as e: _log.error("ProteinRepository.retrieve_protein(protein_id): Multiple results found while expecting uniqueness for protein_id '"+str(protein_id)+"'. "+str(e)) return None except NoResultFound as e: _log.error("ProteinRepository.retrieve_protein(protein_id): Expected results but found none for protein_id '"+str(protein_id)+"'. "+str(e)) return None except: _log.error(traceback.format_exc()) raise finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()
def retrieve_gene(transcription_id): """Retrieves the gene object for a given transcript id""" # Open as session _session = db.create_scoped_session() try: return _session.query(Gene).filter(Gene.gencode_transcription_id == transcription_id).one() except (AlchemyResourceClosedError, AlchemyOperationalError, PsycopOperationalError) as e: raise RecoverableError(str(e)) except MultipleResultsFound as e: error_message = "GeneRepository.retrieve_gene(transcription_id): Multiple results found while expecting uniqueness for transcription_id '"+str(transcription_id)+"'. "+str(e) _log.error(error_message) raise RepositoryException(error_message) except NoResultFound as e: error_message = "GeneRepository.retrieve_gene(transcription_id): Expected results but found none for transcription_id '"+str(transcription_id)+"'. "+str(e) _log.error(error_message) raise RepositoryException(error_message) except Exception as e: error_message = "GeneRepository.retrieve_gene(transcription_id): Unexpected exception for transcription_id '"+str(transcription_id)+"'. "+str(e) _log.error(error_message) raise RepositoryException(error_message) finally: # Close this session, thus all items are cleared and memory usage is kept at a minimum _session.remove()