def merge_dsorftranscriptasso(self):

        Logger.get_instance().info(
            'Starting to merge the entries of the DSORFTranscriptAsso table.')

        # Get all the existing ( ORF, Transcript ) couples for which this
        # is necessary to merge all the related DSORFTranscriptAsso
        # If they have not yet been computed, then do it
        if (self.resume_at_stage in [
                ResumeMergeStrategy.RESUME_AFTER_CONSERVED,
                ResumeMergeStrategy.RESUME_AFTER_ORF,
                ResumeMergeStrategy.RESUME_AFTER_TRANSCRIPT
        ]):
            self.get_dsorftranscriptasso_to_merge()

        # Otherwise import them from the .dcorf file
        elif (self.resume_at_stage in [
                ResumeMergeStrategy.RESUME_AFTER_OTA_ID_ASSO,
                ResumeMergeStrategy.RESUME_DURING_OTA
        ]):
            self.import_dsorftranscriptasso_to_merge()

        # Any other case that is not handle should raise a programming exception
        else:
            raise DenCellORFException(
                'ResumeMergeStrategy.merge_dsorftranscriptasso(): The case ' +
                self.resume_at_stage +
                ' is not properly handled in the method!' +
                ' Please contact the developer if you see this message.')

        # Merge the entries of the DSORFTranscriptAsso table
        self.merge_dsota()
    def execute(self):

        # For each model of database, if the strategy selected by the user requires to
        # check / build the database, instantiate the appropriate CheckDatabase class
        # and execute the CheckDatabase strategy.
        for db_model in self.DATABASE_MODELS:

            strategies_checking_db = eval(
                'OptionConstants.STRATEGIES_CHECKING_' + db_model +
                '_DATABASE')

            if (self.called_strategy in strategies_checking_db):

                Logger.get_instance().info('Checking the ' + db_model +
                                           ' database...')

                try:
                    check_db = eval('Check' + db_model + 'Database()')
                except Exception as e:
                    raise DenCellORFException(
                        'DatabaseCheckStrategy: An exception occurred trying to'
                        + ' instantiate Check' + db_model + 'Database.', e)

                try:
                    check_db.execute()
                except DenCellORFException as e:
                    raise DenCellORFException(
                        'DatabaseCheckStrategy: An exception occurred trying to'
                        + ' check the ' + db_model + 'database.', e)
    def get_obj_from_file( input_folder, filename ):

        file_path = os.path.join( input_folder, filename ) + Constants.DENCELLORF_FILES_EXTENSION
        
        # Check the file exists
        if not os.path.exists( file_path ):
            raise DenCellORFException( 'FileHandlerUtil.get_obj_from_file(): No file has been found' +
                                       ' at the path provided (' + file_path + ').' +
                                       ' Please note that some strategies may only be run in certain' +
                                       ' particular cases (e.g. the Restore strategy may only be run' +
                                       ' after a successful run of the Backup strategy).' +
                                       ' Please see the documentation for more information.' )
        
        # Get the list of objects from the file
        Logger.get_instance().debug( 'FileHandlerUtil.get_obj_from_file(): Importing data from ' + 
                                     file_path + '...' )
        
        try:
            with open( file_path, 'rb' ) as saved_objects_file:
                list_of_obj_unpickler = pickle.Unpickler( saved_objects_file )
                list_of_objects = list_of_obj_unpickler.load()
        except Exception as e:
            raise DenCellORFException( 'FileHandlerUtil.get_obj_from_file(): An error occurred trying' +
                                        ' to get the objects saved in ' + file_path + 
                                        '. Hence, these data from this file will not be loaded.' +
                                        ' Error code: ' + LogCodes.ERR_FILEHAND + '.', e )
        else:
            Logger.get_instance().debug( 'FileHandlerUtil.get_obj_from_file(): ' + 
                                         str( len( list_of_objects ) ) + 
                                         ' objects have been successfully loaded from ' + 
                                         file_path + '.' )
        
        return list_of_objects
    def save_obj_to_file( objects_to_save, filename, output_folder=DefaultOutputFolder.OUTPUT_FOLDER ):

        # Create the output folder if it does not yet exist 
        # (and its parent folders if necessary)
        if not os.path.isdir( output_folder ):
            os.makedirs( output_folder )
        
        file_path = os.path.join( output_folder, filename ) + Constants.DENCELLORF_FILES_EXTENSION

        Logger.get_instance().debug( 'FileHandlerUtil.save_obj_to_file(): ' + 
                                     str( len( objects_to_save ) ) + 
                                     ' objects will be saved in ' + file_path + '.' )
        
        # Save the objects in the file
        try:
            with open( file_path, 'wb' ) as objects_to_save_file:
                    obj_to_insert_pickler = pickle.Pickler( objects_to_save_file )
                    obj_to_insert_pickler.dump( objects_to_save )
        except Exception as e:
            raise DenCellORFException( 'FileHandlerUtil.save_obj_to_file(): An error occurred trying' +
                                        ' to save the objects in ' + file_path + 
                                        '. Hence, these data will not be saved.' +
                                        ' Error code: ' + LogCodes.ERR_FILEHAND + '.', e )
        else:
            Logger.get_instance().debug( 'FileHandlerUtil.save_obj_to_file(): ' + 
                                         str( len( objects_to_save ) ) + 
                                         ' objects have been successfully saved in ' + file_path + 
                                         '. This file may be used to recover data later.' +
                                         ' Please see the documentation for more information.')
Beispiel #5
0
    def execute(self):

        # Create a session to the DS database
        SQLManagerDS.get_instance().set_db_settings(self.db_settings)
        try:
            SQLManagerDS.get_instance().get_session()
        except Exception as e:
            raise DenCellORFException(
                'ForceInsertionStrategy.execute(): An error occurred while trying to'
                + ' create a session to the database.' + '\n Error code: ' +
                LogCodes.ERR_SQL_SESSION + '.', e)

        # Get the list of entries in the DataSource table and store them in the DataManager
        DataManager.get_instance().store_DS_query_result(
            Constants.DM_ALL_DATASOURCES, 'query(DataSource).all()')
        SQLManagerDS.get_instance().close_session()

        # Process to the insertion of data
        Logger.get_instance().info('Starting the insertion of data.')

        for data_source in self.datasource:
            try:
                self.insert_datasource(data_source)
            except Exception as e:
                raise DenCellORFException(
                    'An error occurred trying to insert the data related to ' +
                    data_source + '.', e)

        # Log the end of the insertion
        Logger.get_instance().info('The insertion of data has finished.')
    def pandas_df_to_csv( output_folder, filename, df, file_desc='', sep=',', ext='.csv', \
                          hdr=True, idx=False, mode='w', encoding='utf-8' ):

        # Create the output folder if it does not yet exist
        # (and its parent folders if necessary)
        if not os.path.isdir( output_folder ):
            os.makedirs( output_folder )
        
        file_path = os.path.join( output_folder, filename ) + ext

        Logger.get_instance().debug( 'FileHandlerUtil.pandas_df_to_csv(): ' + 
                                     ' The data frame (' + file_desc + ') will be saved in ' + 
                                     file_path + '.' )
        
        # Save the objects in the file
        try:
            df.to_csv( file_path, 
                       sep = sep, 
                       header = hdr, 
                       index = idx,
                       mode = mode,
                       encoding = encoding )
        except Exception as e:
            raise DenCellORFException( 'FileHandlerUtil.pandas_df_to_csv(): An error occurred trying to' +
                                       ' save the pandas dataframe in ' + file_path + 
                                       ' Error code: ' + LogCodes.ERR_FILEHAND + '.', e )
        else:
            Logger.get_instance().debug( 'FileHandlerUtil.pandas_df_to_csv(): ' +
                                         ' The data frame (' + file_desc + 
                                         ') has been successfully saved in ' + file_path + '.' )
    def import_dsorftranscriptasso_to_merge(self):

        Logger.get_instance().debug(
            'ResumeMergeStrategy.get_dsorftranscriptasso_to_merge():' +
            ' Getting the dictionary that associate to each existing' +
            ' ( ORF ID, Transcript ID ) couples the list of DSORFTranscriptAsso'
            + ' (DS) IDs and converting lists of IDs into list of objects.')

        # Get the dictionary that associates to each unique ( ORF ID (PRO),
        # Transcript ID (PRO) ) couple that exists the list of the IDs of
        # all the DSORFTranscriptAsso (DS) that are related to it
        all_existing_orf_tr_asso_ids = FileHandlerUtil.get_obj_from_file(
            input_folder=Constants.MERGED_DATA_FOLDER,
            filename=Constants.ALL_EXISTING_ORF_TR_ASSO_IDS_FILENAME)

        # Convert the dictionary in order to change the list of IDs
        # by lists of corresponding DSORFTranscriptAsso objects
        all_dsota = SQLManagerDS.get_instance().get_session().query(
            DSORFTranscriptAsso).all()
        SQLManagerDS.get_instance().close_session()

        all_dsota_dict = {}
        for dsota in all_dsota:
            all_dsota_dict[dsota.id] = dsota

        existing_orf_tr_asso_all = {}
        for ((orf_id, tr_id),
             dsota_ids_list) in all_existing_orf_tr_asso_ids.items():
            existing_orf_tr_asso_all[(orf_id, tr_id)] = map(
                lambda x: all_dsota_dict.get(x), dsota_ids_list)

        # If the merging of DSORFTranscriptAsso entries has already
        # been started once and failed, then remove from the list
        # all the (ORF ID, Transcript ID) couples already processed
        if (self.resume_at_stage == ResumeMergeStrategy.RESUME_DURING_OTA):
            all_processed_ota = SQLManagerPRO.get_instance().get_session(
            ).query(ORFTranscriptAsso).all()
            SQLManagerPRO.get_instance().close_session()

            all_processed_ids = [(ota.orf_id, ota.transcript_id)
                                 for ota in all_processed_ota]

            existing_orf_tr_asso_all_to_process = { key : val \
                                                    for ( key, val ) in existing_orf_tr_asso_all.items() \
                                                    if ( ( int( key[0] ), int( key[1] ) ) not in all_processed_ids ) }

            existing_orf_tr_asso_all = existing_orf_tr_asso_all_to_process

            Logger.get_instance().debug(
                'ResumeMergeStrategy.import_dsorftranscriptasso_to_merge(): ' +
                str(len(all_processed_ota)) + ' couples have already been' +
                ' processed and ' + str(len(existing_orf_tr_asso_all.keys())) +
                ' remains to be processed.')

        # Store the dictionary in the DataManager main dictionary
        DataManager.get_instance().store_data(
            Constants.DM_ALL_EXISTING_ORF_TR_ASSO_DICT,
            existing_orf_tr_asso_all)
    def execute(self):

        # Set the connection to the database
        self.get_sqlmanager_instance().set_db_settings(self.db_settings)

        # Check the integrity of the database
        str_ok = self.get_sqlmanager_instance().check_database_str_integrity()
        if (not str_ok):
            raise DenCellORFException(
                'The schema of the database provided does not follow' +
                ' the expected model. Please make sure the provided model (' +
                self.db_model + ') and the database (' +
                self.db_settings[Constants.DB_SETTINGS_DB_NAME] +
                ') provided are the right ones.')

        # Get the declarative base corresponding to the database
        base = self.get_sqlmanager_instance().get_declarative_base()

        # Build a dictionary of the classes defined in the model
        # where the keys are the classes, and the values their names
        dict_model_classes = {}
        for (cl_name, cl_object) in base._decl_class_registry.items():
            if (not str(cl_name) == '_sa_module_registry'):
                dict_model_classes[cl_object] = str(cl_name)

        # For each table, get the list of all entries and save them in a file
        for table in dict_model_classes.keys():

            # Get the name of the tale
            table_name = str(dict_model_classes[table])
            Logger.get_instance().debug(
                'Starting to save the entries of the ' + table_name +
                ' table.')

            # Get all the entries to save
            objects_to_save = self.get_sqlmanager_instance().get_session(
            ).query(table).all()

            # Expunge the session to the database to detach the objects in the list from the session
            self.get_sqlmanager_instance().get_session().expunge_all()
            self.get_sqlmanager_instance().close_session()

            if self.file_prefix:
                filename = self.file_prefix + table_name
            else:
                filename = table_name

            try:
                FileHandlerUtil.save_obj_to_file(
                    objects_to_save=objects_to_save,
                    filename=filename,
                    output_folder=self.output_folder)

            except Exception as e:
                raise DenCellORFException(
                    'BackupStrategy.execute(): An error occurred trying to' +
                    ' save data in the file.'
                    '\n Error code: ' + LogCodes.ERR_FILEHAND + '.', e)
    def initialize(self):

        # Get the main keyword that defines the strategy
        self.strategy = sys.argv[1]

        # If the strategy is not known, check if the user asked the help.
        # Otherwise, raise a DenCellORFException.
        if (self.strategy not in OptionConstants.STRATEGIES_LIST):

            # Display help on the console if necessary and exit the program
            if (self.strategy in ['-h', '--help']):
                print(
                    'To run a strategy, you need to type the command such as: \n'
                    +
                    'python $PYTHONPATH/fr/tagc/uorf/uorf.py [StrategyKeyword] [Options] \n'
                    + 'or DenCellORF [StrategyKeyword] [Options]. \n'
                    'The following strategies are available: ' +
                    ', '.join(OptionConstants.STRATEGIES_LIST) + '.\n'
                    ' You may find more information about the options available for each strategy'
                    + ' using the command DenCellORF [StrategyKeyword] -h' +
                    ' or DenCellORF [StrategyKeyword] --help. \n' +
                    ' For extensive information, please read the user manual (PDF file).'
                )
                exit()

            else:
                raise DenCellORFException(
                    'The strategy selected (' + self.strategy +
                    ') is not correct.' + ' It must be one of ' +
                    ', '.join(OptionConstants.STRATEGIES_LIST) +
                    '. Please see the documentation for more information.')

        Logger.get_instance().info('---')
        Logger.get_instance().info('Selected strategy: ' + self.strategy)

        # Build an option parser to collect the option values
        self.optionParser = OptionParser()
        for current_prop_list in OptionConstants.OPTION_LIST[self.strategy]:
            self.optionParser.add_option(current_prop_list[0],
                                         current_prop_list[1],
                                         action=current_prop_list[2],
                                         type=current_prop_list[3],
                                         dest=current_prop_list[4],
                                         default=current_prop_list[5],
                                         help=current_prop_list[6])

        # Get the various option values into a dictionary
        (opts, args) = self.optionParser.parse_args()
        self.optionDict = vars(opts)
        self.args = args

        # Log the settings
        Logger.get_instance().info('Settings:')
        for opt in self.optionDict.items():
            Logger.get_instance().info("-" + str(opt[0]) + ": '" +
                                       str(opt[1]) + "'")
        Logger.get_instance().info('---')
Beispiel #10
0
 def execute( self ):
             
     # Set the connection to the database
     self.get_sqlmanager_instance().set_db_settings( self.db_settings )
     
     try:
         self.get_sqlmanager_instance().get_session()
     except Exception as e:
         raise DenCellORFException( 'AddReleaseVersionStrategy.execute(): An error occurred trying to' +
                                    ' create a session to the database.' +
                                     '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.', e)
     self.get_sqlmanager_instance().close_session()
     
     # If there is already an annotation version information registered
     # in the metadata table, get it
     release_metadata_query = self.get_sqlmanager_instance().get_session().query(
                                                                                     self.get_metadata_class() 
                                                                                 ).filter( 
                                                                                             self.get_metadata_class().parameter == Constants.METATABLE_DATABASE_VERSION_NUMBER
                                                                                         )
     
     # If a version has already been registered, update it if necessary 
     # (i.e. if the forceOverwrite option has been selected) or log a 
     # critical message,
     if ( release_metadata_query.count() != 0 ):
     
         # Get the entry
         metadata_release_entry = release_metadata_query.one()
         
         if ( self.force_overwrite ):
             metadata_release_entry.value = self.db_release
             metadata_release_entry.description = self.db_desc
             # Commit the updates
             self.get_sqlmanager_instance().commit()
             
         else:
             if ( ( metadata_release_entry.value != self.db_release )
                  or ( metadata_release_entry.description != self.db_desc ) ):
                 Logger.get_instance().critical( 'A different version has already been registered in the metadata' +
                                                 ' table for this database (' + metadata_release_entry.value +
                                                 ', ' + metadata_release_entry.description + 
                                                 ') and the forceOverwrite option has not been selected.' +
                                                 ' Hence, the version number and/or description will not' +
                                                 ' be updated.' )
          
     # Otherwise, create a new entry in the metadata table   
     else:
         metadata_release = self.get_metadata_class()( parameter = Constants.METATABLE_DATABASE_VERSION_NUMBER,
                                                       value = self.db_release,
                                                       description = self.db_desc )
         self.get_sqlmanager_instance().get_session().add( metadata_release )
         self.get_sqlmanager_instance().commit()
             
     self.get_sqlmanager_instance().close_session()
    def dict_to_csv( output_folder, filename, dict, file_desc='', sort=False, sep=',', ext='.csv', \
                     hdr=None, key_func=lambda k: k, val_func=lambda v: v, unlist_key=False, unlist_val=False ):

        # Create the output folder if it does not yet exist 
        # (and its parent folders if necessary)
        if not os.path.isdir( output_folder ):
            os.makedirs( output_folder )
        
        file_path = os.path.join( output_folder, filename ) + ext
        
        Logger.get_instance().debug( 'FileHandlerUtil.dict_to_csv(): The content of the dictionary (' + 
                                     file_desc + ') will be saved in ' + file_path + '.' )
        
        # Save the objects in the file
        try:
            with open( file_path, 'wb' ) as csv_file:
                writer = csv.writer( csv_file, delimiter = sep )
                
                # Write the header if necessary
                if hdr:
                    writer.writerow( hdr )
                
                # Write the dictionary as key, value
                if sort:
                    key_list = sorted( dict.keys() )
                else:
                    key_list = dict.keys() 

                for k in key_list:
                    # Get the value and apply the functions to transform the key 
                    # and value if necessary
                    key = key_func( k )
                    val = val_func( dict.get( k ) )
                    
                    # Write the new row in the file
                    if ( ( not unlist_key ) or ( not isinstance( key, list ) ) ):
                        key = [ key ]
                        
                    if ( ( not unlist_val ) or ( not isinstance( val, list ) ) ):
                        val = [ val ]
                    
                    writer.writerow( key + val )                        
        
        except Exception as e:
            raise DenCellORFException( 'FileHandlerUtil.dict_to_csv(): An error occurred trying to save' +
                                        ' the content of the dictionary in ' + file_path + 
                                        '\n Error code: ' + LogCodes.ERR_FILEHAND + '.', e )
        
        else:
            Logger.get_instance().debug( 'FileHandlerUtil.dict_to_csv(): ' +
                                         ' the content of the dictionary (' + file_desc + 
                                         ') has been successfully saved in ' + file_path + 
                                         '. Please see the documentation for more information.' )
            
    def __init__( self ):
        
        configfile = OptionManager.get_instance().get_option( OptionConstants.OPTION_CONFIG_FILE_PATH, 
                                                              not_none = True )
            
        if configfile:
            self.configfile = configfile
            if ( not os.path.exists( configfile ) ):
                raise DenCellORFException( 'No config file may be found at the path provided (' + 
                                           self.configfile + ').' )
            
        else:
            raise DenCellORFException( 'A config file has to be provided.' +
                                       ' See the documentation for more information.' )

        # Check if the forceOverwrite option has been selected
        if OptionManager.get_instance().get_option( OptionConstants.OPTION_FORCE_OVERWRITE, not_none = False ):
            self.force_overwrite = True
        else:
            self.force_overwrite = False
        
        self.species = None
        self.ensembl_release_version = None
        
        
        # Get the number of threads available
        self.thread_nb = OptionManager.get_instance().get_option( OptionConstants.OPTION_THREAD_NB, 
                                                                  not_none = False )
        available_thread_nb = cpu_count()
        if self.thread_nb:
            try:
                self.thread_nb = int( self.thread_nb )
            except:
                raise DenCellORFException( 'ComputeRelCoordStrategy: The value provided for the number'
                                           ' of threads needs to be an integer (provided value: ' + 
                                           str( self.thread_nb ) + ').' )
            else:
                if ( self.thread_nb < 1 ):
                    raise DenCellORFException( 'ComputeRelCoordStrategy: The value provided for the number' +
                                               ' of threads needs to be an integer greater than 1 (provided value: ' + 
                                               str( self.thread_nb ) + ').' )
                    
                if ( self.thread_nb > available_thread_nb ):
                    Logger.get_instance().info( 'The number of threads provided (' + str( self.thread_nb ) +
                                                ') is greater than the number of threads actually' +
                                                ' available(' +  str( available_thread_nb ) +
                                                '). Hence, ' + str( available_thread_nb ) +
                                                ' threads will be used for the computation.' )
        else:
            self.thread_nb = available_thread_nb
            
        Logger.get_instance().debug( 'ComputeRelCoordStrategy: ' + str( self.thread_nb ) + ' threads' +
                                     ' will be used for the computation of relative coordinates.' )
    def __init__(self,
                 log_path=Constants.PATH_GENEREF_LOG,
                 writing_mode=Constants.GENEREF_LOG_DEFAULT):

        self.logg = GeneRefLogger.set_logger(log_path, writing_mode)

        # Log the instantiation of this logger in the main logger
        Logger.get_instance().warning(
            'A warning related to gene references has been raised' +
            ' during the execution of the program. All warnings related' +
            ' to the gene references will be logged in the file "' +
            str(log_path) + '". Please see the documentation for' +
            ' more information.')
    def init_log_file(self, ext='.tsv'):

        # Create the output folder if it does not yet exist
        # (and its parent folders if necessary)
        if (not os.path.isdir(self.output_folder)):
            os.makedirs(self.output_folder)

        file_path = os.path.join(self.output_folder, self.file_name + ext)

        Logger.get_instance().info(' The logs will be saved in ' + file_path +
                                   '.')

        file = open(file_path, mode='w')
        file.close()
    def build_sqlite_database(self, force_overwrite):

        # If there is already a database at the path
        if os.path.exists(self.db_path):

            # And the forceOverwrite option has been selected,
            # then remove the database file
            if force_overwrite:
                self.remove_sqlite_db()
                return True

            else:
                # Check if the existing database contains the
                # appropriate model
                if not self.check_database_str_integrity():

                    # Ask the user to confirm the deletion of the database
                    confirm_deletion = None
                    Logger.get_instance().info(
                        'The database provided does not use the appropriate' +
                        ' model. Hence, the database will be removed and build'
                        + ' again using the right model.')

                    while (confirm_deletion not in ['Y', 'N']):
                        print(
                            ' Do you want to confirm the deletion of the database? (Y/N)'
                        )
                        confirm_deletion = raw_input().upper()

                    # If the user refuse the database to be deleted, log a critical error
                    if (confirm_deletion == 'N'):
                        Logger.get_instance().critical(
                            'As the database does not use the appropriate' +
                            ' model and the deletion has been canceled by the'
                            +
                            ' user, the program will be stopped. Please see' +
                            ' the documentation for more information.')

                    # Otherwise delete the database and create a new one
                    self.remove_sqlite_db()
                    return True

                else:
                    return False

        else:
            return True
    def add_and_commit(self, objects_to_add, process='Undefined process'):

        # Get the number of objects that are expected to be inserted in the database
        total_count = len(objects_to_add)

        # Add the objects to the session
        try:
            self.get_session().add_all(objects_to_add)
        except Exception as e:
            # Get the number of objects of each type in the list
            types_dict = GeneralUtil.get_type_counts_in_list(objects_to_add)
            types_dict_str = ', '.join([
                str(tp) + ': ' + str(val) for (tp, val) in types_dict.items()
            ])

            raise DenCellORFException(
                self.classname + '.add_and_commit():' +
                ' An error occurred trying to add ' + str(total_count) +
                ' objects (from ' + process + ') to the session.' +
                ' The list was containing the following objects: ' +
                types_dict_str + '.', e)

        # Commit changes
        try:
            self.commit()
        except Exception as e:
            # Get the number of objects of each type in the list
            types_dict = GeneralUtil.get_type_counts_in_list(objects_to_add)
            types_dict_str = ', '.join([
                str(tp) + ': ' + str(val) for (tp, val) in types_dict.items()
            ])

            raise DenCellORFException(
                self.classname + '.add_and_commit():' +
                ' An error occurred trying to commit changes after addition of '
                + str(total_count) + ' objects (from ' + process +
                ') to the session.' +
                ' The list was containing the following objects: ' +
                types_dict_str + '.', e)

        # Log in debug mode the number of objects successfully inserted
        Logger.get_instance().debug(
            self.classname + '.add_and_commit(): ' + str(total_count) +
            ' objects (from ' + process +
            ') have been successfully added to the database.')
    def store_DS_query_result(self, keyword, query_string):

        # Perform the query
        Logger.get_instance().debug('DataManager.store_DS_query_result():' +
                                    ' Querying the DS database: ".' +
                                    query_string + '".')
        try:
            query_result = eval('SQLManagerDS.get_instance().get_session().' +
                                query_string)
        except Exception as e:
            raise DenCellORFException(
                'DataManager.store_DS_query_result(): The query ".' +
                query_string + '" failed.', e)

        # Convert the result of the query into a dictionary where the keys equal
        # the values and each of them are one of the result in the list
        query_result = GeneralUtil.list_to_dict(query_result)

        # Store the dictionary in the data dictionary
        self.data[keyword] = query_result
    def remove_sqlite_db(self):

        if os.path.exists(self.db_path):
            try:
                remove(self.db_path)
            except Exception as e:
                raise DenCellORFException('The database located at ' +
                                          str(self.db_path) +
                                          ' cannot be deleted.')
            else:
                Logger.get_instance().info('The database file located at ' +
                                           str(self.db_path) +
                                           ' has been deleted.')

        else:
            Logger.get_instance().error(
                self.classname + '.remove_sqlite_db(): There is no file' +
                ' located at' + str(self.db_path) + '.' + ' Error code: ' +
                LogCodes.ERR_SQL_FILE + '.',
                ex=False)
    def build_database(self,
                       db_settings,
                       species,
                       sp_mandatory=True,
                       force_overwrite=False):

        # Store the settings necessary to establish the connection
        self.set_db_settings(db_settings)

        # Check that a species is provided
        if ((sp_mandatory) and ((species == None) or (len(species) == 0))):
            raise DenCellORFException(
                self.classname +
                '.set_db_settings(): A species needs to be provided!')

        # Get the engine to dedicated database
        self.create_engine()

        # Check and / or remove the existing database if necessary
        if (self.db_type == SQLConstants.DB_TYPE_SQLITE):
            reset_model = self.build_sqlite_database(force_overwrite)

        # Check and / or remove the existing database if necessary
        # and / or create the database on the server if necessary
        elif (self.db_type == SQLConstants.DB_TYPE_MYSQL):
            reset_model = self.build_mysql_database(force_overwrite)

        # Open a session
        self.create_session()

        # If the model does not yet exists, create all the required tables
        if reset_model:
            self.BASE.metadata.create_all(self.engine)
            Logger.get_instance().info('The database ' + self.db_path +
                                       ' has been created.')

        else:
            Logger.get_instance().info('The database ' + self.db_path +
                                       ' will be used.')

        self.session.close()
 def get_ensembl_db( sp, annotation_version ):
     
     Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Downloading and indexing the Ensembl' +
                                  ' database release ' + str( annotation_version ) + 
                                  ' for ' + sp + '.' )
     
     ensembl_db = EnsemblRelease( release = annotation_version,
                                  species = sp )
     
     # Download and index the database if not yet in the temporary folder
     Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Downloading the Ensembl' +
                                  ' database release ' + str( annotation_version) + 
                                  ' for ' + sp + '.' )
     try:
         ensembl_db.download()
     except Exception as e:
         raise DenCellORFException( 'EnsemblUtil.get_ensembl_db(): An error occurred trying to' +
                                    ' download the Ensembl database using pyensembl.', e )
         
     
     Logger.get_instance().debug( 'EnsemblUtil.get_ensembl_db(): Indexing the Ensembl' +
                                  ' database release ' + str( annotation_version) + 
                                  ' for ' + sp + '.' )
     try:
         ensembl_db.index()
     except Exception as e:
         raise DenCellORFException( 'EnsemblUtil.get_ensembl_db(): An error occurred trying to' +
                                    ' index the Ensembl database using pyensembl.', e )
     
     return ensembl_db
    def batch_insert_to_db(self, objects_to_insert, source):

        Logger.get_instance().debug('Starting the insertion of data from ' +
                                    source + '.')

        # Save into a temporary file the data that should be inserted.
        # This allows to recover the data later if an exception is raised during
        # the insertion, saving thus the parsing time.
        try:
            FileHandlerUtil.save_obj_to_file(
                objects_to_save=objects_to_insert,
                filename='objects_from_' + source,
                output_folder=Constants.PARSED_DATA_FOLDER)
        except Exception as e:
            Logger.get_instance().error(
                'InsertionStrategy.batch_insert_to_db():' +
                ' An error occurred trying to save data from ' + source +
                ': \n' + str(e) + ' Error code: ' + LogCodes.ERR_FILEHAND +
                '.',
                ex=False)

        # Insert the objects into the database
        SQLManagerDS.get_instance().batch_insert_to_db(
            objects_to_insert=objects_to_insert, process=source)

        Logger.get_instance().debug('The insertion of data from ' + source +
                                    ' has finished.')
 def prepare_r_annotation_package( species_short_name, species_full_name, species_common_name, \
                                   ensembl_release_version ):
     
     # Define temporary folder where to install the packages
     annot_package_dir = os.path.join( DefaultTemporaryFolder.TEMPORARY_FOLDER, 
                                       'R_ensembl_annot_packages',
                                       species_common_name + str( ensembl_release_version ) )
     if ( not os.path.exists( annot_package_dir ) ):
         os.makedirs( annot_package_dir )            
     
     # Run the R script (as a Python subprocess) 
     # to prepare the packages
     r_args = [ '--ensemblRelease=' + str( ensembl_release_version ),
                '--speciesFullName=' + species_full_name,
                '--speciesShortName=' + species_short_name,
                '--speciesCommonName=' + species_common_name,
                '--annotPackageDir=' + annot_package_dir ]
     r_command = [ 'Rscript', ComputeRelCoordStrategy.R_SCRIPT_BUILD_ANNOT_PACKAGE_PATH ] + r_args
     Logger.get_instance().debug( 'ComputeRelCoordStrategy.prepare_r_annotation_package(): The R script' +
                                  ' will be run with the following arguments ' + ' '.join( r_args ) )
     
     r_process = subprocess.Popen( r_command, stdout = subprocess.PIPE, stderr = subprocess.PIPE )
     
     ( stdout, stderr ) = r_process.communicate()
     if ( stdout != '' ):
         Logger.get_instance().debug( 'ComputeRelCoordStrategy.prepare_r_annotation_package():' +
                                      ' The R script returned the following standard output: \n' + 
                                      stdout )
     if ( stderr != '' ):
         Logger.get_instance().debug( 'ComputeRelCoordStrategy.prepare_r_annotation_package():' +
                                      ' The R script returned the following error output: \n' + 
                                      stderr )
Beispiel #23
0
    def execute(self):

        strategy_command = OptionManager.get_instance().get_strategy()

        if (strategy_command != None):
            try:
                strategy = eval(strategy_command + 'Strategy()')
            except Exception as e:
                raise DenCellORFException(
                    'DenCellORF.execute(): An error occurred during the' +
                    ' instantiation of the strategy: ' + str(e))
        else:
            Logger.get_instance().critical(
                'DenCellORF.execute(): A strategy must be provided!' +
                ' The following strategies are available: ' +
                ', '.join(OptionConstants.STRATEGIES_LIST) +
                '. See the documentation for more information.')

        try:
            strategy.execute()
        except Exception as e:
            raise DenCellORFException(
                'DenCellORF.execute(): An error occurred during the execution'
                + ' of the program.', e)
Beispiel #24
0
    def insert_datasource(data_source):

        # For an easier manipulation of data stored in the DataManager,
        # assign a new variable to access the list of data sources
        all_datasources = DataManager.get_instance().get_data(
            Constants.DM_ALL_DATASOURCES)

        # Make sure the source is not already in the database
        ds = DataSource(name=data_source)

        # If the source is in the database, do not proceed to the insertion
        if ds in all_datasources:
            Logger.get_instance().info(
                'The source "' + data_source +
                '" has been found in the database.' +
                ' Hence, the data from this source will not be inserted again.'
                +
                ' If for some reason you need to perform again the insertion,'
                + ' please first use the Deletion strategy.' +
                ' Please see the documentation for more information.')

        # Process to the insertion of data
        else:
            Logger.get_instance().debug(
                'Starting the insertion of data from ' + data_source + '.')

            # Get the list of objects to insert from the file
            try:
                objects_to_insert = FileHandlerUtil.get_obj_from_file(
                    input_folder=Constants.PARSED_DATA_FOLDER,
                    filename='objects_from_' + data_source)
            except Exception as e:
                raise DenCellORFException(
                    'An error occurred trying to import the data for the source '
                    + data_source + ' from its file.' + '\n Error code: ' +
                    LogCodes.ERR_FILEHAND + '.', e)

            else:
                try:
                    SQLManagerDS.get_instance().batch_insert_to_db(
                        objects_to_insert=objects_to_insert,
                        process=data_source)
                except DenCellORFException as e:
                    raise DenCellORFException(
                        'An error occurred trying to insert the data from ' +
                        data_source + '.' + '\n Error code: ' +
                        LogCodes.ERR_SQL_SESSION + '.', e)

                Logger.get_instance().info('The insertion of data from ' +
                                           data_source + ' finished.')
Beispiel #25
0
    def execute( self ):
        
        # Create a session to the "PRO-like" database
        SQLManagerPRO.get_instance().set_db_settings( self.db_settings )

        try:
            SQLManagerPRO.get_instance().get_session()
        except Exception as e:
            raise DenCellORFException( 'GenerateTrackDbFileStrategy.execute(): An error occurred trying to' +
                                       ' create a session to the database.' +
                                        '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.', e)
        SQLManagerPRO.get_instance().close_session()
        
        
        Logger.get_instance().info( 'Starting to build the track file.' )
        
        
        # Track header and settings
        # -------------------------        
        # Define track labels
        sp = SQLManagerPRO.get_instance().get_session().query( PROSpeciesCatalog.name ).one()[0]
        current_annotation = SQLManagerPRO.get_instance().get_session().query( 
                                                                                PROMetadata.value 
                                                                              ).filter( 
                                                                                            PROMetadata.parameter == Constants.METATABLE_CURRENT_ANNOTATION 
                                                                                        ).one()[0]
        current_ucsc_annot = Constants.CORRESPONDING_UCSC_FROM_NCBI[ current_annotation ]
        SQLManagerPRO.get_instance().close_session()
        
        track_track = '{project_name}_{species}_{annotation}'.format( project_name = Constants.PROJECT_NAME,
                                                                      species = sp,
                                                                      annotation = current_ucsc_annot )
        track_track = 'track ' + track_track
                                                                                        
        track_shortLabel = '{project_name}_{species}_{annotation}'.format( project_name = Constants.PROJECT_NAME,
                                                                           species = sp,
                                                                           annotation = current_ucsc_annot )
        track_shortLabel = 'shortLabel ' + track_shortLabel
        
        track_longLabel = '{project_name} {species} track hub ({annotation}), See {db_url} for more information regarding this track'.format( project_name = Constants.PROJECT_NAME,
                                                                                                                                              species = sp,
                                                                                                                                              annotation = current_ucsc_annot,
                                                                                                                                              db_url = self.WEBSITE_URL )
        track_longLabel = 'longLabel ' + track_longLabel
        
        # Path to HTML descriptive file
        track_html = 'html ' + self.TRACK_HTML_DESC_PATH
        
        # Path to BigBed file
        track_bigDataUrl = ( 'bigDataUrl ' + GenerateTrackDbFileStrategy.BIGBED_FILENAME + 
                             GenerateBEDFileStrategy.BIGBED_FILE_EXTENSION )
        track_type = 'type bigBed 12 +'
        
        # Track visualization
        track_default_vis = ( 'visibility full\n' +
                              'thickDrawItem on\n' +
                              'itemRgb on\n' +
                              'maxItems 100000\n' +
                              'exonArrows on\n' +
                              'exonNumbers on' )
            
        
        # Track filters
        # -------------
        
        # Transcript IDs
        all_transcript_ids = SQLManagerPRO.get_instance().get_session().query( Transcript.transcript_id.distinct() ).all()
        all_transcript_ids = sorted( GeneralUtil.query_result_to_list( all_transcript_ids ) )
        SQLManagerPRO.get_instance().close_session()
        
        transcripts_filter_values = ',\\\n'.join( all_transcript_ids )        
        track_filter_transcripts = ( 'filterType.transcripts multipleListOr\n' +
                                     'filterText.transcripts *\n' +
                                     'filterLabel.transcripts Transcript IDs\n' +
                                     'filterValues.transcripts ' + transcripts_filter_values )
        
        # RNA biotypes
        all_rna_biotypes = SQLManagerPRO.get_instance().get_session().query( 
                                                                                Transcript.rna_biotype.distinct()
                                                                            ).filter(
                                                                                        Transcript.rna_biotype != None
                                                                                    ).all()
        all_rna_biotypes = sorted( GeneralUtil.query_result_to_list( all_rna_biotypes ) )
        SQLManagerPRO.get_instance().close_session()
        
        rnabiotypes_filter_values = ',\\\n'.join( all_rna_biotypes )        
        track_filter_rnabiotypes = ( 'filterType.rna_biotypes multipleListOr\n' +
                                     'filterText.rna_biotypes *\n' +
                                     'filterLabel.rna_biotypes RNA biotypes\n' +
                                     'filterValues.rna_biotypes ' + rnabiotypes_filter_values )
                                    
        # Cell contexts
        all_cell_contexts = SQLManagerPRO.get_instance().get_session().query( CellContextCatalog.context ).all()
        all_cell_contexts = sorted( GeneralUtil.query_result_to_list( all_cell_contexts ) )
        SQLManagerPRO.get_instance().close_session()
        
        celltypes_filter_values = ',\\\n'.join( all_cell_contexts )
        track_filter_celltypes = ( 'filterType.cell_types multipleListOr\n' +
                                   'filterText.cell_types *\n' +
                                   'filterLabel.cell_types Cell types (cell lines, tissues...)\n' +
                                   'filterValues.cell_types ' + celltypes_filter_values )
                                  
        # ORF Annotations
        all_orfannotations = SQLManagerPRO.get_instance().get_session().query( ORFAnnotationCatalog.annotation ).all()
        all_orfannotations = sorted( GeneralUtil.query_result_to_list( all_orfannotations ) )
        
        orfannotations_filter_values = ',\\\n'.join( all_orfannotations )
        track_filter_orfannotations = ( 'filterType.orf_annotations multipleListOr\n' +
                                        'filterText.orf_annotations *\n' +
                                        'filterLabel.orf_annotations ORF Annotations\n' +
                                        'filterValues.orf_annotations ' + orfannotations_filter_values )
                                       
        # Kozak contexts
        all_kozak_ctxt_comp = SQLManagerPRO.get_instance().get_session().query( 
                                                                                    ORFTranscriptAsso.kozak_context_comp.distinct() 
                                                                                ).filter(
                                                                                            ORFTranscriptAsso.kozak_context_comp != None
                                                                                        ).all()
        all_kozak_ctxt_comp = sorted( GeneralUtil.query_result_to_list( all_kozak_ctxt_comp ) )
        
        kozakcontexts_filter_values = ',\\\n'.join( all_kozak_ctxt_comp )
        track_filter_kozakcontexts = ( 'filterType.kozak_contexts multipleListOr\n' +
                                       'filterText.kozak_contexts *\n' +
                                       'filterLabel.kozak_contexts Computed Kozak context\n' +
                                       'filterValues.kozak_contexts ' + kozakcontexts_filter_values )
                          
                                       
        # URLs
        # ----
        url_name = 'name="' + self.WEBSITE_URL + '/ORF/' + self.WEBSITE_URL_SPECIES[ sp ] + '/$$' + '"'
        url_transcript = 'transcripts="' + self.WEBSITE_URL + '/transcript/' + self.WEBSITE_URL_SPECIES[ sp ] + '/$$' + '"'
        track_urls = 'urls ' + '\\\n'.join( [ url_name, url_transcript ] )
                          
                                       
        # Additional lines
        # ----------------
        track_labelFields = 'labelFields name, transcripts, rna_biotypes, cell_types, orf_annotations, kozak_contexts'
        
                                       
        # Write the trackDb file
        # ----------------------
        track_content = [ track_track,
                          track_shortLabel,
                          track_longLabel,
                          track_html,
                          track_bigDataUrl,
                          track_type,
                          track_default_vis,
                          track_filter_transcripts,
                          track_filter_rnabiotypes,
                          track_filter_celltypes,
                          track_filter_kozakcontexts,
                          track_labelFields,
                          track_urls ]
        track_content = '\n'.join( track_content ) + '\n'       
        
        # Create the output folder if necessary
        if ( not os.path.isdir( self.output_folder ) ):
            os.makedirs( self.output_folder )
        
        track_db_file_path = os.path.join( self.output_folder, self.filename + self.TRACK_DB_FILE_EXTENSION )
        with open( track_db_file_path, 'w' ) as track_db_file:
            track_db_file.write( track_content )
            
        Logger.get_instance().info( 'The trackDb file has been successfully created and saved at ' +
                                    track_db_file_path + '.' )
            
            
        # Create BigBed file if necessary
        # -------------------------------
        
        if self.generate_bigbed:
            
            # Overwrite / define some options necessary 
            # to run the GenerateBEDFile strategy
            OptionManager.get_instance().set_option( option_name = OptionConstants.OPTION_CONVERT_TO_BIGBED,
                                                     option_value = True )
            # Output folder is the same than the one of the trackDb file
            OptionManager.get_instance().set_option( option_name = OptionConstants.OPTION_OUTPUT_FOLDER,
                                                     option_value = self.output_folder )
            # Defined BigBed filename
            OptionManager.get_instance().set_option( option_name = OptionConstants.OPTION_BED_FILENAME,
                                                     option_value = GenerateTrackDbFileStrategy.BIGBED_FILENAME )
            # Set bigbed format at 12 + 5
            OptionManager.get_instance().set_option( option_name = OptionConstants.OPTION_BED_EXTENDED,
                                                     option_value = True )
            
            try:
                generatebedfilestrategy = GenerateBEDFileStrategy()
            except Exception as e:
                raise DenCellORFException( 'GenerateTrackDbFileStrategy.execute(): An error occurred' +
                                           ' trying to instantiate a GenerateBEDFileStrategy.', e )
            
            try:
                generatebedfilestrategy.execute()
            except Exception as e:
                raise DenCellORFException( 'GenerateTrackDbFileStrategy.execute(): An error occurred' +
                                           ' during the execution of the GenerateBEDFile strategy.', e )
 def compute_tr_cds_relative_coordinates( self ):
     
     Logger.get_instance().info( 'Starting the computation of relative CDS transcript start and stop' +
                                 ' coordinates (registered in the Transcript table).')
     
     # Get all the transcript for which there are CDS 
     # start and stop positions provided 
     # NB: Query is performed using raw SQL statement for better efficiency
     transcript_info_sql_statement = 'SELECT Transcript.id, Transcript.transcript_id AS tr_id, \
                                             Transcript.gene_id, PROGene.chromosome, \
                                             Transcript.cds_start_pos AS start_pos, \
                                             Transcript.cds_stop_pos AS end_pos \
                                      FROM Transcript \
                                      INNER JOIN PROGene ON PROGene.gene_id = Transcript.gene_id \
                                      WHERE ( Transcript.cds_start_pos IS NOT NULL ) \
                                            AND ( Transcript.cds_stop_pos IS NOT NULL )'
     if ( not self.force_overwrite ):
         transcript_info_sql_statement += ' AND ( ( Transcript.rel_cds_start_pos IS NULL ) \
                                                  OR ( Transcript.rel_cds_stop_pos IS NULL ) )'
                                             
     transcript_info_df = pd.read_sql( transcript_info_sql_statement, SQLManagerPRO.get_instance().get_engine() )
     SQLManagerPRO.get_instance().close_session()
     
     Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_tr_cds_relative_coordinates(): ' +
                                  str( transcript_info_df.shape[0] ) + ' Transcript entries are' +
                                  ' expected to be processed.')
             
     # As the conversion of coordinates in R may be highly time-consuming,
     # split the data frame into small data frames and multi-process the 
     # computation
     # Split the data frame into smaller data frames that can be processed
     # independently from each other  
     subset_data_frames = [ transcript_info_df[ min_bound : min_bound + Constants.MAX_ENTRIES_PER_DATAFRAME ] \
                            for min_bound in xrange( 0,
                                                     transcript_info_df.shape[ 0 ],
                                                     Constants.MAX_ENTRIES_PER_DATAFRAME ) ]
     
     
     # For each of the subset data frame, process it with R in order
     # to build a dataset containing the start and stop relative
     # coordinates.
     # Instantiate the list of tuple-embedded arguments necessary to
     # compute the relative coordinates
     args_to_run_r = []
     filename_prefix = self.TRANSCRIPT_CSV_FILE_PREFIX
     filename_suffix = 0
     for df in subset_data_frames:
         args_to_run_r.append( ( df,
                                 self.species, 
                                 self.ensembl_release_version, 
                                 filename_prefix,
                                 filename_suffix ) )
         filename_suffix += 1
         
     # Instantiate the pool of processes
     p = Pool( self.thread_nb )
     messages_to_log = p.map( self.compute_relative_coord_r, args_to_run_r )
     p.close()
     # Wait for all processes to be completed
     p.join()
     
     # Log the messages generated by the processes
     for messages in messages_to_log:
         
         ( debug_messages_to_log,
           stdout,
           stderr ) = messages
           
         for message in debug_messages_to_log:
             Logger.get_instance().debug( message )
         
         if ( stdout != '' ):
             Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' +
                                          ' The R script returned the following standard output: \n' + 
                                          stdout )
         
         # NB: As the R function is susceptible to write not error-related 
         #     messages in stderr, these messages are also logged at the 
         #     debug level
         if ( stderr != '' ):
             Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' +
                                          ' The R script returned the following error output: \n' + 
                                          stderr )
     
     # Sequentially open CSV files to get the relative positions
     # Instantiate a dictionary that associate to the ORFTranscriptAsso ID
     # the relative start and stop positions of the ORF
     rel_positions_dict = {}
     for file_nb in range( filename_suffix ):
         
         df = pd.read_csv( os.path.join( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER,
                                         filename_prefix + str( file_nb ) + '.csv' ),
                           sep = ',',
                           encoding = 'utf-8' )
         
         for ( index, row ) in df.iterrows():
             rel_positions_dict[ row[ 'id' ] ] = ( row[ 'rel_start_pos' ], row[ 'rel_end_pos' ] )
     
     
     # Add the relative start and stop positions for all the ORFTranscriptAsso entries 
     all_transcripts = SQLManagerPRO.get_instance().get_session().query( 
                                                                             Transcript 
                                                                         ).filter( 
                                                                                     Transcript.id.in_( rel_positions_dict.keys() ) 
                                                                                 ).all()
                                                                                 
     for transcript in all_transcripts:
         
         # Get the start and stop positions
         positions = rel_positions_dict.get( transcript.id )
         rel_cds_start_pos = positions[ 0 ] 
         rel_cds_stop_pos = positions[ 1 ] 
         
         if not pd.isna( rel_cds_start_pos ):
             transcript.rel_cds_start_pos = int( rel_cds_start_pos )
         
         if not pd.isna( rel_cds_stop_pos ):
             transcript.rel_cds_stop_pos = int( rel_cds_stop_pos )
     
     # Commit the updates and close the session
     SQLManagerPRO.get_instance().commit()
     SQLManagerPRO.get_instance().close_session()
     
     # Delete the pool instance
     p.clear()
 def compute_ota_relative_coordinates( self ):
     
     Logger.get_instance().info( 'Starting the computation of relative ORF start and stop coordinates' +
                                 ' (registered in the ORFTranscriptAsso table).')
     
     # Get information related to the ORF
     # Query the database in order to get, for each unique entry of the ORFTranscriptAsso table:
     # - Its unique ID in the database
     # - The ID of its ORF-related entry, as well as the chromosome, 
     #   start and stop positions of the ORF
     # NB: Query is performed using raw SQL statement for better efficiency
     orf_info_sql_statement = 'SELECT ORFTranscriptAsso.id, ORFTranscriptAsso.orf_id,\
                                      ORF.chromosome, ORF.start_pos, ORF.stop_pos AS end_pos \
                               FROM ORF \
                               INNER JOIN ORFTranscriptAsso ON ORFTranscriptAsso.orf_id = ORF.id'
     if ( not self.force_overwrite ):
         orf_info_sql_statement += ' WHERE ( ORFTranscriptAsso.rel_start_pos IS NULL ) \
                                           OR ( ORFTranscriptAsso.rel_stop_pos IS NULL)'
     orf_info_df = pd.read_sql( orf_info_sql_statement, SQLManagerPRO.get_instance().get_engine() )
     SQLManagerPRO.get_instance().close_session()
     
     
     # Get information related to the transcript
     # Query the database in order to get, for each unique entry of the ORFTranscriptAsso table:
     # - Its unique ID in the database
     # - The ID of its Transcript-related entry
     # NB: All "UNKNOWN_TRANSCRIPT" entries are excluded as an official ID is needed to perform
     #     the conversion.
     # NB: Query is performed using raw SQL statement for better efficiency
     transcript_info_sql_statement = "SELECT ORFTranscriptAsso.id, ORFTranscriptAsso.transcript_id, \
                                             Transcript.transcript_id AS tr_id \
                                      FROM Transcript \
                                      INNER JOIN ORFTranscriptAsso ON ORFTranscriptAsso.transcript_id = Transcript.id \
                                      WHERE Transcript.transcript_id != '" + Constants.UNKNOWN_TRANSCRIPT + "'"        
     transcript_info_df = pd.read_sql( transcript_info_sql_statement, SQLManagerPRO.get_instance().get_engine() )
     SQLManagerPRO.get_instance().close_session()
     
     
     # Merge information from the two data frames in order to get
     # a data frame with the following columns:
     # - id: The ORFTranscriptAsso unique ID
     # - orf_id: The ORF unique ID
     # - chromosome: The ORF chromosome name
     # - start_pos: The ORF start position
     # - end_pos: The ORF stop position
     # - transcript_id: The Transcript unique ID
     # - tr_id: The transcript official ID (e.g. Ensembl ID)
     ota_info_df = orf_info_df.merge( transcript_info_df, 
                                      on='id', 
                                      how = 'inner', 
                                      validate = 'one_to_one' )
     Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_ota_relative_coordinates(): ' +
                                  str( ota_info_df.shape[0] ) + ' ORFTranscriptAsso entries are' +
                                  ' expected to be processed.')
     
     # As the conversion of coordinates in R may be highly time-consuming,
     # split the data frame into small data frames and multi-process the 
     # computation
     # Split the data frame into smaller data frames that can be processed 
     # independently from each other
     subset_data_frames = [ ota_info_df[ min_bound : min_bound + Constants.MAX_ENTRIES_PER_DATAFRAME ] \
                            for min_bound in xrange( 0, 
                                                     ota_info_df.shape[ 0 ], 
                                                     Constants.MAX_ENTRIES_PER_DATAFRAME ) ]
     
     # For each of the subset data frame, process it with R in order
     # to build a dataset containing the start and stop relative
     # coordinates.
     # Instantiate the list of tuple-embedded arguments necessary to
     # compute the relative coordinates
     args_to_run_r = []
     filename_prefix = self.OTA_CSV_FILE_PREFIX
     filename_suffix = 0
     for df in subset_data_frames:
         args_to_run_r.append( ( df,
                                 self.species, 
                                 self.ensembl_release_version, 
                                 filename_prefix,
                                 filename_suffix ) )
         filename_suffix += 1
             
     # Instantiate the pool of processes
     p = Pool( self.thread_nb )
     messages_to_log = p.map( self.compute_relative_coord_r, args_to_run_r )
     p.close()
     # Wait for all processes to be completed
     p.join()
     
     # Log the messages generated by the processes
     for messages in messages_to_log:
         
         ( debug_messages_to_log,
           stdout,
           stderr ) = messages
           
         for message in debug_messages_to_log:
             Logger.get_instance().debug( message )
         
         if ( stdout != '' ):
             Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' +
                                          ' The R script returned the following standard output: \n' + 
                                          stdout )
         
         # NB: As the R function is susceptible to write not error-related 
         #     messages in stderr, these messages are also logged at the 
         #     debug level
         if ( stderr != '' ):
             Logger.get_instance().debug( 'ComputeRelCoordStrategy.compute_relative_coord_r():' +
                                          ' The R script returned the following error output: \n' + 
                                          stderr )
     
     # Sequentially open CSV files to get the relative positions
     # Instantiate a dictionary that associate to the ORFTranscriptAsso ID
     # the relative start and stop positions of the ORF
     rel_positions_dict = {}
     for file_nb in range( filename_suffix ):
         
         df = pd.read_csv( os.path.join( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER,
                                         filename_prefix + str( file_nb ) + '.csv' ),
                           sep = ',',
                           encoding = 'utf-8' )
         
         for ( index, row ) in df.iterrows():
             rel_positions_dict[ row[ 'id' ] ] = ( row[ 'rel_start_pos' ], row[ 'rel_end_pos' ] )
     
     
     # Add the relative start and stop positions for all the ORFTranscriptAsso entries 
     all_ota = SQLManagerPRO.get_instance().get_session().query( 
                                                                     ORFTranscriptAsso 
                                                                 ).filter( 
                                                                             ORFTranscriptAsso.id.in_( rel_positions_dict.keys() ) 
                                                                         ).all()
     for ota in all_ota:
         
         # Get the start and stop positions
         positions = rel_positions_dict.get( ota.id )
         rel_start_pos = positions[ 0 ] 
         rel_stop_pos = positions[ 1 ] 
         
         if not pd.isna( rel_start_pos ):
             ota.rel_start_pos = int( rel_start_pos )
         
         if not pd.isna( rel_stop_pos ):
             ota.rel_stop_pos = int( rel_stop_pos )
     
     # Commit the updates and close the session
     SQLManagerPRO.get_instance().commit()
     SQLManagerPRO.get_instance().close_session()
     
     # Delete the pool instance
     p.clear()
 def execute( self ):
     
     Logger.get_instance().info( 'IMPORTANT: This strategy has been built in order to be able to' +
                                 ' convert exclusively coordinates related to a Transcript entry that' +
                                 ' has an Ensembl transcript ID as "transcript_id" attribute.' +
                                 ' Hence, if the database contains IDs related to another database,' +
                                 ' then the source code of this strategy has to be modified in order to' +
                                 ' convert these IDs into Ensembl IDs.' )
     
     # Run DatabaseCheck in order to check PRO database is reachable and use
     # the appropriate models prior to the merging of data.
     Logger.get_instance().info( 'Checking the PRO database prior to compute missing information...' )
     try:
         DatabaseCheckStrategy().execute()
     except Exception as e:
         raise DenCellORFException( ' An error occurred whilst checking the database prior to' +
                                    ' compute missing information.' +
                                    '\n Error code: ' + LogCodes.ERR_DBCHECK + '.', e )
     
     # Get the name of the species used in the database
     self.species = DataManager.get_instance().get_data( Constants.SPECIES_SHORT )
     
     # Get the Ensembl release version used in the database
     prometadata_ensembl_release = SQLManagerPRO.get_instance().get_session().query( 
                                                                                         PROMetadata 
                                                                                     ).filter( 
                                                                                                 PROMetadata.parameter == Constants.METATABLE_CURRENT_ENSEMBL_RELEASE 
                                                                                             ).one()
     self.ensembl_release_version = prometadata_ensembl_release.value
     
     
     # Check there is at least one ORFTranscriptAsso entry in the database prior 
     # to try to convert the absolute coordinates into relative coordinates.
     # NB: The presence of entries in the Transcript tables will obviously be
     #     implicitly checked at the same time.
     orftranscriptasso_count = SQLManagerPRO.get_instance().get_session().query( ORFTranscriptAsso ).count()
     if ( orftranscriptasso_count == 0 ):
         raise DenCellORFException( 'There is not any entry in the ORFTranscriptAsso table of the ' + 
                                    SQLManagerPRO.get_instance().db_name + ' database (PRO database).' +
                                    ' Hence, the conversion of absolute coordinates into relative' +
                                    ' coordinates will be stopped.' )
     SQLManagerPRO.get_instance().close_session()
     
     
     # Set the R_LIBS_USER environment package to install new R 
     # packages in a folder where the user has the writing right
     if ( not os.path.exists( Constants.CUSTOM_R_LIBRARY_FOLDER ) ):
         os.makedirs( Constants.CUSTOM_R_LIBRARY_FOLDER )
     os.environ['R_LIBS_USER'] = Constants.CUSTOM_R_LIBRARY_FOLDER
     
     
     # As the computation of relative coordinates is performed
     # using R scripts relying on the ensembldb packages and
     # annotation packages, first make sure the appropriate 
     # annotation package is available. If not build it.
     Logger.get_instance().debug( 'ComputeRelCoordStrategy.execute(): Preparing the R annotation' +
                                  ' package to perform the computation of relative coordinates' +
                                  ' (ensembl release: ' + str( self.ensembl_release_version ) + ')...' )
     self.prepare_r_annotation_package( species_short_name = self.species,
                                        species_full_name = Constants.SPECIES_CATALOG_FULL_NAMES_WITH_CAPS[ self.species ],
                                        species_common_name = Constants.SPECIES_CATALOG_COMMON_NAMES[ self.species ], 
                                        ensembl_release_version = self.ensembl_release_version )  
     
     # Create a new folder that will be used to create temporary 
     # csv files necessary to the computation of relative coordinates  
     if ( not os.path.exists( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER ) ):
         os.makedirs( ComputeRelCoordStrategy.RELATIVE_COORD_CSV_FOLDER )
     
     
     # ================================================================================
     # INFORMATION ABOUT THE MULTI-PROCESSING
     #
     # In order to lower as most as possible the computation time, the computation 
     # of relative coordinates is multi-processed (concurrent R scripts subprocesses
     # run in parallel).
     # 
     # Important information regarding the multi-processing:
     # - Multi-processing has been chosen instead of multi-threading, in particular 
     #   to side-step the GIL (Global Interpreter Lock).
     # - The processes use all available / provided CPUs to run.
     # - The pathos package has been chosen as it allows to serialize functions which 
     #   are not top-levels, such as class static methods (contrary to the 
     #   multiprocessing built-in package for instance).
     # - The processes are run in pools which is one of the most convenient mean to 
     #   parallelize the execution of a function across multiple inputs. The Pool 
     #   map() method is used to do so.
     # - As access of objects shared by the several processes (using locks and 
     #   semaphores for instance), could slower a lot the speed of execution when the
     #   process regularly need to access these variables, it has been decided to do
     #   not access to shared resources. Then the progression bar is not displayed on 
     #   screen for this step.
     # - In order to use efficiently the Pool map() method, the arguments needed by the
     #   forked function are embedded into tuples of fixed-size.
     #
     # ================================================================================
     
     # Compute the the start and stop relative coordinates 
     # in the ORFTranscriptAsso table
     self.compute_ota_relative_coordinates()
     
     
     # Compute the the start and stop CDS relative coordinates 
     # in the Transcript table
     self.compute_tr_cds_relative_coordinates()
Beispiel #29
0
    def execute(self):

        # Create a session to the "PRO-like" database
        SQLManagerPRO.get_instance().set_db_settings(self.db_settings)

        try:
            SQLManagerPRO.get_instance().get_session()
        except Exception as e:
            raise DenCellORFException(
                'GenerateFastaFileStrategy.execute(): An error occurred trying to'
                + ' create a session to the database.' + '\n Error code: ' +
                LogCodes.ERR_SQL_SESSION + '.', e)
        SQLManagerPRO.get_instance().close_session()

        Logger.get_instance().info('Starting to build the FASTA file.')
        Logger.get_instance().info(
            'The fasta file will be created querying the ' + self.table_type +
            ' table and using the ' + self.seq_type + ' sequences.')

        # Create the output folder if it does not yet exist
        # (and its parent folders if necessary )
        if (not os.path.isdir(self.output_folder)):
            os.makedirs(self.output_folder)

        file_path = os.path.join(self.output_folder,
                                 self.filename) + self.FASTA_FILE_EXTENSION

        # Get the name of the species
        sp = SQLManagerPRO.get_instance().get_session().query(
            PROSpeciesCatalog).one().name
        SQLManagerPRO.get_instance().close_session()

        # Get the informations related to the species
        # NB: These information will be used in the headers
        taxon_sc_name = Constants.SPECIES_CATALOG_FULL_NAMES_WITH_CAPS[sp]
        taxon_code = Constants.SPECIES_CATALOG_CODE[sp]
        taxon_id = Constants.SPECIES_CATALOG_TAXON_ID[sp]

        # Get the version number of the database
        db_release = SQLManagerPRO.get_instance().get_session().query(
            PROMetadata.value).filter(PROMetadata.parameter == Constants.
                                      METATABLE_DATABASE_VERSION_NUMBER).all()
        db_release = GeneralUtil.query_result_to_list(db_release)
        if (len(db_release) == 1):
            db_release = db_release[0]
        else:
            db_release = ''

        # Create the FASTA file
        # ---------------------
        # Get the information necessary to compute the file content
        if (self.table_type == self.ORF_TABLE):
            # Get the necessary information from the ORF table
            all_orfs_query = SQLManagerPRO.get_instance().get_session().query(
                ORF.id, ORF.chromosome, ORF.strand, ORF.start_pos,
                ORF.stop_pos, ORF.spliced_parts_count,
                eval('ORF.' + self.seq_attribute_name)).filter(
                    eval('ORF.' + self.seq_attribute_name) != None)

        else:
            # Get the necessary information from the ORFTranscriptAsso table
            all_orfs_query = SQLManagerPRO.get_instance().get_session().query(
                ORFTranscriptAsso.id, ORFTranscriptAsso.orf_id,
                ORFTranscriptAsso.transcript_id,
                ORFTranscriptAsso.rel_start_pos,
                ORFTranscriptAsso.rel_stop_pos,
                eval('ORFTranscriptAsso.' + self.seq_attribute_name)).filter(
                    eval('ORFTranscriptAsso.' +
                         self.seq_attribute_name) != None)

        # Run the query and get the results as a Pandas data frame
        all_orfs_df = pd.read_sql(all_orfs_query.statement,
                                  SQLManagerPRO.get_instance().get_engine())
        SQLManagerPRO.get_instance().close_session()

        # Check the query returned a result
        total_sqce_count = all_orfs_df.shape[0]
        if (total_sqce_count == 0):
            raise DenCellORFException(
                'It seems that the database you are querying do not contain any'
                + ' entry with sequence (' + self.seq_type + ') in its ' +
                self.table_type +
                ' table. Hence, the generation of the fasta file' +
                ' has been stopped.')

        all_orfs_df = all_orfs_df.astype(str)

        # If the excludeSqcesWithStop option has been selected,
        # then exclude from the data frame all the sequences
        # that contains at least a stop, then remove it
        if self.exclude_sqce_with_stops:
            contains_stop_codon = all_orfs_df.apply(
                self.check_stop_codons_in_sqce,
                seq_type=self.seq_type,
                seq_attribute_name=self.seq_attribute_name,
                axis=1).to_frame()
            contains_stop_codon = contains_stop_codon.rename(
                columns={0: 'contains_stop_codon'})
            all_orfs_df = pd.concat([all_orfs_df, contains_stop_codon], axis=1)

            # Extract from the data frame the ORF for which the sequence
            # do not contains stop codons
            all_orfs_df = all_orfs_df[all_orfs_df.contains_stop_codon == False]

            Logger.get_instance().info(
                str(total_sqce_count - all_orfs_df.shape[0]) +
                ' sequences (/' + str(total_sqce_count) +
                ') have been removed as they were containing stop codons')

        # For each row, build a string that will be used
        # as header line in the FASTA file
        header = all_orfs_df.apply(self.generate_header,
                                   axis=1,
                                   taxon_sc_name=taxon_sc_name,
                                   taxon_code=taxon_code,
                                   taxon_id=str(taxon_id),
                                   table=self.table_type,
                                   db_release=db_release,
                                   long_header=self.long_header).to_frame()
        header = header.rename(columns={0: 'header'})
        all_orfs_df = pd.concat([all_orfs_df, header], axis=1)

        # Write the FASTA file one line at a time
        with open(file_path, 'w') as fasta_file:

            for (index, row) in all_orfs_df.iterrows():

                # Write the header line
                fasta_file.write('>' + row['header'] + '\n')

                # Write the sequence line(s)
                # Split the sequence if it has to be written on several lines
                full_seq = row[self.seq_attribute_name]
                seq = '\n'.join([
                    full_seq[k:k + self.MAX_SEQ_LINE_LENGTH]
                    for k in range(0, len(full_seq), self.MAX_SEQ_LINE_LENGTH)
                ])
                # Write the sequence line(s)
                fasta_file.write(seq + '\n')

        Logger.get_instance().info('The fasta file has been created at ' +
                                   file_path + '.')
Beispiel #30
0
    def execute(self):

        # Set the connection to the database
        self.get_sqlmanager_instance().set_db_settings(self.db_settings)
        try:
            self.get_sqlmanager_instance().get_instance().get_session()
            self.get_sqlmanager_instance().get_instance().close_session()
        except DenCellORFException as e:
            raise DenCellORFException(
                'RestoreStrategy.execute(): An error occurred while trying to'
                + ' create a session to the database.' + '\n Error code: ' +
                LogCodes.ERR_SQL_SESSION + '.', e)

        # Check if the database already exists.
        # If it exists, then ask the user to confirm the deletion of the database.
        if ((not self.force_overwrite)
                and (self.get_sqlmanager_instance().db_exists())):

            confirm_deletion = None
            Logger.get_instance().info(
                'A database already exists at the provided connection settings.'
                +
                ' Hence, any existing data nooeds to be removed prior the insertion'
                + ' of the data to restore.')
            while (confirm_deletion not in ['Y', 'N']):
                print(
                    'Do you want to confirm the deletion of the database? (Y/N)'
                )
                confirm_deletion = raw_input().upper()

            if (confirm_deletion == 'N'):
                Logger.get_instance().critical(
                    'As a database already exists at the provided connection' +
                    ' settings and as the deletion of existing data has been' +
                    ' canceled by the user, the program will be stopped.' +
                    ' Please see the documentation for more information.')

        # (Re-)create the empty database
        self.get_sqlmanager_instance().build_database(
            db_settings=self.db_settings,
            species=None,
            sp_mandatory=False,
            force_overwrite=True)

        # Get the appropriate order in which the tables needs to be filled in
        order_of_insertion = eval('self.' + self.db_model +
                                  '_ORDER_OF_INSERTION')

        # For each table of the list, get the corresponding file,
        # upload the content and insert the data in the database
        for tablename in order_of_insertion:

            Logger.get_instance().debug(
                'Starting to load and insert the data saved from the table ' +
                tablename + '.')

            # Get the name of the file (without its extension)
            if self.file_prefix:
                filename = self.file_prefix + tablename
            else:
                filename = tablename

            # Get the content of the file
            try:
                objects_to_insert = FileHandlerUtil.get_obj_from_file(
                    input_folder=self.input_folder, filename=filename)
            except Exception as e:
                raise DenCellORFException(
                    'A error occurred trying to import the objects to insert in the '
                    + tablename + 'table.')

            Logger.get_instance().debug(
                str(len(objects_to_insert)) + ' entries are expected' +
                ' to be inserted into the ' + tablename + ' table.')

            # Insert the data
            # NB: Using the add_all() method of the session does not work (probably because
            #     the objects saved in the file were mapped to the session). Hence, it is
            #     necessary to add the objects one at a time using the merge method.

            # Get the number total number of elements expected to be treated and
            # reset the ProgressionBar instance to follow the progression
            ProgressionBar.get_instance().reset_instance(
                total=len(objects_to_insert))

            for entry in objects_to_insert:

                # Update and display the progression bar on the console
                ProgressionBar.get_instance().increase_and_display()

                try:
                    self.get_sqlmanager_instance().get_session().merge(entry)
                except Exception as e:
                    raise DenCellORFException(
                        'An error occurred trying to insert the data into the '
                        + tablename +
                        ' table. Please make sure the backup occurred' +
                        ' successfully', e)

            # Commit the session
            try:
                self.get_sqlmanager_instance().commit()
            except Exception as e:
                raise DenCellORFException(
                    'An error occurred trying to commit changes after insertion'
                    + ' of data in the ' + tablename + ' table.' +
                    '\n Error code: ' + LogCodes.ERR_SQL_SESSION + '.')

            entry_count = self.get_sqlmanager_instance().get_session().query(
                eval(tablename)).count()
            Logger.get_instance().debug(
                str(entry_count) + ' entries have been successfully added' +
                ' to the ' + tablename + ' table.')
            self.get_sqlmanager_instance().close_session()

        # Log the end of the restoration
        Logger.get_instance().info('Restoration of the database has finished.')