def _before(self): self.__pipeline_result_object.file_path_pipeline_session = config_manager.get_app_config_manager() \ .get_session_working_dir() # Add this pipeline session log files to the final report self.__pipeline_result_object.add_log_files(config_manager.get_app_config_manager().get_session_log_files()) # TODO Check that the trackhub URL is valid return True
def modules_bootstrap(): ensembl_config_file = config_manager.get_app_config_manager( ).get_file_name_config_modules_ensembl_service() __logger.debug( "Setting Ensembl configuration file -- {}".format(ensembl_config_file)) ensembl.service.set_configuration_file(ensembl_config_file) # TODO - Should I delegate this to a main entry point for every module?. # TODO - REFACTOR THIS IN THE FUTURE, WHEN MODULE FUNCTIONALITY HAS BEEN TESTED __logger.debug( "Setting Ensembl Data Downloader configuration file -- {}".format( config_manager.get_app_config_manager( ).get_file_name_config_modules_ensembl_data_downloader())) ensembl.data_downloader.set_configuration_file( config_manager.get_app_config_manager( ).get_file_name_config_modules_ensembl_data_downloader())
def __init__(self, configuration_object, configuration_file, pipeline_arguments): super(DirectorConfigurationManager, self).__init__(configuration_object, configuration_file) self.__pipeline_arguments = pipeline_arguments self.__pipeline_arguments_object = None # Logger the pythonist way self._logger = config_manager.get_app_config_manager().get_logger_for( "{}.{}".format(__name__, type(self).__name__))
class TestCommandLineRunner(unittest.TestCase): __logger = config_manager.get_app_config_manager().get_logger_for(__name__) def test_success_on_running_simple_command_without_timeout(self): command = "echo Successful_run" runner = CommandLineRunnerFactory.get_command_line_runner() runner.command = command runner.start() runner.wait() self.assertTrue(runner.command_success, "Command finishes with success") self.__logger.debug( "Command '{}', STDOUT - '{}', STDERR - '{}'".format( command, runner.get_stdout().decode('utf8'), runner.get_stderr().decode('utf8'))) def test_simple_commands_with_parallel_runner_manager(self): commands = [ "echo Successful_run-{:03}".format(i) for i in range(0, 16) ] parallel_runner_manager = ParallelRunnerManagerFactory.get_parallel_runner_manager( ) for command in commands: runner = CommandLineRunnerFactory.get_command_line_runner() runner.command = command parallel_runner_manager.add_runner(runner) parallel_runner_manager.start_runners() parallel_runner_manager.wait_all() for runner in parallel_runner_manager.get_finished_runners(): self.assertTrue(runner.is_done(), "Runner is Done") self.assertTrue(runner.command_success, "Run command was successful")
def __init__(self, configuration_object, configuration_file): super(ConfigurationManager, self).__init__(configuration_object, configuration_file) self.__logger = config_manager.get_app_config_manager().get_logger_for( __name__) # Local Ensembl repo parent folder name self.__local_folder_ensembl_repo = 'ensembl'
def __runmode_test_run_cluster_file_exporter(self): """ This method is a helper that I'm using for building this pipeline, it is not even clear whether this pipeline will have a "testing / development mode" where the most expensive parts of it are dummied, that's why I don't think the code will stay, thus, I'm not spending much time on getting this code fit in the software in a sensible way :return: True if success on preparing the dummy data, False otherwise """ cluster_file_exporter_destination_folder = self \ ._get_configuration_manager() \ .get_cluster_file_exporter_destination_folder() rsync_source_folder = os.path.join( config_manager.get_app_config_manager().get_folder_resources(), os.path.join("tests", "cluster-file-exporter")) # Rsync the dummy data into the destination folder rsync_command = "rsync -vah --progress --stats {}/ {}/" \ .format(rsync_source_folder, cluster_file_exporter_destination_folder) rsync_subprocess = subprocess.Popen(rsync_command, shell=True) try: # TODO - WARNING - OMG! Magic number there! stdout, stderr = rsync_subprocess.communicate(timeout=600) except subprocess.TimeoutExpired as e: self._get_logger().error( "TIMEOUT error while rsyncing dummy cluster-file-exporter data, KILLING subprocess" ) rsync_subprocess.kill() stdout, stderr = rsync_subprocess.wait() return False return True
def __init__(self, username, password): self.logger = config_manager.get_app_config_manager().get_logger_for("{}.{}" .format(__name__, type(self).__name__)) self.username = username self.password = password self.trackhub_registry_base_url = 'https://www.trackhubregistry.org' self.__auth_token = None
def test_gunzip_files(self): file_url = 'ftp://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.abinitio.gtf.gz' file_name = file_url[file_url.rfind('/') + 1:] file_name_uncompressed = file_name[:file_name.rfind('.')] # Download the file to the session working directory destination_folder = config_manager.get_app_config_manager( ).get_session_working_dir() destination_file_path = os.path.join(destination_folder, file_name) destination_file_path_uncompressed = os.path.join( destination_folder, file_name_uncompressed) self.__logger.info( "Using test file '{}', from '{}' for testing gunzip functionality at folder '{}'" .format(file_name, file_url, destination_folder)) download_manager = DownloadManager([file_url], destination_folder, self.__logger) download_manager.start_downloads() download_manager.wait_all() self.assertTrue( download_manager.is_success(), "Test files for gunzip unit test downloaded successfully") errors = general_toolbox.gunzip_files([destination_file_path]) self.assertTrue( not errors, "No errors uncompressing test files for unit testing gunzip feature" ) self.assertTrue( os.path.isfile(destination_file_path_uncompressed), "The test file has been uncompressed, '{}'".format( destination_file_path_uncompressed)) self.assertTrue( os.path.getsize(destination_file_path_uncompressed) > 0, "The uncompressed test file '{}' is not empty".format( destination_file_path_uncompressed))
def __init__(self): super().__init__() # The default destination folder for exporting the trackhub is located within the current session working # directory self.track_hub_destination_folder = os.path.join( config_manager.get_app_config_manager().get_session_working_dir(), 'track_hub')
def __init__(self, species_data): self.__logger = config_manager.get_app_config_manager().get_logger_for( __name__) # I've changed this, we store the original species data, and then we offer two different views self.__ensembl_species_data_raw = species_data self.__ensembl_species_data_dao = None self.__index_by_taxonomy_id = None self.__index_by_assembly = None
def get_file_path_binary_bed_to_bigbed_conversion_tool(self): """ Get absolute path to the binary tool to convert from 'bed' file to 'bigBed' file format :return: absolute path to 'bed to bigBed' conversion tool """ return os.path.join(config_manager.get_app_config_manager().get_folder_bin(), os.path.join(self._CONFIG_UCSC_TOOLSUITE_SUBFOLDER_NAME, self._CONFIG_UCSC_TOOLSUITE_BEDTOBIGBED_BINARY_FILE_NAME))
def get_pogo_binary_file_path(self): """ Again, at this iteration of the software lifecycle, it duplicates the method from the application wide configuration manager but, it belongs here, and this extra level of abstraction will help us in the process of refactoring the parameter. :return: absolute file path to the pogo binary """ return config_manager.get_app_config_manager( ).get_pogo_binary_file_path()
def __init__(self): super().__init__() # The default destination folder for exporting the trackhub is located within the current session working # directory self.track_hub_destination_folder = os.path.join( config_manager.get_app_config_manager().get_session_working_dir(), 'track_hub') # By default we're working with an empty export summary self.export_summary = TrackHubExportSummary()
def get_local_path_folder_ensembl_repo(self): """ Get the absolute path to the local folder where we are going to store all the data from the different releases of Ensembl :return: absolute path of the local repository for Ensembl releases data """ return os.path.abspath( os.path.join( config_manager.get_app_config_manager().get_folder_resources(), self.__local_folder_ensembl_repo))
def get_pogo_run_timeout(self): """ This method is duplicating the one at the application wide configuration manager at this stage and iteration of this application development with the idea of refactoring it out of the application wide configuration manager in the future, as this is a parameter that, even if it's configurable in the future, is within the responsiblity boundaries of the 'pogo' module. At this iteration of the software lifecycle, it just returns a default value, but later on, it can be made configurable as a parameter to this software. :return: configured timeout (seconds) for running PoGo """ return config_manager.get_app_config_manager().get_pogo_run_timeout()
def __init__(self): self.logger = config_manager.get_app_config_manager().get_logger_for( "{}.{}".format(__name__, type(self).__name__)) # hub.txt URL self.url = None self.assembly_accession_map = {} # Trackhub is public by default self.public = '1' # Default type for trackhubs is PROTEOMICS self.type = 'PROTEOMICS'
def get_local_path_root_ensembl_repo(self): if self.__local_path_ensembl_repo is None: # For improved reading and code maintenance later resources_folder_path = os.path.abspath( config_manager.get_app_config_manager().get_folder_resources()) root_folder_ensembl_repo = self._get_configuration_manager( ).get_local_path_folder_ensembl_repo() self.__local_path_ensembl_repo = os.path.join( resources_folder_path, root_folder_ensembl_repo) return self.__local_path_ensembl_repo
def __init__(self, configuration_object, configuration_file): self.__logger = config_manager.get_app_config_manager().get_logger_for( __name__) self._get_logger().debug( "Using configuration file '{}'".format(configuration_file)) self.__config_manager = ConfigurationManager(configuration_object, configuration_file) # Ensembl Release Number self.__release_number = None # Ensembl Species Data self.__species_data_service = None
def app_bootstrap(): global __run_test_mode global __logger global __args __args = get_cmdl() # Initialize configuration module if __args.config_file: config_manager.set_application_config_file(__args.config_file) else: config_manager.set_application_config_file(__DEFAULT_CONFIG_FILE) if __args.testmode: __run_test_mode = True # Request the main logger __logger = config_manager.get_app_config_manager().get_logger_for(__name__) if __run_test_mode: __logger.info( "Session '{}' STARTED, RUNNING UNIT TESTS".format(config_manager.get_app_config_manager().get_session_id())) else: __logger.info( "Session '{}' STARTED".format(config_manager.get_app_config_manager().get_session_id()))
def get_browser_instance(): logger = config_manager.get_app_config_manager().get_logger_for( "{}.{}".format(__name__, "get_browser_instance")) folder_prefix = os.path.join( config_manager.get_app_config_manager().get_session_working_dir(), "browser_profile_no") profile_folder = "{}{}".format(folder_prefix, uuid.uuid4()) general_toolbox.check_create_folders([profile_folder]) logger.debug("Creating Browser instance, profile folder at '{}'".format( profile_folder)) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument("user-data-dir={}".format(profile_folder)) browser = webdriver.Chrome( executable_path=config_manager.get_app_config_manager( ).get_path_chrome_driver(), chrome_options=chrome_options) browser.implicitly_wait(3) return browser
def app_bootstrap(): global __run_test_mode global __logger global __args __args = get_cmdl() if __args.config_file: config_manager.set_application_config_file(__args.config_file) else: config_manager.set_application_config_file(__DEFAULT_CONFIG_FILE) if __args.pipeline_name: config_manager.set_pipeline_name(__args.pipeline_name) if __args.pipeline_name == 'test': __run_test_mode = True __logger = config_manager.get_app_config_manager().get_logger_for(__name__) if __run_test_mode: __logger.info("Session '{}' STARTED, RUNNING UNIT TESTS".format( config_manager.get_app_config_manager().get_session_id())) else: __logger.info("Session '{}' STARTED, pipeline '{}'".format( config_manager.get_app_config_manager().get_session_id(), __args.pipeline_name))
def get_cluster_file_exporter_jar_path(self): """ Get the path to cluster-file-exporter jar file for running the software This is computed here just in case I want to make it either a configuration parameter or a command line argument in the near future :return: cluster-file-exporter jar file path """ return os.path.join( config_manager.get_app_config_manager().get_folder_bin(), os.path.join(self._CONFIG_CLUSTER_FILE_EXPORTER_BIN_SUBFOLDER, self._CONFIG_CLUSTER_FILE_EXPORTER_JAR_FILE_NAME))
def _before(self): # Set Pipeline Session working directory self.__pipeline_result_object.file_path_pipeline_session = \ config_manager.get_app_config_manager().get_session_working_dir() # Add this pipeline session log files to the final report self.__pipeline_result_object.add_log_files( config_manager.get_app_config_manager().get_session_log_files()) # Add information about the Ensembl Release being used self.__pipeline_result_object.ensembl_release = str( ensembl.service.get_service().get_release_number()) if self.__config_manager.get_project_data_file_path(): self._get_logger().info( "Reading Project Trackhub Descriptor from file at '{}'".format( self.__config_manager.get_project_data_file_path())) self.__project_trackhub_descriptor = \ ProjectTrackhubDescriptor(self.__config_manager.get_project_data_file_path()) # Check that the destination folder exists if not os.path.isdir(self.__project_trackhub_descriptor. get_trackhub_destination_path()): error_message = "Trackhub destination path NOT VALID, '{}'" \ .format(self.__project_trackhub_descriptor.get_trackhub_destination_path()) self._get_logger().error(error_message) self.__pipeline_result_object.add_error_message(error_message) self.set_pipeline_status_fail() return False # Check valid project tracks if not self.__get_valid_project_tracks(): # It makes no sense to go ahead if this project has no valid tracks error_message = "Project Trackhub contains NO VALID TRACKS" self._get_logger().error(error_message) self.__pipeline_result_object.add_error_message(error_message) self.set_pipeline_status_fail() return False return True error_message = "INVALID / MISSING Project Trackhub Descriptor file, '{}'" \ .format(self.__config_manager.get_project_data_file_path()) self._get_logger().error(error_message) self.__pipeline_result_object.add_error_message(error_message) self.set_pipeline_status_fail() return False
def get_local_path_root_ensembl_repo(self): """ Get the local root folder where all ensembl data releases are going to be made locally available :return: the local folder that will contain all Ensembl releases data, e.g. .../resources/ensembl """ if self.__local_path_ensembl_repo is None: # For improved reading and code maintenance later resources_folder_path = os.path.abspath( config_manager.get_app_config_manager().get_folder_resources()) root_folder_ensembl_repo = self._get_configuration_manager( ).get_local_path_folder_ensembl_repo() self.__local_path_ensembl_repo = os.path.join( resources_folder_path, root_folder_ensembl_repo) return self.__local_path_ensembl_repo
def test_success_on_sample_files_download(self): urls = ['http://ipv4.download.thinkbroadband.com/5MB.zip', 'http://ipv4.download.thinkbroadband.com/10MB.zip', 'http://ipv4.download.thinkbroadband.com/20MB.zip', 'http://ipv4.download.thinkbroadband.com/50MB.zip'] destination_folder = config_manager.get_app_config_manager().get_session_working_dir() # Log the test environment self.__logger.info("Sample file URLs to download: {}".format(",".join(urls))) self.__logger.info("Destination folder for the downloads, '{}'".format(destination_folder)) # Get the download manager and start the downloads download_manager = DownloadManager(urls, destination_folder, self.__logger) download_manager.start_downloads() download_manager.wait_all() self.assertTrue(download_manager.is_success(), "Files downloaded successfully")
def get_cluster_file_exporter_destination_folder(self): """ Get the destination folder for the cluster file exporter result files, it will typically be a subfolder of the current running session working directory. This is computed here just in case I want to make it either a configuration parameter or a command line argument in the near future :return: destination folder for pride cluster-file-exporter result files """ destination_folder = os.path.join( config_manager.get_app_config_manager().get_session_working_dir(), self._CONFIG_CLUSTER_FILE_EXPORTER_WORKING_SUBDIR) # Make sure the folder is there general_toolbox.check_create_folders([destination_folder]) return destination_folder
def __init__(self, pogo_runner): """ Just the constructor, I had this implemented as syntactic sugar, but I was wrong :param ncbi_taxonomy_id: ncbi taxonomy id for this PoGo run results :param pogo_source_file_path: path of the source file used to run PoGo :param protein_sequence_file_path: FASTA file :param gtf_file_path: GTF file """ # Logging self.__logger = main_app_config_manager.get_app_config_manager( ).get_logger_for("{}.{}".format(__name__, type(self).__name__)) # Map<pogo_result_file_extension, pogo_result_file_path> self.__pogo_result_file_paths = {} self.pogo_runner = pogo_runner
class TestEnsemblDataDownloader(unittest.TestCase): __logger = config_manager.get_app_config_manager().get_logger_for(__name__) def test_get_protein_sequences_for_human(self): human_ncbi_tax_id = '9606' ensembl_downloader_service = ensembl.data_downloader.get_data_download_service( ) ensembl_downloader_service.get_protein_sequences_for_species( human_ncbi_tax_id) def test_get_gtf_for_human(self): human_ncbi_tax_id = '9606' ensembl_downloader_service = ensembl.data_downloader.get_data_download_service( ) ensembl_downloader_service.get_genome_reference_for_species( human_ncbi_tax_id)
def __init__(self, configuration_object, configuration_file): self.__logger = config_manager.get_app_config_manager().get_logger_for( __name__) self._get_logger().debug( "Using configuration file '{}'".format(configuration_file)) self.__config_manager = ConfigurationManager(configuration_object, configuration_file) self.__local_path_ensembl_repo = None self.__local_path_ensembl_release = None self.__remote_path_ensembl_release = None # Name for the current release self.__ensembl_release_name = None # Name for the subfolder that contains per species fasta files self.__folder_name_fasta = None # Name for the subfolder of species folder that contains protein sequences files self.__folder_name_protein_sequences = None
def __sync_filesystem(self, trackhub_exporter): if self._get_configuration_manager().is_do_sync(): # Sync script parameters sync_script_launcher = self._get_configuration_manager( ).get_path_script_filesystem_sync() app_root_dir = config_manager.get_app_config_manager( ).get_application_root_folder() source_trackhub_container_folder = os.path.dirname( trackhub_exporter.track_hub_destination_folder) source_trackhub_folder = trackhub_exporter.track_hub_destination_folder # Build the synchronization command sync_command = "{} {} {} {}".format( sync_script_launcher, app_root_dir, source_trackhub_container_folder, source_trackhub_folder) self._get_logger().info( "Filesystem synchronization command '{}'".format(sync_command)) sync_subprocess = subprocess.Popen(sync_command, shell=True) stdout = '' stderr = '' try: stdout, stderr = sync_subprocess \ .communicate(timeout=self._get_configuration_manager().get_filesystem_sync_run_timeout()) except subprocess.TimeoutExpired as e: exception_message = "TIMEOUT ERROR while running Filesystem synchronization script '{}'," \ " Command: '{}'\n" \ "STDOUT: '{}'\n" \ "STDERR: '{}'" \ .format(self._get_configuration_manager().get_path_script_filesystem_sync(), sync_command, stdout, stderr) self._get_logger().error(exception_message) sync_subprocess.kill() stdout, stderr = sync_subprocess.communicate() raise pipeline_exceptions.PipelineDirectorException( exception_message) from e if sync_subprocess.poll() and (sync_subprocess.returncode != 0): error_msg = "ERROR while running Filesystem synchronization script '{}'," \ " Command: '{}'\n" \ "STDOUT: '{}'\n" \ "STDERR: '{}'" \ .format(self._get_configuration_manager().get_path_script_filesystem_sync(), sync_command, stdout, stderr) self._get_logger().error(error_msg) raise pipeline_exceptions.PipelineDirectorException(error_msg)