def __prepare_local_ensembl_repository(self): """ This helper makes sure that the local folder structure for receiving Ensembl data is available. :return: no return value """ self._get_logger().debug( "Preparing local Ensembl repository, root folder - '{}'".format( self.get_local_path_root_ensembl_repo())) general.check_create_folders([self.get_local_path_root_ensembl_repo()]) self._get_logger().debug( "Local path for Ensembl Release - '{}'".format( self.get_local_path_ensembl_release())) if self._get_configuration_manager( ).is_rewrite_local_path_ensembl_repo(): self._get_logger().debug( "Creating folder in 'OVERWRITE' mode - '{}'".format( self.get_local_path_ensembl_release())) general.check_create_folders_overwrite( [self.get_local_path_ensembl_release()]) else: self._get_logger().debug( "Creating folder if it doesn't exist - '{}'".format( self.get_local_path_ensembl_release())) general.check_create_folders( [self.get_local_path_ensembl_release()])
def _prepare_trackhub_destination_folder(self, trackhub_exporter): if not self.__trackhub_destination_folder: # Check if anything was specified if self._get_configuration_manager( ).get_folder_pride_cluster_trackhubs(): # The destination folder will be a subfolder there self.__trackhub_destination_folder = os.path.join( self._get_configuration_manager( ).get_folder_pride_cluster_trackhubs(), self._get_configuration_manager(). get_cluster_file_exporter_version_parameter()) else: # Set the default trackhub destination folder by default self.__trackhub_destination_folder = os.path.join( trackhub_exporter.track_hub_destination_folder, self._get_configuration_manager(). get_cluster_file_exporter_version_parameter()) # Make sure the destination folder is there general_toolbox.check_create_folders( [self.__trackhub_destination_folder]) general_toolbox.create_latest_symlink_overwrite( self.__trackhub_destination_folder) # Set the destination folder for the exporter of the trackhub trackhub_exporter.track_hub_destination_folder = self.__trackhub_destination_folder self._get_logger().info( "PRIDE Cluster trackhub destination folder at '{}'") return self.__trackhub_destination_folder
def __init__(self, configuration_object, configuration_file): super(AppConfigManager, self).__init__(configuration_object, configuration_file) global _log_level global _logger_formatters # Session ID hpc_service = None try: hpc_service = HpcServiceFactory.get_hpc_service() except HpcServiceFactoryException as e: pass lsf_jobid = '' if hpc_service: try: lsf_jobid = "-{}-".format(hpc_service.get_current_job_id()) except HpcServiceException as e: lsf_jobid = "-NO_JOB_ID-" self.__session_id = time.strftime('%Y.%m.%d_%H.%M') \ + lsf_jobid + "-" \ + str(uuid.uuid4()) \ + "-" \ + get_pipeline_name() # TODO config, folder_run, etc. self.__session_working_dir = os.path.abspath( os.path.join(self.get_folder_run(), self.get_session_id())) # TODO check and create folders (if needed) folders_to_check = [ self.get_folder_bin(), self.get_folder_logs(), self.get_folder_resources(), self.get_folder_run(), self.get_session_working_dir(), ] general.check_create_folders(folders_to_check) # Prepare Logging subsystem if "loglevel" in configuration_object["logger"]: _log_level = configuration_object["logger"]["loglevel"] if "formatters" in configuration_object["logger"]["formatters"]: _logger_formatters = configuration_object["logger"]["formatters"] self.__log_handlers = [] log_handlers_prefix = self.get_session_id() + '-' log_handlers_extension = '.log' self.__logger = logging.getLogger(__name__) self.__logger.setLevel(getattr(logging, _log_level)) # TODO fix this code self.__log_files = [] for llevel, lformat in _logger_formatters.items(): logfile = os.path.join( self.get_folder_logs(), log_handlers_prefix + llevel.lower() + log_handlers_extension) lformatter = logging.Formatter(lformat) lhandler = logging.FileHandler(logfile, mode='w') lhandler.setLevel(getattr(logging, llevel)) lhandler.setFormatter(lformatter) self.__log_handlers.append(lhandler) # Add the handlers to my own logger self.__logger.addHandler(lhandler) # Keep the path to the log file self.__log_files.append(logfile) self._get_logger().debug("Logging system initialized")
def __init__(self, configuration_object, configuration_file): super().__init__(configuration_object, configuration_file) global _log_level global _logger_formatters # Session ID self.__session_id = time.strftime('%Y.%m.%d_%H.%M') \ + "-{}".format(str(uuid.uuid4())) \ + "-session" # TODO config, folder_run, etc. self.__session_working_dir = os.path.abspath( os.path.join(self.get_folder_run(), self.get_session_id())) # TODO check and create folders (if needed) folders_to_check = [ self.get_folder_bin(), self.get_folder_logs(), self.get_folder_resources(), self.get_folder_run(), self.get_session_working_dir(), ] general.check_create_folders(folders_to_check) # Prepare Logging subsystem if "loglevel" in configuration_object["logger"]: _log_level = configuration_object["logger"]["loglevel"] if "formatters" in configuration_object["logger"]["formatters"]: _logger_formatters = configuration_object["logger"]["formatters"] self.__log_handlers = [] log_handlers_prefix = self.get_session_id() + '-' log_handlers_extension = '.log' self._logger = logging.getLogger("{}.{}".format( __name__, type(self).__name__)) self._logger.setLevel(getattr(logging, _log_level)) # TODO fix this code # for llevel, lformat in _logger_formatters.items(): # logfile = os.path.join(self.get_folder_logs(), # log_handlers_prefix + llevel.lower() + log_handlers_extension) # lformatter = logging.Formatter(lformat) # lhandler = logging.FileHandler(logfile, mode='w') # lhandler.setLevel(getattr(logging, llevel)) # lhandler.setFormatter(lformatter) # self.__log_handlers.append(lhandler) # # Add the handlers to my own logger # self._logger.addHandler(lhandler) if is_show_logs_on_console(): lhandler = logging.StreamHandler(stream=sys.stdout) lhandler.setLevel(getattr(logging, _console_log_level)) lformatter = logging.Formatter( _logger_formatters[_console_log_level]) lhandler.setFormatter(lformatter) self.__log_handlers.append(lhandler) # Add the handlers to my own logger self._logger.addHandler(lhandler) self._logger.debug("Logging system initialized")
def get_cluster_file_exporter_destination_folder(self): """ Get the destination folder for the cluster file exporter result files, it will typically be a subfolder of the current running session working directory. This is computed here just in case I want to make it either a configuration parameter or a command line argument in the near future :return: destination folder for pride cluster-file-exporter result files """ destination_folder = os.path.join( config_manager.get_app_config_manager().get_session_working_dir(), self._CONFIG_CLUSTER_FILE_EXPORTER_WORKING_SUBDIR) # Make sure the folder is there general_toolbox.check_create_folders([destination_folder]) return destination_folder
def get_browser_instance(): logger = config_manager.get_app_config_manager().get_logger_for( "{}.{}".format(__name__, "get_browser_instance")) folder_prefix = os.path.join( config_manager.get_app_config_manager().get_session_working_dir(), "browser_profile_no") profile_folder = "{}{}".format(folder_prefix, uuid.uuid4()) general_toolbox.check_create_folders([profile_folder]) logger.debug("Creating Browser instance, profile folder at '{}'".format( profile_folder)) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument("user-data-dir={}".format(profile_folder)) browser = webdriver.Chrome( executable_path=config_manager.get_app_config_manager( ).get_path_chrome_driver(), chrome_options=chrome_options) browser.implicitly_wait(3) return browser
def __prepare_local_ensembl_repository(self): self._get_logger().debug( "Preparing local Ensembl repository, root folder - '{}'".format( self.get_local_path_root_ensembl_repo())) general.check_create_folders([self.get_local_path_root_ensembl_repo()]) self._get_logger().debug( "Local path for Ensembl Release - '{}'".format( self.get_local_path_ensembl_release())) if self._get_configuration_manager( ).is_rewrite_local_path_ensembl_repo(): self._get_logger().debug( "Creating folder in 'OVERWRITE' mode - '{}'".format( self.get_local_path_ensembl_release())) general.check_create_folders_overwrite( [self.get_local_path_ensembl_release()]) else: self._get_logger().debug( "Creating folder if it doesn't exist - '{}'".format( self.get_local_path_ensembl_release())) general.check_create_folders( [self.get_local_path_ensembl_release()]) general.create_latest_symlink_overwrite( self.get_local_path_ensembl_release())
def export_simple_trackhub(self, trackhub_builder): file_trackhub_descriptor = os.path.join(self.track_hub_destination_folder, 'hub.txt') self.export_summary.track_hub_root_folder = self.track_hub_destination_folder self.export_summary.track_hub_descriptor_file_path = file_trackhub_descriptor # TODO - Tell clients when you're not exporting anything if os.path.isfile(file_trackhub_descriptor): error_message = "Trackhub Export to '{}' ABORTED, there already is a trackhub there"\ .format(self.track_hub_destination_folder) self.logger.warning(error_message) self.export_summary.errors.append(error_message) else: self.logger.info("Export Simple TrackHub to '{}'".format(self.track_hub_destination_folder)) # Check / Create destination folder general.check_create_folders([self.track_hub_destination_folder]) # Create hub.txt file with open(file_trackhub_descriptor, 'w') as wf: wf.write("{}\n".format(str(trackhub_builder.track_hub))) self.logger.info("TrackHub descriptor file at '{}'".format(file_trackhub_descriptor)) # Per assembly # TODO - I should also have an assembly collector and refactor TrackHubGenomeAssembly accordingly, but I'm # TODO - cutting some corners here to get the first iteration up and running as soon as possible. Supporting # TODO - more complex genomes.txt files is not as critical as getting the 'tracks' the right way assembly_mapping = {} for assembly in trackhub_builder.assemblies: tracks_with_non_empty_bed_files = \ self.__get_tracks_with_non_empty_bed_files(assembly, trackhub_builder.assemblies[assembly].track_collector) if not tracks_with_non_empty_bed_files: self.logger.warning("Assembly '{}' contains ALL EMPTY BIG DATA FILE TRACKS -- SKIPPED --" .format(assembly)) continue assembly_folder = os.path.join(self.track_hub_destination_folder, assembly) # Create the folder for the assembly general.check_create_folders([assembly_folder]) self.logger.info("For Assembly '{}', trackhub folder created at '{}'".format(assembly, assembly_folder)) # Per track in its track collector, we'll process only those tracks with non-empty big data files for track in tracks_with_non_empty_bed_files: # Copy track file to assembly folder # TODO - source of this big_data_file_name = os.path.basename(track.get_big_data_url()) destination_file_path = os.path.join(assembly_folder, big_data_file_name) shutil.copy(track.get_big_data_url(), destination_file_path) # Modify the track (irreversible) to point to the big data file relative to the trackDB.txt file # path new_big_data_url = big_data_file_name track.set_big_data_url(new_big_data_url) self.logger.info( "Assembly '{}' ---> Data for track '{}' prepared, track information updated" .format(assembly, track.get_track())) # Export trackDB.txt with the current set of 'valid' tracks trackdb_file_path = os.path.join(assembly_folder, 'trackDb.txt') track_collector_exporter = TrackCollectorFileExporter(trackdb_file_path) track_collector_exporter.export_from_track_collection(tracks_with_non_empty_bed_files) # Add assembly entry to genomes.txt files within trackhub root folder assembly_mapping[assembly] = os.path.join(os.path.basename(os.path.dirname(trackdb_file_path)), os.path.basename(trackdb_file_path)) self.logger.info("Assembly data collected and exported to its corresponding subfolders") # Export data to genomes.txt file genomes_file_path = os.path.join(self.track_hub_destination_folder, 'genomes.txt') with open(genomes_file_path, 'w') as wf: for assembly in assembly_mapping: wf.write("genome {}\n" "trackDb {}\n" .format(assembly, assembly_mapping[assembly])) self.logger.info("Genomes file with per-assembly data exported to '{}'".format(genomes_file_path)) # Prepare summary object self.export_summary.export_performed = True self.logger.info("Trackhub export summary prepared") return self.export_summary
def get_genome_reference_for_species(self, taxonomy_id): """ This method will make sure the GTF files are available locally, before returning the result map that contains the file names and their local paths :param taxonomy_id: Taxonomy ID for which we want the GTF files :return: the list of GTF file names with their local paths or None in case the taxonomy has not been found on Ensembl """ # Work out the file names for the data to retrieve from Ensembl file_names = self._get_genome_reference_ensembl_file_name_for_species( taxonomy_id) if not file_names: return None self._get_logger().debug( "Working with Ensembl GTF file names for taxonomy ID '{}' - '{}'". format(taxonomy_id, str(file_names))) # Work out their path in the local repository gtf_files_local_path = self._get_genome_reference_file_path_local( taxonomy_id, file_names) self._get_logger().debug( "Local Ensembl Repo GTF paths for taxonomy ID '{}', file paths '{}'" .format(taxonomy_id, str(gtf_files_local_path))) # Check if they already exist locally missing_files = [ (missing_file_name, missing_file_path) for missing_file_name, missing_file_path in gtf_files_local_path if not os.path.exists(missing_file_path) ] if missing_files: # If not, work out their remote path on Ensembl FTP self._get_logger() \ .debug("There are {} GTF files missing from the local repository for taxonomy ID '{}': {}" .format(len(missing_files), taxonomy_id, "[{}]".format(",".join(["'{} -> {}'".format( missing_file_name, missing_file_path) for missing_file_name, missing_file_path in missing_files])))) # Retrieve the files download_information = self._get_genome_reference_file_path_remote( [file_entry[0] for file_entry in missing_files], taxonomy_id) destination_folder = self._get_genome_reference_file_destination_path_local( taxonomy_id) # Make sure that the destination folder exists general.check_create_folders([destination_folder]) download_urls = [url for file_name, url in download_information] self._get_logger().info( "GTF files to download to '{}': '{}'".format( destination_folder, ",".join(download_urls))) download_manager = DownloadManager(download_urls, destination_folder, self._get_logger()) download_manager.start_downloads() download_manager.wait_all() if not download_manager.is_success(): self._get_logger().error( "ERROR Downloading files from Ensembl !!!") # TODO - Should I raise an exception here? See how the code goes and take a decission later # Once the files have been downloaded, we know they come compressed from Ensembl, with .gz extension # Uncompress the files # I have their local paths in 'missin_files' second component of the pairs in the list, I just need to add # the '.gz' extension for them, as they come gzipped from Ensembl errors = general.gunzip_files([ "{}.gz".format(file_local_path) for file_name, file_local_path in missing_files ]) # Deal with possible errors if errors: msg = "An ERROR occurred while obtaining the following GTF files for taxonomy ID '{}' -> '{}'" \ .format(taxonomy_id, "\n".join(["File '{}', ERROR '{}'" .format(file, error) for file, error in errors] )) self._get_logger().error(msg) # I just found out that Ensembl does not have files for all the taxonomies using all the suffixes in a # uniform way, thus, if some of the files where not found, I WILL NOT raise an exception, I will do the # "Windows" here by keeping it quiet ^_^ # raise EnsemblDownloadManagerException(msg) # Return the .gtf file names and their local paths for the given ncbi taxonomy id. Return only those file that # were successfuly downloaded. # TODO - I need to review this return value in the future return [(file_name, file_path) for file_name, file_path in gtf_files_local_path if os.path.isfile(file_path)]
def export_simple_trackhub(self, trackhub_builder): """ When exporting a simple trackhub from a (simple) trackhub builder, those tracks with empty .bed files will be skipped :param trackhub_builder: a TrackhubBuilder that holds all the trackhub parts together :return: a report of the export process as a TrackHubExportSummary """ file_trackhub_descriptor = os.path.join( self.track_hub_destination_folder, 'hub.txt') self.export_summary.track_hub_root_folder = self.track_hub_destination_folder self.export_summary.track_hub_descriptor_file_path = file_trackhub_descriptor # TODO - Tell clients when you're not exporting anything if os.path.isfile(file_trackhub_descriptor): error_message = "Trackhub Export to '{}' ABORTED, there already is a trackhub there" \ .format(self.track_hub_destination_folder) self.logger.warning(error_message) self.export_summary.errors.append(error_message) else: self.logger.info("Export Simple TrackHub to '{}'".format( self.track_hub_destination_folder)) # Check / Create destination folder general.check_create_folders([self.track_hub_destination_folder]) # Create hub.txt file with open(file_trackhub_descriptor, 'w') as wf: wf.write("{}\n".format(str(trackhub_builder.track_hub))) self.logger.info("TrackHub descriptor file at '{}'".format( file_trackhub_descriptor)) # Per assembly # TODO - I should also have an assembly collector and refactor TrackHubGenomeAssembly accordingly, but I'm # TODO - cutting some corners here to get the first iteration up and running as soon as possible. Supporting # TODO - more complex genomes.txt files is not as critical as getting the 'tracks' the right way assembly_mapping = {} assembly_mapping_service = AssemblyMappingServiceFactory.get_assembly_mapping_service( ) ensembl_species_service = ensembl.service.get_service( ).get_species_data_service() for assembly in dict(trackhub_builder.assemblies): try: ucsc_assembly = assembly_mapping_service \ .get_ucsc_assembly_for_ensembl_assembly_accession(ensembl_species_service .get_species_entry_for_assembly(assembly) .get_assembly_accession()) except AssemblyMappingServiceException as e: message = "ERROR while mapping Ensembl Assembly '{}' - SKIPPING THIS ASSEMBLY - xxx> '{}'"\ .format(assembly, e.value) self.export_summary.warnings.append(message) self.logger.error(message) trackhub_builder.invalidate_assembly(assembly) continue self.logger.info( "Ensembl Assembly '{}' --- mapped_to ---> UCSC Assembly '{}'" .format(assembly, ucsc_assembly)) tracks_with_non_empty_bed_files = \ self.__get_tracks_with_non_empty_bed_files(assembly, trackhub_builder.assemblies[assembly].track_collector) if not tracks_with_non_empty_bed_files: message = "Assembly '{} ({})' contains ALL EMPTY BIG DATA FILE TRACKS -- SKIPPED --"\ .format(assembly,ucsc_assembly) self.export_summary.warnings.append(message) self.logger.warning(message) trackhub_builder.invalidate_assembly(assembly) continue assembly_folder = os.path.join( self.track_hub_destination_folder, ucsc_assembly) # Create the folder for the assembly general.check_create_folders([assembly_folder]) self.logger.info( "For Assembly '{} ({})', trackhub folder created at '{}'". format(assembly, ucsc_assembly, assembly_folder)) # Per track in its track collector, we'll process only those tracks with non-empty big data files # The following map will contain the tracks that are ready for being added to the collector. If a track # has no converter associated, then it can be added with no problem, but if it has a converter, we need # to wait for it to finish the conversion process before we can add the track to the collector # TODO - Apparently, there's usually just a couple of tracks per assembly (without PTMs and with PTMs), # TODO - this part can be parallelized even further by making a map <assembly, <track, converter>> that # TODO - will contain all the processed tracks for all the assemblies, so all the conversions happen in # TODO - parallel. Then, iterating over this would produce the final genomes.txt file track_converter_map = {} for track in tracks_with_non_empty_bed_files: # Copy track file to assembly folder # TODO - source of this # Instead of copying the file, if it is a BED file, perform conversion - # Get the original big data url big_data_file_name = os.path.basename( track.get_big_data_url()) # Default destination for the big data file is just copying it destination_file_path = os.path.join( assembly_folder, big_data_file_name) # if track type is BED, workout the destination file as bigbed and do not copy the data, convert it converter = None if (track.get_type() == BaseTrack.TRACK_TYPE_BED) \ and track.taxonomy_id: destination_file_path = \ os.path.join(assembly_folder, "{}.bb".format(big_data_file_name[:big_data_file_name.rfind(".")])) # The new name for the big data file big_data_file_name = os.path.basename( destination_file_path) # Convert the file converter = DataFormatConverterFactory.get_bed_to_bigbed_converter( track.taxonomy_id, track.get_big_data_url(), destination_file_path) # We start the converter converter.start() else: shutil.copy(track.get_big_data_url(), destination_file_path) # Update the big data url with either the copied file or the newly built .bb (bigBed) file # Modify the track (irreversible) to point to the big data file relative to the trackDB.txt file # path new_big_data_url = big_data_file_name track.set_big_data_url(new_big_data_url) self.logger.info( "Assembly '{} ({})' ---> Data for track '{}' prepared, track information updated" .format(assembly, ucsc_assembly, track.get_track())) # Apparently, 'blank spaces' are not allowed in the track names (UCSC) track.set_track(track.get_track().replace(' ', '_')) # Add the track to the map track_converter_map[track] = converter # Export trackDB.txt with the current set of 'valid' tracks trackdb_file_path = os.path.join(assembly_folder, 'trackDb.txt') track_collector_exporter = TrackCollectorFileExporter( trackdb_file_path) # Add successful tracks successful_tracks = [] for track, converter in track_converter_map.items(): if converter: converter.wait() if not converter.is_conversion_ok(): message = "SKIP TRACK for Assembly '{} ({})' " \ "---> Track '{}' Big Data File FAILED conversion process " \ "- STDOUT '{}', STDERR '{}'".format(assembly, ucsc_assembly, track.get_track(), converter.get_conversion_output(), converter.get_conversion_output_error()) self.export_summary.warnings.append(message) self.logger.error(message) # Skip this track continue # Add the track to the successful tracks list successful_tracks.append(track) track_collector_exporter.export_from_track_collection( successful_tracks) # Add assembly entry to genomes.txt files within trackhub root folder assembly_mapping[ucsc_assembly] = os.path.join( os.path.basename(os.path.dirname(trackdb_file_path)), os.path.basename(trackdb_file_path)) if not assembly_mapping: message = "ALL Assemblies in this project are INVALID" self.export_summary.errors.append(message) self.logger.error(message) self.logger.info( "Assembly data collected and exported to its corresponding subfolders" ) # Export data to genomes.txt file genomes_file_path = os.path.join(self.track_hub_destination_folder, 'genomes.txt') with open(genomes_file_path, 'w') as wf: for assembly in assembly_mapping: wf.write("genome {}\n" "trackDb {}\n\n".format( assembly, assembly_mapping[assembly])) self.logger.info( "Genomes file with per-assembly data exported to '{}'".format( genomes_file_path)) # Prepare summary object self.export_summary.export_performed = True self.logger.info("Trackhub export summary prepared") return self.export_summary