def run(self): Logger.info('Launching demultiplex pipeline "%s" with version "%s"' % (self.pipeline_name, self.version), task_name=self.task) pipeline = self.api.find_pipeline(self.pipeline_name) pipeline_params = { 'MACHINE_RUN_FOLDER': { 'value': os.environ['MACHINE_RUN_FOLDER'], 'type': 'input' }, 'SAMPLE_SHEET': { 'value': os.environ['SAMPLE_SHEET_ORIGINAL'], 'type': 'input' }, 'ANALYSIS_FOLDER': { 'value': os.environ['ANALYSIS_FOLDER'], 'type': 'output' } } run = self.api.launch_pipeline(pipeline['id'], self.version, pipeline_params, instance=self.instance_type, disk=self.instance_disk, parent_run_id=os.environ['RUN_ID']) demultiplex_run_id = run['id'] Logger.info('Launched demultiplex run %d.' % demultiplex_run_id, task_name=self.task) Logger.info('Waiting till run %d completion.' % demultiplex_run_id, task_name=self.task) final_status = self.__wait_run_completion(demultiplex_run_id) if final_status != 'SUCCESS': Logger.fail( 'Demultiplex processing does not completed successfully. ' 'Check run %d logs for more information.' % demultiplex_run_id, task_name=self.task) sys.exit(1) Logger.success('Demultiplex processing completed sucessfully.', task_name=self.task)
def await_node_start(self, parameters, task_name, run_id): try: Logger.info( 'Waiting for node with parameters = {}, task: {}'.format( ','.join(parameters), task_name), task_name=self.task_name) # approximately 10 minutes attempts = 60 master = self.get_node_info(parameters, task_name, run_id) while not master and attempts > 0: master = self.get_node_info(parameters, task_name, run_id) attempts -= 1 Logger.info('Waiting for node ...', task_name=self.task_name) time.sleep(10) if not master: raise RuntimeError('Failed to attach to master node') Logger.success('Attached to node (run id {})'.format(master.name), task_name=self.task_name) return master except Exception as e: self.fail_task(e.message)
def run(self, mount_root, tmp_dir): try: Logger.info('Starting mounting remote data storages.', task_name=self.task_name) Logger.info('Fetching list of allowed storages...', task_name=self.task_name) available_storages = self.api.load_available_storages() if not available_storages: Logger.success('No remote storages are available', task_name=self.task_name) return Logger.info( 'Found {} available storage(s). Checking mount options.'. format(len(available_storages)), task_name=self.task_name) fuse_tmp = os.path.join(tmp_dir, "s3fuse") if not self.create_directory(fuse_tmp): fuse_tmp = '/tmp' fuse_available = self.check_or_install_fuse() aws_default_region = os.getenv('AWS_DEFAULT_REGION', 'us-east-1') aws_region = os.getenv('AWS_REGION', aws_default_region) limited_storages = os.getenv('CP_CAP_LIMIT_MOUNTS') if limited_storages: try: limited_storages_list = [ int(x.strip()) for x in limited_storages.split(',') ] available_storages = [ x for x in available_storages if x.id in limited_storages_list ] Logger.info( 'Run is launched with mount limits ({}) Only {} storages will be mounted' .format(limited_storages, len(available_storages)), task_name=self.task_name) except Exception as limited_storages_ex: Logger.warn( 'Unable to parse CP_CAP_LIMIT_MOUNTS value({}) with error: {}.' .format(limited_storages, str(limited_storages_ex.message)), task_name=self.task_name) nfs_count = len( filter((lambda ds: ds.storage_type == 'NFS' and ds.region_name == aws_region), available_storages)) nfs_available = nfs_count > 0 and self.check_or_install_nfs() if not fuse_available and not nfs_available: Logger.success( 'Mounting of remote storages is not available for this image', task_name=self.task_name) return for storage in available_storages: if not PermissionHelper.is_storage_readable(storage): continue mounter = self.get_mount_manager(storage, nfs_available, fuse_available, fuse_tmp) if mounter is not None: self.mount(mounter, mount_root) elif storage.storage_type != NFS_TYPE and storage.storage_type != S3_TYPE: Logger.warn('Unsupported storage type {}.'.format( storage.storage_type), task_name=self.task_name) Logger.success('Finished data storage mounting', task_name=self.task_name) except Exception as e: Logger.fail('Unhandled error during mount task: {}.'.format( str(e.message)), task_name=self.task_name)
def find_files(self, recursive=True): Logger.info("Starting parsing input directory: {}.".format( self.folder), task_name=self.TASK_NAME) all_files = bucket.ls_s3(self.folder, self.MAX_ATTEMPTS, recursive=recursive) patterns_files = {} for file in all_files: # recursive version of s3 ls returns path from bucket root # non-recursive ls returns path relative to the requested folder if recursive: file_name = file[len(self.get_path_without_bucket()) - 1:] else: file_name = file for pattern_name, glob in self.patterns.iteritems(): Logger.info("Matching file {} against patterns {}.".format( file_name, str(glob)), task_name=self.TASK_NAME) if self.match_patterns(file_name, glob): if pattern_name in self.exclude_patterns: exclude = self.exclude_patterns[pattern_name] if self.match_patterns(file_name, exclude): Logger.info( "Skipping filename '{}' since it matches exclude patterns '{}'." .format(file_name, str(exclude))) continue if pattern_name not in patterns_files: patterns_files[pattern_name] = [] patterns_files[pattern_name].append( os.path.join(self.folder, file_name)) if len(patterns_files) == 0: self.fail_task("Failed to find files matching any of patterns.") samples_number = None for pattern, files in patterns_files.iteritems(): current_length = len(files) if current_length == 0: self.fail_task( "Failed to find files matching patterns: {}.".format( str(pattern))) if samples_number is None: samples_number = current_length elif samples_number != current_length: self.fail_task( "Number of found files differ between patterns. Please check the input data." ) else: files.sort() Logger.info("Found files: {}".format(str(patterns_files)), task_name=self.TASK_NAME) result = [[] for x in xrange(samples_number)] for pattern, files in patterns_files.iteritems(): index = 0 for file in files: result[index].append(file) index = index + 1 for file_set in result: Logger.info('Collected run parameters: {}.'.format(str(file_set)), task_name=self.TASK_NAME) Logger.success('Successfully collected batch files.', task_name=self.TASK_NAME) return result
metadata_columns = [] for column in metadata_column_names: column_name = column.strip() if len(column_name) > 0: metadata_columns.append(column_name) metadata_columns_values[column_name] = [] Logger.info('Input parameters checked', task_name=INPUT_CHECK_TASK_NAME) Logger.info('Destination: {}'.format(destination), task_name=INPUT_CHECK_TASK_NAME) Logger.info('Metadata ID: {}'.format(metadata_id), task_name=INPUT_CHECK_TASK_NAME) Logger.info('Metadata Class: {}'.format(metadata_class), task_name=INPUT_CHECK_TASK_NAME) Logger.info('Metadata columns: {}'.format(', '.join(metadata_columns)), task_name=INPUT_CHECK_TASK_NAME) Logger.success("Done", task_name=INPUT_CHECK_TASK_NAME) Logger.info( 'Extracting metadata values (#{}, {}) for columns {}...'.format( metadata_id, metadata_class, ', '.join(metadata_columns)), task_name=METADATA_TASK_NAME) api = EntitiesAPI(api_path, api_token) for el in api.load_all(metadata_id, metadata_class): if len(metadata_entities) > 0 and str(el.id) not in metadata_entities: continue if el.data is not None: for column in metadata_columns: if column in el.data and 'value' in el.data[column]: value = el.data[column]['value'].encode("utf-8") if not value.lower().startswith(
def success(message, crucial=True, *args, **kwargs): logging.info(message, *args, **kwargs) if not Logger.cmd and (crucial or Logger.verbose): CloudPipelineLogger.success(message, task_name=Logger.task)