Beispiel #1
0
 def run(self):
     Logger.info('Launching demultiplex pipeline "%s" with version "%s"' %
                 (self.pipeline_name, self.version),
                 task_name=self.task)
     pipeline = self.api.find_pipeline(self.pipeline_name)
     pipeline_params = {
         'MACHINE_RUN_FOLDER': {
             'value': os.environ['MACHINE_RUN_FOLDER'],
             'type': 'input'
         },
         'SAMPLE_SHEET': {
             'value': os.environ['SAMPLE_SHEET_ORIGINAL'],
             'type': 'input'
         },
         'ANALYSIS_FOLDER': {
             'value': os.environ['ANALYSIS_FOLDER'],
             'type': 'output'
         }
     }
     run = self.api.launch_pipeline(pipeline['id'],
                                    self.version,
                                    pipeline_params,
                                    instance=self.instance_type,
                                    disk=self.instance_disk,
                                    parent_run_id=os.environ['RUN_ID'])
     demultiplex_run_id = run['id']
     Logger.info('Launched demultiplex run %d.' % demultiplex_run_id,
                 task_name=self.task)
     Logger.info('Waiting till run %d completion.' % demultiplex_run_id,
                 task_name=self.task)
     final_status = self.__wait_run_completion(demultiplex_run_id)
     if final_status != 'SUCCESS':
         Logger.fail(
             'Demultiplex processing does not completed successfully. '
             'Check run %d logs for more information.' % demultiplex_run_id,
             task_name=self.task)
         sys.exit(1)
     Logger.success('Demultiplex processing completed sucessfully.',
                    task_name=self.task)
    def await_node_start(self, parameters, task_name, run_id):
        try:
            Logger.info(
                'Waiting for node with parameters = {}, task: {}'.format(
                    ','.join(parameters), task_name),
                task_name=self.task_name)
            # approximately 10 minutes
            attempts = 60
            master = self.get_node_info(parameters, task_name, run_id)
            while not master and attempts > 0:
                master = self.get_node_info(parameters, task_name, run_id)
                attempts -= 1
                Logger.info('Waiting for node ...', task_name=self.task_name)
                time.sleep(10)
            if not master:
                raise RuntimeError('Failed to attach to master node')

            Logger.success('Attached to node (run id {})'.format(master.name),
                           task_name=self.task_name)
            return master
        except Exception as e:
            self.fail_task(e.message)
    def run(self, mount_root, tmp_dir):
        try:
            Logger.info('Starting mounting remote data storages.',
                        task_name=self.task_name)

            Logger.info('Fetching list of allowed storages...',
                        task_name=self.task_name)
            available_storages = self.api.load_available_storages()
            if not available_storages:
                Logger.success('No remote storages are available',
                               task_name=self.task_name)
                return
            Logger.info(
                'Found {} available storage(s). Checking mount options.'.
                format(len(available_storages)),
                task_name=self.task_name)

            fuse_tmp = os.path.join(tmp_dir, "s3fuse")
            if not self.create_directory(fuse_tmp):
                fuse_tmp = '/tmp'

            fuse_available = self.check_or_install_fuse()

            aws_default_region = os.getenv('AWS_DEFAULT_REGION', 'us-east-1')
            aws_region = os.getenv('AWS_REGION', aws_default_region)
            limited_storages = os.getenv('CP_CAP_LIMIT_MOUNTS')
            if limited_storages:
                try:
                    limited_storages_list = [
                        int(x.strip()) for x in limited_storages.split(',')
                    ]
                    available_storages = [
                        x for x in available_storages
                        if x.id in limited_storages_list
                    ]
                    Logger.info(
                        'Run is launched with mount limits ({}) Only {} storages will be mounted'
                        .format(limited_storages, len(available_storages)),
                        task_name=self.task_name)
                except Exception as limited_storages_ex:
                    Logger.warn(
                        'Unable to parse CP_CAP_LIMIT_MOUNTS value({}) with error: {}.'
                        .format(limited_storages,
                                str(limited_storages_ex.message)),
                        task_name=self.task_name)

            nfs_count = len(
                filter((lambda ds: ds.storage_type == 'NFS' and ds.region_name
                        == aws_region), available_storages))
            nfs_available = nfs_count > 0 and self.check_or_install_nfs()
            if not fuse_available and not nfs_available:
                Logger.success(
                    'Mounting of remote storages is not available for this image',
                    task_name=self.task_name)
                return
            for storage in available_storages:
                if not PermissionHelper.is_storage_readable(storage):
                    continue
                mounter = self.get_mount_manager(storage, nfs_available,
                                                 fuse_available, fuse_tmp)
                if mounter is not None:
                    self.mount(mounter, mount_root)
                elif storage.storage_type != NFS_TYPE and storage.storage_type != S3_TYPE:
                    Logger.warn('Unsupported storage type {}.'.format(
                        storage.storage_type),
                                task_name=self.task_name)
            Logger.success('Finished data storage mounting',
                           task_name=self.task_name)
        except Exception as e:
            Logger.fail('Unhandled error during mount task: {}.'.format(
                str(e.message)),
                        task_name=self.task_name)
 def find_files(self, recursive=True):
     Logger.info("Starting parsing input directory: {}.".format(
         self.folder),
                 task_name=self.TASK_NAME)
     all_files = bucket.ls_s3(self.folder,
                              self.MAX_ATTEMPTS,
                              recursive=recursive)
     patterns_files = {}
     for file in all_files:
         # recursive version of s3 ls returns path from bucket root
         # non-recursive ls returns path relative to the requested folder
         if recursive:
             file_name = file[len(self.get_path_without_bucket()) - 1:]
         else:
             file_name = file
         for pattern_name, glob in self.patterns.iteritems():
             Logger.info("Matching file {} against patterns {}.".format(
                 file_name, str(glob)),
                         task_name=self.TASK_NAME)
             if self.match_patterns(file_name, glob):
                 if pattern_name in self.exclude_patterns:
                     exclude = self.exclude_patterns[pattern_name]
                     if self.match_patterns(file_name, exclude):
                         Logger.info(
                             "Skipping filename '{}' since it matches exclude patterns '{}'."
                             .format(file_name, str(exclude)))
                         continue
                 if pattern_name not in patterns_files:
                     patterns_files[pattern_name] = []
                 patterns_files[pattern_name].append(
                     os.path.join(self.folder, file_name))
     if len(patterns_files) == 0:
         self.fail_task("Failed to find files matching any of patterns.")
     samples_number = None
     for pattern, files in patterns_files.iteritems():
         current_length = len(files)
         if current_length == 0:
             self.fail_task(
                 "Failed to find files matching patterns: {}.".format(
                     str(pattern)))
         if samples_number is None:
             samples_number = current_length
         elif samples_number != current_length:
             self.fail_task(
                 "Number of found files differ between patterns. Please check the input data."
             )
         else:
             files.sort()
     Logger.info("Found files: {}".format(str(patterns_files)),
                 task_name=self.TASK_NAME)
     result = [[] for x in xrange(samples_number)]
     for pattern, files in patterns_files.iteritems():
         index = 0
         for file in files:
             result[index].append(file)
             index = index + 1
     for file_set in result:
         Logger.info('Collected run parameters: {}.'.format(str(file_set)),
                     task_name=self.TASK_NAME)
     Logger.success('Successfully collected batch files.',
                    task_name=self.TASK_NAME)
     return result
    metadata_columns = []
    for column in metadata_column_names:
        column_name = column.strip()
        if len(column_name) > 0:
            metadata_columns.append(column_name)
            metadata_columns_values[column_name] = []
    Logger.info('Input parameters checked', task_name=INPUT_CHECK_TASK_NAME)
    Logger.info('Destination: {}'.format(destination),
                task_name=INPUT_CHECK_TASK_NAME)
    Logger.info('Metadata ID: {}'.format(metadata_id),
                task_name=INPUT_CHECK_TASK_NAME)
    Logger.info('Metadata Class: {}'.format(metadata_class),
                task_name=INPUT_CHECK_TASK_NAME)
    Logger.info('Metadata columns: {}'.format(', '.join(metadata_columns)),
                task_name=INPUT_CHECK_TASK_NAME)
    Logger.success("Done", task_name=INPUT_CHECK_TASK_NAME)

    Logger.info(
        'Extracting metadata values (#{}, {}) for columns {}...'.format(
            metadata_id, metadata_class, ', '.join(metadata_columns)),
        task_name=METADATA_TASK_NAME)

    api = EntitiesAPI(api_path, api_token)
    for el in api.load_all(metadata_id, metadata_class):
        if len(metadata_entities) > 0 and str(el.id) not in metadata_entities:
            continue
        if el.data is not None:
            for column in metadata_columns:
                if column in el.data and 'value' in el.data[column]:
                    value = el.data[column]['value'].encode("utf-8")
                    if not value.lower().startswith(
 def success(message, crucial=True, *args, **kwargs):
     logging.info(message, *args, **kwargs)
     if not Logger.cmd and (crucial or Logger.verbose):
         CloudPipelineLogger.success(message, task_name=Logger.task)