def __init__(self):
     Task.__init__(self)
     self.task_name = 'WaitForNode'
     self.pipe_api = PipelineAPI(os.environ['API'], 'logs')
class RunAnalyticalPipelinesTask(object):
    def __init__(self, task, pipeline, version, instance_type, instance_disk):
        self.api = PipelineAPI(os.environ['API'], 'logs')
        self.task = task
        self.pipeline = self.api.find_pipeline(pipeline)
        self.version = version
        self.instance_type = instance_type
        self.instance_disk = instance_disk

    def run(self):
        analysis_folder = os.environ['ANALYSIS_FOLDER']
        machine_run_folder = os.environ['MACHINE_RUN_FOLDER']
        sample_sheet = os.environ['SAMPLE_SHEET']
        Logger.info('Starting analytical processing for sample sheet %s' %
                    sample_sheet,
                    task_name=self.task)
        samples = SampleSheetParser(
            sample_sheet,
            [SAMPLE_ID, SAMPLE_NAME, SAMPLE_PROJECT]).parse_sample_sheet()
        launched_runs = {}
        for sample in samples:
            Logger.info('Starting "%s" sample processing.' %
                        sample[SAMPLE_NAME],
                        task_name=self.task)
            launched_runs[sample[SAMPLE_NAME]] = self.__run_sample(
                sample[SAMPLE_NAME], analysis_folder, machine_run_folder)
        failed_runs = self.__wait_runs_completion(launched_runs)
        if failed_runs:
            for sample, run_id in failed_runs.iteritems():
                Logger.fail(
                    'Processing failed for sample "%s". Check run %d logs for more information.'
                    % (sample, run_id),
                    task_name=self.task)
            sys.exit(1)
        Logger.success("All samples processed successfully.",
                       task_name=self.task)

    def __run_sample(self, sample, analysis_folder, machine_run_folder):
        Logger.info(
            'Launching analytical pipeline "%s" with version "%s" for sample %s.'
            % (self.pipeline['name'], self.version, sample),
            task_name=self.task)
        read1, read2 = self.__fetch_reads(sample, analysis_folder,
                                          machine_run_folder)
        pipeline_params = {
            'SAMPLE': {
                'value': sample
            },
            'READ1': {
                'value': read1,
                'type': 'input'
            },
            'READ2': {
                'value': read2,
                'type': 'input'
            },
            'OUTPUT_FOLDER': {
                'value': analysis_folder,
                'type': 'output'
            }
        }
        run = self.api.launch_pipeline(self.pipeline['id'],
                                       self.version,
                                       pipeline_params,
                                       instance=self.instance_type,
                                       disk=self.instance_disk,
                                       parent_run_id=os.environ['RUN_ID'])
        return run['id']

    def __fetch_reads(self, sample, analysis_folder, machine_run_folder):
        run_folder_name = urlparse.urlparse(machine_run_folder).path
        read_folder = self.__get_path_without_trailing_slash(analysis_folder) + \
                      self.__get_path_without_trailing_slash(run_folder_name) + \
                      '/PipelineInputData/FASTQ/'
        return os.path.join(read_folder,
                            sample + '_R1.fastq.gz'), os.path.join(
                                read_folder, sample + '_R2.fastq.gz')

    def __get_path_without_trailing_slash(self, path):
        return path[:-1] if path.endswith('/') else path

    def __wait_runs_completion(self, launched_runs):
        finished = {}
        failed = {}
        while True:
            for sample, run_id in launched_runs.iteritems():
                current_status = self.api.load_run(run_id)['status']
                Logger.info('Processing sample: %s. Run %d status is %s.' %
                            (sample, run_id, current_status),
                            task_name=self.task)
                if current_status != 'RUNNING':
                    finished[sample] = run_id
                    if current_status != 'SUCCESS':
                        failed[sample] = run_id
            if len(finished) == len(launched_runs):
                Logger.info("Processing for all samples completed.",
                            task_name=self.task)
                return failed
            time.sleep(60)
class WaitForNode(Task):
    def __init__(self):
        Task.__init__(self)
        self.task_name = 'WaitForNode'
        self.pipe_api = PipelineAPI(os.environ['API'], 'logs')

    def await_node_start(self, parameters, task_name, run_id):
        try:
            Logger.info(
                'Waiting for node with parameters = {}, task: {}'.format(
                    ','.join(parameters), task_name),
                task_name=self.task_name)
            # approximately 10 minutes
            attempts = 60
            master = self.get_node_info(parameters, task_name, run_id)
            while not master and attempts > 0:
                master = self.get_node_info(parameters, task_name, run_id)
                attempts -= 1
                Logger.info('Waiting for node ...', task_name=self.task_name)
                time.sleep(10)
            if not master:
                raise RuntimeError('Failed to attach to master node')

            Logger.success('Attached to node (run id {})'.format(master.name),
                           task_name=self.task_name)
            return master
        except Exception as e:
            self.fail_task(e.message)

    def get_node_info(self, parameters, task_name, run_id):
        params = self.parse_parameters(parameters)
        runs = self.pipe_api.search_runs(params,
                                         status='RUNNING',
                                         run_id=run_id)
        if len(runs) == 0:
            params.append(('parent-id', str(run_id)))
            runs = self.pipe_api.search_runs(params, status='RUNNING')
        for run in runs:
            if self.check_run(run, params):
                node = Node(run)
                task_logs = self.pipe_api.load_task(node.run_id, task_name)
                if not task_logs:
                    return None
                task_status = task_logs[-1]['status']
                if task_status == 'SUCCESS':
                    return node
                elif task_status != 'RUNNING':
                    raise RuntimeError(
                        'Node failed to start as it cannot attach to a node (run id {})'
                        .format(node.run_id))
        return None

    def parse_parameters(self, parameters):
        result = []
        for param in parameters:
            if '=' not in param:
                raise RuntimeError(
                    "Illegal parameter format. Key=Value is expected.")
            result.append(param.split("=", 1))
        return result

    def check_run(self, run, params):
        run_params = {}
        for run_param in run['pipelineRunParameters']:
            value = run_param['value'] if 'value' in run_param else None
            run_params[run_param['name']] = value
        for param in params:
            if param[0] not in run_params or run_params[param[0]] != param[1]:
                return False
        return True
Beispiel #4
0

def get_image_name_and_tag(image_name_with_tag):
    image_name, image_tag = parse_image(image_name_with_tag)
    if image_tag is None:
        image_tag = 'latest'
    return image_name, image_tag


def add_settings(new_tool_id, new_version, initial_tool_id, initial_version):
    settings = get_tool_version_settings(initial_tool_id, initial_version)
    create_settings_for_tool_version(new_tool_id, new_version, settings)


if __name__ == '__main__':
    api = PipelineAPI(os.environ['API'], 'logs')
    command = sys.argv[1]
    run_id = sys.argv[2]

    if command == "ups":
        status_to_update = None
        new_status = sys.argv[3]
        if new_status == "FAILURE":
            status_to_update = pipeline.api.CommmitStatus.FAILURE
        elif new_status == "SUCCESS":
            status_to_update = pipeline.api.CommmitStatus.SUCCESS
        else:
            raise RuntimeError(
                "Wrong argument for update_commit_status: {}".format(
                    new_status))
        update_commit_status(api, run_id, status_to_update)
Beispiel #5
0
class AbstractPipelineLauncher(AbstractTask):
    TASK_NAME = "LaunchSampleProcessing"
    # important! cmd template should be single-quoted to prevent parameter expansion
    CMD_TEMPLATE = "pipe run --yes --quiet --pipeline {pipe_id}@{version} --instance-disk {instance_disk} " \
                   "--instance-type {instance_type} --docker-image {docker_image} --cmd-template '{cmd}' " \
                   "--parent-id {parent}"
    SAMPLE_TEMPLATE = " --sample_name {sample_name} --sample_id {sample_id}"
    POLL_TIMEOUT = 30
    RETRY_COUNT = 10
    SAMPLE_ID = "Sample_ID"
    SAMPLE_NAME = "Sample_Name"

    def __init__(self, run_files, param_names, run_id, pipe_id, version,
                 pipe_params, param_types):
        AbstractTask.__init__(self, self.TASK_NAME)
        self.samples_number = len(run_files)
        self.run_id = run_id
        self.run_files = run_files
        self.param_names = param_names
        self.pipe_id = pipe_id
        self.version = version
        self.api = PipelineAPI(os.environ['API'], 'logs')
        self.pipe_params = pipe_params
        self.child_id = None
        self.param_types = param_types

    def launch_pipeline(self,
                        params,
                        param_names,
                        instance_size,
                        instance_disk,
                        docker_image,
                        cmd,
                        sample=None):
        if not self.child_run_active():
            self.launch_child_run(params,
                                  param_names,
                                  cmd,
                                  instance_size,
                                  instance_disk,
                                  docker_image,
                                  sample=sample)
            return

        command = self.CMD_TEMPLATE.format(pipe_id=self.pipe_id,
                                           version=self.version,
                                           instance_disk=instance_disk,
                                           instance_type=instance_size,
                                           docker_image=docker_image,
                                           cmd=cmd,
                                           parent=self.run_id)
        if sample:
            command = command + self.SAMPLE_TEMPLATE.format(
                sample_name=sample[self.SAMPLE_NAME],
                sample_id=sample[self.SAMPLE_ID])
        # add all pattern params
        index = 0
        for name in param_names:
            if sample:
                value = ','.join(params[name])
            else:
                value = params[index]
            command += " --{} input?{}".format(name, value)
            index = index + 1
        # add all other params
        for param, value in self.pipe_params.iteritems():
            if param.startswith('i_'):
                command += " --{} input?{}".format(
                    self.change_parameter_name(param), value)
            elif param.startswith('c_'):
                command += " --{} common?{}".format(
                    self.change_parameter_name(param), value)
            elif param.startswith('o_'):
                command += " --{} output?{}".format(
                    self.change_parameter_name(param), value)
            else:
                command += " --{} {}".format(param, value)

        Logger.info('Starting pipeline with command: "{}".'.format(command),
                    task_name=self.TASK_NAME)
        try:
            LoggedCommand(command, None, self.TASK_NAME).execute()
        except Exception as e:
            Logger.warn(
                "Failed to launch sample processing with command: '{}'. Error: '{}'."
                .format(command, e.message),
                task_name=self.TASK_NAME)

    def launch_child_run(self,
                         params,
                         param_names,
                         cmd,
                         instance_size,
                         instance_disk,
                         docker_image,
                         sample=None):
        run_params = {'parent-id': self.run_id}
        if sample:
            run_params['sample_name'] = sample[self.SAMPLE_NAME]
            run_params['sample_id'] = sample[self.SAMPLE_ID]
        index = 0
        # add all pattern params
        for name in param_names:
            if sample:
                value = ','.join(params[name])
            else:
                value = params[index]
            run_params[name] = {'value': value, 'type': 'input'}
            index = index + 1

        # add all other params
        for param, value in self.pipe_params.iteritems():
            param_type = None
            param_name = param
            real_value = self.normalize_value(value)
            if param.startswith('i_'):
                param_type = 'input'
                param_name = self.change_parameter_name(param)
            elif param.startswith('c_'):
                param_type = 'common'
                param_name = self.change_parameter_name(param)
            elif param.startswith('o_'):
                param_type = 'output'
                param_name = self.change_parameter_name(param)
            run_params[param_name] = {'value': real_value}
            if param_type is not None:
                run_params[param_name]['type'] = param_type
            else:
                run_params[param_name]['type'] = self.get_type_from_env(
                    param_name)
        Logger.info(
            "Starting child pipeline run on a parent node with parameters: '{}'."
            .format(str(run_params)),
            task_name=self.TASK_NAME)
        try:
            run = self.api.launch_pipeline(self.pipe_id,
                                           self.version,
                                           run_params,
                                           parent_node_id=self.run_id,
                                           cmd=cmd,
                                           instance=instance_size,
                                           disk=instance_disk,
                                           docker=docker_image)
            self.child_id = run['id']
        except Exception as e:
            Logger.warn(
                "Failed to launch sample processing with parameters: '{}'. Error: '{}'."
                .format(str(run_params), e.message),
                task_name=self.TASK_NAME)
            self.child_id = None

    # to have possibilities to change way of naming new parameter in the batched pipeline
    @staticmethod
    def change_parameter_name(param):
        return param[2:]

    def get_running_samples(self):
        attempts = 0
        while attempts < self.RETRY_COUNT:
            try:
                child_runs = self.api.load_child_pipelines(self.run_id)
                count = 0
                for run in child_runs:
                    if run['status'] == 'RUNNING':
                        count = count + 1
                return count
            except Exception as e:
                Logger.warn("Failed to fetch running samples: {}.".format(
                    e.message),
                            task_name=self.TASK_NAME)
                attempts = attempts + 1
                time.sleep(self.POLL_TIMEOUT)
        Logger.fail("Exceeded maximum attempts to fetch running samples.")
        raise RuntimeError(
            "Exceeded maximum attempts to fetch running samples.")

    def child_run_active(self):
        if self.child_id is None:
            return False
        attempts = 0
        while attempts < self.RETRY_COUNT:
            try:
                run = self.api.load_run(self.child_id)
                return run['status'] == 'RUNNING'
            except Exception as e:
                Logger.warn(
                    "Failed to fetch child run ID '' status: {}.".format(
                        str(self.child_id), e.message),
                    task_name=self.TASK_NAME)
                attempts = attempts + 1
                time.sleep(self.POLL_TIMEOUT)
        Logger.fail("Exceeded maximum attempts to fetch child run status.")
        raise RuntimeError(
            "Exceeded maximum attempts to fetch child run status.")

    def wait_all_samples_finish(self):
        running = self.get_running_samples()
        while running != 0:
            time.sleep(self.POLL_TIMEOUT)
            running = self.get_running_samples()

    def get_type_from_env(self, param_name):
        if param_name not in self.param_types or not self.param_types[
                param_name]:
            return 'string'
        else:
            return self.param_types[param_name]

    # remove escaped ENV values
    def normalize_value(self, value):
        return value.replace("\\$", "$")
    max_additional_hosts = int(os.environ['CP_CAP_SGE_AUTOSCALE_WORKERS']) \
        if 'CP_CAP_SGE_AUTOSCALE_WORKERS' in os.environ else 3
    log_verbose = os.environ['CP_CAP_SGE_AUTOSCALE_VERBOSE'].strip().lower() == "true" \
        if 'CP_CAP_SGE_AUTOSCALE_VERBOSE' in os.environ else False

    Logger.init(cmd=args.debug,
                log_file='/common/workdir/.autoscaler.log',
                task='GridEngineAutoscaling',
                verbose=log_verbose)

    cmd_executor = CmdExecutor()
    grid_engine = GridEngine(cmd_executor=cmd_executor)
    host_storage = FileSystemHostStorage(
        cmd_executor=cmd_executor,
        storage_file='/common/workdir/.autoscaler.storage')
    pipe = PipelineAPI(api_url=pipeline_api,
                       log_dir='/common/workdir/.pipe.log')
    scale_up_timeout = int(
        _retrieve_preference(pipe,
                             'ge.autoscaling.scale.up.timeout',
                             default_value=30))
    scale_down_timeout = int(
        _retrieve_preference(pipe,
                             'ge.autoscaling.scale.down.timeout',
                             default_value=30))
    scale_up_handler = GridEngineScaleUpHandler(
        cmd_executor=cmd_executor,
        pipe=pipe,
        grid_engine=grid_engine,
        host_storage=host_storage,
        parent_run_id=master_run_id,
        default_hostfile=default_hostfile,
class InputDataTask:
    def __init__(self, input_dir, common_dir, analysis_dir, task_name, bucket,
                 report_file, rules):
        self.input_dir = input_dir
        self.common_dir = common_dir
        self.analysis_dir = get_path_with_trailing_delimiter(analysis_dir)
        self.task_name = task_name
        self.bucket = bucket
        self.report_file = report_file
        self.rules = rules
        api_url = os.environ['API']
        if 'API_EXTERNAL' in os.environ and os.environ['API_EXTERNAL']:
            api_url = os.environ['API_EXTERNAL']
        self.api_url = api_url
        self.token = os.environ['API_TOKEN']
        self.api = PipelineAPI(os.environ['API'], 'logs')

    def run(self, upload):
        Logger.info('Starting localization of remote data...',
                    task_name=self.task_name)
        try:
            dts_registry = self.fetch_dts_registry()
            parameter_types = {ParameterType.INPUT_PARAMETER, ParameterType.COMMON_PARAMETER} if upload else \
                {ParameterType.OUTPUT_PARAMETER}
            remote_locations = self.find_remote_locations(
                dts_registry, parameter_types)
            if len(remote_locations) == 0:
                Logger.info('No remote sources found',
                            task_name=self.task_name)
            else:
                dts_locations = [
                    path for location in remote_locations
                    for path in location.paths if path.type == PathType.DTS
                ]
                if upload:
                    self.transfer_dts(dts_locations, dts_registry, upload)
                    self.localize_data(remote_locations, upload)
                    if self.report_file:
                        with open(self.report_file, 'w') as report:
                            for location in remote_locations:
                                env_name = location.env_name
                                original_value = location.original_value
                                localized_value = location.delimiter.join([
                                    path.local_path for path in location.paths
                                ])
                                report.write('export {}="{}"\n'.format(
                                    env_name, localized_value))
                                report.write('export {}="{}"\n'.format(
                                    env_name + '_ORIGINAL', original_value))
                else:
                    rule_patterns = DataStorageRule.read_from_file(self.rules)
                    rules = []
                    for rule in rule_patterns:
                        if rule.move_to_sts:
                            rules.append(rule.file_mask)
                    self.localize_data(remote_locations, upload, rules=rules)
                    self.transfer_dts(dts_locations,
                                      dts_registry,
                                      upload,
                                      rules=rules)
            Logger.success('Finished localization of remote data',
                           task_name=self.task_name)
        except BaseException as e:
            Logger.fail(
                'Localization of remote data failed due to exception: %s' %
                e.message,
                task_name=self.task_name)
            exit(1)

    def fetch_dts_registry(self):
        result = {}
        try:
            dts_data = self.api.load_dts_registry()
        except BaseException as e:
            Logger.info("DTS is not available: %s" % e.message,
                        task_name=self.task_name)
            return result
        for registry in dts_data:
            for prefix in registry['prefixes']:
                result[prefix] = registry['url']
        return result

    def find_remote_locations(self, dts_registry, parameter_types):
        remote_locations = []
        for env in os.environ:
            param_type_name = env + '_PARAM_TYPE'
            if os.environ[env] and param_type_name in os.environ:
                param_type = os.environ[param_type_name]
                if param_type in parameter_types:
                    value = os.environ[env].strip()
                    Logger.info('Found remote parameter %s with type %s' %
                                (value, param_type),
                                task_name=self.task_name)
                    original_paths = [value]
                    delimiter = ''
                    for supported_delimiter in VALUE_DELIMITERS:
                        if value.find(supported_delimiter) != -1:
                            original_paths = re.split(supported_delimiter,
                                                      value)
                            delimiter = supported_delimiter
                            break
                    paths = []
                    for path in original_paths:
                        resolved_path = replace_all_system_variables_in_path(
                            path).strip()
                        if self.match_dts_path(resolved_path, dts_registry):
                            paths.append(
                                self.build_dts_path(resolved_path,
                                                    dts_registry, param_type))
                        elif self.match_s3_path(resolved_path):
                            paths.append(
                                self.build_s3_path(resolved_path, param_type))
                        elif self.match_ftp_or_http_path(resolved_path):
                            paths.append(
                                self.build_ftp_or_http_path(
                                    resolved_path, param_type))
                    if len(paths) != 0:
                        remote_locations.append(
                            RemoteLocation(env, value, param_type, paths,
                                           delimiter))

        return remote_locations

    @staticmethod
    def match_ftp_or_http_path(path):
        return any(path.startswith(scheme) for scheme in HTTP_FTP_SCHEMES)

    @staticmethod
    def match_s3_path(path):
        return path.startswith('s3://') or path.startswith('cp://')

    @staticmethod
    def match_dts_path(path, dts_registry):
        for prefix in dts_registry:
            if path.startswith(prefix):
                return True
        return False

    def build_dts_path(self, path, dts_registry, input_type):
        for prefix in dts_registry:
            if path.startswith(prefix):
                if not self.bucket:
                    raise RuntimeError(
                        'Transfer bucket shall be set for DTS locations')
                relative_path = path.replace(prefix, '')
                s3_path = self.join_paths(self.bucket, relative_path)

                if input_type == ParameterType.OUTPUT_PARAMETER:
                    local_path = self.analysis_dir
                else:
                    local_dir = self.get_local_dir(input_type)
                    local_path = self.join_paths(local_dir, relative_path)
                Logger.info(
                    'Found remote {} path {} matching DTS prefix {}. '
                    'It will be uploaded to bucket path {} and localized {} {}.'
                    .format(
                        input_type, path, prefix, s3_path, 'from' if input_type
                        == ParameterType.OUTPUT_PARAMETER else 'to',
                        local_path),
                    task_name=self.task_name)
                return LocalizedPath(path,
                                     s3_path,
                                     local_path,
                                     PathType.DTS,
                                     prefix=prefix)
        raise RuntimeError(
            'Remote path %s does not match any of DTS prefixes.')

    def build_s3_path(self, path, input_type):
        return self._build_remote_path(path, input_type, PathType.S3)

    def build_ftp_or_http_path(self, path, input_type):
        return self._build_remote_path(path, input_type, PathType.HTTP_OR_FTP)

    def _build_remote_path(self, path, input_type, path_type):
        if input_type == ParameterType.OUTPUT_PARAMETER:
            local_path = self.analysis_dir
        else:
            remote = urlparse.urlparse(path)
            relative_path = path.replace(
                '%s://%s' % (remote.scheme, remote.netloc), '')
            local_dir = self.get_local_dir(input_type)
            local_path = self.join_paths(local_dir, relative_path)
        Logger.info('Found %s %s path %s. It will be localized to %s.' %
                    (path_type.lower(), input_type, path, local_path),
                    task_name=self.task_name)
        return LocalizedPath(path, path, local_path, path_type)

    def get_local_dir(self, type):
        return self.input_dir if type == ParameterType.INPUT_PARAMETER else self.common_dir

    def join_paths(self, prefix, suffix):
        trimmed_prefix = get_path_with_trailing_delimiter(prefix)
        trimmed_suffix = suffix[1:] if suffix.startswith('/') else suffix
        return trimmed_prefix + trimmed_suffix

    def transfer_dts(self, dts_locations, dts_registry, upload, rules=None):
        grouped_paths = {}
        for path in dts_locations:
            if path.prefix not in grouped_paths:
                grouped_paths[path.prefix] = [path]
            else:
                grouped_paths[path.prefix].append(path)

        for prefix, paths in grouped_paths.iteritems():
            dts_url = dts_registry[prefix]
            Logger.info(
                'Uploading {} paths using DTS service {}'.format(
                    len(paths), dts_url), self.task_name)
            dts_client = DataTransferServiceClient(dts_url, self.token,
                                                   self.api_url, self.token,
                                                   10)
            dts_client.transfer_data(
                [self.create_dts_path(path, upload, rules) for path in paths],
                self.task_name)

    def create_dts_path(self, path, upload, rules):
        return LocalToS3(path.path, path.s3_path,
                         rules) if upload else S3ToLocal(
                             path.s3_path, path.path, rules)

    def localize_data(self, remote_locations, upload, rules=None):
        cluster = Cluster.build_cluster()
        for location in remote_locations:
            for path in location.paths:
                source, destination = self.get_local_paths(path, upload)
                self.perform_transfer(path,
                                      source,
                                      destination,
                                      cluster,
                                      upload,
                                      rules=rules)

    def perform_transfer(self,
                         path,
                         source,
                         destination,
                         cluster,
                         upload,
                         rules=None):
        Logger.info(
            'Uploading files from {} to {}'.format(source, destination),
            self.task_name)
        if path.type == PathType.HTTP_OR_FTP or cluster is None or self.is_file(
                source):
            if upload or self.rules is None:
                S3Bucket().pipe_copy(source, destination, TRANSFER_ATTEMPTS)
            else:
                S3Bucket().pipe_copy_with_rules(source, destination,
                                                TRANSFER_ATTEMPTS, self.rules)
        else:
            common_folder = os.path.join(os.environ['SHARED_WORK_FOLDER'],
                                         'transfer')
            applied_rules = None if upload else rules
            chunks = self.split_source_into_chunks(cluster, source,
                                                   destination, common_folder,
                                                   applied_rules)
            transfer_pool = Pool(len(chunks))
            transfer_pool.map(transfer_async, chunks)
            shutil.rmtree(common_folder, ignore_errors=True)

    def is_file(self, source):
        if source.endswith('/'):
            return False
        if self.match_s3_path(source):
            source_path = urlparse.urlparse(source)
            # case when whole bucket is selected
            if not source_path.path or source_path.path == '/':
                return True
            # urlparse returns path as /folder/inner
            # convert it to s3 listing representation folder/inner/
            folder = get_path_with_trailing_delimiter(
                get_path_without_first_delimiter(source_path.path))
            s3_paths = S3Bucket().pipe_ls(
                get_path_without_trailing_delimiter(source),
                TRANSFER_ATTEMPTS,
                recursive=False,
                all=False,
                show_info=True)
            for path in s3_paths:
                if path[0] == 'Folder' and path[1] == folder:
                    return False
            return True

        else:
            return os.path.isfile(source)

    def split_source_into_chunks(self, cluster, source, destination,
                                 common_folder, rules):
        if not os.path.exists(common_folder):
            os.makedirs(common_folder)
        source_files = self.fetch_source_files(source)
        chunks = []
        for node in cluster.nodes:
            for slot in range(0, cluster.slots_per_node):
                chunks.append(
                    TransferChunk(node.hostname, [], source, destination,
                                  common_folder, self.task_name, rules))
        for i in range(0, len(source_files)):
            file = source_files[i]
            chunk_index = i % len(chunks)
            chunks[chunk_index].files.append(file)
        return chunks

    def fetch_source_files(self, source):
        """
        :return: list of files sorted by size DESC
        """
        if self.match_s3_path(source):
            s3_paths = S3Bucket().pipe_ls(
                get_path_with_trailing_delimiter(source),
                TRANSFER_ATTEMPTS,
                recursive=True,
                all=True,
                show_info=True)
            s3_paths = filter(lambda x: x[0] == 'File', s3_paths)
            files = [
                File(self.get_path_without_folder(source, path[1]),
                     int(path[2])) for path in s3_paths
            ]
        else:
            files = []
            for root, d_names, f_names in os.walk(source):
                for f in f_names:
                    path = os.path.join(root, f)
                    files.append(
                        File(os.path.relpath(path, start=source),
                             os.path.getsize(path)))
        return sorted(files, key=lambda x: x.size, reverse=True)

    def get_path_without_folder(self, source, path):
        prefix = urlparse.urlparse(source).path
        if prefix.startswith('/'):
            prefix = prefix[1:]
        if not prefix.endswith('/'):
            prefix += '/'
        if len(prefix) == 0 or prefix == '/':
            return path
        return path.replace(prefix, '', 1)

    @staticmethod
    def get_local_paths(path, upload):
        if upload:
            source = path.s3_path if path.type == PathType.DTS else path.path
            destination = path.local_path
        else:
            source = path.local_path
            destination = path.path if path.type == PathType.HTTP_OR_FTP else path.s3_path
        return source, destination
Beispiel #8
0
 def __init__(self):
     self.api = PipelineAPI(os.environ.get('API'), "logs")
Beispiel #9
0
class CloudPipelineApiProvider(object):
    def __init__(self):
        self.api = PipelineAPI(os.environ.get('API'), "logs")

    def search(self, query, type):
        return self.api.search(query, [type])

    def create_pipeline(self, name, description):
        data = {
            "name": name,
            "description": description,
        }
        return self.api.create_pipeline(data)

    def delete_pipeline(self, id):
        self.api.delete_pipeline(id)

    def create_folder(self, name, parent=None):
        return self.api.create_folder(name, parent)

    def delete_folder(self, id):
        self.api.delete_folder(id)

    def create_s3_data_storage(self,
                               name,
                               description,
                               parent_folder_id=None,
                               region_id=2,
                               storage_policy=None):
        if not storage_policy:
            storage_policy = {"versioningEnabled": True}
        data = {
            "name": name,
            "path": name,
            "description": description,
            "type": 'S3',
            "shared": False,
            "parentFolderId": parent_folder_id,
            "regionId": region_id,
            "storagePolicy": storage_policy
        }
        return self.api.datastorage_create(data)

    def delete_data_storage(self, id):
        self.api.delete_datastorage(id)

    def create_issue(self, name, text, entity_id, entity_class):
        return self.api.create_issue(name, text, entity_id, entity_class)

    def delete_issue(self, id):
        self.api.delete_folder(id)

    def create_comment(self, issue_id, text):
        return self.api.create_comment(issue_id, text)
Beispiel #10
0
 def __init__(self):
     Task.__init__(self)
     self.task_name = 'WaitForMasterNode'
     self.kube = Kubernetes()
     self.pipe_api = PipelineAPI(os.environ['API'], 'logs')