Esempio n. 1
0
 def validate(self, job_directory, string):
     regex = "^%s$" % self._expression_to_regex(job_directory, self.xml_el)
     match = compile(regex).match(string)
     validated = match is not None
     if validated:
         for group in match.groups():
             if not in_directory(group, join(job_directory.path, "inputs")) and \
                not in_directory(group, join(job_directory.path, "outputs")):
                 validated = False
                 break
     return validated
Esempio n. 2
0
 def validate(self, job_directory, string):
     regex = "^%s$" % self._expression_to_regex(job_directory, self.xml_el)
     match = compile(regex).match(string)
     validated = match is not None
     if validated:
         for group in match.groups():
             if not in_directory(group, join(job_directory.path, "inputs")) and \
                not in_directory(group, join(job_directory.path, "outputs")):
                 validated = False
                 break
     return validated
Esempio n. 3
0
def discovered_file_for_element(dataset,
                                job_working_directory,
                                parent_identifiers=[],
                                collector=None):
    target_directory = discover_target_directory(
        getattr(collector, "directory", None), job_working_directory)
    filename = dataset.get("filename")
    error_message = dataset.get("error_message")
    if error_message is None:
        # handle link_data_only here, verify filename is in directory if not linking...
        if not dataset.get("link_data_only"):
            path = os.path.join(target_directory, filename)
            if not util.in_directory(path, target_directory):
                raise Exception(
                    "Problem with tool configuration, attempting to pull in datasets from outside working directory."
                )
        else:
            path = filename
        return DiscoveredFile(
            path, collector,
            JsonCollectedDatasetMatch(dataset,
                                      collector,
                                      filename,
                                      path=path,
                                      parent_identifiers=parent_identifiers))
    else:
        assert "error_message" in dataset
        return DiscoveredFileError(
            dataset['error_message'], collector,
            JsonCollectedDatasetMatch(dataset,
                                      collector,
                                      None,
                                      parent_identifiers=parent_identifiers))
Esempio n. 4
0
    def get_work_dir_outputs( self, job_wrapper, job_working_directory=None ):
        """
        Returns list of pairs (source_file, destination) describing path
        to work_dir output file and ultimate destination.
        """
        if not job_working_directory:
            job_working_directory = os.path.abspath( job_wrapper.working_directory )

        # Set up dict of dataset id --> output path; output path can be real or
        # false depending on outputs_to_working_directory
        output_paths = {}
        for dataset_path in job_wrapper.get_output_fnames():
            path = dataset_path.real_path
            if self.app.config.outputs_to_working_directory:
                path = dataset_path.false_path
            output_paths[ dataset_path.dataset_id ] = path

        output_pairs = []
        # Walk job's output associations to find and use from_work_dir attributes.
        job = job_wrapper.get_job()
        job_tool = job_wrapper.tool
        for (joda, dataset) in self._walk_dataset_outputs( job ):
            if joda and job_tool:
                hda_tool_output = job_tool.find_output_def( joda.name )
                if hda_tool_output and hda_tool_output.from_work_dir:
                    # Copy from working dir to HDA.
                    # TODO: move instead of copy to save time?
                    source_file = os.path.join( job_working_directory, hda_tool_output.from_work_dir )
                    destination = job_wrapper.get_output_destination( output_paths[ dataset.dataset_id ] )
                    if in_directory( source_file, job_working_directory ):
                        output_pairs.append( ( source_file, destination ) )
                    else:
                        # Security violation.
                        log.exception( "from_work_dir specified a location not in the working directory: %s, %s" % ( source_file, job_wrapper.working_directory ) )
        return output_pairs
Esempio n. 5
0
 def __in_working_directory(self, job, path, app):
     working_directory = app.object_store.get_filename(job,
                                                       base_dir='job_work',
                                                       dir_only=True,
                                                       extra_dir=str(
                                                           job.id))
     return util.in_directory(path, working_directory)
Esempio n. 6
0
    def get_work_dir_outputs( self, job_wrapper, job_working_directory=None ):
        """
        Returns list of pairs (source_file, destination) describing path
        to work_dir output file and ultimate destination.
        """
        if not job_working_directory:
            job_working_directory = os.path.abspath( job_wrapper.working_directory )

        # Set up dict of dataset id --> output path; output path can be real or
        # false depending on outputs_to_working_directory
        output_paths = {}
        for dataset_path in job_wrapper.get_output_fnames():
            path = dataset_path.real_path
            if self.app.config.outputs_to_working_directory:
                path = dataset_path.false_path
            output_paths[ dataset_path.dataset_id ] = path

        output_pairs = []
        # Walk job's output associations to find and use from_work_dir attributes.
        job = job_wrapper.get_job()
        job_tool = job_wrapper.tool
        for (joda, dataset) in self._walk_dataset_outputs( job ):
            if joda and job_tool:
                hda_tool_output = job_tool.find_output_def( joda.name )
                if hda_tool_output and hda_tool_output.from_work_dir:
                    # Copy from working dir to HDA.
                    # TODO: move instead of copy to save time?
                    source_file = os.path.join( job_working_directory, hda_tool_output.from_work_dir )
                    destination = job_wrapper.get_output_destination( output_paths[ dataset.dataset_id ] )
                    if in_directory( source_file, job_working_directory ):
                        output_pairs.append( ( source_file, destination ) )
                    else:
                        # Security violation.
                        log.exception( "from_work_dir specified a location not in the working directory: %s, %s" % ( source_file, job_wrapper.working_directory ) )
        return output_pairs
Esempio n. 7
0
 def metadata_path_on_compute(path):
     compute_path = path
     log.info(compute_tmp_dir)
     if compute_tmp_dir and tmp_dir and in_directory(path, tmp_dir):
         path_relative = os.path.relpath(path, tmp_dir)
         compute_path = os.path.join(compute_tmp_dir, path_relative)
     return compute_path
Esempio n. 8
0
def discover_target_directory(extra_file_collector, job_working_directory):
    directory = job_working_directory
    if extra_file_collector.directory:
        directory = os.path.join(directory, extra_file_collector.directory)
        if not util.in_directory(directory, job_working_directory):
            raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.")
    return directory
Esempio n. 9
0
def preprocess_volumes(volumes_raw_str, container_type):
    """Process Galaxy volume specification string to either Docker or Singularity specification.

    Galaxy allows the mount try "default_ro" which translates to ro for Docker and
    ro for Singularity iff no subdirectories are rw (Singularity does not allow ro
    parent directories with rw subdirectories).

    >>> preprocess_volumes("/a/b", DOCKER_CONTAINER_TYPE)
    ['/a/b:rw']
    >>> preprocess_volumes("/a/b:ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE)
    ['/a/b:ro', '/a/b/c:rw']
    >>> preprocess_volumes("/a/b:/a:ro,/a/b/c:/a/b:rw", DOCKER_CONTAINER_TYPE)
    ['/a/b:/a:ro', '/a/b/c:/a/b:rw']
    >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE)
    ['/a/b:ro', '/a/b/c:rw']
    >>> preprocess_volumes("/a/b:default_ro,/a/b/c:ro", SINGULARITY_CONTAINER_TYPE)
    ['/a/b:ro', '/a/b/c:ro']
    >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", SINGULARITY_CONTAINER_TYPE)
    ['/a/b', '/a/b/c']
    """

    volumes_raw_strs = [v.strip() for v in volumes_raw_str.split(",")]
    volumes = []
    rw_paths = []

    for volume_raw_str in volumes_raw_strs:
        volume_parts = volume_raw_str.split(":")
        if len(volume_parts) > 3:
            raise Exception("Unparsable volumes string in configuration [%s]" % volumes_raw_str)
        if len(volume_parts) == 3:
            volume_parts = ["{}:{}".format(volume_parts[0], volume_parts[1]), volume_parts[2]]
        if len(volume_parts) == 2 and volume_parts[1] not in ("rw", "ro", "default_ro"):
            volume_parts = ["{}:{}".format(volume_parts[0], volume_parts[1]), "rw"]
        if len(volume_parts) == 1:
            volume_parts.append("rw")
        volumes.append(volume_parts)
        if volume_parts[1] == "rw":
            rw_paths.append(volume_parts[0])

    for volume in volumes:
        path = volume[0]
        how = volume[1]

        if how == "default_ro":
            how = "ro"
            if container_type == SINGULARITY_CONTAINER_TYPE:
                for rw_path in rw_paths:
                    if in_directory(rw_path, path):
                        how = "rw"

        volume[1] = how

        # for a while singularity did not allow to specify the bind type rw
        # (which is the default). so we omit this default
        # see https://github.com/hpcng/singularity/pull/5487
        if container_type == SINGULARITY_CONTAINER_TYPE and volume[1] == 'rw':
            del volume[1]

    return [":".join(v) for v in volumes]
Esempio n. 10
0
def discover_target_directory(dir_name, job_working_directory):
    if dir_name:
        directory = os.path.join(job_working_directory, dir_name)
        if not util.in_directory(directory, job_working_directory):
            raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.")
        return directory
    else:
        return job_working_directory
Esempio n. 11
0
def discover_target_directory(dir_name, job_working_directory):
    if dir_name:
        directory = os.path.join(job_working_directory, dir_name)
        if not util.in_directory(directory, job_working_directory):
            raise Exception(
                "Problem with tool configuration, attempting to pull in datasets from outside working directory."
            )
        return directory
    else:
        return job_working_directory
Esempio n. 12
0
def discovered_file_for_unnamed_output(dataset, job_working_directory, parent_identifiers=[]):
    extra_file_collector = DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR
    target_directory = discover_target_directory(extra_file_collector.directory, job_working_directory)
    filename = dataset["filename"]
    # handle link_data_only here, verify filename is in directory if not linking...
    if not dataset.get("link_data_only"):
        path = os.path.join(target_directory, filename)
        if not util.in_directory(path, target_directory):
            raise Exception("Problem with tool configuration, attempting to pull in datasets from outside working directory.")
    else:
        path = filename
    return DiscoveredFile(path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path, parent_identifiers=parent_identifiers))
Esempio n. 13
0
    def get_work_dir_outputs(self, job_wrapper, job_working_directory=None):
        """
        Returns list of pairs (source_file, destination) describing path
        to work_dir output file and ultimate destination.
        """
        if not job_working_directory:
            job_working_directory = os.path.abspath(
                job_wrapper.working_directory)

        # Set up dict of dataset id --> output path; output path can be real or
        # false depending on outputs_to_working_directory
        output_paths = {}
        for dataset_path in job_wrapper.get_output_fnames():
            path = dataset_path.real_path
            if self.app.config.outputs_to_working_directory:
                path = dataset_path.false_path
            output_paths[dataset_path.dataset_id] = path

        output_pairs = []
        # Walk job's output associations to find and use from_work_dir attributes.
        job = job_wrapper.get_job()
        job_tool = job_wrapper.tool
        for dataset_assoc in job.output_datasets + job.output_library_datasets:
            for dataset in dataset_assoc.dataset.dataset.history_associations + dataset_assoc.dataset.dataset.library_associations:
                if isinstance(dataset,
                              self.app.model.HistoryDatasetAssociation):
                    joda = self.sa_session.query(
                        self.app.model.JobToOutputDatasetAssociation
                    ).filter_by(job=job, dataset=dataset).first()
                    if joda and job_tool:
                        hda_tool_output = job_tool.outputs.get(joda.name, None)
                        if hda_tool_output and hda_tool_output.from_work_dir:
                            # Copy from working dir to HDA.
                            # TODO: move instead of copy to save time?
                            source_file = os.path.join(
                                job_working_directory,
                                hda_tool_output.from_work_dir)
                            destination = job_wrapper.get_output_destination(
                                output_paths[dataset.dataset_id])
                            if in_directory(source_file,
                                            job_working_directory):
                                output_pairs.append((source_file, destination))
                            else:
                                # Security violation.
                                log.exception(
                                    "from_work_dir specified a location not in the working directory: %s, %s"
                                    % (source_file,
                                       job_wrapper.working_directory))
        return output_pairs
Esempio n. 14
0
 def __is_output_dataset_path(self, job, path):
     """ Check if is an output path for this job or a file in the an
     output's extra files path.
     """
     da_lists = [job.output_datasets, job.output_library_datasets]
     for da_list in da_lists:
         for job_dataset_association in da_list:
             dataset = job_dataset_association.dataset
             if not dataset:
                 continue
             if os.path.abspath(dataset.file_name) == os.path.abspath(path):
                 return True
             elif util.in_directory(path, dataset.extra_files_path):
                 return True
     return False
Esempio n. 15
0
def walk_over_extra_files( extra_file_collectors, job_working_directory, matchable ):
    for extra_file_collector in extra_file_collectors:
        directory = job_working_directory
        if extra_file_collector.directory:
            directory = os.path.join( directory, extra_file_collector.directory )
            if not util.in_directory( directory, job_working_directory ):
                raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." )
        if not os.path.isdir( directory ):
            continue
        for filename in sorted( os.listdir( directory ) ):
            path = os.path.join( directory, filename )
            if not os.path.isfile( path ):
                continue
            if extra_file_collector.match( matchable, filename ):
                yield path, extra_file_collector
Esempio n. 16
0
 def __is_output_dataset_path( self, job, path ):
     """ Check if is an output path for this job or a file in the an
     output's extra files path.
     """
     da_lists = [ job.output_datasets, job.output_library_datasets ]
     for da_list in da_lists:
         for job_dataset_association in da_list:
             dataset = job_dataset_association.dataset
             if not dataset:
                 continue
             if os.path.abspath( dataset.file_name ) == os.path.abspath( path ):
                 return True
             elif util.in_directory( path, dataset.extra_files_path ):
                 return True
     return False
Esempio n. 17
0
    def ensure_in_working_directory(self, path, purge_source, in_place):
        if in_directory(path, self.__workdir):
            return path

        new_path = self.__new_dataset_path()
        if purge_source:
            try:
                shutil.move(path, new_path)
            except OSError as e:
                # We may not have permission to remove converted_path
                if e.errno != errno.EACCES:
                    raise
        else:
            shutil.copy(path, new_path)

        return new_path
Esempio n. 18
0
    def ensure_in_working_directory(self, path, purge_source, in_place):
        if in_directory(path, self.__workdir):
            return path

        new_path = self.__new_dataset_path()
        if purge_source:
            try:
                shutil.move(path, new_path)
            except OSError as e:
                # We may not have permission to remove converted_path
                if e.errno != errno.EACCES:
                    raise
        else:
            shutil.copy(path, new_path)

        return new_path
Esempio n. 19
0
def preprocess_volumes(volumes_raw_str, container_type):
    """Process Galaxy volume specification string to either Docker or Singularity specification.

    Galaxy allows the mount try "default_ro" which translates to ro for Docker and
    ro for Singularity iff no subdirectories are rw (Singularity does not allow ro
    parent directories with rw subdirectories).

    >>> preprocess_volumes("/a/b", DOCKER_CONTAINER_TYPE)
    ['/a/b:rw']
    >>> preprocess_volumes("/a/b:ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE)
    ['/a/b:ro', '/a/b/c:rw']
    >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE)
    ['/a/b:ro', '/a/b/c:rw']
    >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", SINGULARITY_CONTAINER_TYPE)
    ['/a/b:rw', '/a/b/c:rw']
    """

    volumes_raw_strs = [v.strip() for v in volumes_raw_str.split(",")]
    volumes = []
    rw_paths = []

    for volume_raw_str in volumes_raw_strs:
        volume_parts = volume_raw_str.split(":")
        if len(volume_parts) > 2:
            raise Exception("Unparsable volumes string in configuration [%s]" %
                            volumes_raw_str)
        if len(volume_parts) == 1:
            volume_parts.append("rw")
        volumes.append(volume_parts)
        if volume_parts[1] == "rw":
            rw_paths.append(volume_parts[0])

    for volume in volumes:
        path = volume[0]
        how = volume[1]

        if how == "default_ro":
            how = "ro"
            if container_type == SINGULARITY_CONTAINER_TYPE:
                for rw_path in rw_paths:
                    if in_directory(rw_path, path):
                        how = "rw"

        volume[1] = how

    return [":".join(v) for v in volumes]
Esempio n. 20
0
def preprocess_volumes(volumes_raw_str, container_type):
    """Process Galaxy volume specification string to either Docker or Singularity specification.

    Galaxy allows the mount try "default_ro" which translates to ro for Docker and
    ro for Singularity iff no subdirectories are rw (Singularity does not allow ro
    parent directories with rw subdirectories).

    >>> preprocess_volumes("/a/b", DOCKER_CONTAINER_TYPE)
    '/a/b:rw'
    >>> preprocess_volumes("/a/b:ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE)
    '/a/b:ro,/a/b/c:rw'
    >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", DOCKER_CONTAINER_TYPE)
    '/a/b:ro,/a/b/c:rw'
    >>> preprocess_volumes("/a/b:default_ro,/a/b/c:rw", SINGULARITY_CONTAINER_TYPE)
    '/a/b:rw,/a/b/c:rw'
    """

    volumes_raw_strs = [v.strip() for v in volumes_raw_str.split(",")]
    volumes = []
    rw_paths = []

    for volume_raw_str in volumes_raw_strs:
        volume_parts = volume_raw_str.split(":")
        if len(volume_parts) > 2:
            raise Exception("Unparsable volumes string in configuration [%s]" % volumes_raw_str)
        if len(volume_parts) == 1:
            volume_parts.append("rw")
        volumes.append(volume_parts)
        if volume_parts[1] == "rw":
            rw_paths.append(volume_parts[0])

    for volume in volumes:
        path = volume[0]
        how = volume[1]

        if how == "default_ro":
            how = "ro"
            if container_type == SINGULARITY_CONTAINER_TYPE:
                for rw_path in rw_paths:
                    if in_directory(rw_path, path):
                        how = "rw"

        volume[1] = how

    return ",".join([":".join(v) for v in volumes])
Esempio n. 21
0
def walk_over_extra_files(extra_file_collectors, job_working_directory,
                          matchable):
    for extra_file_collector in extra_file_collectors:
        directory = job_working_directory
        if extra_file_collector.directory:
            directory = os.path.join(directory, extra_file_collector.directory)
            if not util.in_directory(directory, job_working_directory):
                raise Exception(
                    "Problem with tool configuration, attempting to pull in datasets from outside working directory."
                )
        if not os.path.isdir(directory):
            continue
        for filename in sorted(os.listdir(directory)):
            path = os.path.join(directory, filename)
            if not os.path.isfile(path):
                continue
            if extra_file_collector.match(matchable, filename):
                yield path, extra_file_collector
Esempio n. 22
0
    def ensure_in_working_directory(self, path, purge_source, in_place):
        if in_directory(path, self.__workdir):
            return path

        new_path = self.__new_dataset_path()
        if purge_source:
            try:
                shutil.move(path, new_path)
                # Drop .info file if it exists
                try:
                    os.remove(f"{path}.info")
                except FileNotFoundError:
                    pass
            except OSError as e:
                # We may not have permission to remove converted_path
                if e.errno != errno.EACCES:
                    raise
        else:
            shutil.copy(path, new_path)

        return new_path
Esempio n. 23
0
    def get_work_dir_outputs( self, job_wrapper, job_working_directory=None ):
        """
        Returns list of pairs (source_file, destination) describing path
        to work_dir output file and ultimate destination.
        """
        if not job_working_directory:
            job_working_directory = os.path.abspath( job_wrapper.working_directory )

        # Set up dict of dataset id --> output path; output path can be real or
        # false depending on outputs_to_working_directory
        output_paths = {}
        for dataset_path in job_wrapper.get_output_fnames():
            path = dataset_path.real_path
            if self.app.config.outputs_to_working_directory:
                path = dataset_path.false_path
            output_paths[ dataset_path.dataset_id ] = path

        output_pairs = []
        # Walk job's output associations to find and use from_work_dir attributes.
        job = job_wrapper.get_job()
        job_tool = self.app.toolbox.tools_by_id.get( job.tool_id, None )
        for dataset_assoc in job.output_datasets + job.output_library_datasets:
            for dataset in dataset_assoc.dataset.dataset.history_associations + dataset_assoc.dataset.dataset.library_associations:
                if isinstance( dataset, self.app.model.HistoryDatasetAssociation ):
                    joda = self.sa_session.query( self.app.model.JobToOutputDatasetAssociation ).filter_by( job=job, dataset=dataset ).first()
                    if joda and job_tool:
                        hda_tool_output = job_tool.outputs.get( joda.name, None )
                        if hda_tool_output and hda_tool_output.from_work_dir:
                            # Copy from working dir to HDA.
                            # TODO: move instead of copy to save time?
                            source_file = os.path.join( job_working_directory, hda_tool_output.from_work_dir )
                            destination = job_wrapper.get_output_destination( output_paths[ dataset.dataset_id ] )
                            if in_directory( source_file, job_working_directory ):
                                output_pairs.append( ( source_file, destination ) )
                                log.debug( "Copying %s to %s as directed by from_work_dir" % ( source_file, destination ) )
                            else:
                                # Security violation.
                                log.exception( "from_work_dir specified a location not in the working directory: %s, %s" % ( source_file, job_wrapper.working_directory ) )
        return output_pairs
Esempio n. 24
0
def discovered_file_for_unnamed_output(dataset,
                                       job_working_directory,
                                       parent_identifiers=[]):
    extra_file_collector = DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR
    target_directory = discover_target_directory(
        extra_file_collector.directory, job_working_directory)
    filename = dataset["filename"]
    # handle link_data_only here, verify filename is in directory if not linking...
    if not dataset.get("link_data_only"):
        path = os.path.join(target_directory, filename)
        if not util.in_directory(path, target_directory):
            raise Exception(
                "Problem with tool configuration, attempting to pull in datasets from outside working directory."
            )
    else:
        path = filename
    return DiscoveredFile(
        path, extra_file_collector,
        JsonCollectedDatasetMatch(dataset,
                                  extra_file_collector,
                                  filename,
                                  path=path,
                                  parent_identifiers=parent_identifiers))
Esempio n. 25
0
 def remote_output_path_rewrite(self, local_path):
     output_type = path_type.OUTPUT
     if in_directory(local_path, self.local_working_directory):
         output_type = path_type.OUTPUT_WORKDIR
     remote_path = self.__remote_path_rewrite(local_path, output_type)
     return remote_path
Esempio n. 26
0
 def exists(self, filename):
     path = os.path.abspath(self.path(filename))
     return os.path.exists(path) and in_directory(path, self.file_dir)
Esempio n. 27
0
 def exists(self, filename):
     path = os.path.abspath(self.path(filename))
     return os.path.exists(path) and in_directory(path, self.file_dir)
Esempio n. 28
0
 def __is_allowed_temp_dir_file(self, app, job, path):
     # grrr.. need to get away from new_file_path - these should be written
     # to job working directory like metadata files.
     in_temp_dir = util.in_directory(path, app.config.new_file_path)
     return in_temp_dir and os.path.split(path)[-1].startswith(
         "GALAXY_VERSION_")
Esempio n. 29
0
 def metadata_path_on_compute(path):
     compute_path = path
     if compute_tmp_dir and tmp_dir and in_directory(path, tmp_dir):
         path_relative = os.path.relpath(path, tmp_dir)
         compute_path = os.path.join(compute_tmp_dir, path_relative)
     return compute_path
Esempio n. 30
0
 def __in_working_directory( self, job, path, app ):
     working_directory = app.object_store.get_filename(job, base_dir='job_work', dir_only=True, extra_dir=str(job.id))
     return util.in_directory( path, working_directory )
Esempio n. 31
0
 def __is_allowed_temp_dir_file( self, app, job, path ):
     # grrr.. need to get away from new_file_path - these should be written
     # to job working directory like metadata files.
     in_temp_dir = util.in_directory( path, app.config.new_file_path )
     return in_temp_dir and os.path.split( path )[ -1 ].startswith( "GALAXY_VERSION_")
Esempio n. 32
0
def verify_is_in_directory(path, directory, local_path_module=os.path):
    if not in_directory(path, directory, local_path_module):
        msg = "Attempt to read or write file outside an authorized directory."
        log.warn("%s Attempted path: %s, valid directory: %s" %
                 (msg, path, directory))
        raise Exception(msg)
Esempio n. 33
0
 def remote_output_path_rewrite(self, local_path):
     output_type = path_type.OUTPUT
     if in_directory(local_path, self.local_working_directory):
         output_type = path_type.OUTPUT_WORKDIR
     remote_path = self.__remote_path_rewrite(local_path, output_type)
     return remote_path
Esempio n. 34
0
def verify_is_in_directory(path, directory, local_path_module=os.path):
    if not in_directory(path, directory, local_path_module):
        msg = "Attempt to read or write file outside an authorized directory."
        log.warn("%s Attempted path: %s, valid directory: %s" % (msg, path, directory))
        raise Exception(msg)
def collect_primary_datasets( tool, output, job_working_directory, input_ext ):
    app = tool.app
    sa_session = tool.sa_session
    new_primary_datasets = {}
    try:
        json_file = open( os.path.join( job_working_directory, jobs.TOOL_PROVIDED_JOB_METADATA_FILE ), 'r' )
        for line in json_file:
            line = json.loads( line )
            if line.get( 'type' ) == 'new_primary_dataset':
                new_primary_datasets[ os.path.split( line.get( 'filename' ) )[-1] ] = line
    except Exception:
        # This should not be considered an error or warning condition, this file is optional
        pass
    # Loop through output file names, looking for generated primary
    # datasets in form of:
    #     'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)'
    primary_datasets = {}
    for name, outdata in output.items():
        dataset_collectors = tool.outputs[ name ].dataset_collectors if name in tool.outputs else [ DEFAULT_DATASET_COLLECTOR ]
        filenames = odict.odict()
        if 'new_file_path' in app.config.collect_outputs_from:
            if DEFAULT_DATASET_COLLECTOR in dataset_collectors:
                # 'new_file_path' collection should be considered deprecated,
                # only use old-style matching (glob instead of regex and only
                # using default collector - if enabled).
                for filename in glob.glob(os.path.join(app.config.new_file_path, "primary_%i_*" % outdata.id) ):
                    filenames[ filename ] = DEFAULT_DATASET_COLLECTOR
        if 'job_working_directory' in app.config.collect_outputs_from:
            for extra_file_collector in dataset_collectors:
                directory = job_working_directory
                if extra_file_collector.directory:
                    directory = os.path.join( directory, extra_file_collector.directory )
                    if not util.in_directory( directory, job_working_directory ):
                        raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." )
                if not os.path.isdir( directory ):
                    continue
                for filename in os.listdir( directory ):
                    path = os.path.join( directory, filename )
                    if not os.path.isfile( path ):
                        continue
                    if extra_file_collector.match( outdata, filename ):
                        filenames[ path ] = extra_file_collector
        for filename, extra_file_collector in filenames.iteritems():
            if not name in primary_datasets:
                primary_datasets[name] = {}
            fields_match = extra_file_collector.match( outdata, os.path.basename( filename ) )
            if not fields_match:
                # Before I guess pop() would just have thrown an IndexError
                raise Exception( "Problem parsing metadata fields for file %s" % filename )
            designation = fields_match.designation
            visible = fields_match.visible
            ext = fields_match.ext
            if ext == "input":
                ext = input_ext
            dbkey = fields_match.dbkey
            # Create new primary dataset
            primary_data = app.model.HistoryDatasetAssociation( extension=ext,
                                                                designation=designation,
                                                                visible=visible,
                                                                dbkey=dbkey,
                                                                create_dataset=True,
                                                                sa_session=sa_session )
            app.security_agent.copy_dataset_permissions( outdata.dataset, primary_data.dataset )
            sa_session.add( primary_data )
            sa_session.flush()
            # Move data from temp location to dataset location
            app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True)
            primary_data.set_size()
            # If match specified a name use otherwise generate one from
            # designation.
            primary_data.name = fields_match.name or "%s (%s)" % ( outdata.name, designation )
            primary_data.info = outdata.info
            primary_data.init_meta( copy_from=outdata )
            primary_data.dbkey = dbkey
            # Associate new dataset with job
            job = None
            for assoc in outdata.creating_job_associations:
                job = assoc.job
                break
            if job:
                assoc = app.model.JobToOutputDatasetAssociation( '__new_primary_file_%s|%s__' % ( name, designation ), primary_data )
                assoc.job = job
                sa_session.add( assoc )
                sa_session.flush()
            primary_data.state = outdata.state
            #add tool/metadata provided information
            new_primary_datasets_attributes = new_primary_datasets.get( os.path.split( filename )[-1], {} )
            if new_primary_datasets_attributes:
                dataset_att_by_name = dict( ext='extension' )
                for att_set in [ 'name', 'info', 'ext', 'dbkey' ]:
                    dataset_att_name = dataset_att_by_name.get( att_set, att_set )
                    setattr( primary_data, dataset_att_name, new_primary_datasets_attributes.get( att_set, getattr( primary_data, dataset_att_name ) ) )
                extra_files_path = new_primary_datasets_attributes.get( 'extra_files', None )
                if extra_files_path:
                    extra_files_path_joined = os.path.join( job_working_directory, extra_files_path )
                    for root, dirs, files in os.walk( extra_files_path_joined ):
                        extra_dir = os.path.join( primary_data.extra_files_path, root.replace( extra_files_path_joined, '', 1 ).lstrip( os.path.sep ) )
                        for f in files:
                            app.object_store.update_from_file( primary_data.dataset,
                                extra_dir=extra_dir,
                                alt_name=f,
                                file_name=os.path.join( root, f ),
                                create=True,
                                dir_only=True,
                                preserve_symlinks=True
                            )
                    # FIXME: 
                    # since these are placed into the job working dir, let the standard
                    # Galaxy cleanup methods handle this (for now?)
                    # there was an extra_files_path dir, attempt to remove it
                    #shutil.rmtree( extra_files_path_joined )
            metadata_dict = new_primary_datasets_attributes.get( 'metadata', None )
            if metadata_dict:
                primary_data.metadata.from_JSON_dict( json_dict=metadata_dict )
            else:
                primary_data.set_meta()
            primary_data.set_peek()
            sa_session.add( primary_data )
            sa_session.flush()
            outdata.history.add_dataset( primary_data )
            # Add dataset to return dict
            primary_datasets[name][designation] = primary_data
            # Need to update all associated output hdas, i.e. history was
            # shared with job running
            for dataset in outdata.dataset.history_associations:
                if outdata == dataset:
                    continue
                new_data = primary_data.copy()
                dataset.history.add_dataset( new_data )
                sa_session.add( new_data )
                sa_session.flush()
    return primary_datasets
Esempio n. 36
0
def collect_primary_datasets(tool, output, job_working_directory, input_ext):
    app = tool.app
    sa_session = tool.sa_session
    new_primary_datasets = {}
    try:
        json_file = open(
            os.path.join(job_working_directory,
                         jobs.TOOL_PROVIDED_JOB_METADATA_FILE), 'r')
        for line in json_file:
            line = json.loads(line)
            if line.get('type') == 'new_primary_dataset':
                new_primary_datasets[os.path.split(
                    line.get('filename'))[-1]] = line
    except Exception:
        # This should not be considered an error or warning condition, this file is optional
        pass
    # Loop through output file names, looking for generated primary
    # datasets in form of:
    #     'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)'
    primary_datasets = {}
    for name, outdata in output.items():
        dataset_collectors = tool.outputs[
            name].dataset_collectors if name in tool.outputs else [
                DEFAULT_DATASET_COLLECTOR
            ]
        filenames = odict.odict()
        if 'new_file_path' in app.config.collect_outputs_from:
            if DEFAULT_DATASET_COLLECTOR in dataset_collectors:
                # 'new_file_path' collection should be considered deprecated,
                # only use old-style matching (glob instead of regex and only
                # using default collector - if enabled).
                for filename in glob.glob(
                        os.path.join(app.config.new_file_path,
                                     "primary_%i_*" % outdata.id)):
                    filenames[filename] = DEFAULT_DATASET_COLLECTOR
        if 'job_working_directory' in app.config.collect_outputs_from:
            for extra_file_collector in dataset_collectors:
                directory = job_working_directory
                if extra_file_collector.directory:
                    directory = os.path.join(directory,
                                             extra_file_collector.directory)
                    if not util.in_directory(directory, job_working_directory):
                        raise Exception(
                            "Problem with tool configuration, attempting to pull in datasets from outside working directory."
                        )
                if not os.path.isdir(directory):
                    continue
                for filename in os.listdir(directory):
                    path = os.path.join(directory, filename)
                    if not os.path.isfile(path):
                        continue
                    if extra_file_collector.match(outdata, filename):
                        filenames[path] = extra_file_collector
        for filename, extra_file_collector in filenames.iteritems():
            if not name in primary_datasets:
                primary_datasets[name] = {}
            fields_match = extra_file_collector.match(
                outdata, os.path.basename(filename))
            if not fields_match:
                # Before I guess pop() would just have thrown an IndexError
                raise Exception("Problem parsing metadata fields for file %s" %
                                filename)
            designation = fields_match.designation
            visible = fields_match.visible
            ext = fields_match.ext
            if ext == "input":
                ext = input_ext
            dbkey = fields_match.dbkey
            # Create new primary dataset
            primary_data = app.model.HistoryDatasetAssociation(
                extension=ext,
                designation=designation,
                visible=visible,
                dbkey=dbkey,
                create_dataset=True,
                sa_session=sa_session)
            app.security_agent.copy_dataset_permissions(
                outdata.dataset, primary_data.dataset)
            sa_session.add(primary_data)
            sa_session.flush()
            # Move data from temp location to dataset location
            app.object_store.update_from_file(primary_data.dataset,
                                              file_name=filename,
                                              create=True)
            primary_data.set_size()
            # If match specified a name use otherwise generate one from
            # designation.
            primary_data.name = fields_match.name or "%s (%s)" % (outdata.name,
                                                                  designation)
            primary_data.info = outdata.info
            primary_data.init_meta(copy_from=outdata)
            primary_data.dbkey = dbkey
            # Associate new dataset with job
            job = None
            for assoc in outdata.creating_job_associations:
                job = assoc.job
                break
            if job:
                assoc = app.model.JobToOutputDatasetAssociation(
                    '__new_primary_file_%s|%s__' % (name, designation),
                    primary_data)
                assoc.job = job
                sa_session.add(assoc)
                sa_session.flush()
            primary_data.state = outdata.state
            #add tool/metadata provided information
            new_primary_datasets_attributes = new_primary_datasets.get(
                os.path.split(filename)[-1], {})
            if new_primary_datasets_attributes:
                dataset_att_by_name = dict(ext='extension')
                for att_set in ['name', 'info', 'ext', 'dbkey']:
                    dataset_att_name = dataset_att_by_name.get(
                        att_set, att_set)
                    setattr(
                        primary_data, dataset_att_name,
                        new_primary_datasets_attributes.get(
                            att_set, getattr(primary_data, dataset_att_name)))
                extra_files_path = new_primary_datasets_attributes.get(
                    'extra_files', None)
                if extra_files_path:
                    extra_files_path_joined = os.path.join(
                        job_working_directory, extra_files_path)
                    for root, dirs, files in os.walk(extra_files_path_joined):
                        extra_dir = os.path.join(
                            primary_data.extra_files_path,
                            root.replace(extra_files_path_joined, '',
                                         1).lstrip(os.path.sep))
                        for f in files:
                            app.object_store.update_from_file(
                                primary_data.dataset,
                                extra_dir=extra_dir,
                                alt_name=f,
                                file_name=os.path.join(root, f),
                                create=True,
                                dir_only=True,
                                preserve_symlinks=True)
                    # FIXME:
                    # since these are placed into the job working dir, let the standard
                    # Galaxy cleanup methods handle this (for now?)
                    # there was an extra_files_path dir, attempt to remove it
                    #shutil.rmtree( extra_files_path_joined )
            metadata_dict = new_primary_datasets_attributes.get(
                'metadata', None)
            if metadata_dict:
                primary_data.metadata.from_JSON_dict(json_dict=metadata_dict)
            else:
                primary_data.set_meta()
            primary_data.set_peek()
            sa_session.add(primary_data)
            sa_session.flush()
            outdata.history.add_dataset(primary_data)
            # Add dataset to return dict
            primary_datasets[name][designation] = primary_data
            # Need to update all associated output hdas, i.e. history was
            # shared with job running
            for dataset in outdata.dataset.history_associations:
                if outdata == dataset:
                    continue
                new_data = primary_data.copy()
                dataset.history.add_dataset(new_data)
                sa_session.add(new_data)
                sa_session.flush()
    return primary_datasets