Beispiel #1
0
def looks_like_xml(path, regex=TOOL_REGEX):
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if(checkers.check_binary(full_path) or
       checkers.check_image(full_path) or
       checkers.is_gzip(full_path) or
       checkers.is_bz2(full_path) or
       checkers.is_zip(full_path)):
        return False

    with open(path, encoding='utf-8') as f:
        try:
            start_contents = f.read(5 * 1024)
        except UnicodeDecodeError:
            return False
        if regex.search(start_contents):
            return True

    return False
Beispiel #2
0
def get_repository_file_contents(app,
                                 file_path,
                                 repository_id,
                                 is_admin=False):
    """Return the display-safe contents of a repository file for display in a browser."""
    safe_str = ''
    if not is_path_browsable(app, file_path, repository_id, is_admin):
        log.warning(
            'Request tries to access a file outside of the repository location. File path: %s',
            file_path)
        return 'Invalid file path'
    # Symlink targets are checked by is_path_browsable
    if os.path.islink(file_path):
        safe_str = 'link to: ' + basic_util.to_html_string(
            os.readlink(file_path))
        return safe_str
    elif checkers.is_gzip(file_path):
        return '<br/>gzip compressed file<br/>'
    elif checkers.is_bz2(file_path):
        return '<br/>bz2 compressed file<br/>'
    elif checkers.is_zip(file_path):
        return '<br/>zip compressed file<br/>'
    elif checkers.check_binary(file_path):
        return '<br/>Binary file<br/>'
    else:
        for i, line in enumerate(open(file_path)):
            safe_str = '%s%s' % (safe_str, basic_util.to_html_string(line))
            # Stop reading after string is larger than MAX_CONTENT_SIZE.
            if len(safe_str) > MAX_CONTENT_SIZE:
                large_str = \
                    '<br/>File contents truncated because file size is larger than maximum viewing size of %s<br/>' % \
                    util.nice_size(MAX_CONTENT_SIZE)
                safe_str = '%s%s' % (safe_str, large_str)
                break

        if len(safe_str) > basic_util.MAX_DISPLAY_SIZE:
            # Eliminate the middle of the file to display a file no larger than basic_util.MAX_DISPLAY_SIZE.
            # This may not be ideal if the file is larger than MAX_CONTENT_SIZE.
            join_by_str = \
                "<br/><br/>...some text eliminated here because file size is larger than maximum viewing size of %s...<br/><br/>" % \
                util.nice_size(basic_util.MAX_DISPLAY_SIZE)
            safe_str = util.shrink_string_by_size(safe_str,
                                                  basic_util.MAX_DISPLAY_SIZE,
                                                  join_by=join_by_str,
                                                  left_larger=True,
                                                  beginning_on_size_error=True)
        return safe_str
def get_repository_file_contents(app, file_path, repository_id, is_admin=False):
    """Return the display-safe contents of a repository file for display in a browser."""
    safe_str = ''
    if not is_path_browsable(app, file_path, repository_id, is_admin):
        log.warning('Request tries to access a file outside of the repository location. File path: %s', file_path)
        return 'Invalid file path'
    # Symlink targets are checked by is_path_browsable
    if os.path.islink(file_path):
        safe_str = 'link to: ' + basic_util.to_html_string(os.readlink(file_path))
        return safe_str
    elif checkers.is_gzip(file_path):
        return '<br/>gzip compressed file<br/>'
    elif checkers.is_bz2(file_path):
        return '<br/>bz2 compressed file<br/>'
    elif checkers.is_zip(file_path):
        return '<br/>zip compressed file<br/>'
    elif checkers.check_binary(file_path):
        return '<br/>Binary file<br/>'
    else:
        for i, line in enumerate(open(file_path)):
            safe_str = '%s%s' % (safe_str, basic_util.to_html_string(line))
            # Stop reading after string is larger than MAX_CONTENT_SIZE.
            if len(safe_str) > MAX_CONTENT_SIZE:
                large_str = \
                    '<br/>File contents truncated because file size is larger than maximum viewing size of %s<br/>' % \
                    util.nice_size(MAX_CONTENT_SIZE)
                safe_str = '%s%s' % (safe_str, large_str)
                break

        if len(safe_str) > basic_util.MAX_DISPLAY_SIZE:
            # Eliminate the middle of the file to display a file no larger than basic_util.MAX_DISPLAY_SIZE.
            # This may not be ideal if the file is larger than MAX_CONTENT_SIZE.
            join_by_str = \
                "<br/><br/>...some text eliminated here because file size is larger than maximum viewing size of %s...<br/><br/>" % \
                util.nice_size(basic_util.MAX_DISPLAY_SIZE)
            safe_str = util.shrink_string_by_size(safe_str,
                                                  basic_util.MAX_DISPLAY_SIZE,
                                                  join_by=join_by_str,
                                                  left_larger=True,
                                                  beginning_on_size_error=True)
        return safe_str
Beispiel #4
0
def looks_like_a_tool_xml(path):
    """Quick check to see if a file looks like it may be a Galaxy XML tool file."""
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if (checkers.check_binary(full_path) or checkers.check_image(full_path)
            or checkers.is_gzip(full_path) or checkers.is_bz2(full_path)
            or checkers.is_zip(full_path)):
        return False

    with open(path, "r") as f:
        start_contents = f.read(5 * 1024)
        if TOOL_REGEX.search(start_contents):
            return True

    return False
Beispiel #5
0
def looks_like_xml(path, regex=TOOL_REGEX):
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if(checkers.check_binary(full_path) or
       checkers.check_image(full_path) or
       checkers.is_gzip(full_path) or
       checkers.is_bz2(full_path) or
       checkers.is_zip(full_path)):
        return False

    with open(path, "r") as f:
        start_contents = f.read(5 * 1024)
        if regex.search(start_contents):
            return True

    return False
def looks_like_a_tool_xml(path):
    """Quick check to see if a file looks like it may be a Galaxy XML tool file."""
    full_path = os.path.abspath(path)

    if not full_path.endswith(".xml"):
        return False

    if not os.path.getsize(full_path):
        return False

    if(checkers.check_binary(full_path) or
       checkers.check_image(full_path) or
       checkers.is_gzip(full_path) or
       checkers.is_bz2(full_path) or
       checkers.is_zip(full_path)):
        return False

    with open(path, "r") as f:
        start_contents = f.read(5 * 1024)
        if TOOL_REGEX.search(start_contents):
            return True

    return False
Beispiel #7
0
def is_data_index_sample_file(file_path):
    """
    Attempt to determine if a .sample file is appropriate for copying to ~/tool-data when
    a tool shed repository is being installed into a Galaxy instance.
    """
    # Currently most data index files are tabular, so check that first.  We'll assume that
    # if the file is tabular, it's ok to copy.
    if is_column_based(file_path):
        return True
    # If the file is any of the following, don't copy it.
    if checkers.check_html(file_path):
        return False
    if checkers.check_image(file_path):
        return False
    if checkers.check_binary(name=file_path):
        return False
    if checkers.is_bz2(file_path):
        return False
    if checkers.is_gzip(file_path):
        return False
    if checkers.is_zip(file_path):
        return False
    # Default to copying the file if none of the above are true.
    return True
Beispiel #8
0
def is_data_index_sample_file(file_path):
    """
    Attempt to determine if a .sample file is appropriate for copying to ~/tool-data when
    a tool shed repository is being installed into a Galaxy instance.
    """
    # Currently most data index files are tabular, so check that first.  We'll assume that
    # if the file is tabular, it's ok to copy.
    if is_column_based(file_path):
        return True
    # If the file is any of the following, don't copy it.
    if checkers.check_html(file_path):
        return False
    if checkers.check_image(file_path):
        return False
    if checkers.check_binary(name=file_path):
        return False
    if checkers.is_bz2(file_path):
        return False
    if checkers.is_gzip(file_path):
        return False
    if checkers.is_zip(file_path):
        return False
    # Default to copying the file if none of the above are true.
    return True
Beispiel #9
0
def handle_upload(
    registry,
    path,  # dataset.path
    requested_ext,  # dataset.file_type
    name,  # dataset.name,
    tmp_prefix,
    tmp_dir,
    check_content,
    link_data_only,
    in_place,
    auto_decompress,
    convert_to_posix_lines,
    convert_spaces_to_tabs,
):
    stdout = None
    converted_path = None

    # Does the first 1K contain a null?
    is_binary = check_binary(path)

    # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected
    # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed.
    if not link_data_only:
        if auto_decompress and is_zip(path) and not is_single_file_zip(path):
            stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
        try:
            ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file_internal(
                path,
                registry,
                ext=requested_ext,
                tmp_prefix=tmp_prefix,
                tmp_dir=tmp_dir,
                in_place=in_place,
                check_content=check_content,
                is_binary=is_binary,
                auto_decompress=auto_decompress,
                uploaded_file_ext=os.path.splitext(name)[1].lower().lstrip(
                    '.'),
                convert_to_posix_lines=convert_to_posix_lines,
                convert_spaces_to_tabs=convert_spaces_to_tabs,
            )
        except sniff.InappropriateDatasetContentError as exc:
            raise UploadProblemException(str(exc))
    elif requested_ext == 'auto':
        ext = sniff.guess_ext(path, registry.sniff_order, is_binary=is_binary)
    else:
        ext = requested_ext

    # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used)
    converted_path = None if converted_path == path else converted_path

    # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any)
    if requested_ext != 'auto':
        datatype = registry.get_datatype_by_extension(requested_ext)
        # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves)
        if check_content and hasattr(datatype,
                                     'sniff') and not datatype.sniff(path):
            stdout = (
                "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that"
                " type".format(ext=requested_ext))

    # Handle unsniffable binaries
    if is_binary and ext == 'binary':
        upload_ext = os.path.splitext(name)[1].lower().lstrip('.')
        if registry.is_extension_unsniffable_binary(upload_ext):
            stdout = (
                "Warning: The file's datatype cannot be determined from its contents and was guessed based on"
                " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading"
                " this type of file".format(ext=upload_ext))
            ext = upload_ext
        else:
            stdout = (
                "The uploaded binary file format cannot be determined automatically, please set the file 'Type'"
                " manually")

    datatype = registry.get_datatype_by_extension(ext)

    return stdout, ext, datatype, is_binary, converted_path
Beispiel #10
0
def add_file(dataset, registry, output_path):
    ext = None
    compression_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only_str = dataset.get('link_data_only', 'copy_files')
    if link_data_only_str not in ['link_to_files', 'copy_files']:
        raise UploadProblemException(
            "Invalid setting '%s' for option link_data_only - upload request misconfigured"
            % link_data_only_str)
    link_data_only = link_data_only_str == 'link_to_files'

    # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed)
    # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their
    # paths during data conversions since this user already owns that path.
    # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206
    run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get(
        "in_place", False)

    # purge_source defaults to True unless this is an FTP import and
    # ftp_upload_purge has been overridden to False in Galaxy's config.
    # We set purge_source to False if:
    # - the job does not have write access to the file, e.g. when running as the
    #   real user
    # - the files are uploaded from external paths.
    purge_source = dataset.get(
        'purge_source',
        True) and not run_as_real_user and dataset.type not in ('server_dir',
                                                                'path_paste')

    # in_place is True unless we are running as a real user or importing external paths (i.e.
    # this is a real upload and not a path paste or ftp import).
    # in_place should always be False if running as real user because the uploaded file will
    # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't
    # modify files not controlled by Galaxy.
    in_place = not run_as_real_user and dataset.type not in ('server_dir',
                                                             'path_paste',
                                                             'ftp_import')

    # Base on the check_upload_content Galaxy config option and on by default, this enables some
    # security related checks on the uploaded content, but can prevent uploads from working in some cases.
    check_content = dataset.get('check_content', True)

    # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically
    # decompressing archive files before sniffing.
    auto_decompress = dataset.get('auto_decompress', True)
    try:
        dataset.file_type
    except AttributeError:
        raise UploadProblemException(
            'Unable to process uploaded file, missing file_type parameter.')

    if dataset.type == 'url':
        try:
            dataset.path = sniff.stream_url_to_file(dataset.path)
        except Exception as e:
            raise UploadProblemException('Unable to fetch %s\n%s' %
                                         (dataset.path, str(e)))

    # See if we have an empty file
    if not os.path.exists(dataset.path):
        raise UploadProblemException(
            'Uploaded temporary file (%s) does not exist.' % dataset.path)

    if not os.path.getsize(dataset.path) > 0:
        raise UploadProblemException('The uploaded file is empty')

    # Does the first 1K contain a null?
    is_binary = check_binary(dataset.path)

    # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected
    # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed.
    if not link_data_only:
        if is_zip(dataset.path) and not is_single_file_zip(dataset.path):
            stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
        try:
            ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file(
                dataset.path,
                registry,
                ext=dataset.file_type,
                tmp_prefix='data_id_%s_upload_' % dataset.dataset_id,
                tmp_dir=output_adjacent_tmpdir(output_path),
                in_place=in_place,
                check_content=check_content,
                is_binary=is_binary,
                auto_decompress=auto_decompress,
                uploaded_file_ext=os.path.splitext(
                    dataset.name)[1].lower().lstrip('.'),
                convert_to_posix_lines=dataset.to_posix_lines,
                convert_spaces_to_tabs=dataset.space_to_tab,
            )
        except sniff.InappropriateDatasetContentError as exc:
            raise UploadProblemException(str(exc))
    elif dataset.file_type == 'auto':
        # Link mode can't decompress anyway, so enable sniffing for keep-compressed datatypes even when auto_decompress
        # is enabled
        os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1'
        ext = sniff.guess_ext(dataset.path,
                              registry.sniff_order,
                              is_binary=is_binary)
        os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE')

    # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used)
    converted_path = None if converted_path == dataset.path else converted_path

    # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any)
    if dataset.file_type != 'auto':
        datatype = registry.get_datatype_by_extension(dataset.file_type)
        # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves)
        os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1'
        if hasattr(datatype, 'sniff') and not datatype.sniff(dataset.path):
            stdout = (
                "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that"
                " type".format(ext=dataset.file_type))
        os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE')

    # Handle unsniffable binaries
    if is_binary and ext == 'binary':
        upload_ext = os.path.splitext(dataset.name)[1].lower().lstrip('.')
        if registry.is_extension_unsniffable_binary(upload_ext):
            stdout = (
                "Warning: The file's datatype cannot be determined from its contents and was guessed based on"
                " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading"
                " this type of file".format(ext=upload_ext))
            ext = upload_ext
        else:
            stdout = (
                "The uploaded binary file format cannot be determined automatically, please set the file 'Type'"
                " manually")

    datatype = registry.get_datatype_by_extension(ext)

    # Strip compression extension from name
    if compression_type and not getattr(
            datatype, 'compressed',
            False) and dataset.name.endswith('.' + compression_type):
        dataset.name = dataset.name[:-len('.' + compression_type)]

    # Move dataset
    if link_data_only:
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            raise UploadProblemException(err_msg)
    if not link_data_only:
        # Move the dataset to its "real" path. converted_path is a tempfile so we move it even if purge_source is False.
        if purge_source or converted_path:
            try:
                shutil.move(converted_path or dataset.path, output_path)
            except OSError as e:
                # We may not have permission to remove the input
                if e.errno != errno.EACCES:
                    raise
        else:
            shutil.copy(dataset.path, output_path)

    # Write the job info
    stdout = stdout or 'uploaded %s file' % ext
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    # FIXME: does this belong here? also not output-adjacent-tmpdir aware =/
    if not link_data_only and datatype and datatype.dataset_content_needs_grooming(
            output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
    return info
Beispiel #11
0
def handle_upload(
    registry,
    path,  # dataset.path
    requested_ext,  # dataset.file_type
    name,  # dataset.name,
    tmp_prefix,
    tmp_dir,
    check_content,
    link_data_only,
    in_place,
    auto_decompress,
    convert_to_posix_lines,
    convert_spaces_to_tabs,
):
    stdout = None
    converted_path = None
    multi_file_zip = False

    # Does the first 1K contain a null?
    is_binary = check_binary(path)

    # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected
    # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed.
    if not link_data_only:
        if auto_decompress and is_zip(path) and not is_single_file_zip(path):
            multi_file_zip = True
        try:
            ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file_internal(
                path,
                registry,
                ext=requested_ext,
                tmp_prefix=tmp_prefix,
                tmp_dir=tmp_dir,
                in_place=in_place,
                check_content=check_content,
                is_binary=is_binary,
                auto_decompress=auto_decompress,
                uploaded_file_ext=os.path.splitext(name)[1].lower().lstrip('.'),
                convert_to_posix_lines=convert_to_posix_lines,
                convert_spaces_to_tabs=convert_spaces_to_tabs,
            )
        except sniff.InappropriateDatasetContentError as exc:
            raise UploadProblemException(str(exc))
    elif requested_ext == 'auto':
        ext = sniff.guess_ext(path, registry.sniff_order, is_binary=is_binary)
    else:
        ext = requested_ext

    # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used)
    converted_path = None if converted_path == path else converted_path

    # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any)
    if requested_ext != 'auto':
        datatype = registry.get_datatype_by_extension(requested_ext)
        # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves)
        if check_content and hasattr(datatype, 'sniff') and not datatype.sniff(path):
            stdout = ("Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that"
                      " type".format(ext=requested_ext))

    # Handle unsniffable binaries
    if is_binary and ext == 'binary':
        upload_ext = os.path.splitext(name)[1].lower().lstrip('.')
        if registry.is_extension_unsniffable_binary(upload_ext):
            stdout = ("Warning: The file's datatype cannot be determined from its contents and was guessed based on"
                     " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading"
                     " this type of file".format(ext=upload_ext))
            ext = upload_ext
        else:
            stdout = ("The uploaded binary file format cannot be determined automatically, please set the file 'Type'"
                      " manually")

    datatype = registry.get_datatype_by_extension(ext)
    if multi_file_zip and not getattr(datatype, 'compressed', False):
        stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'

    return stdout, ext, datatype, is_binary, converted_path