Exemple #1
0
def add_file(dataset, registry, json_file, output_path):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get('link_data_only', 'copy_files')
    in_place = dataset.get('in_place', True)

    try:
        ext = dataset.file_type
    except AttributeError:
        file_err(
            'Unable to process uploaded file, missing file_type parameter.',
            dataset, json_file)
        return

    if dataset.type == 'url':
        try:
            page = urllib.urlopen(
                dataset.path)  #page will be .close()ed by sniff methods
            temp_name, dataset.is_multi_byte = sniff.stream_to_file(
                page,
                prefix='url_paste',
                source_encoding=util.get_charset_from_http_headers(
                    page.headers))
        except Exception, e:
            file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)),
                     dataset, json_file)
            return
        dataset.path = temp_name
Exemple #2
0
def stream_url_to_file(path):
    page = urlopen(path)  # page will be .close()ed in stream_to_file
    temp_name = stream_to_file(
        page,
        prefix='url_paste',
        source_encoding=util.get_charset_from_http_headers(page.headers))
    return temp_name
Exemple #3
0
def stream_url_to_file(path, file_sources=None):
    prefix = "url_paste"
    if file_sources and file_sources.looks_like_uri(path):
        file_source_path = file_sources.get_file_source_path(path)
        _, temp_name = tempfile.mkstemp(prefix=prefix)
        os.close(_)
        file_source_path.file_source.realize_to(file_source_path.path, temp_name)
        return temp_name
    else:
        page = urllib.request.urlopen(path)  # page will be .close()ed in stream_to_file
        temp_name = stream_to_file(page, prefix=prefix, source_encoding=util.get_charset_from_http_headers(page.headers))
        return temp_name
Exemple #4
0
def stream_url_to_file(path, file_sources=None):
    prefix = "url_paste"
    if file_sources and file_sources.looks_like_uri(path):
        file_source_path = file_sources.get_file_source_path(path)
        with tempfile.NamedTemporaryFile(prefix=prefix, delete=False) as temp:
            temp_name = temp.name
        file_source_path.file_source.realize_to(file_source_path.path, temp_name)
        return temp_name
    else:
        page = urllib.request.urlopen(path, timeout=util.DEFAULT_SOCKET_TIMEOUT)  # page will be .close()ed in stream_to_file
        temp_name = stream_to_file(page, prefix=prefix, source_encoding=util.get_charset_from_http_headers(page.headers))
        return temp_name
Exemple #5
0
def url_upload(dataset):
    """
    @summary: Upload an URL file and update dataset information.
    @param dataset: [Bunch] The dataset for the uploaded file.
    """
    url = dataset.path
    page = urllib.urlopen(url)
    temp_name, dataset.is_multi_byte = sniff.stream_to_file(
        page,
        prefix='url_paste',
        source_encoding=util.get_charset_from_http_headers(page.headers))
    dataset.path = temp_name
    dataset.name = url.split('/')[-1]
Exemple #6
0
def add_file(dataset, registry, json_file, output_path):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get("link_data_only", "copy_files")
    in_place = dataset.get("in_place", True)

    try:
        ext = dataset.file_type
    except AttributeError:
        file_err("Unable to process uploaded file, missing file_type parameter.", dataset, json_file)
        return

    if dataset.type == "url":
        try:
            page = urllib.urlopen(dataset.path)  # page will be .close()ed by sniff methods
            temp_name, dataset.is_multi_byte = sniff.stream_to_file(
                page, prefix="url_paste", source_encoding=util.get_charset_from_http_headers(page.headers)
            )
        except Exception, e:
            file_err("Unable to fetch %s\n%s" % (dataset.path, str(e)), dataset, json_file)
            return
        dataset.path = temp_name
Exemple #7
0
def __main__():
    filename = sys.argv[1]
    try:
        max_file_size = int( sys.argv[2] )
    except:
        max_file_size = 0

    job_params, params = load_input_parameters( filename )
    if job_params is None:  # using an older tabular file
        enhanced_handling = False
        job_params = dict( param_dict=params )
        job_params[ 'output_data' ] = [ dict( out_data_name='output',
                                              ext='data',
                                              file_name=filename,
                                              extra_files_path=None ) ]
        job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE )
    else:
        enhanced_handling = True
        json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' )  # specially named file for output junk to pass onto set metadata

    datatypes_registry = Registry()
    datatypes_registry.load_datatypes( root_dir=job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config=job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] )

    URL = params.get( 'URL', None )  # using exactly URL indicates that only one dataset is being downloaded
    URL_method = params.get( 'URL_method', None )

    # The Python support for fetching resources from the web is layered. urllib uses the httplib
    # library, which in turn uses the socket library.  As of Python 2.3 you can specify how long
    # a socket should wait for a response before timing out. By default the socket module has no
    # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2
    # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by
    # doing the following.
    socket.setdefaulttimeout( 600 )

    for data_dict in job_params[ 'output_data' ]:
        cur_filename = data_dict.get( 'file_name', filename )
        cur_URL = params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL )
        if not cur_URL:
            open( cur_filename, 'w' ).write( "" )
            stop_err( 'The remote data source application has not sent back a URL parameter in the request.' )

        # The following calls to urlopen() will use the above default timeout
        try:
            if not URL_method or URL_method == 'get':
                page = urlopen( cur_URL )
            elif URL_method == 'post':
                page = urlopen( cur_URL, urlencode( params ) )
        except Exception as e:
            stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
        if max_file_size:
            file_size = int( page.info().get( 'Content-Length', 0 ) )
            if file_size > max_file_size:
                stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
        # do sniff stream for multi_byte
        try:
            cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) )
        except Exception as e:
            stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) )

        # here import checks that upload tool performs
        if enhanced_handling:
            try:
                ext = sniff.handle_uploaded_dataset_file( filename, datatypes_registry, ext=data_dict[ 'ext' ], is_multi_byte=is_multi_byte )
            except Exception as e:
                stop_err( str( e ) )
            info = dict( type='dataset',
                         dataset_id=data_dict[ 'dataset_id' ],
                         ext=ext)

            json_file.write( "%s\n" % dumps( info ) )
Exemple #8
0
def add_file(dataset, registry, json_file, output_path):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get('link_data_only', 'copy_files')
    run_as_real_user = in_place = dataset.get('in_place', True)
    purge_source = dataset.get('purge_source', True)
    # in_place is True if there is no external chmod in place,
    # however there are other instances where modifications should not occur in_place:
    # when a file is added from a directory on the local file system (ftp import folder or any other path).
    if dataset.type in ('server_dir', 'path_paste', 'ftp_import'):
        in_place = False
    check_content = dataset.get('check_content' , True)
    auto_decompress = dataset.get('auto_decompress', True)
    try:
        ext = dataset.file_type
    except AttributeError:
        file_err('Unable to process uploaded file, missing file_type parameter.', dataset, json_file)
        return

    if dataset.type == 'url':
        try:
            page = urlopen(dataset.path)  # page will be .close()ed by sniff methods
            temp_name, dataset.is_multi_byte = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers))
        except Exception as e:
            file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file)
            return
        dataset.path = temp_name
    # See if we have an empty file
    if not os.path.exists(dataset.path):
        file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file)
        return
    if not os.path.getsize(dataset.path) > 0:
        file_err('The uploaded file is empty', dataset, json_file)
        return
    if not dataset.type == 'url':
        # Already set is_multi_byte above if type == 'url'
        try:
            dataset.is_multi_byte = multi_byte.is_multi_byte(codecs.open(dataset.path, 'r', 'utf-8').read(100))
        except UnicodeDecodeError as e:
            dataset.is_multi_byte = False
    # Is dataset an image?
    i_ext = get_image_ext(dataset.path)
    if i_ext:
        ext = i_ext
        data_type = ext
    # Is dataset content multi-byte?
    elif dataset.is_multi_byte:
        data_type = 'multi-byte char'
        ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True)
    # Is dataset content supported sniffable binary?
    else:
        # FIXME: This ignores the declared sniff order in datatype_conf.xml
        # resulting in improper behavior
        type_info = Binary.is_sniffable_binary(dataset.path)
        if type_info:
            data_type = type_info[0]
            ext = type_info[1]
    if not data_type:
        root_datatype = registry.get_datatype_by_extension(dataset.file_type)
        if getattr(root_datatype, 'compressed', False):
            data_type = 'compressed archive'
            ext = dataset.file_type
        else:
            # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
            is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content)
            if is_gzipped and not is_valid:
                file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file)
                return
            elif is_gzipped and is_valid and auto_decompress:
                if link_data_only == 'copy_files':
                    # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
                    CHUNK_SIZE = 2 ** 20  # 1Mb
                    fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                    gzipped_file = gzip.GzipFile(dataset.path, 'rb')
                    while 1:
                        try:
                            chunk = gzipped_file.read(CHUNK_SIZE)
                        except IOError:
                            os.close(fd)
                            os.remove(uncompressed)
                            file_err('Problem decompressing gzipped data', dataset, json_file)
                            return
                        if not chunk:
                            break
                        os.write(fd, chunk)
                    os.close(fd)
                    gzipped_file.close()
                    # Replace the gzipped file with the decompressed file if it's safe to do so
                    if not in_place:
                        dataset.path = uncompressed
                    else:
                        shutil.move(uncompressed, dataset.path)
                    os.chmod(dataset.path, 0o644)
                dataset.name = dataset.name.rstrip('.gz')
                data_type = 'gzip'
            if not data_type:
                # See if we have a bz2 file, much like gzip
                is_bzipped, is_valid = check_bz2(dataset.path, check_content)
                if is_bzipped and not is_valid:
                    file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file)
                    return
                elif is_bzipped and is_valid and auto_decompress:
                    if link_data_only == 'copy_files':
                        # We need to uncompress the temp_name file
                        CHUNK_SIZE = 2 ** 20  # 1Mb
                        fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                        bzipped_file = bz2.BZ2File(dataset.path, 'rb')
                        while 1:
                            try:
                                chunk = bzipped_file.read(CHUNK_SIZE)
                            except IOError:
                                os.close(fd)
                                os.remove(uncompressed)
                                file_err('Problem decompressing bz2 compressed data', dataset, json_file)
                                return
                            if not chunk:
                                break
                            os.write(fd, chunk)
                        os.close(fd)
                        bzipped_file.close()
                        # Replace the bzipped file with the decompressed file if it's safe to do so
                        if not in_place:
                            dataset.path = uncompressed
                        else:
                            shutil.move(uncompressed, dataset.path)
                        os.chmod(dataset.path, 0o644)
                    dataset.name = dataset.name.rstrip('.bz2')
                    data_type = 'bz2'
            if not data_type:
                # See if we have a zip archive
                is_zipped = check_zip(dataset.path)
                if is_zipped and auto_decompress:
                    if link_data_only == 'copy_files':
                        CHUNK_SIZE = 2 ** 20  # 1Mb
                        uncompressed = None
                        uncompressed_name = None
                        unzipped = False
                        z = zipfile.ZipFile(dataset.path)
                        for name in z.namelist():
                            if name.endswith('/'):
                                continue
                            if unzipped:
                                stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
                                break
                            fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                            if sys.version_info[:2] >= (2, 6):
                                zipped_file = z.open(name)
                                while 1:
                                    try:
                                        chunk = zipped_file.read(CHUNK_SIZE)
                                    except IOError:
                                        os.close(fd)
                                        os.remove(uncompressed)
                                        file_err('Problem decompressing zipped data', dataset, json_file)
                                        return
                                    if not chunk:
                                        break
                                    os.write(fd, chunk)
                                os.close(fd)
                                zipped_file.close()
                                uncompressed_name = name
                                unzipped = True
                            else:
                                # python < 2.5 doesn't have a way to read members in chunks(!)
                                try:
                                    outfile = open(uncompressed, 'wb')
                                    outfile.write(z.read(name))
                                    outfile.close()
                                    uncompressed_name = name
                                    unzipped = True
                                except IOError:
                                    os.close(fd)
                                    os.remove(uncompressed)
                                    file_err('Problem decompressing zipped data', dataset, json_file)
                                    return
                        z.close()
                        # Replace the zipped file with the decompressed file if it's safe to do so
                        if uncompressed is not None:
                            if not in_place:
                                dataset.path = uncompressed
                            else:
                                shutil.move(uncompressed, dataset.path)
                            os.chmod(dataset.path, 0o644)
                            dataset.name = uncompressed_name
                    data_type = 'zip'
            if not data_type:
                # TODO refactor this logic.  check_binary isn't guaranteed to be
                # correct since it only looks at whether the first 100 chars are
                # printable or not.  If someone specifies a known unsniffable
                # binary datatype and check_binary fails, the file gets mangled.
                if check_binary(dataset.path) or Binary.is_ext_unsniffable(dataset.file_type):
                    # We have a binary dataset, but it is not Bam, Sff or Pdf
                    data_type = 'binary'
                    # binary_ok = False
                    parts = dataset.name.split(".")
                    if len(parts) > 1:
                        ext = parts[-1].strip().lower()
                        if check_content and not Binary.is_ext_unsniffable(ext):
                            file_err('The uploaded binary file contains inappropriate content', dataset, json_file)
                            return
                        elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext:
                            err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext.capitalize(), ext)
                            file_err(err_msg, dataset, json_file)
                            return
            if not data_type:
                # We must have a text file
                if check_content and check_html(dataset.path):
                    file_err('The uploaded file contains inappropriate HTML content', dataset, json_file)
                    return
            if data_type != 'binary':
                if link_data_only == 'copy_files' and data_type not in ('gzip', 'bz2', 'zip'):
                    # Convert universal line endings to Posix line endings if to_posix_lines is True
                    # and the data is not binary or gzip-, bz2- or zip-compressed.
                    if dataset.to_posix_lines:
                        tmpdir = output_adjacent_tmpdir(output_path)
                        tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
                        if dataset.space_to_tab:
                            line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix)
                        else:
                            line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix)
                if dataset.file_type == 'auto':
                    ext = sniff.guess_ext(dataset.path, registry.sniff_order)
                else:
                    ext = dataset.file_type
                data_type = ext
    # Save job info for the framework
    if ext == 'auto' and data_type == 'binary':
        ext = 'data'
    if ext == 'auto' and dataset.ext:
        ext = dataset.ext
    if ext == 'auto':
        ext = 'data'
    datatype = registry.get_datatype_by_extension(ext)
    if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files':
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            file_err(err_msg, dataset, json_file)
            return
    if link_data_only == 'copy_files' and converted_path:
        # Move the dataset to its "real" path
        try:
            shutil.move(converted_path, output_path)
        except OSError as e:
            # We may not have permission to remove converted_path
            if e.errno != errno.EACCES:
                raise
    elif link_data_only == 'copy_files':
        if purge_source and not run_as_real_user:
            # if the upload tool runs as a real user the real user
            # can't move dataset.path as this path is owned by galaxy.
            shutil.move(dataset.path, output_path)
        else:
            shutil.copy(dataset.path, output_path)
    # Write the job info
    stdout = stdout or 'uploaded %s file' % data_type
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    json_file.write(dumps(info) + "\n")
    if link_data_only == 'copy_files' and datatype and datatype.dataset_content_needs_grooming(output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
Exemple #9
0
def stream_url_to_file(path):
    page = urlopen(path)  # page will be .close()ed in stream_to_file
    temp_name = stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers))
    return temp_name
Exemple #10
0
def __main__():
    filename = sys.argv[1]
    try:
        max_file_size = int(sys.argv[2])
    except Exception:
        max_file_size = 0

    job_params, params = load_input_parameters(filename)
    if job_params is None:  # using an older tabular file
        enhanced_handling = False
        job_params = dict(param_dict=params)
        job_params['output_data'] = [dict(out_data_name='output',
                                          ext='data',
                                          file_name=filename,
                                          extra_files_path=None)]
        job_params['job_config'] = dict(GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE)
    else:
        enhanced_handling = True
        json_file = open(job_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'w')  # specially named file for output junk to pass onto set metadata

    datatypes_registry = Registry()
    datatypes_registry.load_datatypes(root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE'])

    URL = params.get('URL', None)  # using exactly URL indicates that only one dataset is being downloaded
    URL_method = params.get('URL_method', None)

    # The Python support for fetching resources from the web is layered. urllib uses the httplib
    # library, which in turn uses the socket library.  As of Python 2.3 you can specify how long
    # a socket should wait for a response before timing out. By default the socket module has no
    # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2
    # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by
    # doing the following.
    socket.setdefaulttimeout(600)

    for data_dict in job_params['output_data']:
        cur_filename = data_dict.get('file_name', filename)
        cur_URL = params.get('%s|%s|URL' % (GALAXY_PARAM_PREFIX, data_dict['out_data_name']), URL)
        if not cur_URL or urlparse(cur_URL).scheme not in ('http', 'https', 'ftp'):
            open(cur_filename, 'w').write("")
            stop_err('The remote data source application has not sent back a URL parameter in the request.')

        # The following calls to urlopen() will use the above default timeout
        try:
            if not URL_method or URL_method == 'get':
                page = urlopen(cur_URL)
            elif URL_method == 'post':
                page = urlopen(cur_URL, urlencode(params).encode("utf-8"))
        except Exception as e:
            stop_err('The remote data source application may be off line, please try again later. Error: %s' % str(e))
        if max_file_size:
            file_size = int(page.info().get('Content-Length', 0))
            if file_size > max_file_size:
                stop_err('The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % (file_size, max_file_size))
        try:
            cur_filename = sniff.stream_to_open_named_file(page, os.open(cur_filename, os.O_WRONLY | os.O_CREAT), cur_filename, source_encoding=get_charset_from_http_headers(page.headers))
        except Exception as e:
            stop_err('Unable to fetch %s:\n%s' % (cur_URL, e))

        # here import checks that upload tool performs
        if enhanced_handling:
            try:
                ext = sniff.handle_uploaded_dataset_file(filename, datatypes_registry, ext=data_dict['ext'])
            except Exception as e:
                stop_err(str(e))
            info = dict(type='dataset',
                        dataset_id=data_dict['dataset_id'],
                        ext=ext)

            json_file.write("%s\n" % dumps(info))
Exemple #11
0
def add_file(dataset, registry, json_file, output_path):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files'

    # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed)
    # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their
    # paths during data conversions since this user already owns that path.
    # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206
    run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get("in_place", False)

    # purge_source defaults to True unless this is an FTP import and
    # ftp_upload_purge has been overridden to False in Galaxy's config.
    # We set purge_source to False if:
    # - the job does not have write access to the file, e.g. when running as the
    #   real user
    # - the files are uploaded from external paths.
    purge_source = dataset.get('purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste')

    # in_place is True unless we are running as a real user or importing external paths (i.e.
    # this is a real upload and not a path paste or ftp import).
    # in_place should always be False if running as real user because the uploaded file will
    # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't
    # modify files not controlled by Galaxy.
    in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import')

    # Base on the check_upload_content Galaxy config option and on by default, this enables some
    # security related checks on the uploaded content, but can prevent uploads from working in some cases.
    check_content = dataset.get('check_content' , True)

    # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically
    # decompressing archive files before sniffing.
    auto_decompress = dataset.get('auto_decompress', True)
    try:
        ext = dataset.file_type
    except AttributeError:
        raise UploadProblemException('Unable to process uploaded file, missing file_type parameter.')

    if dataset.type == 'url':
        try:
            page = urlopen(dataset.path)  # page will be .close()ed by sniff methods
            temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers))
        except Exception as e:
            raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e)))
        dataset.path = temp_name
    # See if we have an empty file
    if not os.path.exists(dataset.path):
        raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path)
    if not os.path.getsize(dataset.path) > 0:
        raise UploadProblemException('The uploaded file is empty')
    # Is dataset content supported sniffable binary?
    is_binary = check_binary(dataset.path)
    if is_binary:
        # Sniff the data type
        guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order)
        # Set data_type only if guessed_ext is a binary datatype
        datatype = registry.get_datatype_by_extension(guessed_ext)
        if isinstance(datatype, Binary):
            data_type = guessed_ext
            ext = guessed_ext
    if not data_type:
        root_datatype = registry.get_datatype_by_extension(dataset.file_type)
        if getattr(root_datatype, 'compressed', False):
            data_type = 'compressed archive'
            ext = dataset.file_type
        else:
            # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
            is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content)
            if is_gzipped and not is_valid:
                raise UploadProblemException('The gzipped uploaded file contains inappropriate content')
            elif is_gzipped and is_valid and auto_decompress:
                if not link_data_only:
                    # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
                    CHUNK_SIZE = 2 ** 20  # 1Mb
                    fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                    gzipped_file = gzip.GzipFile(dataset.path, 'rb')
                    while 1:
                        try:
                            chunk = gzipped_file.read(CHUNK_SIZE)
                        except IOError:
                            os.close(fd)
                            os.remove(uncompressed)
                            raise UploadProblemException('Problem decompressing gzipped data')
                        if not chunk:
                            break
                        os.write(fd, chunk)
                    os.close(fd)
                    gzipped_file.close()
                    # Replace the gzipped file with the decompressed file if it's safe to do so
                    if not in_place:
                        dataset.path = uncompressed
                    else:
                        shutil.move(uncompressed, dataset.path)
                    os.chmod(dataset.path, 0o644)
                dataset.name = dataset.name.rstrip('.gz')
                data_type = 'gzip'
            if not data_type:
                # See if we have a bz2 file, much like gzip
                is_bzipped, is_valid = check_bz2(dataset.path, check_content)
                if is_bzipped and not is_valid:
                    raise UploadProblemException('The gzipped uploaded file contains inappropriate content')
                elif is_bzipped and is_valid and auto_decompress:
                    if not link_data_only:
                        # We need to uncompress the temp_name file
                        CHUNK_SIZE = 2 ** 20  # 1Mb
                        fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                        bzipped_file = bz2.BZ2File(dataset.path, 'rb')
                        while 1:
                            try:
                                chunk = bzipped_file.read(CHUNK_SIZE)
                            except IOError:
                                os.close(fd)
                                os.remove(uncompressed)
                                raise UploadProblemException('Problem decompressing bz2 compressed data')
                            if not chunk:
                                break
                            os.write(fd, chunk)
                        os.close(fd)
                        bzipped_file.close()
                        # Replace the bzipped file with the decompressed file if it's safe to do so
                        if not in_place:
                            dataset.path = uncompressed
                        else:
                            shutil.move(uncompressed, dataset.path)
                        os.chmod(dataset.path, 0o644)
                    dataset.name = dataset.name.rstrip('.bz2')
                    data_type = 'bz2'
            if not data_type:
                # See if we have a zip archive
                is_zipped = check_zip(dataset.path)
                if is_zipped and auto_decompress:
                    if not link_data_only:
                        CHUNK_SIZE = 2 ** 20  # 1Mb
                        uncompressed = None
                        uncompressed_name = None
                        unzipped = False
                        z = zipfile.ZipFile(dataset.path)
                        for name in z.namelist():
                            if name.endswith('/'):
                                continue
                            if unzipped:
                                stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
                                break
                            fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False)
                            if sys.version_info[:2] >= (2, 6):
                                zipped_file = z.open(name)
                                while 1:
                                    try:
                                        chunk = zipped_file.read(CHUNK_SIZE)
                                    except IOError:
                                        os.close(fd)
                                        os.remove(uncompressed)
                                        raise UploadProblemException('Problem decompressing zipped data')
                                    if not chunk:
                                        break
                                    os.write(fd, chunk)
                                os.close(fd)
                                zipped_file.close()
                                uncompressed_name = name
                                unzipped = True
                            else:
                                # python < 2.5 doesn't have a way to read members in chunks(!)
                                try:
                                    with open(uncompressed, 'wb') as outfile:
                                        outfile.write(z.read(name))
                                    uncompressed_name = name
                                    unzipped = True
                                except IOError:
                                    os.close(fd)
                                    os.remove(uncompressed)
                                    raise UploadProblemException('Problem decompressing zipped data')
                        z.close()
                        # Replace the zipped file with the decompressed file if it's safe to do so
                        if uncompressed is not None:
                            if not in_place:
                                dataset.path = uncompressed
                            else:
                                shutil.move(uncompressed, dataset.path)
                            os.chmod(dataset.path, 0o644)
                            dataset.name = uncompressed_name
                    data_type = 'zip'
            if not data_type:
                if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type):
                    # We have a binary dataset, but it is not Bam, Sff or Pdf
                    data_type = 'binary'
                    parts = dataset.name.split(".")
                    if len(parts) > 1:
                        ext = parts[-1].strip().lower()
                        is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext)
                        if check_content and not is_ext_unsniffable_binary:
                            raise UploadProblemException('The uploaded binary file contains inappropriate content')
                        elif is_ext_unsniffable_binary and dataset.file_type != ext:
                            err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext)
                            raise UploadProblemException(err_msg)
            if not data_type:
                # We must have a text file
                if check_content and check_html(dataset.path):
                    raise UploadProblemException('The uploaded file contains inappropriate HTML content')
            if data_type != 'binary':
                if not link_data_only and data_type not in ('gzip', 'bz2', 'zip'):
                    # Convert universal line endings to Posix line endings if to_posix_lines is True
                    # and the data is not binary or gzip-, bz2- or zip-compressed.
                    if dataset.to_posix_lines:
                        tmpdir = output_adjacent_tmpdir(output_path)
                        tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
                        if dataset.space_to_tab:
                            line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix)
                        else:
                            line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix)
                if dataset.file_type == 'auto':
                    ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order)
                else:
                    ext = dataset.file_type
                data_type = ext
    # Save job info for the framework
    if ext == 'auto' and data_type == 'binary':
        ext = 'data'
    if ext == 'auto' and dataset.ext:
        ext = dataset.ext
    if ext == 'auto':
        ext = 'data'
    datatype = registry.get_datatype_by_extension(ext)
    if dataset.type in ('server_dir', 'path_paste') and link_data_only:
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            raise UploadProblemException(err_msg)
    if not link_data_only and converted_path:
        # Move the dataset to its "real" path
        try:
            shutil.move(converted_path, output_path)
        except OSError as e:
            # We may not have permission to remove converted_path
            if e.errno != errno.EACCES:
                raise
    elif not link_data_only:
        if purge_source:
            shutil.move(dataset.path, output_path)
        else:
            shutil.copy(dataset.path, output_path)
    # Write the job info
    stdout = stdout or 'uploaded %s file' % data_type
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    json_file.write(dumps(info) + "\n")
    if not link_data_only and datatype and datatype.dataset_content_needs_grooming(output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
Exemple #12
0
def add_file(dataset, registry, json_file, output_path):
    data_type = None
    line_count = None
    converted_path = None
    stdout = None
    link_data_only = dataset.get('link_data_only', 'copy_files')
    in_place = dataset.get('in_place', True)
    purge_source = dataset.get('purge_source', True)
    try:
        ext = dataset.file_type
    except AttributeError:
        file_err(
            'Unable to process uploaded file, missing file_type parameter.',
            dataset, json_file)
        return

    if dataset.type == 'url':
        try:
            page = urlopen(
                dataset.path)  # page will be .close()ed by sniff methods
            temp_name, dataset.is_multi_byte = sniff.stream_to_file(
                page,
                prefix='url_paste',
                source_encoding=util.get_charset_from_http_headers(
                    page.headers))
        except Exception as e:
            file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)),
                     dataset, json_file)
            return
        dataset.path = temp_name
    # See if we have an empty file
    if not os.path.exists(dataset.path):
        file_err('Uploaded temporary file (%s) does not exist.' % dataset.path,
                 dataset, json_file)
        return
    if not os.path.getsize(dataset.path) > 0:
        file_err('The uploaded file is empty', dataset, json_file)
        return
    if not dataset.type == 'url':
        # Already set is_multi_byte above if type == 'url'
        try:
            dataset.is_multi_byte = multi_byte.is_multi_byte(
                codecs.open(dataset.path, 'r', 'utf-8').read(100))
        except UnicodeDecodeError as e:
            dataset.is_multi_byte = False
    # Is dataset an image?
    i_ext = get_image_ext(dataset.path)
    if i_ext:
        ext = i_ext
        data_type = ext
    # Is dataset content multi-byte?
    elif dataset.is_multi_byte:
        data_type = 'multi-byte char'
        ext = sniff.guess_ext(dataset.path,
                              registry.sniff_order,
                              is_multi_byte=True)
    # Is dataset content supported sniffable binary?
    else:
        # FIXME: This ignores the declared sniff order in datatype_conf.xml
        # resulting in improper behavior
        type_info = Binary.is_sniffable_binary(dataset.path)
        if type_info:
            data_type = type_info[0]
            ext = type_info[1]
    if not data_type:
        root_datatype = registry.get_datatype_by_extension(dataset.file_type)
        if getattr(root_datatype, 'compressed', False):
            data_type = 'compressed archive'
            ext = dataset.file_type
        else:
            # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
            is_gzipped, is_valid = check_gzip(dataset.path)
            if is_gzipped and not is_valid:
                file_err(
                    'The gzipped uploaded file contains inappropriate content',
                    dataset, json_file)
                return
            elif is_gzipped and is_valid:
                if link_data_only == 'copy_files':
                    # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
                    CHUNK_SIZE = 2**20  # 1Mb
                    fd, uncompressed = tempfile.mkstemp(
                        prefix='data_id_%s_upload_gunzip_' %
                        dataset.dataset_id,
                        dir=os.path.dirname(output_path),
                        text=False)
                    gzipped_file = gzip.GzipFile(dataset.path, 'rb')
                    while 1:
                        try:
                            chunk = gzipped_file.read(CHUNK_SIZE)
                        except IOError:
                            os.close(fd)
                            os.remove(uncompressed)
                            file_err('Problem decompressing gzipped data',
                                     dataset, json_file)
                            return
                        if not chunk:
                            break
                        os.write(fd, chunk)
                    os.close(fd)
                    gzipped_file.close()
                    # Replace the gzipped file with the decompressed file if it's safe to do so
                    if dataset.type in ('server_dir',
                                        'path_paste') or not in_place:
                        dataset.path = uncompressed
                    else:
                        shutil.move(uncompressed, dataset.path)
                    os.chmod(dataset.path, 0o644)
                dataset.name = dataset.name.rstrip('.gz')
                data_type = 'gzip'
            if not data_type and bz2 is not None:
                # See if we have a bz2 file, much like gzip
                is_bzipped, is_valid = check_bz2(dataset.path)
                if is_bzipped and not is_valid:
                    file_err(
                        'The gzipped uploaded file contains inappropriate content',
                        dataset, json_file)
                    return
                elif is_bzipped and is_valid:
                    if link_data_only == 'copy_files':
                        # We need to uncompress the temp_name file
                        CHUNK_SIZE = 2**20  # 1Mb
                        fd, uncompressed = tempfile.mkstemp(
                            prefix='data_id_%s_upload_bunzip2_' %
                            dataset.dataset_id,
                            dir=os.path.dirname(output_path),
                            text=False)
                        bzipped_file = bz2.BZ2File(dataset.path, 'rb')
                        while 1:
                            try:
                                chunk = bzipped_file.read(CHUNK_SIZE)
                            except IOError:
                                os.close(fd)
                                os.remove(uncompressed)
                                file_err(
                                    'Problem decompressing bz2 compressed data',
                                    dataset, json_file)
                                return
                            if not chunk:
                                break
                            os.write(fd, chunk)
                        os.close(fd)
                        bzipped_file.close()
                        # Replace the bzipped file with the decompressed file if it's safe to do so
                        if dataset.type in ('server_dir',
                                            'path_paste') or not in_place:
                            dataset.path = uncompressed
                        else:
                            shutil.move(uncompressed, dataset.path)
                        os.chmod(dataset.path, 0o644)
                    dataset.name = dataset.name.rstrip('.bz2')
                    data_type = 'bz2'
            if not data_type:
                # See if we have a zip archive
                is_zipped = check_zip(dataset.path)
                if is_zipped:
                    if link_data_only == 'copy_files':
                        CHUNK_SIZE = 2**20  # 1Mb
                        uncompressed = None
                        uncompressed_name = None
                        unzipped = False
                        z = zipfile.ZipFile(dataset.path)
                        for name in z.namelist():
                            if name.endswith('/'):
                                continue
                            if unzipped:
                                stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.'
                                break
                            fd, uncompressed = tempfile.mkstemp(
                                prefix='data_id_%s_upload_zip_' %
                                dataset.dataset_id,
                                dir=os.path.dirname(output_path),
                                text=False)
                            if sys.version_info[:2] >= (2, 6):
                                zipped_file = z.open(name)
                                while 1:
                                    try:
                                        chunk = zipped_file.read(CHUNK_SIZE)
                                    except IOError:
                                        os.close(fd)
                                        os.remove(uncompressed)
                                        file_err(
                                            'Problem decompressing zipped data',
                                            dataset, json_file)
                                        return
                                    if not chunk:
                                        break
                                    os.write(fd, chunk)
                                os.close(fd)
                                zipped_file.close()
                                uncompressed_name = name
                                unzipped = True
                            else:
                                # python < 2.5 doesn't have a way to read members in chunks(!)
                                try:
                                    outfile = open(uncompressed, 'wb')
                                    outfile.write(z.read(name))
                                    outfile.close()
                                    uncompressed_name = name
                                    unzipped = True
                                except IOError:
                                    os.close(fd)
                                    os.remove(uncompressed)
                                    file_err(
                                        'Problem decompressing zipped data',
                                        dataset, json_file)
                                    return
                        z.close()
                        # Replace the zipped file with the decompressed file if it's safe to do so
                        if uncompressed is not None:
                            if dataset.type in ('server_dir',
                                                'path_paste') or not in_place:
                                dataset.path = uncompressed
                            else:
                                shutil.move(uncompressed, dataset.path)
                            os.chmod(dataset.path, 0o644)
                            dataset.name = uncompressed_name
                    data_type = 'zip'
            if not data_type:
                # TODO refactor this logic.  check_binary isn't guaranteed to be
                # correct since it only looks at whether the first 100 chars are
                # printable or not.  If someone specifies a known unsniffable
                # binary datatype and check_binary fails, the file gets mangled.
                if check_binary(dataset.path) or Binary.is_ext_unsniffable(
                        dataset.file_type):
                    # We have a binary dataset, but it is not Bam, Sff or Pdf
                    data_type = 'binary'
                    # binary_ok = False
                    parts = dataset.name.split(".")
                    if len(parts) > 1:
                        ext = parts[-1].strip().lower()
                        if not Binary.is_ext_unsniffable(ext):
                            file_err(
                                'The uploaded binary file contains inappropriate content',
                                dataset, json_file)
                            return
                        elif Binary.is_ext_unsniffable(
                                ext) and dataset.file_type != ext:
                            err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (
                                ext.capitalize(), ext)
                            file_err(err_msg, dataset, json_file)
                            return
            if not data_type:
                # We must have a text file
                if check_html(dataset.path):
                    file_err(
                        'The uploaded file contains inappropriate HTML content',
                        dataset, json_file)
                    return
            if data_type != 'binary':
                if link_data_only == 'copy_files':
                    if dataset.type in ('server_dir',
                                        'path_paste') and data_type not in [
                                            'gzip', 'bz2', 'zip'
                                        ]:
                        in_place = False
                    # Convert universal line endings to Posix line endings, but allow the user to turn it off,
                    # so that is becomes possible to upload gzip, bz2 or zip files with binary data without
                    # corrupting the content of those files.
                    if dataset.to_posix_lines:
                        tmpdir = output_adjacent_tmpdir(output_path)
                        tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id
                        if dataset.space_to_tab:
                            line_count, converted_path = sniff.convert_newlines_sep2tabs(
                                dataset.path,
                                in_place=in_place,
                                tmp_dir=tmpdir,
                                tmp_prefix=tmp_prefix)
                        else:
                            line_count, converted_path = sniff.convert_newlines(
                                dataset.path,
                                in_place=in_place,
                                tmp_dir=tmpdir,
                                tmp_prefix=tmp_prefix)
                if dataset.file_type == 'auto':
                    ext = sniff.guess_ext(dataset.path, registry.sniff_order)
                else:
                    ext = dataset.file_type
                data_type = ext
    # Save job info for the framework
    if ext == 'auto' and dataset.ext:
        ext = dataset.ext
    if ext == 'auto':
        ext = 'data'
    datatype = registry.get_datatype_by_extension(ext)
    if dataset.type in ('server_dir',
                        'path_paste') and link_data_only == 'link_to_files':
        # Never alter a file that will not be copied to Galaxy's local file store.
        if datatype.dataset_content_needs_grooming(dataset.path):
            err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
                '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.'
            file_err(err_msg, dataset, json_file)
            return
    if link_data_only == 'copy_files' and dataset.type in (
            'server_dir',
            'path_paste') and data_type not in ['gzip', 'bz2', 'zip']:
        # Move the dataset to its "real" path
        if converted_path is not None:
            shutil.copy(converted_path, output_path)
            try:
                os.remove(converted_path)
            except:
                pass
        else:
            # This should not happen, but it's here just in case
            shutil.copy(dataset.path, output_path)
    elif link_data_only == 'copy_files':
        if purge_source:
            shutil.move(dataset.path, output_path)
        else:
            shutil.copy(dataset.path, output_path)
    # Write the job info
    stdout = stdout or 'uploaded %s file' % data_type
    info = dict(type='dataset',
                dataset_id=dataset.dataset_id,
                ext=ext,
                stdout=stdout,
                name=dataset.name,
                line_count=line_count)
    if dataset.get('uuid', None) is not None:
        info['uuid'] = dataset.get('uuid')
    json_file.write(dumps(info) + "\n")

    if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming(
            output_path):
        # Groom the dataset content if necessary
        datatype.groom_dataset_content(output_path)
Exemple #13
0
def url_upload(dataset):
    """
    @summary: Upload an URL file and update dataset information.
    @param dataset: [Bunch] The dataset for the uploaded file.
    """
    url = dataset.path
    page = urllib.urlopen( url )
    temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers ) )
    dataset.path = temp_name
    dataset.name = url.split('/')[-1]