def test_check_html(): html_text = '<p>\n<a href="url">Link</a>\n</p>\n' assert check_html(html_text, file_path=False) # Test a non-HTML binary string assert not check_html(b'No HTML here\nSecond line\n', file_path=False) with tempfile.NamedTemporaryFile(mode='w') as tmp: tmp.write(html_text) tmp.flush() assert check_html(tmp.name) # Test a non-UTF8 binary file with tempfile.NamedTemporaryFile(mode='wb') as tmpb: tmpb.write(b'\x1f\x8b') tmpb.flush() assert not check_html(tmpb.name)
def handle_uploaded_dataset_file(filename, datatypes_registry, ext='auto', is_multi_byte=False): is_valid, ext = handle_compressed_file(filename, datatypes_registry, ext=ext) if not is_valid: raise InappropriateDatasetContentError( 'The compressed uploaded file contains inappropriate content.') if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(filename, sniff_order=datatypes_registry.sniff_order, is_multi_byte=is_multi_byte) if check_binary(filename): if not Binary.is_ext_unsniffable( ext) and not datatypes_registry.get_datatype_by_extension( ext).sniff(filename): raise InappropriateDatasetContentError( 'The binary uploaded file contains inappropriate content.') elif check_html(filename): raise InappropriateDatasetContentError( 'The uploaded file contains inappropriate HTML content.') return ext
def check_file_content_for_html_and_images(file_path): message = '' if checkers.check_html(file_path): message = 'The file "%s" contains HTML content.\n' % str(file_path) elif checkers.check_image(file_path): message = 'The file "%s" contains image content.\n' % str(file_path) return message
def handle_uploaded_dataset_file_internal( filename, datatypes_registry, ext='auto', tmp_prefix='sniff_upload_', tmp_dir=None, in_place=False, check_content=True, is_binary=None, auto_decompress=True, uploaded_file_ext=None, convert_to_posix_lines=None, convert_spaces_to_tabs=None, ): is_valid, ext, converted_path, compressed_type = handle_compressed_file( filename, datatypes_registry, ext=ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, auto_decompress=auto_decompress, ) try: if not is_valid: if is_tar(converted_path): raise InappropriateDatasetContentError('TAR file uploads are not supported') raise InappropriateDatasetContentError('The uploaded compressed file contains invalid content') # This needs to be checked again after decompression is_binary = check_binary(converted_path) if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs): # Convert universal line endings to Posix line endings, spaces to tabs (if desired) if convert_spaces_to_tabs: convert_fxn = convert_newlines_sep2tabs else: convert_fxn = convert_newlines line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix) if not in_place: if converted_path and filename != converted_path: os.unlink(converted_path) converted_path = _converted_path if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) if not is_binary and check_content and check_html(converted_path): raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content') except Exception: if filename != converted_path: os.unlink(converted_path) raise return ext, converted_path, compressed_type
def handle_uploaded_dataset_file(filename, datatypes_registry, ext='auto', is_multi_byte=False): is_valid, ext = handle_compressed_file(filename, datatypes_registry, ext=ext) if not is_valid: raise InappropriateDatasetContentError('The compressed uploaded file contains inappropriate content.') if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(filename, sniff_order=datatypes_registry.sniff_order, is_multi_byte=is_multi_byte) if check_binary(filename): if not Binary.is_ext_unsniffable(ext) and not datatypes_registry.get_datatype_by_extension(ext).sniff(filename): raise InappropriateDatasetContentError('The binary uploaded file contains inappropriate content.') elif check_html(filename): raise InappropriateDatasetContentError('The uploaded file contains inappropriate HTML content.') return ext
def is_data_index_sample_file( file_path ): """ Attempt to determine if a .sample file is appropriate for copying to ~/tool-data when a tool shed repository is being installed into a Galaxy instance. """ # Currently most data index files are tabular, so check that first. We'll assume that # if the file is tabular, it's ok to copy. if is_column_based( file_path ): return True # If the file is any of the following, don't copy it. if checkers.check_html( file_path ): return False if checkers.check_image( file_path ): return False if checkers.check_binary( name=file_path ): return False if checkers.is_bz2( file_path ): return False if checkers.is_gzip( file_path ): return False if checkers.check_zip( file_path ): return False # Default to copying the file if none of the above are true. return True
def is_data_index_sample_file(file_path): """ Attempt to determine if a .sample file is appropriate for copying to ~/tool-data when a tool shed repository is being installed into a Galaxy instance. """ # Currently most data index files are tabular, so check that first. We'll assume that # if the file is tabular, it's ok to copy. if is_column_based(file_path): return True # If the file is any of the following, don't copy it. if checkers.check_html(file_path): return False if checkers.check_image(file_path): return False if checkers.check_binary(name=file_path): return False if checkers.is_bz2(file_path): return False if checkers.is_gzip(file_path): return False if checkers.is_zip(file_path): return False # Default to copying the file if none of the above are true. return True
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') run_as_real_user = in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) # in_place is True if there is no external chmod in place, # however there are other instances where modifications should not occur in_place: # when a file is added from a directory on the local file system (ftp import folder or any other path). if dataset.type in ('server_dir', 'path_paste', 'ftp_import'): in_place = False check_content = dataset.get('check_content' , True) auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: file_err('Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte(codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if link_data_only == 'copy_files': CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if check_content and not Binary.is_ext_unsniffable(ext): file_err('The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_content and check_html(dataset.path): file_err('The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files' and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif link_data_only == 'copy_files': if purge_source and not run_as_real_user: # if the upload tool runs as a real user the real user # can't move dataset.path as this path is owned by galaxy. shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
# We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split( "." ) if len( parts ) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) return elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) file_err( err_msg, dataset, json_file ) return if not data_type: # We must have a text file if check_html( dataset.path ): file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir( output_path ) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else:
def _resolve_src(item): converted_path = None name, path = _has_src_to_path(item) dbkey = item.get("dbkey", "?") requested_ext = item.get("ext", "auto") info = item.get("info", None) object_id = item.get("object_id", None) link_data_only = upload_config.link_data_only if "link_data_only" in item: # Allow overriding this on a per file basis. link_data_only = _link_data_only(item) to_posix_lines = upload_config.get_option(item, "to_posix_lines") space_to_tab = upload_config.get_option(item, "space_to_tab") in_place = item.get("in_place", False) purge_source = item.get("purge_source", True) # Follow upload.py logic but without the auto-decompress logic. registry = upload_config.registry check_content = upload_config.check_content data_type, ext = None, requested_ext is_binary = check_binary(path) if is_binary: data_type, ext = handle_sniffable_binary_check( data_type, ext, path, registry) if data_type is None: root_datatype = registry.get_datatype_by_extension(ext) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = ext elif is_binary: data_type, ext = handle_unsniffable_binary_check( data_type, ext, path, name, is_binary, requested_ext, check_content, registry) if not data_type and check_content and check_html(path): raise UploadProblemException( 'The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only: if to_posix_lines: if space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( path, in_place=in_place, tmp_dir=".") else: line_count, converted_path = sniff.convert_newlines( path, in_place=in_place, tmp_dir=".") else: if space_to_tab: line_count, converted_path = sniff.sep2tabs( path, in_place=in_place, tmp_dir=".") if requested_ext == 'auto': ext = sniff.guess_ext(converted_path or path, registry.sniff_order) else: ext = requested_ext data_type = ext if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and requested_ext: ext = requested_ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) # If this file is not in the workdir make sure it gets there. if not link_data_only and converted_path: path = upload_config.ensure_in_working_directory( converted_path, purge_source, in_place) elif not link_data_only: path = upload_config.ensure_in_working_directory( path, purge_source, in_place) if not link_data_only and datatype and datatype.dataset_content_needs_grooming( path): # Groom the dataset content if necessary datatype.groom_dataset_content(path) rval = { "name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only } if info is not None: rval["info"] = info if object_id is not None: rval["object_id"] = object_id return rval
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get("in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get('purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content' , True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: raise UploadProblemException('Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: # Sniff the data type guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) # Set data_type only if guessed_ext is a binary datatype datatype = registry.get_datatype_by_extension(guessed_ext) if isinstance(datatype, Binary): data_type = guessed_ext ext = guessed_ext if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_gzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing gzipped data') if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_bzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing bz2 compressed data') if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if not link_data_only: CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: with open(uncompressed, 'wb') as outfile: outfile.write(z.read(name)) uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) if check_content and not is_ext_unsniffable_binary: raise UploadProblemException('The uploaded binary file contains inappropriate content') elif is_ext_unsniffable_binary and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) raise UploadProblemException(err_msg) if not data_type: # We must have a text file if check_content and check_html(dataset.path): raise UploadProblemException('The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif not link_data_only: if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if not link_data_only and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def handle_uploaded_dataset_file_internal( filename: str, datatypes_registry, ext: str = 'auto', tmp_prefix: Optional[str] = 'sniff_upload_', tmp_dir: Optional[str] = None, in_place: bool = False, check_content: bool = True, is_binary: Optional[bool] = None, auto_decompress: bool = True, uploaded_file_ext: Optional[str] = None, convert_to_posix_lines: Optional[bool] = None, convert_spaces_to_tabs: Optional[bool] = None, ) -> HandleUploadedDatasetFileInternalResponse: is_valid, ext, converted_path, compressed_type = handle_compressed_file( filename, datatypes_registry, ext=ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, auto_decompress=auto_decompress, ) converted_newlines = False converted_spaces = False try: if not is_valid: if is_tar(converted_path): raise InappropriateDatasetContentError( 'TAR file uploads are not supported') raise InappropriateDatasetContentError( 'The uploaded compressed file contains invalid content') # This needs to be checked again after decompression is_binary = check_binary(converted_path) guessed_ext = ext if ext in AUTO_DETECT_EXTENSIONS: guessed_ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) guessed_datatype = datatypes_registry.get_datatype_by_extension( guessed_ext) if not is_binary and guessed_datatype.is_binary: # It's possible to have a datatype that is binary but not within the first 1024 bytes, # so check_binary might return a false negative. This is for instance true for PDF files is_binary = True if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs): # Convert universal line endings to Posix line endings, spaces to tabs (if desired) convert_fxn = convert_function(convert_to_posix_lines, convert_spaces_to_tabs) line_count, _converted_path, converted_newlines, converted_spaces = convert_fxn( converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix) if not in_place: if converted_path and filename != converted_path: os.unlink(converted_path) assert _converted_path converted_path = _converted_path if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) else: ext = guessed_ext if not is_binary and check_content and check_html(converted_path): raise InappropriateDatasetContentError( 'The uploaded file contains invalid HTML content') except Exception: if filename != converted_path: os.unlink(converted_path) raise return HandleUploadedDatasetFileInternalResponse(ext, converted_path, compressed_type, converted_newlines, converted_spaces)
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen( dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path) if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type and bz2 is not None: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path) if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped: if link_data_only == 'copy_files': CHUNK_SIZE = 2**20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable( dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable( ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_html(dataset.path): file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ('server_dir', 'path_paste') and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste') and data_type not in ['gzip', 'bz2', 'zip']: # Move the dataset to its "real" path if converted_path is not None: shutil.copy(converted_path, output_path) try: os.remove(converted_path) except: pass else: # This should not happen, but it's here just in case shutil.copy(dataset.path, output_path) elif link_data_only == 'copy_files': if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
if len(parts) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable( ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_html(dataset.path): file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ('server_dir', 'path_paste') and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: