def looks_like_xml(path, regex=TOOL_REGEX): full_path = os.path.abspath(path) if not full_path.endswith(".xml"): return False if not os.path.getsize(full_path): return False if(checkers.check_binary(full_path) or checkers.check_image(full_path) or checkers.is_gzip(full_path) or checkers.is_bz2(full_path) or checkers.is_zip(full_path)): return False with open(path, encoding='utf-8') as f: try: start_contents = f.read(5 * 1024) except UnicodeDecodeError: return False if regex.search(start_contents): return True return False
def handle_uploaded_dataset_file(filename, datatypes_registry, ext='auto', is_multi_byte=False): is_valid, ext = handle_compressed_file(filename, datatypes_registry, ext=ext) if not is_valid: raise InappropriateDatasetContentError( 'The compressed uploaded file contains inappropriate content.') if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(filename, sniff_order=datatypes_registry.sniff_order, is_multi_byte=is_multi_byte) if check_binary(filename): if not Binary.is_ext_unsniffable( ext) and not datatypes_registry.get_datatype_by_extension( ext).sniff(filename): raise InappropriateDatasetContentError( 'The binary uploaded file contains inappropriate content.') elif check_html(filename): raise InappropriateDatasetContentError( 'The uploaded file contains inappropriate HTML content.') return ext
def handle_uploaded_dataset_file_internal( filename, datatypes_registry, ext='auto', tmp_prefix='sniff_upload_', tmp_dir=None, in_place=False, check_content=True, is_binary=None, auto_decompress=True, uploaded_file_ext=None, convert_to_posix_lines=None, convert_spaces_to_tabs=None, ): is_valid, ext, converted_path, compressed_type = handle_compressed_file( filename, datatypes_registry, ext=ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, auto_decompress=auto_decompress, ) try: if not is_valid: if is_tar(converted_path): raise InappropriateDatasetContentError('TAR file uploads are not supported') raise InappropriateDatasetContentError('The uploaded compressed file contains invalid content') # This needs to be checked again after decompression is_binary = check_binary(converted_path) if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs): # Convert universal line endings to Posix line endings, spaces to tabs (if desired) if convert_spaces_to_tabs: convert_fxn = convert_newlines_sep2tabs else: convert_fxn = convert_newlines line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix) if not in_place: if converted_path and filename != converted_path: os.unlink(converted_path) converted_path = _converted_path if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) if not is_binary and check_content and check_html(converted_path): raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content') except Exception: if filename != converted_path: os.unlink(converted_path) raise return ext, converted_path, compressed_type
def sniff_and_handle_data_type(file_path, datatypes_registry): """ The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual functions: it sniffs the filetype and if it's a compressed archive for a non compressed datatype such as fasta, it will be unpacked. """ ext = sniff.handle_uploaded_dataset_file(file_path, datatypes_registry) if not ext or ext == "data": is_binary = check_binary(file_path) ext = sniff.guess_ext(file_path, datatypes_registry.sniff_order, is_binary=is_binary) return ext
def handle_uploaded_dataset_file(filename, datatypes_registry, ext='auto', is_multi_byte=False): is_valid, ext = handle_compressed_file(filename, datatypes_registry, ext=ext) if not is_valid: raise InappropriateDatasetContentError('The compressed uploaded file contains inappropriate content.') if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(filename, sniff_order=datatypes_registry.sniff_order, is_multi_byte=is_multi_byte) if check_binary(filename): if not Binary.is_ext_unsniffable(ext) and not datatypes_registry.get_datatype_by_extension(ext).sniff(filename): raise InappropriateDatasetContentError('The binary uploaded file contains inappropriate content.') elif check_html(filename): raise InappropriateDatasetContentError('The uploaded file contains inappropriate HTML content.') return ext
def detect_datatype(self, trans, dataset_assoc): """Sniff and assign the datatype to a given dataset association (ldda or hda)""" data = trans.sa_session.query(self.model_class).get(dataset_assoc.id) if data.datatype.is_datatype_change_allowed(): if not data.ok_to_edit_metadata(): raise exceptions.ItemAccessibilityException('This dataset is currently being used as input or output. You cannot change datatype until the jobs have completed or you have canceled them.') else: path = data.dataset.file_name is_binary = check_binary(path) datatype = sniff.guess_ext(path, trans.app.datatypes_registry.sniff_order, is_binary=is_binary) trans.app.datatypes_registry.change_datatype(data, datatype) trans.sa_session.flush() self.set_metadata(trans, dataset_assoc) else: raise exceptions.InsufficientPermissionsException(f'Changing datatype "{data.extension}" is not allowed.')
def get_repository_file_contents(app, file_path, repository_id, is_admin=False): """Return the display-safe contents of a repository file for display in a browser.""" safe_str = '' if not is_path_browsable(app, file_path, repository_id, is_admin): log.warning( 'Request tries to access a file outside of the repository location. File path: %s', file_path) return 'Invalid file path' # Symlink targets are checked by is_path_browsable if os.path.islink(file_path): safe_str = 'link to: ' + basic_util.to_html_string( os.readlink(file_path)) return safe_str elif checkers.is_gzip(file_path): return '<br/>gzip compressed file<br/>' elif checkers.is_bz2(file_path): return '<br/>bz2 compressed file<br/>' elif checkers.check_zip(file_path): return '<br/>zip compressed file<br/>' elif checkers.check_binary(file_path): return '<br/>Binary file<br/>' else: for i, line in enumerate(open(file_path)): safe_str = '%s%s' % (safe_str, basic_util.to_html_string(line)) # Stop reading after string is larger than MAX_CONTENT_SIZE. if len(safe_str) > MAX_CONTENT_SIZE: large_str = \ '<br/>File contents truncated because file size is larger than maximum viewing size of %s<br/>' % \ util.nice_size( MAX_CONTENT_SIZE ) safe_str = '%s%s' % (safe_str, large_str) break if len(safe_str) > basic_util.MAX_DISPLAY_SIZE: # Eliminate the middle of the file to display a file no larger than basic_util.MAX_DISPLAY_SIZE. # This may not be ideal if the file is larger than MAX_CONTENT_SIZE. join_by_str = \ "<br/><br/>...some text eliminated here because file size is larger than maximum viewing size of %s...<br/><br/>" % \ util.nice_size( basic_util.MAX_DISPLAY_SIZE ) safe_str = util.shrink_string_by_size(safe_str, basic_util.MAX_DISPLAY_SIZE, join_by=join_by_str, left_larger=True, beginning_on_size_error=True) return safe_str
def get_repository_file_contents(app, file_path, repository_id, is_admin=False): """Return the display-safe contents of a repository file for display in a browser.""" safe_str = '' if not is_path_browsable(app, file_path, repository_id, is_admin): log.warning('Request tries to access a file outside of the repository location. File path: %s', file_path) return 'Invalid file path' # Symlink targets are checked by is_path_browsable if os.path.islink(file_path): safe_str = 'link to: ' + basic_util.to_html_string(os.readlink(file_path)) return safe_str elif checkers.is_gzip(file_path): return '<br/>gzip compressed file<br/>' elif checkers.is_bz2(file_path): return '<br/>bz2 compressed file<br/>' elif checkers.is_zip(file_path): return '<br/>zip compressed file<br/>' elif checkers.check_binary(file_path): return '<br/>Binary file<br/>' else: for i, line in enumerate(open(file_path)): safe_str = '%s%s' % (safe_str, basic_util.to_html_string(line)) # Stop reading after string is larger than MAX_CONTENT_SIZE. if len(safe_str) > MAX_CONTENT_SIZE: large_str = \ '<br/>File contents truncated because file size is larger than maximum viewing size of %s<br/>' % \ util.nice_size(MAX_CONTENT_SIZE) safe_str = '%s%s' % (safe_str, large_str) break if len(safe_str) > basic_util.MAX_DISPLAY_SIZE: # Eliminate the middle of the file to display a file no larger than basic_util.MAX_DISPLAY_SIZE. # This may not be ideal if the file is larger than MAX_CONTENT_SIZE. join_by_str = \ "<br/><br/>...some text eliminated here because file size is larger than maximum viewing size of %s...<br/><br/>" % \ util.nice_size(basic_util.MAX_DISPLAY_SIZE) safe_str = util.shrink_string_by_size(safe_str, basic_util.MAX_DISPLAY_SIZE, join_by=join_by_str, left_larger=True, beginning_on_size_error=True) return safe_str
def looks_like_a_tool_xml(path): full_path = os.path.abspath(path) if not full_path.endswith(".xml"): return False if not os.path.getsize(full_path): return False if (checkers.check_binary(full_path) or checkers.check_image(full_path) or checkers.check_gzip(full_path)[0] or checkers.check_bz2(full_path)[0] or checkers.check_zip(full_path)): return False with open(path, "r") as f: start_contents = f.read(5 * 1024) if TOOL_REGEX.search(start_contents): return True return False
def looks_like_a_tool_xml(path): """Quick check to see if a file looks like it may be a Galaxy XML tool file.""" full_path = os.path.abspath(path) if not full_path.endswith(".xml"): return False if not os.path.getsize(full_path): return False if (checkers.check_binary(full_path) or checkers.check_image(full_path) or checkers.is_gzip(full_path) or checkers.is_bz2(full_path) or checkers.is_zip(full_path)): return False with open(path, "r") as f: start_contents = f.read(5 * 1024) if TOOL_REGEX.search(start_contents): return True return False
def looks_like_xml(path, regex=TOOL_REGEX): full_path = os.path.abspath(path) if not full_path.endswith(".xml"): return False if not os.path.getsize(full_path): return False if(checkers.check_binary(full_path) or checkers.check_image(full_path) or checkers.is_gzip(full_path) or checkers.is_bz2(full_path) or checkers.is_zip(full_path)): return False with open(path, "r") as f: start_contents = f.read(5 * 1024) if regex.search(start_contents): return True return False
def looks_like_a_tool_xml(path): """Quick check to see if a file looks like it may be a Galaxy XML tool file.""" full_path = os.path.abspath(path) if not full_path.endswith(".xml"): return False if not os.path.getsize(full_path): return False if(checkers.check_binary(full_path) or checkers.check_image(full_path) or checkers.check_gzip(full_path)[0] or checkers.check_bz2(full_path)[0] or checkers.check_zip(full_path)): return False with open(path, "r") as f: start_contents = f.read(5 * 1024) if TOOL_REGEX.search(start_contents): return True return False
def is_data_index_sample_file( file_path ): """ Attempt to determine if a .sample file is appropriate for copying to ~/tool-data when a tool shed repository is being installed into a Galaxy instance. """ # Currently most data index files are tabular, so check that first. We'll assume that # if the file is tabular, it's ok to copy. if is_column_based( file_path ): return True # If the file is any of the following, don't copy it. if checkers.check_html( file_path ): return False if checkers.check_image( file_path ): return False if checkers.check_binary( name=file_path ): return False if checkers.is_bz2( file_path ): return False if checkers.is_gzip( file_path ): return False if checkers.check_zip( file_path ): return False # Default to copying the file if none of the above are true. return True
def is_data_index_sample_file(file_path): """ Attempt to determine if a .sample file is appropriate for copying to ~/tool-data when a tool shed repository is being installed into a Galaxy instance. """ # Currently most data index files are tabular, so check that first. We'll assume that # if the file is tabular, it's ok to copy. if is_column_based(file_path): return True # If the file is any of the following, don't copy it. if checkers.check_html(file_path): return False if checkers.check_image(file_path): return False if checkers.check_binary(name=file_path): return False if checkers.is_bz2(file_path): return False if checkers.is_gzip(file_path): return False if checkers.is_zip(file_path): return False # Default to copying the file if none of the above are true. return True
def handle_upload( registry, path, # dataset.path requested_ext, # dataset.file_type name, # dataset.name, tmp_prefix, tmp_dir, check_content, link_data_only, in_place, auto_decompress, convert_to_posix_lines, convert_spaces_to_tabs, ): stdout = None converted_path = None multi_file_zip = False # Does the first 1K contain a null? is_binary = check_binary(path) # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed. if not link_data_only: if auto_decompress and is_zip(path) and not is_single_file_zip(path): multi_file_zip = True try: ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file_internal( path, registry, ext=requested_ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, is_binary=is_binary, auto_decompress=auto_decompress, uploaded_file_ext=os.path.splitext(name)[1].lower().lstrip('.'), convert_to_posix_lines=convert_to_posix_lines, convert_spaces_to_tabs=convert_spaces_to_tabs, ) except sniff.InappropriateDatasetContentError as exc: raise UploadProblemException(str(exc)) elif requested_ext == 'auto': ext = sniff.guess_ext(path, registry.sniff_order, is_binary=is_binary) else: ext = requested_ext # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used) converted_path = None if converted_path == path else converted_path # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any) if requested_ext != 'auto': datatype = registry.get_datatype_by_extension(requested_ext) # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves) if check_content and hasattr(datatype, 'sniff') and not datatype.sniff(path): stdout = ("Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that" " type".format(ext=requested_ext)) # Handle unsniffable binaries if is_binary and ext == 'binary': upload_ext = os.path.splitext(name)[1].lower().lstrip('.') if registry.is_extension_unsniffable_binary(upload_ext): stdout = ("Warning: The file's datatype cannot be determined from its contents and was guessed based on" " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading" " this type of file".format(ext=upload_ext)) ext = upload_ext else: stdout = ("The uploaded binary file format cannot be determined automatically, please set the file 'Type'" " manually") datatype = registry.get_datatype_by_extension(ext) if multi_file_zip and not getattr(datatype, 'compressed', False): stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' return stdout, ext, datatype, is_binary, converted_path
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') run_as_real_user = in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) # in_place is True if there is no external chmod in place, # however there are other instances where modifications should not occur in_place: # when a file is added from a directory on the local file system (ftp import folder or any other path). if dataset.type in ('server_dir', 'path_paste', 'ftp_import'): in_place = False check_content = dataset.get('check_content' , True) auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: file_err('Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte(codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if link_data_only == 'copy_files': CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if check_content and not Binary.is_ext_unsniffable(ext): file_err('The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_content and check_html(dataset.path): file_err('The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files' and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif link_data_only == 'copy_files': if purge_source and not run_as_real_user: # if the upload tool runs as a real user the real user # can't move dataset.path as this path is owned by galaxy. shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: dataset.path = uncompressed else: shutil.move( uncompressed, dataset.path ) os.chmod(dataset.path, 0644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary( dataset.path ) or Binary.is_ext_unsniffable(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split( "." ) if len( parts ) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file ) return elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext ) file_err( err_msg, dataset, json_file ) return if not data_type: # We must have a text file
def handle_upload( registry, path, # dataset.path requested_ext, # dataset.file_type name, # dataset.name, tmp_prefix, tmp_dir, check_content, link_data_only, in_place, auto_decompress, convert_to_posix_lines, convert_spaces_to_tabs, ): stdout = None converted_path = None # Does the first 1K contain a null? is_binary = check_binary(path) # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed. if not link_data_only: if auto_decompress and is_zip(path) and not is_single_file_zip(path): stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' try: ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file_internal( path, registry, ext=requested_ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, is_binary=is_binary, auto_decompress=auto_decompress, uploaded_file_ext=os.path.splitext(name)[1].lower().lstrip( '.'), convert_to_posix_lines=convert_to_posix_lines, convert_spaces_to_tabs=convert_spaces_to_tabs, ) except sniff.InappropriateDatasetContentError as exc: raise UploadProblemException(str(exc)) elif requested_ext == 'auto': ext = sniff.guess_ext(path, registry.sniff_order, is_binary=is_binary) else: ext = requested_ext # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used) converted_path = None if converted_path == path else converted_path # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any) if requested_ext != 'auto': datatype = registry.get_datatype_by_extension(requested_ext) # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves) if check_content and hasattr(datatype, 'sniff') and not datatype.sniff(path): stdout = ( "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that" " type".format(ext=requested_ext)) # Handle unsniffable binaries if is_binary and ext == 'binary': upload_ext = os.path.splitext(name)[1].lower().lstrip('.') if registry.is_extension_unsniffable_binary(upload_ext): stdout = ( "Warning: The file's datatype cannot be determined from its contents and was guessed based on" " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading" " this type of file".format(ext=upload_ext)) ext = upload_ext else: stdout = ( "The uploaded binary file format cannot be determined automatically, please set the file 'Type'" " manually") datatype = registry.get_datatype_by_extension(ext) return stdout, ext, datatype, is_binary, converted_path
# Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable( dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable( ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (
def handle_uploaded_dataset_file_internal( filename, datatypes_registry, ext='auto', tmp_prefix='sniff_upload_', tmp_dir=None, in_place=False, check_content=True, is_binary=None, auto_decompress=True, uploaded_file_ext=None, convert_to_posix_lines=None, convert_spaces_to_tabs=None, ): is_valid, ext, converted_path, compressed_type = handle_compressed_file( filename, datatypes_registry, ext=ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, auto_decompress=auto_decompress, ) try: if not is_valid: if is_tar(converted_path): raise InappropriateDatasetContentError( 'TAR file uploads are not supported') raise InappropriateDatasetContentError( 'The uploaded compressed file contains invalid content') # This needs to be checked again after decompression is_binary = check_binary(converted_path) guessed_ext = ext if ext in AUTO_DETECT_EXTENSIONS: guessed_ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) guessed_datatype = datatypes_registry.get_datatype_by_extension( guessed_ext) if not is_binary and guessed_datatype.is_binary: # It's possible to have a datatype that is binary but not within the first 1024 bytes, # so check_binary might return a false negative. This is for instance true for PDF files is_binary = True if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs): # Convert universal line endings to Posix line endings, spaces to tabs (if desired) if convert_spaces_to_tabs: convert_fxn = convert_newlines_sep2tabs else: convert_fxn = convert_newlines line_count, _converted_path = convert_fxn(converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix) if not in_place: if converted_path and filename != converted_path: os.unlink(converted_path) converted_path = _converted_path if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) else: ext = guessed_ext # AMP Commenting this out. Does not seem necessary. check_html returns true for valid HTML, but the error # thrown is "invalid html". Similarly, if we change it to not check_html, then it throws an error with JSON # or other non-binary data types #if not is_binary and check_content and check_html(converted_path): #raise InappropriateDatasetContentError('The uploaded file contains invalid HTML content') except Exception: if filename != converted_path: os.unlink(converted_path) raise return ext, converted_path, compressed_type
def add_file(dataset, registry, output_path): ext = None compression_type = None line_count = None converted_path = None stdout = None link_data_only_str = dataset.get('link_data_only', 'copy_files') if link_data_only_str not in ['link_to_files', 'copy_files']: raise UploadProblemException( "Invalid setting '%s' for option link_data_only - upload request misconfigured" % link_data_only_str) link_data_only = link_data_only_str == 'link_to_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get( "in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get( 'purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content', True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: dataset.file_type except AttributeError: raise UploadProblemException( 'Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: dataset.path = sniff.stream_url_to_file(dataset.path) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException( 'Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Does the first 1K contain a null? is_binary = check_binary(dataset.path) # Decompress if needed/desired and determine/validate filetype. If a keep-compressed datatype is explicitly selected # or if autodetection is selected and the file sniffs as a keep-compressed datatype, it will not be decompressed. if not link_data_only: if is_zip(dataset.path) and not is_single_file_zip(dataset.path): stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' try: ext, converted_path, compression_type = sniff.handle_uploaded_dataset_file( dataset.path, registry, ext=dataset.file_type, tmp_prefix='data_id_%s_upload_' % dataset.dataset_id, tmp_dir=output_adjacent_tmpdir(output_path), in_place=in_place, check_content=check_content, is_binary=is_binary, auto_decompress=auto_decompress, uploaded_file_ext=os.path.splitext( dataset.name)[1].lower().lstrip('.'), convert_to_posix_lines=dataset.to_posix_lines, convert_spaces_to_tabs=dataset.space_to_tab, ) except sniff.InappropriateDatasetContentError as exc: raise UploadProblemException(str(exc)) elif dataset.file_type == 'auto': # Link mode can't decompress anyway, so enable sniffing for keep-compressed datatypes even when auto_decompress # is enabled os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_binary=is_binary) os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE') # The converted path will be the same as the input path if no conversion was done (or in-place conversion is used) converted_path = None if converted_path == dataset.path else converted_path # Validate datasets where the filetype was explicitly set using the filetype's sniffer (if any) if dataset.file_type != 'auto': datatype = registry.get_datatype_by_extension(dataset.file_type) # Enable sniffer "validate mode" (prevents certain sniffers from disabling themselves) os.environ['GALAXY_SNIFFER_VALIDATE_MODE'] = '1' if hasattr(datatype, 'sniff') and not datatype.sniff(dataset.path): stdout = ( "Warning: The file 'Type' was set to '{ext}' but the file does not appear to be of that" " type".format(ext=dataset.file_type)) os.environ.pop('GALAXY_SNIFFER_VALIDATE_MODE') # Handle unsniffable binaries if is_binary and ext == 'binary': upload_ext = os.path.splitext(dataset.name)[1].lower().lstrip('.') if registry.is_extension_unsniffable_binary(upload_ext): stdout = ( "Warning: The file's datatype cannot be determined from its contents and was guessed based on" " its extension, to avoid this warning, manually set the file 'Type' to '{ext}' when uploading" " this type of file".format(ext=upload_ext)) ext = upload_ext else: stdout = ( "The uploaded binary file format cannot be determined automatically, please set the file 'Type'" " manually") datatype = registry.get_datatype_by_extension(ext) # Strip compression extension from name if compression_type and not getattr( datatype, 'compressed', False) and dataset.name.endswith('.' + compression_type): dataset.name = dataset.name[:-len('.' + compression_type)] # Move dataset if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only: # Move the dataset to its "real" path. converted_path is a tempfile so we move it even if purge_source is False. if purge_source or converted_path: try: shutil.move(converted_path or dataset.path, output_path) except OSError as e: # We may not have permission to remove the input if e.errno != errno.EACCES: raise else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % ext info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') # FIXME: does this belong here? also not output-adjacent-tmpdir aware =/ if not link_data_only and datatype and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path) return info
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen( dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path) if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type and bz2 is not None: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path) if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped: if link_data_only == 'copy_files': CHUNK_SIZE = 2**20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable( dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable( ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_html(dataset.path): file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ('server_dir', 'path_paste') and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste') and data_type not in ['gzip', 'bz2', 'zip']: # Move the dataset to its "real" path if converted_path is not None: shutil.copy(converted_path, output_path) try: os.remove(converted_path) except: pass else: # This should not happen, but it's here just in case shutil.copy(dataset.path, output_path) elif link_data_only == 'copy_files': if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def handle_uploaded_dataset_file_internal( filename: str, datatypes_registry, ext: str = 'auto', tmp_prefix: Optional[str] = 'sniff_upload_', tmp_dir: Optional[str] = None, in_place: bool = False, check_content: bool = True, is_binary: Optional[bool] = None, auto_decompress: bool = True, uploaded_file_ext: Optional[str] = None, convert_to_posix_lines: Optional[bool] = None, convert_spaces_to_tabs: Optional[bool] = None, ) -> HandleUploadedDatasetFileInternalResponse: is_valid, ext, converted_path, compressed_type = handle_compressed_file( filename, datatypes_registry, ext=ext, tmp_prefix=tmp_prefix, tmp_dir=tmp_dir, in_place=in_place, check_content=check_content, auto_decompress=auto_decompress, ) converted_newlines = False converted_spaces = False try: if not is_valid: if is_tar(converted_path): raise InappropriateDatasetContentError( 'TAR file uploads are not supported') raise InappropriateDatasetContentError( 'The uploaded compressed file contains invalid content') # This needs to be checked again after decompression is_binary = check_binary(converted_path) guessed_ext = ext if ext in AUTO_DETECT_EXTENSIONS: guessed_ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) guessed_datatype = datatypes_registry.get_datatype_by_extension( guessed_ext) if not is_binary and guessed_datatype.is_binary: # It's possible to have a datatype that is binary but not within the first 1024 bytes, # so check_binary might return a false negative. This is for instance true for PDF files is_binary = True if not is_binary and (convert_to_posix_lines or convert_spaces_to_tabs): # Convert universal line endings to Posix line endings, spaces to tabs (if desired) convert_fxn = convert_function(convert_to_posix_lines, convert_spaces_to_tabs) line_count, _converted_path, converted_newlines, converted_spaces = convert_fxn( converted_path, in_place=in_place, tmp_dir=tmp_dir, tmp_prefix=tmp_prefix) if not in_place: if converted_path and filename != converted_path: os.unlink(converted_path) assert _converted_path converted_path = _converted_path if ext in AUTO_DETECT_EXTENSIONS: ext = guess_ext(converted_path, sniff_order=datatypes_registry.sniff_order, is_binary=is_binary) else: ext = guessed_ext if not is_binary and check_content and check_html(converted_path): raise InappropriateDatasetContentError( 'The uploaded file contains invalid HTML content') except Exception: if filename != converted_path: os.unlink(converted_path) raise return HandleUploadedDatasetFileInternalResponse(ext, converted_path, compressed_type, converted_newlines, converted_spaces)
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get("in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get('purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content' , True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: raise UploadProblemException('Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: # Sniff the data type guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) # Set data_type only if guessed_ext is a binary datatype datatype = registry.get_datatype_by_extension(guessed_ext) if isinstance(datatype, Binary): data_type = guessed_ext ext = guessed_ext if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_gzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing gzipped data') if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_bzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing bz2 compressed data') if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if not link_data_only: CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: with open(uncompressed, 'wb') as outfile: outfile.write(z.read(name)) uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) if check_content and not is_ext_unsniffable_binary: raise UploadProblemException('The uploaded binary file contains inappropriate content') elif is_ext_unsniffable_binary and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) raise UploadProblemException(err_msg) if not data_type: # We must have a text file if check_content and check_html(dataset.path): raise UploadProblemException('The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif not link_data_only: if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if not link_data_only and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def _resolve_src(item): converted_path = None name, path = _has_src_to_path(item) dbkey = item.get("dbkey", "?") requested_ext = item.get("ext", "auto") info = item.get("info", None) object_id = item.get("object_id", None) link_data_only = upload_config.link_data_only if "link_data_only" in item: # Allow overriding this on a per file basis. link_data_only = _link_data_only(item) to_posix_lines = upload_config.get_option(item, "to_posix_lines") space_to_tab = upload_config.get_option(item, "space_to_tab") in_place = item.get("in_place", False) purge_source = item.get("purge_source", True) # Follow upload.py logic but without the auto-decompress logic. registry = upload_config.registry check_content = upload_config.check_content data_type, ext = None, requested_ext is_binary = check_binary(path) if is_binary: data_type, ext = handle_sniffable_binary_check( data_type, ext, path, registry) if data_type is None: root_datatype = registry.get_datatype_by_extension(ext) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = ext elif is_binary: data_type, ext = handle_unsniffable_binary_check( data_type, ext, path, name, is_binary, requested_ext, check_content, registry) if not data_type and check_content and check_html(path): raise UploadProblemException( 'The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only: if to_posix_lines: if space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( path, in_place=in_place, tmp_dir=".") else: line_count, converted_path = sniff.convert_newlines( path, in_place=in_place, tmp_dir=".") else: if space_to_tab: line_count, converted_path = sniff.sep2tabs( path, in_place=in_place, tmp_dir=".") if requested_ext == 'auto': ext = sniff.guess_ext(converted_path or path, registry.sniff_order) else: ext = requested_ext data_type = ext if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and requested_ext: ext = requested_ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) # If this file is not in the workdir make sure it gets there. if not link_data_only and converted_path: path = upload_config.ensure_in_working_directory( converted_path, purge_source, in_place) elif not link_data_only: path = upload_config.ensure_in_working_directory( path, purge_source, in_place) if not link_data_only and datatype and datatype.dataset_content_needs_grooming( path): # Groom the dataset content if necessary datatype.groom_dataset_content(path) rval = { "name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only } if info is not None: rval["info"] = info if object_id is not None: rval["object_id"] = object_id return rval