def looks_like_a_tool_xml(path): full_path = os.path.abspath(path) if not full_path.endswith(".xml"): return False if not os.path.getsize(full_path): return False if (checkers.check_binary(full_path) or checkers.check_image(full_path) or checkers.check_gzip(full_path)[0] or checkers.check_bz2(full_path)[0] or checkers.check_zip(full_path)): return False with open(path, "r") as f: start_contents = f.read(5 * 1024) if TOOL_REGEX.search(start_contents): return True return False
def looks_like_a_tool_xml(path): full_path = os.path.abspath(path) if not full_path.endswith(".xml"): return False if not os.path.getsize(full_path): return False if(checkers.check_binary(full_path) or checkers.check_image(full_path) or checkers.check_gzip(full_path)[0] or checkers.check_bz2(full_path)[0] or checkers.check_zip(full_path)): return False with open(path, "r") as f: start_contents = f.read(5 * 1024) if TOOL_REGEX.search(start_contents): return True return False
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') run_as_real_user = in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) # in_place is True if there is no external chmod in place, # however there are other instances where modifications should not occur in_place: # when a file is added from a directory on the local file system (ftp import folder or any other path). if dataset.type in ('server_dir', 'path_paste', 'ftp_import'): in_place = False check_content = dataset.get('check_content' , True) auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: file_err('Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte(codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if link_data_only == 'copy_files': CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if check_content and not Binary.is_ext_unsniffable(ext): file_err('The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_content and check_html(dataset.path): file_err('The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files' and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif link_data_only == 'copy_files': if purge_source and not run_as_real_user: # if the upload tool runs as a real user the real user # can't move dataset.path as this path is owned by galaxy. shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
if not chunk: break os.write( fd, chunk ) os.close( fd ) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ( 'server_dir', 'path_paste' ) or not in_place: dataset.path = uncompressed else: shutil.move( uncompressed, dataset.path ) os.chmod(dataset.path, 0644) dataset.name = dataset.name.rstrip( '.gz' ) data_type = 'gzip' if not data_type and bz2 is not None: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2( dataset.path ) if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file ) return elif is_bzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname( output_path ), text=False ) bzipped_file = bz2.BZ2File( dataset.path, 'rb' ) while 1: try: chunk = bzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed )
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get("in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get('purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content' , True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: raise UploadProblemException('Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: # Sniff the data type guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) # Set data_type only if guessed_ext is a binary datatype datatype = registry.get_datatype_by_extension(guessed_ext) if isinstance(datatype, Binary): data_type = guessed_ext ext = guessed_ext if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_gzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing gzipped data') if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_bzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing bz2 compressed data') if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if not link_data_only: CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: with open(uncompressed, 'wb') as outfile: outfile.write(z.read(name)) uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) if check_content and not is_ext_unsniffable_binary: raise UploadProblemException('The uploaded binary file contains inappropriate content') elif is_ext_unsniffable_binary and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) raise UploadProblemException(err_msg) if not data_type: # We must have a text file if check_content and check_html(dataset.path): raise UploadProblemException('The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif not link_data_only: if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if not link_data_only and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen( dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path) if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type and bz2 is not None: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path) if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped: if link_data_only == 'copy_files': CHUNK_SIZE = 2**20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable( dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable( ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_html(dataset.path): file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ('server_dir', 'path_paste') and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste') and data_type not in ['gzip', 'bz2', 'zip']: # Move the dataset to its "real" path if converted_path is not None: shutil.copy(converted_path, output_path) try: os.remove(converted_path) except: pass else: # This should not happen, but it's here just in case shutil.copy(dataset.path, output_path) elif link_data_only == 'copy_files': if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type and bz2 is not None: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path) if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb')