def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.items(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[value.name] is None and not value.optional: raise UploadProblemException('A required composite data file was not provided (%s)' % name) elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name]['path'] isurl = dp.find('://') != -1 # todo fixme if isurl: try: temp_name = sniff.stream_to_file(urlopen(dp), prefix='url_paste') except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dp, str(e))) dataset.path = temp_name dp = temp_name if not value.is_binary: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[value.name].get('space_to_tab', value.space_to_tab): sniff.convert_newlines_sep2tabs(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: sniff.convert_newlines(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) shutil.move(dp, os.path.join(files_path, name)) # Move the dataset to its "real" path shutil.move(dataset.primary_file, output_path) # Write the job info info = dict(type='dataset', dataset_id=dataset.dataset_id, stdout='uploaded %s file' % dataset.file_type) json_file.write(dumps(info) + "\n")
def stage_file(name, composite_file_path, is_binary=False): dp = composite_file_path['path'] path, is_url = to_path(dp) if is_url: dataset.path = path dp = path auto_decompress = composite_file_path.get('auto_decompress', True) if auto_decompress and not datatype.composite_type and CompressedFile.can_decompress( dp): # It isn't an explictly composite datatype, so these are just extra files to attach # as composite data. It'd be better if Galaxy was communicating this to the tool # a little more explicitly so we didn't need to dispatch on the datatype and so we # could attach arbitrary extra composite data to an existing composite datatype if # if need be? Perhaps that would be a mistake though. CompressedFile(dp).extract(files_path) else: if not is_binary: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if composite_file_path.get('space_to_tab'): sniff.convert_newlines_sep2tabs(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: sniff.convert_newlines(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) file_output_path = os.path.join(files_path, name) shutil.move(dp, file_output_path) # groom the dataset file content if required by the corresponding datatype definition if datatype.dataset_content_needs_grooming(file_output_path): datatype.groom_dataset_content(file_output_path)
def add_composite_file(dataset, registry, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[ value.name] is None and not value.optional: file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name]['path'] isurl = dp.find('://') <> -1 # todo fixme if isurl: try: temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen(dp), prefix='url_paste') except Exception, e: file_err('Unable to fetch %s\n%s' % (dp, str(e)), dataset, json_file) return dataset.path = temp_name dp = temp_name if not value.is_binary: if dataset.composite_file_paths[value.name].get( 'space_to_tab', value.space_to_tab): sniff.convert_newlines_sep2tabs(dp) else: sniff.convert_newlines(dp) shutil.move(dp, os.path.join(files_path, name))
def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.items(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[value.name] is None and not value.optional: file_err('A required composite data file was not provided (%s)' % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name]['path'] isurl = dp.find('://') != -1 # todo fixme if isurl: try: temp_name, dataset.is_multi_byte = sniff.stream_to_file(urlopen(dp), prefix='url_paste') except Exception as e: file_err('Unable to fetch %s\n%s' % (dp, str(e)), dataset, json_file) return dataset.path = temp_name dp = temp_name if not value.is_binary: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[value.name].get('space_to_tab', value.space_to_tab): sniff.convert_newlines_sep2tabs(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: sniff.convert_newlines(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) shutil.move(dp, os.path.join(files_path, name)) # Move the dataset to its "real" path shutil.move(dataset.primary_file, output_path) # Write the job info info = dict(type='dataset', dataset_id=dataset.dataset_id, stdout='uploaded %s file' % dataset.file_type) json_file.write(dumps(info) + "\n")
def stage_file(name, composite_file_path, is_binary=False): dp = composite_file_path['path'] path, is_url = to_path(dp) if is_url: dataset.path = path dp = path auto_decompress = composite_file_path.get('auto_decompress', True) if auto_decompress and not datatype.composite_type and CompressedFile.can_decompress(dp): # It isn't an explictly composite datatype, so these are just extra files to attach # as composite data. It'd be better if Galaxy was communicating this to the tool # a little more explicitly so we didn't need to dispatch on the datatype and so we # could attach arbitrary extra composite data to an existing composite datatype if # if need be? Perhaps that would be a mistake though. CompressedFile(dp).extract(files_path) else: if not is_binary: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if composite_file_path.get('space_to_tab'): sniff.convert_newlines_sep2tabs(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: sniff.convert_newlines(dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) file_output_path = os.path.join(files_path, name) shutil.move(dp, file_output_path) # groom the dataset file content if required by the corresponding datatype definition if datatype.dataset_content_needs_grooming(file_output_path): datatype.groom_dataset_content(file_output_path)
def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[ value.name] is None and not value.optional: file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None: if not value.is_binary: if uploaded_dataset.composite_files[ value.name].space_to_tab: sniff.convert_newlines_sep2tabs( dataset.composite_file_paths[value.name]['path']) else: sniff.convert_newlines( dataset.composite_file_paths[value.name]['path']) shutil.move(dataset.composite_file_paths[value.name]['path'], os.path.join(files_path, name)) # Move the dataset to its "real" path shutil.move(dataset.primary_file, output_path) # Write the job info info = dict(type='dataset', dataset_id=dataset.dataset_id, stdout='uploaded %s file' % dataset.file_type) json_file.write(to_json_string(info) + "\n")
def add_composite_file( dataset, registry, json_file, output_path, files_path ): if dataset.composite_files: os.mkdir( files_path ) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch( **value ) if dataset.composite_file_paths[ value.name ] is None and not value.optional: file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file ) break elif dataset.composite_file_paths[value.name] is not None: dp = dataset.composite_file_paths[value.name][ 'path' ] isurl = dp.find('://') != -1 # todo fixme if isurl: try: temp_name, dataset.is_multi_byte = sniff.stream_to_file( urllib.urlopen( dp ), prefix='url_paste' ) except Exception, e: file_err( 'Unable to fetch %s\n%s' % ( dp, str( e ) ), dataset, json_file ) return dataset.path = temp_name dp = temp_name if not value.is_binary: tmpdir = output_adjacent_tmpdir( output_path ) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.composite_file_paths[ value.name ].get( 'space_to_tab', value.space_to_tab ): sniff.convert_newlines_sep2tabs( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else: sniff.convert_newlines( dp, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) move_copy(dp, os.path.join( files_path, name ) )
def assert_converts_to_1234_convert(content, block_size=1024): with tempfile.NamedTemporaryFile(delete=False, mode='w') as tf: tf.write(content) rval = convert_newlines(tf.name, tmp_prefix="gxtest", tmp_dir=tempfile.gettempdir(), block_size=block_size) actual_contents = open(tf.name).read() assert '1 2\n3 4\n' == actual_contents, actual_contents assert rval[0:2] == (2, None), f"rval != {rval} for {content}"
def test_convert_newlines_non_utf(): fname = get_test_fname("dosimzml") rval = convert_newlines(fname, tmp_prefix="gxtest", tmp_dir=tempfile.gettempdir(), in_place=False) new_file = rval[1] assert open(new_file, "rb").read() == open(get_test_fname("1imzml"), "rb").read()
def add_composite_file( dataset, json_file ): if dataset.composite_files: os.mkdir( dataset.extra_files_path ) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch( **value ) if dataset.composite_file_paths[ value.name ] is None and not value.optional: file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file ) break elif dataset.composite_file_paths[value.name] is not None: if not value.is_binary: if uploaded_dataset.composite_files[ value.name ].space_to_tab: sniff.convert_newlines_sep2tabs( dataset.composite_file_paths[ value.name ][ 'path' ] ) else: sniff.convert_newlines( dataset.composite_file_paths[ value.name ][ 'path' ] ) shutil.move( dataset.composite_file_paths[ value.name ][ 'path' ], os.path.join( dataset.extra_files_path, name ) ) info = dict( type = 'dataset', dataset_id = dataset.dataset_id, path = dataset.primary_file) json_file.write( to_json_string( info ) + "\n" )
def assert_converts_to_1234_convert(content, block_size=1024): fname = get_test_fname('temp2.txt') with open(fname, 'w') as fh: fh.write(content) rval = convert_newlines(fname, tmp_prefix="gxtest", tmp_dir=tempfile.gettempdir(), block_size=block_size) actual_contents = open(fname).read() assert '1 2\n3 4\n' == actual_contents, actual_contents assert rval == (2, None), "rval != %s for %s" % (rval, content)
def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[value.name] is None and not value.optional: file_err("A required composite data file was not provided (%s)" % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None: if not value.is_binary: if uploaded_dataset.composite_files[value.name].space_to_tab: sniff.convert_newlines_sep2tabs(dataset.composite_file_paths[value.name]["path"]) else: sniff.convert_newlines(dataset.composite_file_paths[value.name]["path"]) shutil.move(dataset.composite_file_paths[value.name]["path"], os.path.join(files_path, name)) # Move the dataset to its "real" path shutil.move(dataset.primary_file, output_path) # Write the job info info = dict(type="dataset", dataset_id=dataset.dataset_id, stdout="uploaded %s file" % dataset.file_type) json_file.write(to_json_string(info) + "\n")
def add_file(self, trans, file_obj, file_name, file_type, dbkey, info ): temp_name = sniff.stream_to_file(file_obj) sniff.convert_newlines(temp_name) if file_type == 'auto': ext = sniff.guess_ext(temp_name) else: ext = file_type data = trans.app.model.Dataset() data.name = file_name data.extension = ext data.dbkey = dbkey data.info = info data.flush() shutil.move(temp_name, data.file_name) data.state = data.states.OK data.init_meta() data.set_peek() if isinstance( data.datatype, datatypes.interval.Interval ): if data.missing_meta(): data.extension = 'tabular' trans.history.add_dataset( data ) trans.app.model.flush() return data
def add_file(self, trans, file_obj, file_name, file_type, dbkey, info): temp_name = sniff.stream_to_file(file_obj) sniff.convert_newlines(temp_name) if file_type == 'auto': ext = sniff.guess_ext(temp_name) else: ext = file_type data = trans.app.model.Dataset() data.name = file_name data.extension = ext data.dbkey = dbkey data.info = info data.flush() shutil.move(temp_name, data.file_name) data.state = data.states.OK data.init_meta() data.set_peek() if isinstance(data.datatype, datatypes.interval.Interval): if data.missing_meta(): data.extension = 'tabular' trans.history.add_dataset(data) trans.app.model.flush() return data
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') run_as_real_user = in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) # in_place is True if there is no external chmod in place, # however there are other instances where modifications should not occur in_place: # when a file is added from a directory on the local file system (ftp import folder or any other path). if dataset.type in ('server_dir', 'path_paste', 'ftp_import'): in_place = False check_content = dataset.get('check_content' , True) auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: file_err('Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte(codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: file_err('The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid and auto_decompress: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if link_data_only == 'copy_files': CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if check_content and not Binary.is_ext_unsniffable(ext): file_err('The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable(ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_content and check_html(dataset.path): file_err('The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files' and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif link_data_only == 'copy_files': if purge_source and not run_as_real_user: # if the upload tool runs as a real user the real user # can't move dataset.path as this path is owned by galaxy. shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def add_file( self, trans, temp_name, file_name, file_type, is_multi_byte, dbkey, info=None, space_to_tab=False, precreated_dataset=None ): def dataset_no_data_error( data, message = 'there was an error uploading your file' ): data.info = "No data: %s." % message data.state = data.states.ERROR if data.extension is None: data.extension = 'data' return data data_type = None if precreated_dataset is not None: data = precreated_dataset else: data = trans.app.model.HistoryDatasetAssociation( history = trans.history, create_dataset = True ) trans.app.security_agent.set_all_dataset_permissions( data.dataset, trans.app.security_agent.history_get_default_permissions( trans.history ) ) # See if we have an empty file if not os.path.getsize( temp_name ) > 0: return dataset_no_data_error( data, message = 'you attempted to upload an empty file' ) #raise BadFileException( "you attempted to upload an empty file." ) if is_multi_byte: ext = sniff.guess_ext( temp_name, is_multi_byte=True ) else: if not data_type: # See if we have a gzipped file, which, if it passes our restrictions, # we'll decompress on the fly. is_gzipped, is_valid = self.check_gzip( temp_name ) if is_gzipped and not is_valid: return dataset_no_data_error( data, message = 'you attempted to upload an inappropriate file' ) #raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_gzipped and is_valid: # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp() gzipped_file = gzip.GzipFile( temp_name ) while 1: try: chunk = gzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) return dataset_no_data_error( data, message = 'problem decompressing gzipped data' ) #raise BadFileException( 'problem decompressing gzipped data.' ) if not chunk: break os.write( fd, chunk ) os.close( fd ) gzipped_file.close() # Replace the gzipped file with the decompressed file shutil.move( uncompressed, temp_name ) file_name = file_name.rstrip( '.gz' ) data_type = 'gzip' ext = '' if not data_type: # See if we have a zip archive is_zipped, is_valid, test_ext = self.check_zip( temp_name ) if is_zipped and not is_valid: return dataset_no_data_error( data, message = 'you attempted to upload an inappropriate file' ) #raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_zipped and is_valid: # Currently, we force specific tools to handle this case. We also require the user # to manually set the incoming file_type if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_type != 'binseq.zip': return dataset_no_data_error( data, message = "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'" ) #raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." ) elif test_ext == 'txt' and file_type != 'txtseq.zip': return dataset_no_data_error( data, message = "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'" ) #raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." ) if not ( file_type == 'binseq.zip' or file_type == 'txtseq.zip' ): return dataset_no_data_error( data, message = "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files" ) #raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." ) data_type = 'zip' ext = file_type if not data_type: if self.check_binary( temp_name ): parts = file_name.split( "." ) if len( parts ) > 1: ext = parts[1].strip().lower() if not( ext == 'ab1' or ext == 'scf' ): return dataset_no_data_error( data, message = "you attempted to upload an inappropriate file" ) #raise BadFileException( "you attempted to upload an inappropriate file." ) if ext == 'ab1' and file_type != 'ab1': return dataset_no_data_error( data, message = "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files" ) #raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." ) elif ext == 'scf' and file_type != 'scf': return dataset_no_data_error( data, message = "you must manually set the 'File Format' to 'Scf' when uploading scf files" ) #raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." ) data_type = 'binary' if not data_type: # We must have a text file if trans.app.datatypes_registry.get_datatype_by_extension( file_type ).composite_type != 'auto_primary_file' and self.check_html( temp_name ): return dataset_no_data_error( data, message = "you attempted to upload an inappropriate file" ) #raise BadFileException( "you attempted to upload an inappropriate file." ) if data_type != 'binary' and data_type != 'zip': if space_to_tab: self.line_count = sniff.convert_newlines_sep2tabs( temp_name ) else: self.line_count = sniff.convert_newlines( temp_name ) if file_type == 'auto': ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) else: ext = file_type data_type = ext if info is None: info = 'uploaded %s file' %data_type data.extension = ext data.name = file_name data.dbkey = dbkey data.info = info data.flush() shutil.move( temp_name, data.file_name ) data.state = data.states.OK data.set_size() data.init_meta() if self.line_count is not None: try: if is_multi_byte: data.set_multi_byte_peek( line_count=self.line_count ) else: data.set_peek( line_count=self.line_count ) except: if is_multi_byte: data.set_multi_byte_peek() else: data.set_peek() else: if is_multi_byte: data.set_multi_byte_peek() else: data.set_peek() # validate incomming data # Commented by greg on 3/14/07 # for error in data.datatype.validate( data ): # data.add_validation_error( # model.ValidationError( message=str( error ), err_type=error.__class__.__name__, attributes=util.object_to_string( error.__dict__ ) ) ) if data.missing_meta(): data.datatype.set_meta( data ) dbkey_to_store = dbkey if type( dbkey_to_store ) == type( [] ): dbkey_to_store = dbkey[0] if precreated_dataset is not None: trans.history.genome_build = dbkey_to_store else: trans.history.add_dataset( data, genome_build=dbkey_to_store ) trans.app.model.flush() trans.log_event( "Added dataset %d to history %d" %( data.id, trans.history.id ), tool_id="upload" ) return data
file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ( 'server_dir', 'path_paste' ) and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir( output_path ) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix ) if dataset.file_type == 'auto': ext = sniff.guess_ext( dataset.path, registry.sniff_order ) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension( ext ) if dataset.type in ( 'server_dir', 'path_paste' ) and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming( dataset.path ): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \
def execute( self, tool, trans, incoming={}, set_output_hid = True ): dataset_upload_inputs = [] for input_name, input in tool.inputs.iteritems(): if input.type == "upload_dataset": dataset_upload_inputs.append( input ) assert dataset_upload_inputs, Exception( "No dataset upload groups were found." ) # Get any precreated datasets (when using asynchronous uploads) async_datasets = [] self.precreated_datasets = [] if incoming.get( 'async_datasets', None ) not in ["None", "", None]: async_datasets = incoming['async_datasets'].split(',') for id in async_datasets: try: data = trans.app.model.HistoryDatasetAssociation.get( int( id ) ) except: log.exception( 'Unable to load precreated dataset (%s) sent in upload form' % id ) continue if trans.user is None and trans.galaxy_session.current_history != data.history: log.error( 'Got a precreated dataset (%s) but it does not belong to anonymous user\'s current session (%s)' % ( data.id, trans.galaxy_session.id ) ) elif data.history.user != trans.user: log.error( 'Got a precreated dataset (%s) but it does not belong to current user (%s)' % ( data.id, trans.user.id ) ) else: self.precreated_datasets.append( data ) data_list = [] for dataset_upload_input in dataset_upload_inputs: uploaded_datasets = dataset_upload_input.get_uploaded_datasets( trans, incoming ) for uploaded_dataset in uploaded_datasets: precreated_dataset = self.get_precreated_dataset( uploaded_dataset.precreated_name ) dataset = self.add_file( trans, uploaded_dataset.primary_file, uploaded_dataset.name, uploaded_dataset.file_type, uploaded_dataset.is_multi_byte, uploaded_dataset.dbkey, space_to_tab = uploaded_dataset.space_to_tab, info = uploaded_dataset.info, precreated_dataset = precreated_dataset ) if uploaded_dataset.composite_files: os.mkdir( dataset.extra_files_path ) #make extra files path for name, value in uploaded_dataset.composite_files.iteritems(): #what about binary files here, need to skip converting newlines if value is None and not dataset.datatype.writable_files[ name ].optional: dataset.info = "A required composite data file was not provided (%s)" % name dataset.state = dataset.states.ERROR break elif value is not None: if value.space_to_tab: sniff.convert_newlines_sep2tabs( value.filename ) else: sniff.convert_newlines( value.filename ) shutil.move( value.filename, os.path.join( dataset.extra_files_path, name ) ) data_list.append( dataset ) #clean up extra temp names uploaded_dataset.clean_up_temp_files() #cleanup unclaimed precreated datasets: for data in self.precreated_datasets: log.info( 'Cleaned up unclaimed precreated dataset (%s).' % ( data.id ) ) data.state = data.states.ERROR data.info = 'No file contents were available.' if data_list: trans.app.model.flush() # Create the job object job = trans.app.model.Job() job.session_id = trans.get_galaxy_session().id job.history_id = trans.history.id job.tool_id = tool.id try: # For backward compatibility, some tools may not have versions yet. job.tool_version = tool.version except: job.tool_version = "1.0.1" job.state = trans.app.model.Job.states.UPLOAD job.flush() log.info( 'tool %s created job id %d' % ( tool.id, job.id ) ) trans.log_event( 'created job id %d' % job.id, tool_id=tool.id ) #if we could make a 'real' job here, then metadata could be set before job.finish() is called hda = data_list[0] #only our first hda is being added as output for the job, why? job.state = trans.app.model.Job.states.OK file_size_str = datatypes.data.nice_size( hda.dataset.file_size ) job.info = "%s, size: %s" % ( hda.info, file_size_str ) job.add_output_dataset( hda.name, hda ) job.flush() log.info( 'job id %d ended ok, file size: %s' % ( job.id, file_size_str ) ) trans.log_event( 'job id %d ended ok, file size: %s' % ( job.id, file_size_str ), tool_id=tool.id ) return dict( output=hda )
file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ('server_dir', 'path_paste') and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path):
def add_file( self, trans, folder_id, file_obj, name, file_format, dbkey, roles, info='no info', space_to_tab=False, replace_dataset=None, library_item_info_template=None, template_elements={}, message=None ): folder = trans.app.model.LibraryFolder.get( folder_id ) data_type = None line_count = 0 temp_name, is_multi_byte = sniff.stream_to_file( file_obj ) # See if we have an empty file if not os.path.getsize( temp_name ) > 0: raise BadFileException( "you attempted to upload an empty file." ) if is_multi_byte: ext = sniff.guess_ext( temp_name, is_multi_byte=True ) else: if not data_type: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress on the fly. is_gzipped, is_valid = self.check_gzip( temp_name ) if is_gzipped and not is_valid: raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_gzipped and is_valid: # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp() gzipped_file = gzip.GzipFile( temp_name ) while 1: try: chunk = gzipped_file.read( CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) raise BadFileException( 'problem uncompressing gzipped data.' ) if not chunk: break os.write( fd, chunk ) os.close( fd ) gzipped_file.close() # Replace the gzipped file with the decompressed file shutil.move( uncompressed, temp_name ) name = name.rstrip( '.gz' ) data_type = 'gzip' ext = '' if not data_type: # See if we have a zip archive is_zipped, is_valid, test_ext = self.check_zip( temp_name ) if is_zipped and not is_valid: raise BadFileException( "you attempted to upload an inappropriate file." ) elif is_zipped and is_valid: # Currently, we force specific tools to handle this case. We also require the user # to manually set the incoming file_format if ( test_ext == 'ab1' or test_ext == 'scf' ) and file_format != 'binseq.zip': raise BadFileException( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'." ) elif test_ext == 'txt' and file_format != 'txtseq.zip': raise BadFileException( "Invalid 'File Format' for archive consisting of text files - use 'Txtseq.zip'." ) if not ( file_format == 'binseq.zip' or file_format == 'txtseq.zip' ): raise BadFileException( "you must manually set the 'File Format' to either 'Binseq.zip' or 'Txtseq.zip' when uploading zip files." ) data_type = 'zip' ext = file_format if not data_type: if self.check_binary( temp_name ): try: ext = name.split( "." )[1].strip().lower() except: ext = '' if not( ext == 'ab1' or ext == 'scf' ): raise BadFileException( "you attempted to upload an inappropriate file." ) if ext == 'ab1' and file_format != 'ab1': raise BadFileException( "you must manually set the 'File Format' to 'Ab1' when uploading ab1 files." ) elif ext == 'scf' and file_format != 'scf': raise BadFileException( "you must manually set the 'File Format' to 'Scf' when uploading scf files." ) data_type = 'binary' if not data_type: # We must have a text file if self.check_html( temp_name ): raise BadFileException( "you attempted to upload an inappropriate file." ) if data_type != 'binary' and data_type != 'zip': if space_to_tab: line_count = sniff.convert_newlines_sep2tabs( temp_name ) elif os.stat( temp_name ).st_size < 262144000: # 250MB line_count = sniff.convert_newlines( temp_name ) else: if sniff.check_newlines( temp_name ): line_count = sniff.convert_newlines( temp_name ) else: line_count = None if file_format == 'auto': ext = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) else: ext = file_format data_type = ext if info is None: info = 'uploaded %s file' % data_type if file_format == 'auto': data_type = sniff.guess_ext( temp_name, sniff_order=trans.app.datatypes_registry.sniff_order ) else: data_type = file_format if replace_dataset: # The replace_dataset param ( when not None ) refers to a LibraryDataset that is being replaced with a new version. library_dataset = replace_dataset else: # If replace_dataset is None, the Library level permissions will be taken from the folder and applied to the new # LibraryDataset, and the current user's DefaultUserPermissions will be applied to the associated Dataset. library_dataset = trans.app.model.LibraryDataset( folder=folder, name=name, info=info ) library_dataset.flush() trans.app.security_agent.copy_library_permissions( folder, library_dataset ) ldda = trans.app.model.LibraryDatasetDatasetAssociation( name=name, info=info, extension=data_type, dbkey=dbkey, library_dataset=library_dataset, user=trans.get_user(), create_dataset=True ) ldda.message = message ldda.flush() # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset trans.app.security_agent.copy_library_permissions( library_dataset, ldda ) if replace_dataset: # Copy the Dataset level permissions from replace_dataset to the new LibraryDatasetDatasetAssociation.dataset trans.app.security_agent.copy_dataset_permissions( replace_dataset.library_dataset_dataset_association.dataset, ldda.dataset ) else: # Copy the current user's DefaultUserPermissions to the new LibraryDatasetDatasetAssociation.dataset trans.app.security_agent.set_all_dataset_permissions( ldda.dataset, trans.app.security_agent.user_get_default_permissions( trans.get_user() ) ) folder.add_library_dataset( library_dataset, genome_build=dbkey ) folder.flush() library_dataset.library_dataset_dataset_association_id = ldda.id library_dataset.flush() # Handle any templates included in the upload form if library_item_info_template: user = trans.get_user() library_item_info = trans.app.model.LibraryItemInfo( user=user ) library_item_info.library_item_info_template = library_item_info_template library_item_info.flush() trans.app.security_agent.copy_library_permissions( library_item_info_template, library_item_info ) for template_element in library_item_info_template.elements: info_element_value = template_elements.get( "info_element_%s_%s" % ( library_item_info_template.id, template_element.id ), None ) info_element = trans.app.model.LibraryItemInfoElement() info_element.contents = info_element_value info_element.library_item_info_template_element = template_element info_element.library_item_info = library_item_info info_element.flush() library_item_info_association = trans.app.model.LibraryDatasetDatasetInfoAssociation( user=user ) library_item_info_association.set_library_item( ldda ) library_item_info_association.library_item_info = library_item_info library_item_info_association.flush() # If roles were selected upon upload, restrict access to the Dataset to those roles if roles: for role in roles: dp = trans.app.model.DatasetPermissions( RBACAgent.permitted_actions.DATASET_ACCESS.action, ldda.dataset, role ) dp.flush() shutil.move( temp_name, ldda.dataset.file_name ) ldda.state = ldda.states.OK ldda.init_meta() if line_count: try: if is_multi_byte: ldda.set_multi_byte_peek( line_count=line_count ) else: ldda.set_peek( line_count=line_count ) except: if is_multi_byte: ldda.set_multi_byte_peek() else: ldda.set_peek() else: if is_multi_byte: ldda.set_multi_byte_peek() else: ldda.set_peek() ldda.set_size() if ldda.missing_meta(): ldda.datatype.set_meta( ldda ) ldda.flush() return ldda
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') in_place = dataset.get('in_place', True) purge_source = dataset.get('purge_source', True) try: ext = dataset.file_type except AttributeError: file_err( 'Unable to process uploaded file, missing file_type parameter.', dataset, json_file) return if dataset.type == 'url': try: page = urlopen( dataset.path) # page will be .close()ed by sniff methods temp_name, dataset.is_multi_byte = sniff.stream_to_file( page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers( page.headers)) except Exception as e: file_err('Unable to fetch %s\n%s' % (dataset.path, str(e)), dataset, json_file) return dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): file_err('Uploaded temporary file (%s) does not exist.' % dataset.path, dataset, json_file) return if not os.path.getsize(dataset.path) > 0: file_err('The uploaded file is empty', dataset, json_file) return if not dataset.type == 'url': # Already set is_multi_byte above if type == 'url' try: dataset.is_multi_byte = multi_byte.is_multi_byte( codecs.open(dataset.path, 'r', 'utf-8').read(100)) except UnicodeDecodeError as e: dataset.is_multi_byte = False # Is dataset an image? i_ext = get_image_ext(dataset.path) if i_ext: ext = i_ext data_type = ext # Is dataset content multi-byte? elif dataset.is_multi_byte: data_type = 'multi-byte char' ext = sniff.guess_ext(dataset.path, registry.sniff_order, is_multi_byte=True) # Is dataset content supported sniffable binary? else: # FIXME: This ignores the declared sniff order in datatype_conf.xml # resulting in improper behavior type_info = Binary.is_sniffable_binary(dataset.path) if type_info: data_type = type_info[0] ext = type_info[1] if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path) if is_gzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_gzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err('Problem decompressing gzipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type and bz2 is not None: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path) if is_bzipped and not is_valid: file_err( 'The gzipped uploaded file contains inappropriate content', dataset, json_file) return elif is_bzipped and is_valid: if link_data_only == 'copy_files': # We need to uncompress the temp_name file CHUNK_SIZE = 2**20 # 1Mb fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing bz2 compressed data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped: if link_data_only == 'copy_files': CHUNK_SIZE = 2**20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: outfile = open(uncompressed, 'wb') outfile.write(z.read(name)) outfile.close() uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) file_err( 'Problem decompressing zipped data', dataset, json_file) return z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if dataset.type in ('server_dir', 'path_paste') or not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: # TODO refactor this logic. check_binary isn't guaranteed to be # correct since it only looks at whether the first 100 chars are # printable or not. If someone specifies a known unsniffable # binary datatype and check_binary fails, the file gets mangled. if check_binary(dataset.path) or Binary.is_ext_unsniffable( dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' # binary_ok = False parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() if not Binary.is_ext_unsniffable(ext): file_err( 'The uploaded binary file contains inappropriate content', dataset, json_file) return elif Binary.is_ext_unsniffable( ext) and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext) file_err(err_msg, dataset, json_file) return if not data_type: # We must have a text file if check_html(dataset.path): file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file) return if data_type != 'binary': if link_data_only == 'copy_files': if dataset.type in ('server_dir', 'path_paste') and data_type not in [ 'gzip', 'bz2', 'zip' ]: in_place = False # Convert universal line endings to Posix line endings, but allow the user to turn it off, # so that is becomes possible to upload gzip, bz2 or zip files with binary data without # corrupting the content of those files. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' file_err(err_msg, dataset, json_file) return if link_data_only == 'copy_files' and dataset.type in ( 'server_dir', 'path_paste') and data_type not in ['gzip', 'bz2', 'zip']: # Move the dataset to its "real" path if converted_path is not None: shutil.copy(converted_path, output_path) try: os.remove(converted_path) except: pass else: # This should not happen, but it's here just in case shutil.copy(dataset.path, output_path) elif link_data_only == 'copy_files': if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if link_data_only == 'copy_files' and datatype.dataset_content_needs_grooming( output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
elif ext == 'scf' and dataset.file_type != 'scf': file_err( "You must manually set the 'File Format' to 'Scf' when uploading scf files.", dataset, json_file ) return else: ext = 'binary' data_type = 'binary' if not data_type: # We must have a text file if check_html( dataset.path ): file_err( 'The uploaded file contains inappropriate content', dataset, json_file ) return if data_type != 'binary' and data_type != 'zip': if dataset.space_to_tab: line_count = sniff.convert_newlines_sep2tabs( dataset.path ) else: line_count = sniff.convert_newlines( dataset.path ) if dataset.file_type == 'auto': ext = sniff.guess_ext( dataset.path ) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' info = dict( type = 'dataset', dataset_id = dataset.dataset_id, path = dataset.path, ext = ext, name = dataset.name,
def add_file(dataset, registry, json_file, output_path): data_type = None line_count = None converted_path = None stdout = None link_data_only = dataset.get('link_data_only', 'copy_files') != 'copy_files' # run_as_real_user is estimated from galaxy config (external chmod indicated of inputs executed) # If this is True we always purge supplied upload inputs so they are cleaned up and we reuse their # paths during data conversions since this user already owns that path. # Older in_place check for upload jobs created before 18.01, TODO remove in 19.XX. xref #5206 run_as_real_user = dataset.get('run_as_real_user', False) or dataset.get("in_place", False) # purge_source defaults to True unless this is an FTP import and # ftp_upload_purge has been overridden to False in Galaxy's config. # We set purge_source to False if: # - the job does not have write access to the file, e.g. when running as the # real user # - the files are uploaded from external paths. purge_source = dataset.get('purge_source', True) and not run_as_real_user and dataset.type not in ('server_dir', 'path_paste') # in_place is True unless we are running as a real user or importing external paths (i.e. # this is a real upload and not a path paste or ftp import). # in_place should always be False if running as real user because the uploaded file will # be owned by Galaxy and not the user and it should be False for external paths so Galaxy doesn't # modify files not controlled by Galaxy. in_place = not run_as_real_user and dataset.type not in ('server_dir', 'path_paste', 'ftp_import') # Base on the check_upload_content Galaxy config option and on by default, this enables some # security related checks on the uploaded content, but can prevent uploads from working in some cases. check_content = dataset.get('check_content' , True) # auto_decompress is a request flag that can be swapped off to prevent Galaxy from automatically # decompressing archive files before sniffing. auto_decompress = dataset.get('auto_decompress', True) try: ext = dataset.file_type except AttributeError: raise UploadProblemException('Unable to process uploaded file, missing file_type parameter.') if dataset.type == 'url': try: page = urlopen(dataset.path) # page will be .close()ed by sniff methods temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) dataset.path = temp_name # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: # Sniff the data type guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) # Set data_type only if guessed_ext is a binary datatype datatype = registry.get_datatype_by_extension(guessed_ext) if isinstance(datatype, Binary): data_type = guessed_ext ext = guessed_ext if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = dataset.file_type else: # See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress is_gzipped, is_valid = check_gzip(dataset.path, check_content=check_content) if is_gzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_gzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) gzipped_file = gzip.GzipFile(dataset.path, 'rb') while 1: try: chunk = gzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing gzipped data') if not chunk: break os.write(fd, chunk) os.close(fd) gzipped_file.close() # Replace the gzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.gz') data_type = 'gzip' if not data_type: # See if we have a bz2 file, much like gzip is_bzipped, is_valid = check_bz2(dataset.path, check_content) if is_bzipped and not is_valid: raise UploadProblemException('The gzipped uploaded file contains inappropriate content') elif is_bzipped and is_valid and auto_decompress: if not link_data_only: # We need to uncompress the temp_name file CHUNK_SIZE = 2 ** 20 # 1Mb fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_bunzip2_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) bzipped_file = bz2.BZ2File(dataset.path, 'rb') while 1: try: chunk = bzipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing bz2 compressed data') if not chunk: break os.write(fd, chunk) os.close(fd) bzipped_file.close() # Replace the bzipped file with the decompressed file if it's safe to do so if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = dataset.name.rstrip('.bz2') data_type = 'bz2' if not data_type: # See if we have a zip archive is_zipped = check_zip(dataset.path) if is_zipped and auto_decompress: if not link_data_only: CHUNK_SIZE = 2 ** 20 # 1Mb uncompressed = None uncompressed_name = None unzipped = False z = zipfile.ZipFile(dataset.path) for name in z.namelist(): if name.endswith('/'): continue if unzipped: stdout = 'ZIP file contained more than one file, only the first file was added to Galaxy.' break fd, uncompressed = tempfile.mkstemp(prefix='data_id_%s_upload_zip_' % dataset.dataset_id, dir=os.path.dirname(output_path), text=False) if sys.version_info[:2] >= (2, 6): zipped_file = z.open(name) while 1: try: chunk = zipped_file.read(CHUNK_SIZE) except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') if not chunk: break os.write(fd, chunk) os.close(fd) zipped_file.close() uncompressed_name = name unzipped = True else: # python < 2.5 doesn't have a way to read members in chunks(!) try: with open(uncompressed, 'wb') as outfile: outfile.write(z.read(name)) uncompressed_name = name unzipped = True except IOError: os.close(fd) os.remove(uncompressed) raise UploadProblemException('Problem decompressing zipped data') z.close() # Replace the zipped file with the decompressed file if it's safe to do so if uncompressed is not None: if not in_place: dataset.path = uncompressed else: shutil.move(uncompressed, dataset.path) os.chmod(dataset.path, 0o644) dataset.name = uncompressed_name data_type = 'zip' if not data_type: if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): # We have a binary dataset, but it is not Bam, Sff or Pdf data_type = 'binary' parts = dataset.name.split(".") if len(parts) > 1: ext = parts[-1].strip().lower() is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) if check_content and not is_ext_unsniffable_binary: raise UploadProblemException('The uploaded binary file contains inappropriate content') elif is_ext_unsniffable_binary and dataset.file_type != ext: err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) raise UploadProblemException(err_msg) if not data_type: # We must have a text file if check_content and check_html(dataset.path): raise UploadProblemException('The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only and data_type not in ('gzip', 'bz2', 'zip'): # Convert universal line endings to Posix line endings if to_posix_lines is True # and the data is not binary or gzip-, bz2- or zip-compressed. if dataset.to_posix_lines: tmpdir = output_adjacent_tmpdir(output_path) tmp_prefix = 'data_id_%s_convert_' % dataset.dataset_id if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) else: line_count, converted_path = sniff.convert_newlines(dataset.path, in_place=in_place, tmp_dir=tmpdir, tmp_prefix=tmp_prefix) if dataset.file_type == 'auto': ext = sniff.guess_ext(converted_path or dataset.path, registry.sniff_order) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if dataset.type in ('server_dir', 'path_paste') and link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(dataset.path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) if not link_data_only and converted_path: # Move the dataset to its "real" path try: shutil.move(converted_path, output_path) except OSError as e: # We may not have permission to remove converted_path if e.errno != errno.EACCES: raise elif not link_data_only: if purge_source: shutil.move(dataset.path, output_path) else: shutil.copy(dataset.path, output_path) # Write the job info stdout = stdout or 'uploaded %s file' % data_type info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout=stdout, name=dataset.name, line_count=line_count) if dataset.get('uuid', None) is not None: info['uuid'] = dataset.get('uuid') json_file.write(dumps(info) + "\n") if not link_data_only and datatype and datatype.dataset_content_needs_grooming(output_path): # Groom the dataset content if necessary datatype.groom_dataset_content(output_path)
def _resolve_src(item): converted_path = None name, path = _has_src_to_path(item) dbkey = item.get("dbkey", "?") requested_ext = item.get("ext", "auto") info = item.get("info", None) object_id = item.get("object_id", None) link_data_only = upload_config.link_data_only if "link_data_only" in item: # Allow overriding this on a per file basis. link_data_only = _link_data_only(item) to_posix_lines = upload_config.get_option(item, "to_posix_lines") space_to_tab = upload_config.get_option(item, "space_to_tab") in_place = item.get("in_place", False) purge_source = item.get("purge_source", True) # Follow upload.py logic but without the auto-decompress logic. registry = upload_config.registry check_content = upload_config.check_content data_type, ext = None, requested_ext is_binary = check_binary(path) if is_binary: data_type, ext = handle_sniffable_binary_check( data_type, ext, path, registry) if data_type is None: root_datatype = registry.get_datatype_by_extension(ext) if getattr(root_datatype, 'compressed', False): data_type = 'compressed archive' ext = ext elif is_binary: data_type, ext = handle_unsniffable_binary_check( data_type, ext, path, name, is_binary, requested_ext, check_content, registry) if not data_type and check_content and check_html(path): raise UploadProblemException( 'The uploaded file contains inappropriate HTML content') if data_type != 'binary': if not link_data_only: if to_posix_lines: if space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( path, in_place=in_place, tmp_dir=".") else: line_count, converted_path = sniff.convert_newlines( path, in_place=in_place, tmp_dir=".") else: if space_to_tab: line_count, converted_path = sniff.sep2tabs( path, in_place=in_place, tmp_dir=".") if requested_ext == 'auto': ext = sniff.guess_ext(converted_path or path, registry.sniff_order) else: ext = requested_ext data_type = ext if ext == 'auto' and data_type == 'binary': ext = 'data' if ext == 'auto' and requested_ext: ext = requested_ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension(ext) if link_data_only: # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming(path): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \ '<b>Copy files into Galaxy</b> instead of <b>Link to files without copying into Galaxy</b> so grooming can be performed.' raise UploadProblemException(err_msg) # If this file is not in the workdir make sure it gets there. if not link_data_only and converted_path: path = upload_config.ensure_in_working_directory( converted_path, purge_source, in_place) elif not link_data_only: path = upload_config.ensure_in_working_directory( path, purge_source, in_place) if not link_data_only and datatype and datatype.dataset_content_needs_grooming( path): # Groom the dataset content if necessary datatype.groom_dataset_content(path) rval = { "name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only } if info is not None: rval["info"] = info if object_id is not None: rval["object_id"] = object_id return rval
file_err( err_msg, dataset, json_file ) return #if not data_type: # We must have a text file #if check_html( dataset.path ): #file_err( 'The uploaded file contains inappropriate HTML content', dataset, json_file ) #return if data_type != 'binary': if link_data_only == 'copy_files': in_place = True if dataset.type in ( 'server_dir', 'path_paste' ): in_place = False if dataset.space_to_tab: line_count, converted_path = sniff.convert_newlines_sep2tabs( dataset.path, in_place=in_place ) else: line_count, converted_path = sniff.convert_newlines( dataset.path, in_place=in_place ) if dataset.file_type == 'auto': ext = sniff.guess_ext( dataset.path, registry.sniff_order ) else: ext = dataset.file_type data_type = ext # Save job info for the framework if ext == 'auto' and dataset.ext: ext = dataset.ext if ext == 'auto': ext = 'data' datatype = registry.get_datatype_by_extension( ext ) if dataset.type in ( 'server_dir', 'path_paste' ) and link_data_only == 'link_to_files': # Never alter a file that will not be copied to Galaxy's local file store. if datatype.dataset_content_needs_grooming( output_path ): err_msg = 'The uploaded files need grooming, so change your <b>Copy data into Galaxy?</b> selection to be ' + \