def __main__(): if len(sys.argv) < 4: print >> sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' sys.exit(1) output_paths = parse_outputs(sys.argv[4:]) json_file = open('galaxy.json', 'w') registry = Registry() registry.load_datatypes(root_dir=sys.argv[1], config=sys.argv[2]) for line in open(sys.argv[3], 'r'): dataset = from_json_string(line) dataset = util.bunch.Bunch(**safe_dict(dataset)) try: output_path = output_paths[int(dataset.dataset_id)][0] except: print >> sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id sys.exit(1) if dataset.type == 'composite': files_path = output_paths[int(dataset.dataset_id)][1] add_composite_file(dataset, registry, json_file, output_path, files_path) else: add_file(dataset, registry, json_file, output_path) # clean up paramfile # TODO: this will not work when running as the actual user unless the # parent directory is writable by the user. try: os.remove(sys.argv[3]) except: pass
def __init__(self, datatypes_registry=None, ext='data', dbkey='?'): self.ext = self.extension = ext self.dbkey = dbkey if datatypes_registry is None: datatypes_registry = Registry() self.datatype = datatypes_registry.get_datatype_by_extension(ext) self._metadata = None self.metadata = MetadataCollection(self)
def __main__(): if len(sys.argv) < 4: print('usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...', file=sys.stderr) sys.exit(1) output_paths = parse_outputs(sys.argv[4:]) registry = Registry() registry.load_datatypes(root_dir=sys.argv[1], config=sys.argv[2]) try: datasets = __read_paramfile(sys.argv[3]) except (ValueError, AssertionError): datasets = __read_old_paramfile(sys.argv[3]) metadata = [] for dataset in datasets: dataset = bunch.Bunch(**safe_dict(dataset)) try: output_path = output_paths[int(dataset.dataset_id)][0] except Exception: print('Output path for dataset %s not found on command line' % dataset.dataset_id, file=sys.stderr) sys.exit(1) try: if dataset.type == 'composite': files_path = output_paths[int(dataset.dataset_id)][1] metadata.append(add_composite_file(dataset, registry, output_path, files_path)) else: metadata.append(add_file(dataset, registry, output_path)) except UploadProblemException as e: metadata.append(file_err(unicodify(e), dataset)) __write_job_metadata(metadata)
def __main__(): if len( sys.argv ) < 4: print >>sys.stderr, 'usage: upload.py <root> <datatypes_conf> <json paramfile> <output spec> ...' sys.exit( 1 ) output_paths = parse_outputs( sys.argv[4:] ) json_file = open( 'galaxy.json', 'w' ) registry = Registry( sys.argv[1], sys.argv[2] ) for line in open( sys.argv[3], 'r' ): dataset = from_json_string( line ) dataset = util.bunch.Bunch( **safe_dict( dataset ) ) try: output_path = output_paths[int( dataset.dataset_id )][0] except: print >>sys.stderr, 'Output path for dataset %s not found on command line' % dataset.dataset_id sys.exit( 1 ) if dataset.type == 'composite': files_path = output_paths[int( dataset.dataset_id )][1] add_composite_file( dataset, registry, json_file, output_path, files_path ) else: add_file( dataset, registry, json_file, output_path ) # clean up paramfile try: os.remove( sys.argv[1] ) except: pass
def __init__(self, app_name, security, model, tool_data_path, shed_tool_data_path, tool_data_tables=None, registry=None, hgweb_config_manager=None): self.name = app_name self.security = security self.model = model self.config = Bunch() self.config.tool_data_path = tool_data_path self.config.shed_tool_data_path = shed_tool_data_path self.temporary_path = tempfile.mkdtemp(prefix='tool_validation_') self.config.tool_data_table_config = os.path.join( self.temporary_path, 'tool_data_table_conf.xml') self.config.shed_tool_data_table_config = os.path.join( self.temporary_path, 'shed_tool_data_table_conf.xml') self.tool_data_tables = tool_data_tables self.datatypes_registry = registry or Registry() self.hgweb_config_manager = hgweb_config_manager self.config.len_file_path = os.path.join(self.temporary_path, 'chromlen.txt') # If the builds file path is set to None, tools/__init__.py will load the default. # Otherwise it will attempt to load a nonexistent file and log an error. This does # not appear to be an issue with the len_file_path config option. self.config.builds_file_path = None self.genome_builds = GenomeBuilds(self)
def __init__(self, config): self.object_store = build_object_store_from_config(config) # Setup the database engine and ORM self.model = galaxy.config.init_models_from_config( config, object_store=self.object_store) registry = Registry() registry.load_datatypes() galaxy.model.set_datatypes_registry(registry)
def __main__(): filename = sys.argv[1] try: max_file_size = int( sys.argv[2] ) except: max_file_size = 0 job_params, params = load_input_parameters( filename ) if job_params is None: #using an older tabular file enhanced_handling = False job_params = dict( param_dict = params ) job_params[ 'output_data' ] = [ dict( out_data_name = 'output', ext = 'data', file_name = filename, extra_files_path = None ) ] job_params[ 'job_config' ] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE = TOOL_PROVIDED_JOB_METADATA_FILE ) else: enhanced_handling = True json_file = open( job_params[ 'job_config' ][ 'TOOL_PROVIDED_JOB_METADATA_FILE' ], 'w' ) #specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir = job_params[ 'job_config' ][ 'GALAXY_ROOT_DIR' ], config = job_params[ 'job_config' ][ 'GALAXY_DATATYPES_CONF_FILE' ] ) URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded URL_method = params.get( 'URL_method', None ) # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout( 600 ) for data_dict in job_params[ 'output_data' ]: cur_filename = data_dict.get( 'file_name', filename ) cur_URL = params.get( '%s|%s|URL' % ( GALAXY_PARAM_PREFIX, data_dict[ 'out_data_name' ] ), URL ) if not cur_URL: open( cur_filename, 'w' ).write( "" ) stop_err( 'The remote data source application has not sent back a URL parameter in the request.' ) # The following calls to urllib.urlopen() will use the above default timeout try: if not URL_method or URL_method == 'get': page = urllib.urlopen( cur_URL ) elif URL_method == 'post': page = urllib.urlopen( cur_URL, urllib.urlencode( params ) ) except Exception, e: stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) ) if max_file_size: file_size = int( page.info().get( 'Content-Length', 0 ) ) if file_size > max_file_size: stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) #do sniff stream for multi_byte try: cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( cur_filename, os.O_WRONLY | os.O_CREAT ), cur_filename, source_encoding=get_charset_from_http_headers( page.headers ) ) except Exception, e: stop_err( 'Unable to fetch %s:\n%s' % ( cur_URL, e ) )
def __init__(self, config): if config.database_connection is False: config.database_connection = "sqlite:///%s?isolation_level=IMMEDIATE" % config.database self.object_store = build_object_store_from_config(config) # Setup the database engine and ORM self.model = galaxy.model.mapping.init(config.file_path, config.database_connection, engine_options={}, create_tables=False, object_store=self.object_store) registry = Registry() registry.load_datatypes() galaxy.model.set_datatypes_registry(registry)
def collect_test_data(): registry = Registry() registry.load_datatypes(root_dir=GALAXY_ROOT, config=DATATYPES_CONFIG) test_files = os.listdir(TEST_FILE_DIR) files = [os.path.join(TEST_FILE_DIR, f) for f in test_files] datatypes = [find_datatype(registry, f) for f in test_files] uploadable = [datatype.file_ext in registry.upload_file_formats for datatype in datatypes] test_data_description = [TEST_DATA(*items) for items in zip(files, datatypes, uploadable)] return {os.path.basename(data.path): data for data in test_data_description}
def __init__( self, datatypes_registry=None, ext='data', dbkey='?' ): self.ext = self.extension = ext self.dbkey = dbkey if datatypes_registry is None: # Default Value Required for unit tests datatypes_registry = Registry() datatypes_registry.load_datatypes() self.datatype = datatypes_registry.get_datatype_by_extension( ext ) self._metadata = None self.metadata = MetadataCollection( self )
def sniff_and_handle_data_type(json_params, output_file): """ The sniff.handle_uploaded_dataset_file() method in Galaxy performs dual functions: it sniffs the filetype and if it's a compressed archive for a non compressed datatype such as fasta, it will be unpacked. """ try: datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) file_type = sniff.handle_uploaded_dataset_file(output_file, datatypes_registry) return file_type except Exception: return None
def __main__(): filename = sys.argv[1] try: int(sys.argv[2]) except Exception: pass job_params, params = load_input_parameters(filename) if job_params is None: # using an older tabular file job_params = dict(param_dict=params) job_params["output_data"] = [ dict(out_data_name="output", ext="data", file_name=filename, files_path=None) ] job_params["job_config"] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE, ) # else: # json_file = open( # job_params["job_config"]["TOOL_PROVIDED_JOB_METADATA_FILE"], "w" # ) # specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params["job_config"]["GALAXY_ROOT_DIR"], config=job_params["job_config"]["GALAXY_DATATYPES_CONF_FILE"], ) # URL = params.get( # "URL", None # ) # using exactly URL indicates that only one dataset is being downloaded params.get("URL_method", None) simpleD = params.get("galaxyData") # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout(600) cur_filename = params.get("output") open(cur_filename, "w").write(simpleD)
def __main__(): filename = sys.argv[1] try: max_file_size = int(sys.argv[2]) except: max_file_size = 0 job_params, params = load_input_parameters(filename) if job_params is None: #using an older tabular file enhanced_handling = False job_params = dict(param_dict=params) job_params['output_data'] = [ dict(out_data_name='output', ext='data', file_name=filename, extra_files_path=None) ] job_params['job_config'] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE) else: enhanced_handling = True json_file = open( job_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'w' ) #specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded URL_method = params.get('URL_method', None) simpleD = params.get('galaxyData') # The Python support for fetching resources from the web is layered. urllib uses the httplib # library, which in turn uses the socket library. As of Python 2.3 you can specify how long # a socket should wait for a response before timing out. By default the socket module has no # timeout and can hang. Currently, the socket timeout is not exposed at the httplib or urllib2 # levels. However, you can set the default timeout ( in seconds ) globally for all sockets by # doing the following. socket.setdefaulttimeout(600) cur_filename = params.get('output') outputfile = open(cur_filename, 'w').write(simpleD)
def main(argv=None): if argv is None: argv = sys.argv[1:] args = _arg_parser().parse_args(argv) registry = Registry() registry.load_datatypes(root_dir=args.galaxy_root, config=args.datatypes_registry) request_path = args.request assert os.path.exists(request_path) with open(request_path) as f: request = json.load(f) upload_config = UploadConfig(request, registry) galaxy_json = _request_to_galaxy_json(upload_config, request) with open("galaxy.json", "w") as f: json.dump(galaxy_json, f)
def main(argv=None): if argv is None: argv = sys.argv[1:] args = _arg_parser().parse_args(argv) registry = Registry() registry.load_datatypes(root_dir=args.galaxy_root, config=args.datatypes_registry) request_path = args.request assert os.path.exists(request_path) with open(request_path) as f: request = json.load(f) working_directory = args.working_directory or os.getcwd() allow_failed_collections = request.get("allow_failed_collections", False) upload_config = UploadConfig(request, registry, working_directory, allow_failed_collections) galaxy_json = _request_to_galaxy_json(upload_config, request) galaxy_json_path = os.path.join(working_directory, "galaxy.json") with open(galaxy_json_path, "w") as f: json.dump(galaxy_json, f)
def _configure_datatypes_registry(self, installed_repository_manager=None): # Create an empty datatypes registry. self.datatypes_registry = Registry(self.config) if installed_repository_manager and self.config.load_tool_shed_datatypes: # Load proprietary datatypes defined in datatypes_conf.xml files in all installed tool shed repositories. We # load proprietary datatypes before datatypes in the distribution because Galaxy's default sniffers include some # generic sniffers (eg text,xml) which catch anything, so it's impossible for proprietary sniffers to be used. # However, if there is a conflict (2 datatypes with the same extension) between a proprietary datatype and a datatype # in the Galaxy distribution, the datatype in the Galaxy distribution will take precedence. If there is a conflict # between 2 proprietary datatypes, the datatype from the repository that was installed earliest will take precedence. installed_repository_manager.load_proprietary_datatypes() # Load the data types in the Galaxy distribution, which are defined in self.config.datatypes_config. datatypes_configs = self.config.datatypes_config for datatypes_config in listify(datatypes_configs): # Setting override=False would make earlier files would take # precedence - but then they wouldn't override tool shed # datatypes. self.datatypes_registry.load_datatypes(self.config.root, datatypes_config, override=True)
from sqlalchemy.sql import label # noqa sys.path.insert( 1, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'lib'))) from galaxy.datatypes.registry import Registry from galaxy.model import * # noqa from galaxy.model import set_datatypes_registry # More explicit than `*` import from galaxy.model.mapping import init from galaxy.model.orm.scripts import get_config if sys.version_info > (3, ): long = int registry = Registry() registry.load_datatypes() set_datatypes_registry(registry) db_url = get_config(sys.argv)['db_url'] sa_session = init('/tmp/', db_url).context # Helper function for debugging sqlalchemy queries... # http://stackoverflow.com/questions/5631078/sqlalchemy-print-the-actual-query def printquery(statement, bind=None): """ Print a query, with values filled in for debugging purposes *only* for security, you should always separate queries from their values please also note that this function is quite slow """
def download_from_genomespace_importer(username, token, json_parameter_file, genomespace_site, gs_toolname): json_params = json.loads(open(json_parameter_file, 'r').read()) datasource_params = json_params.get('param_dict') assert None not in [username, token], "Missing GenomeSpace username or token." output_filename = datasource_params.get("output_file1", None) dataset_id = base_dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener(username, token, gs_toolname=gs_toolname) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[genomespace_site] set_genomespace_format_identifiers(url_opener, genomespace_site_dict['dmServer']) file_url_name = "URL" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb') #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) url_param = datasource_params.get(file_url_name, None) used_filenames = [] for download_url in url_param.split(','): using_temp_file = False parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) #write file to disk new_file_request = urllib2.Request(download_url) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open(new_file_request) filename = None if 'Content-Disposition' in target_download_url.info(): content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else (x.strip(), ''), target_download_url.info()['Content-Disposition'].split( ';'))) if 'filename' in content_disposition: filename = content_disposition['filename'].strip("\"'") if not filename: parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) filename = urllib.unquote_plus(parsed_url[2].split('/')[-1]) if not filename: filename = download_url if output_filename is None: #need to use a temp file here, because we do not know the ext yet using_temp_file = True output_filename = tempfile.NamedTemporaryFile( prefix='tmp-genomespace-importer-').name output_file = open(output_filename, 'wb') chunk_write(target_download_url, output_file) output_file.close() #determine file format file_type = None if 'dataformat' in query_params: #this is a converted dataset file_type = query_params['dataformat'][0] file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type) else: try: #get and use GSMetadata object download_file_path = download_url.split( "%s/file/" % (genomespace_site_dict['dmServer']), 1 )[-1] #FIXME: This is a very bad way to get the path for determining metadata. There needs to be a way to query API using download URLto get to the metadata object metadata_request = urllib2.Request( "%s/%s/filemetadata/%s" % (genomespace_site_dict['dmServer'], GENOMESPACE_API_VERSION_STRING, download_file_path)) metadata_request.get_method = lambda: 'GET' metadata_url = url_opener.open(metadata_request) file_metadata_dict = json.loads(metadata_url.read()) metadata_url.close() file_type = file_metadata_dict.get('dataFormat', None) if file_type and file_type.get('url'): file_type = file_type.get('url') file_type = get_galaxy_ext_from_genomespace_format_url( url_opener, file_type, default=None) except: pass if file_type is None: #try to sniff datatype try: file_type = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry) except: pass #sniff failed if file_type is None and '.' in parsed_url[2]: #still no known datatype, fall back to using extension file_type = parsed_url[2].rsplit('.', 1)[-1] file_type = GENOMESPACE_EXT_TO_GALAXY_EXT.get(file_type, file_type) if file_type is None: #use default extension (e.g. 'data') file_type = DEFAULT_GALAXY_EXT #save json info for single primary dataset if dataset_id is not None: metadata_parameter_file.write("%s\n" % json.dumps( dict(type='dataset', dataset_id=dataset_id, ext=file_type, name="GenomeSpace importer on %s" % (filename)))) #if using tmp file, move the file to the new file path dir to get scooped up later if using_temp_file: original_filename = filename filename = ''.join(c in VALID_CHARS and c or '-' for c in filename) while filename in used_filenames: filename = "-%s" % filename used_filenames.append(filename) target_output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % (hda_id, filename, file_type)) shutil.move(output_filename, target_output_filename) metadata_parameter_file.write("%s\n" % json.dumps( dict(type='new_primary_dataset', base_dataset_id=base_dataset_id, ext=file_type, filename=target_output_filename, name="GenomeSpace importer on %s" % (original_filename)))) dataset_id = None #only one primary dataset available output_filename = None #only have one filename available metadata_parameter_file.close() return True
def download_from_genomespace_file_browser(json_parameter_file, genomespace_site): json_params = json.loads(open(json_parameter_file, 'r').read()) datasource_params = json_params.get('param_dict') username = datasource_params.get("gs-username", None) token = datasource_params.get("gs-token", None) assert None not in [username, token], "Missing GenomeSpace username or token." output_filename = datasource_params.get("output", None) dataset_id = json_params['output_data'][0]['dataset_id'] hda_id = json_params['output_data'][0]['hda_id'] url_opener = get_cookie_opener(username, token) #load and set genomespace format ids to galaxy exts genomespace_site_dict = get_genomespace_site_urls()[genomespace_site] set_genomespace_format_identifiers(url_opener, genomespace_site_dict['dmServer']) file_url_prefix = "fileUrl" file_type_prefix = "fileFormat" metadata_parameter_file = open( json_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'wb') #setup datatypes registry for sniffing datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=json_params['job_config']['GALAXY_ROOT_DIR'], config=json_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) file_numbers = [] for name in datasource_params.keys(): if name.startswith(file_url_prefix): name = name[len(file_url_prefix):] file_numbers.append(int(name)) if not file_numbers: if output_filename: open(output_filename, 'wb') #erase contents of file raise Exception( "You must select at least one file to import into Galaxy.") file_numbers.sort() used_filenames = [] for file_num in file_numbers: url_key = "%s%i" % (file_url_prefix, file_num) download_url = datasource_params.get(url_key, None) if download_url is None: break filetype_key = "%s%i" % (file_type_prefix, file_num) filetype_url = datasource_params.get(filetype_key, None) galaxy_ext = get_galaxy_ext_from_genomespace_format_url( url_opener, filetype_url) formated_download_url = "%s?%s" % ( download_url, urllib.urlencode([('dataformat', filetype_url)])) new_file_request = urllib2.Request(formated_download_url) new_file_request.get_method = lambda: 'GET' target_download_url = url_opener.open(new_file_request) filename = None if 'Content-Disposition' in target_download_url.info(): # If the response has Content-Disposition, try to get filename from it content_disposition = dict( map( lambda x: x.strip().split('=') if '=' in x else (x.strip(), ''), target_download_url.info()['Content-Disposition'].split( ';'))) if 'filename' in content_disposition: filename = content_disposition['filename'].strip("\"'") if not filename: parsed_url = urlparse.urlparse(download_url) query_params = urlparse.parse_qs(parsed_url[4]) filename = urllib.unquote_plus(parsed_url[2].split('/')[-1]) if not filename: filename = download_url metadata_dict = None original_filename = filename if output_filename is None: filename = ''.join(c in VALID_CHARS and c or '-' for c in filename) while filename in used_filenames: filename = "-%s" % filename used_filenames.append(filename) output_filename = os.path.join( os.getcwd(), 'primary_%i_%s_visible_%s' % (hda_id, filename, galaxy_ext)) metadata_dict = dict(type='new_primary_dataset', base_dataset_id=dataset_id, ext=galaxy_ext, filename=output_filename, name="GenomeSpace import on %s" % (original_filename)) else: if dataset_id is not None: metadata_dict = dict(type='dataset', dataset_id=dataset_id, ext=galaxy_ext, name="GenomeSpace import on %s" % (filename)) output_file = open(output_filename, 'wb') chunk_write(target_download_url, output_file) output_file.close() if (galaxy_ext == AUTO_GALAXY_EXT or filetype_url == GENOMESPACE_FORMAT_IDENTIFIER_UNKNOWN) and metadata_dict: #try to sniff datatype try: galaxy_ext = sniff.handle_uploaded_dataset_file( output_filename, datatypes_registry) except: #sniff failed galaxy_ext = original_filename.rsplit('.', 1)[-1] if galaxy_ext not in datatypes_registry.datatypes_by_extension: galaxy_ext = DEFAULT_GALAXY_EXT metadata_dict['ext'] = galaxy_ext output_filename = None #only have one filename available #write out metadata info if metadata_dict: metadata_parameter_file.write("%s\n" % json.dumps(metadata_dict)) metadata_parameter_file.close() return True
def __main__(): filename = sys.argv[1] try: max_file_size = int(sys.argv[2]) except Exception: max_file_size = 0 job_params, params = load_input_parameters(filename) if job_params is None: # using an older tabular file enhanced_handling = False job_params = dict(param_dict=params) job_params['output_data'] = [ dict(out_data_name='output', ext='data', file_name=filename, extra_files_path=None) ] job_params['job_config'] = dict( GALAXY_ROOT_DIR=GALAXY_ROOT_DIR, GALAXY_DATATYPES_CONF_FILE=GALAXY_DATATYPES_CONF_FILE, TOOL_PROVIDED_JOB_METADATA_FILE=TOOL_PROVIDED_JOB_METADATA_FILE) else: enhanced_handling = True json_file = open( job_params['job_config']['TOOL_PROVIDED_JOB_METADATA_FILE'], 'w' ) # specially named file for output junk to pass onto set metadata datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) URL = params.get( 'URL', None ) # using exactly URL indicates that only one dataset is being downloaded URL_method = params.get('URL_method', None) for data_dict in job_params['output_data']: cur_filename = data_dict.get('file_name', filename) cur_URL = params.get( '%s|%s|URL' % (GALAXY_PARAM_PREFIX, data_dict['out_data_name']), URL) if not cur_URL or urlparse(cur_URL).scheme not in ('http', 'https', 'ftp'): open(cur_filename, 'w').write("") stop_err( 'The remote data source application has not sent back a URL parameter in the request.' ) # The following calls to urlopen() will use the above default timeout try: if not URL_method or URL_method == 'get': page = urlopen(cur_URL, timeout=DEFAULT_SOCKET_TIMEOUT) elif URL_method == 'post': page = urlopen(cur_URL, urlencode(params).encode("utf-8"), timeout=DEFAULT_SOCKET_TIMEOUT) except Exception as e: stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str(e)) if max_file_size: file_size = int(page.info().get('Content-Length', 0)) if file_size > max_file_size: stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % (file_size, max_file_size)) try: cur_filename = sniff.stream_to_open_named_file( page, os.open(cur_filename, os.O_WRONLY | os.O_CREAT), cur_filename, source_encoding=get_charset_from_http_headers(page.headers)) except Exception as e: stop_err('Unable to fetch %s:\n%s' % (cur_URL, e)) # here import checks that upload tool performs if enhanced_handling: try: ext = sniff.handle_uploaded_dataset_file(filename, datatypes_registry, ext=data_dict['ext']) except Exception as e: stop_err(str(e)) info = dict(type='dataset', dataset_id=data_dict['dataset_id'], ext=ext) json_file.write("%s\n" % dumps(info))
pass else: # this should not happen, but it's here just in case shutil.copy(dataset.path, output_path) else: shutil.move(dataset.path, output_path) # Write the job info info = dict(type='dataset', dataset_id=dataset.dataset_id, ext=ext, stdout='uploaded %s file' % data_type, name=dataset.name, line_count=line_count) json_file.write(to_json_string(info) + "\n") # Groom the dataset content if necessary datatype = Registry().get_datatype_by_extension(ext) datatype.groom_dataset_content(output_path) def add_composite_file(dataset, json_file, output_path, files_path): if dataset.composite_files: os.mkdir(files_path) for name, value in dataset.composite_files.iteritems(): value = util.bunch.Bunch(**value) if dataset.composite_file_paths[ value.name] is None and not value.optional: file_err( 'A required composite data file was not provided (%s)' % name, dataset, json_file) break elif dataset.composite_file_paths[value.name] is not None:
def load_datatypes_registry(job_params): datatypes_registry = Registry() datatypes_registry.load_datatypes( root_dir=job_params['job_config']['GALAXY_ROOT_DIR'], config=job_params['job_config']['GALAXY_DATATYPES_CONF_FILE']) return datatypes_registry