def extract_bag(local_bag_archive_path): """Unachive a local bdbag, and return the local path. Places the unachived bag next to the archived one, minus the archived bag's extension.""" local_bag, _ = os.path.splitext(local_bag_archive_path) bdbag_api.extract_bag(local_bag_archive_path, os.path.dirname(local_bag)) bagit_bag = bagit.Bag(local_bag) return bagit_bag
def extract_bag(bdbag_zip_path, output_directory=None, project_name=None): """Extract BDBag contents into named output directory in original BDBag location.""" (before, sep, after) = bdbag_zip_path.rpartition('.zip') prefix = os.path.basename(before) if project_name: prefix = project_name outdir = os.path.dirname(before) if output_directory: outdir = output_directory outdir = os.path.normpath(outdir) bdbag_api.extract_bag(bdbag_zip_path, output_path=outdir) return os.path.join(outdir, prefix, "data")
def test_extract_bag_archive_zip_with_relocate_existing(self): logger.info(self.getTestHeader('extract bag zip format, relocate existing')) try: bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.zip'), temp=False) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.zip'), temp=False) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bdb.cleanup_bag(os.path.dirname(bag_path)) output = self.stream.getvalue() self.assertExpectedMessages(["moving existing directory"], output) except Exception as e: self.fail(get_typed_exception(e))
def parse(self, bag_archive, output_path="out"): """ Analyze the bag, consuming BagIt-RO metadata into a structure downstream code emitters can use. """ manifest = {} """ Extract the bag. """ bag_path = bdbag_api.extract_bag(bag_archive, output_path=output_path) if bdbag_api.is_bag(bag_path): logger.debug("Initializing metadata datasets") manifest['path'] = bag_path manifest['datasets'] = {} datasets = manifest['datasets'] data_path = os.path.join(bag_path, "data") """ Extract tarred files. """ tar_data_files = glob.glob(os.path.join(data_path, "*.csv.gz")) for f in tar_data_files: with gzip.open(f, 'rb') as zipped: extracted = f.replace(".gz", "") with open(extracted, "wb") as stream: file_content = zipped.read() stream.write(file_content) """ Collect metadata for each file. """ data_files = glob.glob(os.path.join(data_path, "*.csv")) csv_filter = CSVFilter() for f in data_files: csv_filter.filter_data(f) logger.debug(f" --collecting metadata for: {f}") jsonld_context = self._get_jsonld_context(f) datasets[f] = jsonld_context context = datasets[f]['@context'] datasets[f]['columns'] = { k: None for k in context if isinstance(context[k], dict) } return manifest
def validate(ro_path): """ Validates the research object at the path `ro_path`, throwing an exception if an error is encountered. Throws urllib.error.HTTPError, ValidationError, MissingManifestError, json.decoder.JSONDecodeError. :param ro_path: relative or absolute path to the root directory of the research object """ # Extract bag to temp directory and process the RO as a directory ro_path = bdbag_api.extract_bag(ro_path, temp=True) ro_path = os.path.abspath(ro_path) # Validate BagIt RO bag # try: # bdbag_api.validate_bag(ro_path) # bdbag_api.validate_bag_structure(ro_path) # except BagValidationError as err: # yield err # Get graphs for manifest and main profile try: manifest_path = find_manifest(ro_path) except MissingManifestError as err: yield err return manifest_graph = get_graph(path_to_uri(manifest_path), base=ro_path) for err in validate_graph(manifest_graph, base=ro_path): yield err
def ts_validate(data_path, schema=None): """Validate a given TableSchema using frictionless. Arguments: data_path (str): Path to the TableSchema JSON or BDBag directory or BDBag archive to validate. schema (str): The schema to validate against. If not provided, the data is only validated against the defined TableSchema. Default None. Returns: dict: The validation results. is_valid (bool): Is the TableSchema valid? raw_errors (list): The raw Exceptions generated from any validation errors. error (str): A formatted error message about any validation errors. """ if os.path.isfile(data_path): archive_file = data_path try: data_path = bdbag_api.extract_bag(data_path, temp=True) except Exception as e: raise InvalidInput("Error extracting %s: %s" % (archive_file, e)) if not bdbag_api.is_bag(data_path): raise InvalidInput( "Input %s does not appear to be a valid BDBag. This tool requires a" " prepared BDBag archive when invoked on an existing archive file." % archive_file) # If data_path is a directory, find JSON if os.path.isdir(data_path): if "data" in os.listdir(data_path): data_path = os.path.join(data_path, "data") desc_file_list = [ filename for filename in os.listdir(data_path) if filename.endswith(".json") and not filename.startswith(".") ] if len(desc_file_list) < 1: raise ValidationException("No TableSchema JSON file found") elif len(desc_file_list) > 1: raise ValidationException("Mutiple JSON files found in directory") else: data_path = os.path.join(data_path, desc_file_list[0]) # Read into Package try: pkg = Package(data_path) report = validate(pkg, schema=schema) except FrictionlessException as e: raise ValidationException("Validation error\n%s" % e.error.message) if not report.valid: if report.errors: msg = report.errors[0]['message'] else: for task in report['tasks']: if not task.valid: msg = task['resource']['path'] + "\n" msg += task['errors'][0]['message'] raise ValidationException("Validation error in %s" % msg)
def test_extract_bag_archive_tar(self): logger.info(self.getTestHeader('extract bag tar format')) try: bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bdb.cleanup_bag(os.path.dirname(bag_path)) except Exception as e: self.fail(get_typed_exception(e))
def test_extract_bag_archive_tar(self): logger.info(self.getTestHeader('extract bag tar format')) try: bag_path = bdb.extract_bag(ospj(self.test_archive_dir, 'test-bag.tar'), temp=True) self.assertTrue(ospe(bag_path)) self.assertTrue(bdb.is_bag(bag_path)) bdb.cleanup_bag(os.path.dirname(bag_path)) except Exception as e: self.fail(bdbag.get_named_exception(e))
def get_gtex_files_by_id(): lbags = [ fname for fname in os.listdir(BAG_DIR) if fname.endswith('.zip') and fname not in BLACKLISTED_BAGS ] bag_info = {} for local_bag_archive in lbags: local_bag_archive = os.path.abspath(BAG_DIR + '/' + local_bag_archive) local_bag, _ = os.path.splitext(local_bag_archive) if not os.path.exists(local_bag): try: bdbag_api.extract_bag(local_bag_archive, os.path.dirname(local_bag)) except RuntimeError: continue with open(local_bag + '/fetch.txt') as lbfh: content = lbfh.read() cram_fname = content.split('\t')[0] cram_basename = os.path.basename(cram_fname) gtex_id = cram_basename.split('.', 1)[0] if bag_info.get(gtex_id): raise ValueError('Another bag exists with this run info: ' '\n1: {}\n2: {}'.format( bag_info.get(gtex_id)['file'], local_bag_archive)) bag_info[gtex_id] = { 'id': gtex_id, 'file': local_bag_archive, 'basename': os.path.basename(local_bag_archive), 'size': os.stat(local_bag_archive).st_size, 'md5': md5(local_bag_archive), 'location': 'https://bags.fair-research.org/{}' ''.format(os.path.basename(local_bag_archive)) } return bag_info
def fetch_bags(minids): """Given a list of minid bag models, follow their location and fetch the data associated with them, if it doesn't already exist on the filesystem. Returns a list of bagit bag objects""" bags = _resolve_minids_to_bags(minids) bagit_bags = [] for bag in bags: bag_name = os.path.basename(bag.location) local_bag_archive = os.path.join(settings.BAG_STAGING_DIR, bag_name) if not os.path.exists(local_bag_archive): r = requests.get(bag.location, stream=True) if r.status_code == 200: with open(local_bag_archive, 'wb') as f: for chunk in r.iter_content(HTTP_CHUNK_SIZE): f.write(chunk) local_bag, _ = os.path.splitext(local_bag_archive) if not os.path.exists(local_bag): bdbag_api.extract_bag(local_bag_archive, os.path.dirname(local_bag)) bagit_bag = bagit.Bag(local_bag) bagit_bags.append(bagit_bag) return bagit_bags
def testExportBag(self): tale = self._create_water_tale() export_path = '/tale/{}/export'.format(str(tale['_id'])) resp = self.request(path=export_path, method='GET', params={'taleFormat': 'bagit'}, isJson=False, user=self.user) dirpath = tempfile.mkdtemp() bag_file = os.path.join(dirpath, "{}.zip".format(str(tale['_id']))) with open(bag_file, 'wb') as fp: for content in resp.body: fp.write(content) temp_path = bdb.extract_bag(bag_file, temp=True) try: bdb.validate_bag_structure(temp_path) except bagit.BagValidationError: pass # TODO: Goes without saying that we should not be doing that... shutil.rmtree(dirpath) # Test dataSetCitation resp = self.request(path='/tale/{_id}'.format(**tale), method='PUT', type='application/json', user=self.user, body=json.dumps({ 'dataSet': [], 'imageId': str(tale['imageId']), 'public': tale['public'], })) self.assertStatusOk(resp) tale = resp.json self.assertEqual(tale['dataSetCitation'], []) self.model('tale', 'wholetale').remove(tale) self.model('collection').remove(self.data_collection)
def ts_validate(data_path, schema=None): """Validate a given TableSchema using the Datapackage package. Arguments: data_path (str): Path to the TableSchema JSON or BDBag directory or BDBag archive to validate. schema (str): The schema to validate against. If not provided, the data is only validated against the defined TableSchema. Default None. Returns: dict: The validation results. is_valid (bool): Is the TableSchema valid? raw_errors (list): The raw Exceptions generated from any validation errors. error (str): A formatted error message about any validation errors. """ # If data_path is BDBag archive, unarchive to temp dir try: data_path = bdbag_api.extract_bag(data_path, temp=True) # data_path is not archive except RuntimeError: pass # If data_path is dir (incl. if was unarchived), find JSON desc if os.path.isdir(data_path): # If 'data' dir present, search there instead if "data" in os.listdir(data_path): data_path = os.path.join(data_path, "data") # Find .json file (cannot be hidden) desc_file_list = [ filename for filename in os.listdir(data_path) if filename.endswith(".json") and not filename.startswith(".") ] if len(desc_file_list) < 1: return { "is_valid": False, "raw_errors": [FileNotFoundError("No TableSchema JSON file found.")], "error": "No TableSchema JSON file found." } elif len(desc_file_list) > 1: return { "is_valid": False, "raw_errors": [RuntimeError("Multiple JSON files found in directory.")], "error": "Multiple JSON files found in directory." } else: data_path = os.path.join(data_path, desc_file_list[0]) # data_path should/must be file now (JSON desc) if not os.path.isfile(data_path): return { "is_valid": False, "raw_errors": [ ValueError( "Path '{}' does not refer to a file".format(data_path)) ], "error": "Path '{}' does not refer to a file".format(data_path) } # Read into Package (identical to DataPackage), return error on failure try: pkg = Package(descriptor=data_path, strict=True) except Exception as e: return { "is_valid": False, "raw_errors": e.errors, "error": "\n".join([str(err) for err in pkg.errors]) } # Check and return package validity based on non-Exception-throwing Package validation if not pkg.valid: return { "is_valid": pkg.valid, "raw_errors": pkg.errors, "error": "\n".join([str(err) for err in pkg.errors]) } # Perform manual validation as well for resource in pkg.resources: try: resource.read() except CastError as e: return { "is_valid": False, "raw_errors": e.errors, "error": "\n".join([str(err) for err in e.errors]) } except Exception as e: return {"is_valid": False, "raw_errors": repr(e), "error": str(e)} return {"is_valid": True, "raw_errors": [], "error": None}
def main(): args, is_bag, is_file = parse_cli() path = os.path.abspath(args.path) archive = None temp_path = None error = None result = 0 if not args.quiet: sys.stderr.write('\n') try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ( (args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, algs=args.checksum, update=args.update, save_manifests=not args.skip_manifests, prune_manifests=args.prune_manifests, metadata=BAG_METADATA if BAG_METADATA else None, metadata_file=args.metadata_file, remote_file_manifest=args.remote_file_manifest, config_file=args.config_file, ro_metadata_file=args.ro_metadata_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) if not args.quiet: sys.stderr.write('\n') return result if args.ro_manifest_generate: bdb.generate_ro_manifest( path, True if args.ro_manifest_generate == "overwrite" else False, config_file=args.config_file) if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch( path, force=True if args.resolve_fetch == 'all' else False, keychain_file=args.keychain_file, config_file=args.config_file, filter_expr=args.fetch_filter) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) if args.validate == 'structure': bdb.validate_bag_structure(temp_path if temp_path else path) else: bdb.validate_bag( temp_path if temp_path else path, fast=True if args.validate == 'fast' else False, config_file=args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile( temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) if args.revert: bdb.revert_bag(path) except Exception as e: result = 1 error = "Error: %s" % get_typed_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) if not args.quiet: sys.stderr.write('\n') return result
def restore(self, **kwargs): """ Perform the catalog restore operation. The restore process is broken up into six phases: 1. Pre-process the input path. - If the input path is a file, it is assumed that it is a compressed archive file that can be extracted into an input directory via a supported codec: `tar`,`tgz`,`bz2`, or `zip`. - If the input directory is a valid _bag_ directory structure, the bag will be materialized. 2. The catalog schema will be restored first. The schema is restored from a ERMRest JSON schema document file. The schema document file must be named `catalog-schema.json` and must appear at the root of the input directory. The restore process can be configured to exclude the restoration of an enumerated set both schema and tables. 3. The catalog table data will be restored, if present. The table date restoration process is resilient to interruption and may be restarted. However, if the catalog schema or data is mutated outside of the scope of the restore function in-between such restarts, the restored catalog's consistency cannot be guaranteed. The restore process can be configured to exclude the restoration of table data for a set of tables. 4. The catalog foreign keys will be restored. 5. The catalog assets will be restored, if present. 6. On success, the restore state marker annotations will be deleted and the catalog history will be truncated. :param kwargs: :return: """ success = True start = datetime.datetime.now() # pre-process input logging.info("Processing input path: %s" % self.input_path) is_file, is_dir, is_uri = bdb.inspect_path(self.input_path) if not (is_file or is_dir or is_uri): raise DerivaRestoreError( "Invalid input path [%s]. If the specified input path refers to a locally mounted " "file or directory, it does not exist or cannot be accessed. If the specified " "path is a URI, the scheme component of the URI could not be determined." % self.input_path) if is_file or is_dir: self.input_path = os.path.abspath(self.input_path) if is_file: logging.info( "The input path [%s] is a file. Assuming input file is a directory archive and extracting..." % self.input_path) self.input_path = bdb.extract_bag(self.input_path) try: if not self.no_bag_materialize: self.input_path = bdb.materialize(self.input_path) except bdb.bdbagit.BagValidationError as e: if self.strict_bag_validation: raise DerivaRestoreError(format_exception(e)) else: logging.warning( "Input bag validation failed and strict validation mode is disabled. %s" % format_exception(e)) is_bag = bdb.is_bag(self.input_path) src_schema_file = os.path.abspath( os.path.join(self.input_path, "data" if is_bag else "", "catalog-schema.json")) # the src_catalog_stub created below will never be "connected" in any kind of network sense, # but we need an instance of ErmrestCatalog in order to get a working Model from the schema file. src_catalog_stub = ErmrestCatalog("file", src_schema_file, "1") src_model = Model.fromfile(src_catalog_stub, src_schema_file) # initialize/connect to destination catalog if not self.catalog_id: self.catalog_id = self.server.create_ermrest_catalog().catalog_id self.server_args["catalog_id"] = self.catalog_id logging.info("Created new target catalog with ID: %s" % self.catalog_id) self.dst_catalog = self.server.connect_ermrest(self.catalog_id) # init dcctx cid to a default self.dst_catalog.dcctx['cid'] = self.__class__.__name__ # build up the model content we will copy to destination dst_model = self.dst_catalog.getCatalogModel() logging.info("Restoring %s to catalog: %s" % (self.input_path, self.dst_catalog.get_server_uri())) # set top-level config right away and find fatal usage errors... if self.restore_policy: logging.info("Restoring top-level catalog ACLs...") if not src_model.acls: logging.info("Source schema does not contain any ACLs.") else: src_model.acls.owner.extend(dst_model.acls.owner) self.dst_catalog.put('/acl', json=src_model.acls) if self.restore_annotations: logging.info("Restoring top-level catalog annotations...") self.dst_catalog.put('/annotation', json=src_model.annotations) # build up the model content we will copy to destination dst_model = self.dst_catalog.getCatalogModel() new_model = [] new_columns = [ ] # ERMrest does not currently allow bulk column creation new_keys = [] # ERMrest does not currently allow bulk key creation restore_states = {} fkeys_deferred = {} exclude_schemas = [] if self.exclude_schemas is None else self.exclude_schemas try: for sname, schema in src_model.schemas.items(): if sname in exclude_schemas: continue if sname not in dst_model.schemas: new_model.append(self.copy_sdef(schema)) for tname, table in schema.tables.items(): if table.kind != 'table': logging.warning('Skipping restore of %s %s:%s' % (table.kind, sname, tname)) continue if 'RID' not in table.column_definitions.elements: raise DerivaRestoreError( "Source table %s.%s lacks system-columns and cannot be restored." % (sname, tname)) # make sure the source table is pruned of any existing restore state markers if table.annotations.get(CLONE_STATE_URL) is not None: del table.annotations[CLONE_STATE_URL] if table.annotations.get( self.RESTORE_STATE_URL) is not None: del table.annotations[self.RESTORE_STATE_URL] if sname not in dst_model.schemas or tname not in dst_model.schemas[ sname].tables: new_model.append(self.copy_tdef_core(table)) restore_states[( sname, tname)] = 1 if self.restore_data else None fkeys_deferred[(sname, tname)] = self.copy_tdef_fkeys(table) else: src_columns = { c.name: c for c in table.column_definitions } dst_columns = { c.name: c for c in dst_model.schemas[sname].tables[tname]. column_definitions } for cname in src_columns: if cname not in dst_columns: new_columns.append( self.copy_cdef(src_columns[cname])) else: self.check_column_compatibility( src_columns[cname], dst_columns[cname]) for cname in dst_columns: if cname not in src_columns: raise DerivaRestoreError( "Destination column %s.%s.%s does not exist in source catalog." % (sname, tname, cname)) src_keys = { tuple(sorted(c.name for c in key.unique_columns)): key for key in table.keys } dst_keys = { tuple(sorted(c.name for c in key.unique_columns)): key for key in dst_model.schemas[sname].tables[tname].keys } for utuple in src_keys: if utuple not in dst_keys: new_keys.append( self.copy_kdef(src_keys[utuple])) for utuple in dst_keys: if utuple not in src_keys: raise DerivaRestoreError( "Destination key %s.%s(%s) does not exist in source catalog." % (sname, tname, ', '.join(utuple))) restore_states[(sname, tname)] = \ dst_model.schemas[sname].tables[tname].annotations.get(self.RESTORE_STATE_URL) if dst_model.schemas[sname].tables[tname].foreign_keys: # assume that presence of any destination foreign keys means we already completed if self.restore_assets: self.upload_assets() return else: fkeys_deferred[( sname, tname)] = self.copy_tdef_fkeys(table) # apply the stage 1 model to the destination in bulk logging.info("Restoring catalog schema...") if new_model: self.dst_catalog.post("/schema", json=new_model).raise_for_status() for sname, tname, cdef in new_columns: self.dst_catalog.post("/schema/%s/table/%s/column" % (urlquote(sname), urlquote(tname)), json=cdef).raise_for_status() for sname, tname, kdef in new_keys: self.dst_catalog.post("/schema/%s/table/%s/key" % (urlquote(sname), urlquote(tname)), json=kdef).raise_for_status() # copy data in stage 2 if self.restore_data: logging.info("Restoring catalog data...") for sname, tname in restore_states.keys(): tname_uri = "%s:%s" % (urlquote(sname), urlquote(tname)) if restore_states[(sname, tname)] == 1: # determine current position in (partial?) copy row = self.dst_catalog.get( "/entity/%s@sort(RID::desc::)?limit=1" % tname_uri).json() if row: last = row[0]['RID'] logging.info( "Existing data detected in table [%s] -- will attempt partial restore of " "remaining records following last known RID: %s" % (tname_uri, last)) else: last = None table = self.get_json_recordset( self.open_json_stream_file( self.get_table_path(sname, tname, is_bag)), self.data_chunk_size, after=last) total = 0 table_success = True try: for chunk in table: if chunk: self.dst_catalog.post( "/entity/%s?nondefaults=RID,RCT,RCB" % tname_uri, json=chunk) total += len(chunk) else: break except: table_success = False finally: table.close() if table_success: logging.info( "Restoration of table data [%s] successful. %s rows restored." % (tname_uri, total)) else: logging.warning( "Restoration of table data [%s] failed. %s rows restored." % (tname_uri, total)) # record our progress on catalog in case we fail part way through self.dst_catalog.put( "/schema/%s/table/%s/annotation/%s" % ( urlquote(sname), urlquote(tname), urlquote(self.RESTORE_STATE_URL), ), json=2) elif restore_states[(sname, tname)] is None and ( sname, tname) in { ('public', 'ERMrest_Client'), ('public', 'ERMrest_Group'), }: # special sync behavior for magic ermrest tables # HACK: these are assumed small enough to join via local merge of arrays want = sorted(self.load_json_file( self.get_table_path(sname, tname, is_bag)), key=lambda r: r['ID']) have = sorted(self.dst_catalog.get( "/entity/%s?limit=none" % tname_uri).json(), key=lambda r: r['ID']) create = [] update = [] pos_want = 0 pos_have = 0 while pos_want < len(want): while pos_have < len(have) and have[pos_have][ 'ID'] < want[pos_want]['ID']: # dst-only rows will be retained as is pos_have += 1 if pos_have >= len(have) or have[pos_have][ 'ID'] > want[pos_want]['ID']: # src-only rows will be inserted create.append(want[pos_want]) pos_want += 1 else: # overlapping rows will be updated update.append(want[pos_want]) pos_want += 1 self.dst_catalog.post( "/entity/%s?nondefaults=RCT,RCB" % tname_uri, json=create) self.dst_catalog.put( "/attributegroup/%s/ID;%s" % (tname_uri, ",".join([ urlquote(c.name) for c in src_model. schemas[sname].tables[tname].column_definitions if c.name not in {'RID', 'RMT', 'RMB', 'ID'} ])), json=update) # record our progress on catalog in case we fail part way through self.dst_catalog.put( "/schema/%s/table/%s/annotation/%s" % ( urlquote(sname), urlquote(tname), urlquote(self.RESTORE_STATE_URL), ), json=2) # apply stage 2 model in bulk only... we won't get here unless preceding succeeded logging.info("Restoring foreign keys...") new_fkeys = [] for fkeys in fkeys_deferred.values(): new_fkeys.extend(fkeys) # restore fkeys if new_fkeys: self.dst_catalog.post("/schema", json=new_fkeys) # restore assets if self.restore_assets: self.upload_assets() # cleanup self.cleanup_restored_catalog() except: success = False raise finally: elapsed_time = datetime.datetime.now() - start total_secs = elapsed_time.total_seconds() elapsed = time.strftime('%H:%M:%S', time.gmtime(total_secs)) logging.info("Restore of catalog %s %s. %s" % (self.dst_catalog.get_server_uri(), "completed successfully" if success else "failed", ("Elapsed time: %s" % elapsed) if (total_secs > 0) else ""))
def download_data(location, local_path): """Download data from a remote host to the configured machine. (Many sources to one destination) Arguments: location (str): The location of the data. local_path (str): The path to the local storage location. Returns: dict: success (bool): True on success, False on failure. """ filename = None # If the local_path is a file and not a directory, use the directory if ((os.path.exists(local_path) and not os.path.isdir(local_path)) or (not os.path.exists(local_path) and local_path[-1] != "/")): # Save the filename for later filename = os.path.basename(local_path) local_path = os.path.dirname(local_path) + "/" os.makedirs(local_path, exist_ok=True) loc_info = urllib.parse.urlparse(location) # HTTP(S) if loc_info.scheme.startswith("http"): # Get default filename and extension http_filename = os.path.basename(loc_info.path) if not http_filename: http_filename = "archive" ext = os.path.splitext(http_filename)[1] if not ext: ext = ".archive" # Fetch file with requests.get(location, stream=True) as res: if res.status_code >= 300: logger.error( f"Error {res.status_code} downloading file '{location}': " f"{res.content}") raise IOError("File download failed: {}".format(res.content)) else: logger.debug( f"Downloaded file {location} with status code {res.status_code}" ) # Get filename from header if present con_disp = res.headers.get("Content-Disposition", "") filename_start = con_disp.find("filename=") if filename_start >= 0: filename_end = con_disp.find(";", filename_start) if filename_end < 0: filename_end = None http_filename = con_disp[filename_start + len("filename="):filename_end] http_filename = http_filename.strip("\"'; ") # Create path for file archive_path = os.path.join(local_path, filename or http_filename) # Download and save file with open(archive_path, 'wb') as out: shutil.copyfileobj(res.raw, out) logger.debug("Saved HTTP file: {}".format(archive_path)) # Assume data is BDBag, extract bag_path = bdbag_api.extract_bag(archive_path, local_path) # Not supported else: # Nothing to do raise IOError( "Invalid data location: '{}' is not a recognized protocol " "(from {}).".format(loc_info.scheme, str(location))) # Return path to bag return bag_path
def main(): sys.stderr.write('\n') args, is_bag, is_file = parse_cli() path = os.path.abspath(args.bag_path) archive = None temp_path = None error = None result = 0 try: if not is_file: # do not try to create or update the bag if the user just wants to validate or complete an existing bag if not ((args.validate or args.validate_profile or args.resolve_fetch) and not (args.update and bdb.is_bag(path))): if args.checksum and 'all' in args.checksum: args.checksum = ['md5', 'sha1', 'sha256', 'sha512'] # create or update the bag depending on the input arguments bdb.make_bag(path, args.checksum, args.update, args.skip_manifests, args.prune_manifests, BAG_METADATA if BAG_METADATA else None, args.metadata_file, args.remote_file_manifest, args.config_file) # otherwise just extract the bag if it is an archive and no other conflicting options specified elif not (args.validate or args.validate_profile or args.resolve_fetch): bdb.extract_bag(path) sys.stderr.write('\n') return result if args.resolve_fetch: if args.validate == 'full': sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING) bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False) if args.validate: if is_file: temp_path = bdb.extract_bag(path, temp=True) bdb.validate_bag(temp_path if temp_path else path, True if args.validate == 'fast' else False, args.config_file) if args.archiver: archive = bdb.archive_bag(path, args.archiver) if archive is None and is_file: archive = path if args.validate_profile: if is_file: if not temp_path: temp_path = bdb.extract_bag(path, temp=True) profile = bdb.validate_bag_profile(temp_path if temp_path else path) bdb.validate_bag_serialization(archive if archive else path, profile) except Exception as e: result = 1 error = "Error: %s" % bdbag.get_named_exception(e) finally: if temp_path: bdb.cleanup_bag(os.path.dirname(temp_path)) if result != 0: sys.stderr.write("\n%s" % error) sys.stderr.write('\n') return result
def submit_job(input_minid, wf_minid, api_key=None): #### BASIC ASSUMPTIONS: # 1. User has a globus ID and has account in GG # 2. User has created an API # 3. User does not have the workflow setup on their account #### A. Get workflow GA file from the workflow MINID #### B. Push GA file to the instance url gi = GalaxyInstance(URL, api_key) QUERY_BASE = "http://minid.bd2k.org/minid/landingpage/" tmp_path = tempfile.mkdtemp() wf_mine = None try: # A. BASE_DOWNLOAD_PATH = "/%s" % (tmp_path) query = "%s/%s" % (QUERY_BASE, wf_minid) # print("Executing query: %s" % query) r = requests.get(query, headers={"Accept": "application/json"}) location = r.json()["locations"][0]['link'] filename = location.split("/")[-1] path = "%s/%s" % (BASE_DOWNLOAD_PATH, filename) # print("Downloading result: %s" % location) # Save the bag from the minid location response = requests.get(location, stream=True) with open(path, 'wb') as handle: for block in response.iter_content(1024): handle.write(block) extract_path = ".".join(path.split(".")[0:-1]) output_path = "%s/%s" % (extract_path, ".".join( filename.split(".")[0:-1])) # print("Extracting bag and resolving fetch: %s" % output_path) bdbag_api.extract_bag(path, extract_path) time.sleep(5) # print('resolving fetch') bdbag_api.resolve_fetch(output_path, True) ga_file = glob.glob("%s/data/*.ga" % (output_path))[0] # B. ga_dict = None with open(ga_file) as handle: ga_dict = json.loads(handle.read()) if ga_dict is not None: wf_mine = gi.workflows.import_workflow_dict(ga_dict) finally: shutil.rmtree(tmp_path) # print('finished!') # published_workflow_id = "6f1411e6cfea8ef7" # workflow_name = "imported: RNA-seq-Gtex-stage1-v2.0-bags_transfer" # ## check if workflow exists # workflows = gi.workflows.get_workflows(name=workflow_name) # wf_mine = None # if len(workflows) > 0: # wf_mine = workflows[-1] # else: # # workflow does not exist, need to import from published # wf_mine = gi.workflows.import_shared_workflow(published_workflow_id) # create a history history_name = "topmed_history_%s" % time.strftime( "%a_%b_%d_%Y_%-I:%M:%S_%p", time.localtime(time.time())) history = gi.histories.create_history(name=history_name) wf_data = {} wf_data['workflow_id'] = wf_mine['id'] wf_data['ds_map'] = {} parameters = {} parameters['0'] = {'minid': input_minid} parameters['5'] = { 'historyid': history['id'], 'userapi': api_key, 'url': URL } wf_data['parameters'] = parameters # print('super close to finishing!') res = gi.workflows.invoke_workflow(wf_data['workflow_id'], wf_data['ds_map'], params=wf_data['parameters'], history_id=history['id'], import_inputs_to_history=False) return { 'history_name': history_name, 'history_id': res['history_id'], 'res': res }
def ts_validate(data_path, schema=None): """Validate a given TableSchema using the Datapackage package. Arguments: data_path (str): Path to the TableSchema JSON or BDBag directory or BDBag archive to validate. schema (str): The schema to validate against. If not provided, the data is only validated against the defined TableSchema. Default None. Returns: dict: The validation results. is_valid (bool): Is the TableSchema valid? raw_errors (list): The raw Exceptions generated from any validation errors. error (str): A formatted error message about any validation errors. """ # If data_path is BDBag archive, unarchive to temp dir try: data_path = bdbag_api.extract_bag(data_path, temp=True) # data_path is not archive except RuntimeError: pass # If data_path is dir (incl. if was unarchived), find JSON desc if os.path.isdir(data_path): # If 'data' dir present, search there instead if "data" in os.listdir(data_path): data_path = os.path.join(data_path, "data") # Find .json file (cannot be hidden) desc_file_list = [ filename for filename in os.listdir(data_path) if filename.endswith(".json") and not filename.startswith(".") ] if len(desc_file_list) < 1: return { "is_valid": False, "raw_errors": [FileNotFoundError("No TableSchema JSON file found.")], "error": "No TableSchema JSON file found." } elif len(desc_file_list) > 1: return { "is_valid": False, "raw_errors": [RuntimeError("Multiple JSON files found in directory.")], "error": "Multiple JSON files found in directory." } else: data_path = os.path.join(data_path, desc_file_list[0]) # data_path should/must be file now (JSON desc) if not os.path.isfile(data_path): return { "is_valid": False, "raw_errors": [ ValueError( "Path '{}' does not refer to a file".format(data_path)) ], "error": "Path '{}' does not refer to a file".format(data_path) } # Read into Package, return error on failure try: pkg = Package(descriptor=data_path, strict=True) except Exception as e: return {"is_valid": False, "raw_errors": e.errors, "error": e.errors} if schema: # Download reference schema schema_path = os.path.join(os.path.dirname(data_path), "validation_schema.json") try: with open(schema_path, "wb") as f: f.write(requests.get(schema).content) except Exception as e: return { "is_valid": False, "raw_errors": [e], "error": "Error while downloading schema: {}".format(str(e)) } # TODO: Validate against downloaded schema print( "Warning: Currently unable to validate data against existing schema '{}'." .format(schema)) # Actually check and return package validity based on Package validation return { "is_valid": pkg.valid, "raw_errors": pkg.errors, "error": "\n".join([str(err) for err in pkg.errors]) }