Beispiel #1
0
def create_bag_from_metadata_file(metadata_file_path,
                                  remote_file_manifest=None,
                                  working_dir=None,
                                  output_name=None,
                                  output_path=None,
                                  archive_format=None,
                                  creator_name=None,
                                  creator_orcid=None,
                                  create_ro_manifest=False):

    temp_path = None
    if remote_file_manifest is None:
        if working_dir is None:
            working_dir = temp_path = tempfile.mkdtemp(prefix="encode2bag_")
        remote_file_manifest = osp.abspath(osp.join(working_dir, "remote-file-manifest.json"))

    ro_manifest = None
    if create_ro_manifest:
        ro_manifest = init_ro_manifest(creator_name=creator_name, creator_orcid=creator_orcid)

    convert_tsv_metadata_to_remote_file_manifest(metadata_file_path, remote_file_manifest, ro_manifest)

    bag_path = get_target_bag_path(output_name=output_name, output_path=output_path)
    ensure_bag_path_exists(bag_path)
    shutil.copy(osp.abspath(metadata_file_path), bag_path)

    bag_metadata = dict()
    if creator_name:
        bag_metadata["Contact-Name"] = creator_name
    if creator_orcid:
        bag_metadata["Contact-Orcid"] = creator_orcid

    bag = bdb.make_bag(bag_path,
                       algs=["md5", "sha256"],
                       metadata=bag_metadata,
                       remote_file_manifest=remote_file_manifest)

    if create_ro_manifest:
        bag_metadata_dir = os.path.abspath(os.path.join(bag_path, "metadata"))
        if not os.path.exists(bag_metadata_dir):
            os.mkdir(bag_metadata_dir)
        ro_manifest_path = osp.join(bag_metadata_dir, "manifest.json")
        ro.write_ro_manifest(ro_manifest, ro_manifest_path)
        bag_metadata.update({'BagIt-Profile-Identifier':
                            "http://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-ro-profile.json"})
        bdb.make_bag(bag_path, update=True, metadata=bag_metadata)
    if archive_format:
        bag_path = bdb.archive_bag(bag_path, archive_format)

    if temp_path:
        shutil.rmtree(temp_path)

    return bag_path
Beispiel #2
0
 def test_create_bag(self):
     logger.info(self.getTestHeader('create bag'))
     try:
         bag = bdb.make_bag(self.test_data_dir)
         self.assertIsInstance(bag, bagit.Bag)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #3
0
 def test_update_bag_remove_file(self):
     logger.info(self.getTestHeader('update bag remove file'))
     try:
         os.remove(ospj(self.test_bag_dir, 'data', 'test1', 'test1.txt'))
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertUnexpectedMessages(['test1.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #4
0
 def test_update_bag_remove_file(self):
     logger.info(self.getTestHeader('update bag remove file'))
     try:
         os.remove(ospj(self.test_bag_dir, 'data', 'test1', 'test1.txt'))
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bdbagit.BDBag)
         self.assertUnexpectedMessages(['test1.txt'], output)
     except Exception as e:
         self.fail(get_typed_exception(e))
Beispiel #5
0
 def test_update_bag_change_file(self):
     logger.info(self.getTestHeader('update bag change file'))
     try:
         with open(ospj(self.test_bag_dir, 'data', 'README.txt'), 'a') as f:
             f.writelines('Additional data added via unit test.')
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['README.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #6
0
 def test_update_bag_add_file(self):
     logger.info(self.getTestHeader('update bag add file'))
     try:
         with open(ospj(self.test_bag_dir, 'data', 'NEWFILE.txt'), 'w') as nf:
             nf.write('Additional file added via unit test.')
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bdbagit.BDBag)
         self.assertExpectedMessages(['NEWFILE.txt'], output)
     except Exception as e:
         self.fail(get_typed_exception(e))
Beispiel #7
0
 def test_update_bag_change_file(self):
     logger.info(self.getTestHeader('update bag change file'))
     try:
         with open(ospj(self.test_bag_dir, 'data', 'README.txt'), 'a') as f:
             f.writelines('Additional data added via unit test.')
         bag = bdb.make_bag(self.test_bag_dir, update=True)
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['README.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #8
0
 def test_update_bag_change_metadata(self):
     logger.info(self.getTestHeader('update bag change metadata'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=(ospj(self.test_config_dir, 'test-metadata.json')))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #9
0
    def test_generate_ro_manifest_update(self):
        logger.info(self.getTestHeader('create bag with auto-generation of RO manifest in update mode'))
        try:
            bdb.make_bag(self.test_data_dir, algs=['md5', 'sha1', 'sha256', 'sha512'],
                         remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json'))
            bdb.generate_ro_manifest(self.test_data_dir, overwrite=True)
            ro = bdbro.read_bag_ro_metadata(self.test_data_dir)
            old_agg_dict = dict()
            for entry in ro.get("aggregates", []):
                old_agg_dict[entry["uri"]] = entry
            bdbro.add_file_metadata(ro, local_path="../data/FAKE.txt", bundled_as=bdbro.make_bundled_as())
            bdbro.write_bag_ro_metadata(ro, self.test_data_dir)

            bdb.generate_ro_manifest(self.test_data_dir, overwrite=False)
            ro = bdbro.read_bag_ro_metadata(self.test_data_dir)
            for entry in ro.get("aggregates", []):
                if entry["uri"] in old_agg_dict:
                    self.assertTrue(entry["bundledAs"]["uri"] == old_agg_dict[entry["uri"]]["bundledAs"]["uri"])

        except Exception as e:
            self.fail(get_typed_exception(e))
Beispiel #10
0
    def test_generate_ro_manifest_overwrite(self):
        logger.info(self.getTestHeader('create bag with auto-generation of RO manifest in overwrite mode'))
        try:
            bdb.make_bag(self.test_data_dir, algs=['md5', 'sha1', 'sha256', 'sha512'],
                         remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json'))
            bdb.generate_ro_manifest(self.test_data_dir, overwrite=True)
            ro = bdbro.read_bag_ro_metadata(self.test_data_dir)
            agg_dict = dict()
            for entry in ro.get("aggregates", []):
                agg_dict[entry["uri"]] = entry
            for test_entry in self.ro_test_aggregates:
                self.assertTrue(test_entry["uri"] in agg_dict)
                entry = agg_dict[test_entry["uri"]]
                bundled_as = entry.get("bundledAs")
                if bundled_as:
                    if "filename" in bundled_as:
                        self.assertTrue(test_entry["bundledAs"]["filename"] == bundled_as["filename"])
                    if "folder" in bundled_as:
                        self.assertTrue(test_entry["bundledAs"]["folder"] == bundled_as["folder"])

        except Exception as e:
            self.fail(get_typed_exception(e))
Beispiel #11
0
 def test_update_bag_change_metadata_nested_dict(self):
     logger.info(self.getTestHeader('update bag change metadata with nested dict'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            save_manifests=False,
                            metadata_file=ospj(self.test_config_dir, 'test-ro-metadata.json'))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bdbagit.BDBag)
         self.assertExpectedMessages(['Reading bag metadata from file', 'test-ro-metadata.json'], output)
         self.assertExpectedMessages(["Nested dictionary content not supported in tag file: [bag-info.txt]"], output)
     except Exception as e:
         self.fail(get_typed_exception(e))
Beispiel #12
0
 def test_update_bag_prune(self):
     logger.info(self.getTestHeader('update bag prune manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir, algs=['md5'], update=True, prune_manifests=True)
         self.assertIsInstance(bag, bagit.Bag)
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt')))
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #13
0
 def test_update_bag_prune(self):
     logger.info(self.getTestHeader('update bag prune manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir, algs=['md5'], update=True, prune_manifests=True)
         self.assertIsInstance(bag, bdbagit.BDBag)
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'manifest-sha512.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_bag_dir, 'tagmanifest-sha512.txt')))
     except Exception as e:
         self.fail(get_typed_exception(e))
Beispiel #14
0
    def _test_create_or_update_bag_with_metadata(self,
                                                 update=False,
                                                 override_file_metadata=False,
                                                 no_file_metadata=False):
        try:
            metadata_param = None if not override_file_metadata else {
                "Contact-Name": "nobody"
            }
            ro_metadata_param = None if not override_file_metadata else {
                "manifest.json": {
                    "@context": ["https://w3id.org/bundle/context"],
                    "@id": "../"
                }
            }
            bag_dir = self.test_bag_dir if update else self.test_data_dir
            bag = bdb.make_bag(
                bag_dir,
                update=update,
                metadata=metadata_param,
                metadata_file=None if no_file_metadata else ospj(
                    self.test_config_dir, 'test-metadata.json'),
                ro_metadata=ro_metadata_param,
                ro_metadata_file=None if no_file_metadata else ospj(
                    self.test_config_dir, 'test-ro-metadata.json'))
            output = self.stream.getvalue()
            self.assertIsInstance(bag, bdbagit.BDBag)
            bag_info_txt = self.slurp_text_file(ospj(
                bag_dir, 'bag-info.txt')).splitlines()
            if override_file_metadata:
                self.assertIn('Contact-Name: nobody', bag_info_txt)
            if not no_file_metadata:
                self.assertExpectedMessages(
                    ['Reading bag metadata from file', 'test-metadata.json'],
                    output)
                self.assertExpectedMessages([
                    'Reading bag metadata from file', 'test-ro-metadata.json'
                ], output)
                self.assertIn('External-Description: Simple bdbag test',
                              bag_info_txt)
                ro_manifest_file = ospj(bag_dir, 'metadata', 'manifest.json')
                self.assertTrue(os.path.isfile(ro_manifest_file))
                ro_manifest_txt = self.slurp_text_file(ro_manifest_file)
                ro_test_line = '"uri": "../data/test2/test2.txt"'
                if override_file_metadata:
                    self.assertNotIn(ro_test_line, ro_manifest_txt)
                else:
                    self.assertIn(ro_test_line, ro_manifest_txt)

        except Exception as e:
            self.fail(get_typed_exception(e))
Beispiel #15
0
 def test_update_bag_change_metadata(self):
     logger.info(self.getTestHeader('update bag change metadata'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=(ospj(self.test_config_dir,
                                                'test-metadata.json')))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(
             ['Reading bag metadata from file', 'test-metadata.json'],
             output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #16
0
def create_bag_archive(metadata,
                       bag_algorithms=('md5', 'sha256'),
                       **bag_metadata):
    bag_name = join(settings.BAG_STAGING_DIR, str(uuid.uuid4()))
    remote_manifest_filename = join(settings.BAG_STAGING_DIR,
                                    str(uuid.uuid4()))

    remote_manifest_formatted = _format_remote_file_manifest(
        metadata, bag_algorithms)
    with open(remote_manifest_filename, 'w') as f:
        f.write(json.dumps(remote_manifest_formatted))

    os.mkdir(bag_name)
    bdbag_api.make_bag(
        bag_name,
        algs=bag_algorithms,
        metadata=dict(bag_metadata),
        remote_file_manifest=remote_manifest_filename,
    )
    bdbag_api.archive_bag(bag_name, settings.BAG_ARCHIVE_FORMAT)

    archive_name = '{}.{}'.format(bag_name, settings.BAG_ARCHIVE_FORMAT)
    os.remove(remote_manifest_filename)
    return archive_name
Beispiel #17
0
 def test_update_bag_change_metadata_only(self):
     logger.info(self.getTestHeader('update bag change metadata only - do not save manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            save_manifests=False,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=(ospj(self.test_config_dir, 'test-metadata.json')))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output)
         self.assertUnexpectedMessages(['updating manifest-sha1.txt',
                                        'updating manifest-sha256.txt',
                                        'updating manifest-sha512.txt',
                                        'updating manifest-md5.txt'], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #18
0
 def test_create_bag_with_config(self):
     logger.info(self.getTestHeader('create bag with config'))
     try:
         bag = bdb.make_bag(self.test_data_dir,
                            config_file=(ospj(self.test_config_dir, 'test-config.json')))
         self.assertIsInstance(bag, bagit.Bag)
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha512.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha512.txt')))
         baginfo = ospj(self.test_data_dir, 'bag-info.txt')
         with open(baginfo) as bi:
             baginfo_txt = bi.read()
         self.assertIn('Contact-Name: bdbag test', baginfo_txt)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #19
0
 def test_create_bag_with_config(self):
     logger.info(self.getTestHeader('create bag with config'))
     try:
         bag = bdb.make_bag(self.test_data_dir,
                            config_file=(ospj(self.test_config_dir, 'test-config.json')))
         self.assertIsInstance(bag, bdbagit.BDBag)
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'manifest-sha512.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha1.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha256.txt')))
         self.assertFalse(ospif(ospj(self.test_data_dir, 'tagmanifest-sha512.txt')))
         baginfo = ospj(self.test_data_dir, 'bag-info.txt')
         with open(baginfo) as bi:
             baginfo_txt = bi.read()
         self.assertIn('Contact-Name: bdbag test', baginfo_txt)
     except Exception as e:
         self.fail(get_typed_exception(e))
Beispiel #20
0
 def test_update_bag_remote(self):
     logger.info(self.getTestHeader('update bag add remote file manifest'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            remote_file_manifest=ospj(self.test_config_dir, 'test-fetch-manifest.json'))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(['Generating remote file references from', 'test-fetch-manifest.json'], output)
         fetch_file = ospj(self.test_bag_dir, 'fetch.txt')
         self.assertTrue(ospif(fetch_file))
         with open(fetch_file) as ff:
             fetch_txt = ff.read()
         self.assertIn(
             'https://raw.githubusercontent.com/ini-bdds/bdbag/master/profiles/bdbag-profile.json'
             '\t723\tdata/bdbag-profile.json', fetch_txt)
         self.assertIn(
             'ark:/88120/r8059v\t632860\tdata/minid_v0.1_Nov_2015.pdf', fetch_txt)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #21
0
 def test_update_bag_change_metadata_only(self):
     logger.info(
         self.getTestHeader(
             'update bag change metadata only - do not save manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            save_manifests=False,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=(ospj(self.test_config_dir,
                                                'test-metadata.json')))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bagit.Bag)
         self.assertExpectedMessages(
             ['Reading bag metadata from file', 'test-metadata.json'],
             output)
         self.assertUnexpectedMessages([
             'updating manifest-sha1.txt', 'updating manifest-sha256.txt',
             'updating manifest-sha512.txt', 'updating manifest-md5.txt'
         ], output)
     except Exception as e:
         self.fail(bdbag.get_named_exception(e))
Beispiel #22
0
 def test_update_bag_change_metadata_only(self):
     logger.info(self.getTestHeader('update bag change metadata only - do not save manifests'))
     try:
         bag = bdb.make_bag(self.test_bag_dir,
                            update=True,
                            save_manifests=False,
                            metadata={"Contact-Name": "nobody"},
                            metadata_file=ospj(self.test_config_dir, 'test-metadata.json'),
                            ro_metadata_file=ospj(self.test_config_dir, 'test-ro-metadata.json'))
         output = self.stream.getvalue()
         self.assertIsInstance(bag, bdbagit.BDBag)
         self.assertExpectedMessages(['Reading bag metadata from file', 'test-metadata.json'], output)
         bag_info_txt = self.slurp_text_file(ospj(self.test_bag_dir, 'bag-info.txt')).splitlines()
         self.assertIn('Contact-Name: nobody', bag_info_txt)
         self.assertIn('External-Description: Simple bdbag test', bag_info_txt)
         self.assertTrue(os.path.isfile(ospj(self.test_bag_dir, 'metadata', 'manifest.json')))
         self.assertUnexpectedMessages(['updating manifest-sha1.txt',
                                        'updating manifest-sha256.txt',
                                        'updating manifest-sha512.txt',
                                        'updating manifest-md5.txt'], output)
     except Exception as e:
         self.fail(get_typed_exception(e))
Beispiel #23
0
def create_bag(output_dir, update):
    """Create/Update and archive a BDBag from the contents of a passed-in directory."""
    bdbag_api.make_bag(output_dir, update=update)
    return bdbag_api.archive_bag(output_dir, "zip")
Beispiel #24
0
    def download(self, **kwargs):

        if not self.config:
            raise DerivaDownloadConfigurationError(
                "No configuration specified!")

        if self.config.get("catalog") is None:
            raise DerivaDownloadConfigurationError(
                "Catalog configuration error!")

        ro_manifest = None
        ro_author_name = None
        ro_author_orcid = None
        remote_file_manifest = os.path.abspath(''.join([
            os.path.join(self.output_dir, 'remote-file-manifest_'),
            str(uuid.uuid4()), ".json"
        ]))

        catalog_config = self.config['catalog']
        self.envars.update(self.config.get('env', dict()))
        self.envars.update({"hostname": self.hostname})

        # 1. If we don't have a client identity, we need to authenticate
        identity = kwargs.get("identity")
        if not identity:
            try:
                if not self.credentials:
                    self.set_credentials(get_credential(self.hostname))
                logging.info("Validating credentials for host: %s" %
                             self.hostname)
                attributes = self.catalog.get_authn_session().json()
                identity = attributes["client"]
            except HTTPError as he:
                if he.response.status_code == 404:
                    logging.info(
                        "No existing login session found for host: %s" %
                        self.hostname)
            except Exception as e:
                raise DerivaDownloadAuthenticationError(
                    "Unable to validate credentials: %s" % format_exception(e))
        wallet = kwargs.get("wallet", {})

        # 2. Check for bagging config and initialize bag related variables
        bag_path = None
        bag_archiver = None
        bag_algorithms = None
        bag_config = self.config.get('bag')
        create_bag = True if bag_config else False
        if create_bag:
            bag_name = bag_config.get(
                'bag_name', ''.join([
                    "deriva_bag", '_',
                    time.strftime("%Y-%m-%d_%H.%M.%S")
                ])).format(**self.envars)
            bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name))
            bag_archiver = bag_config.get('bag_archiver')
            bag_algorithms = bag_config.get('bag_algorithms', ['sha256'])
            bag_metadata = bag_config.get(
                'bag_metadata',
                {"Internal-Sender-Identifier": "deriva@%s" % self.server_url})
            bag_ro = create_bag and stob(bag_config.get('bag_ro', "True"))
            if create_bag:
                bdb.ensure_bag_path_exists(bag_path)
                bag = bdb.make_bag(bag_path,
                                   algs=bag_algorithms,
                                   metadata=bag_metadata)
                if bag_ro:
                    ro_author_name = bag.info.get(
                        "Contact-Name", None if not identity else identity.get(
                            'full_name',
                            identity.get('display_name',
                                         identity.get('id', None))))
                    ro_author_orcid = bag.info.get("Contact-Orcid")
                    ro_manifest = ro.init_ro_manifest(
                        author_name=ro_author_name,
                        author_orcid=ro_author_orcid)
                    bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID})

        # 3. Process the set of queries by locating, instantiating, and invoking the specified processor(s)
        outputs = dict()
        base_path = bag_path if bag_path else self.output_dir
        for processor in catalog_config['query_processors']:
            processor_name = processor["processor"]
            processor_type = processor.get('processor_type')
            processor_params = processor.get('processor_params')

            try:
                query_processor = find_query_processor(processor_name,
                                                       processor_type)
                processor = query_processor(
                    self.envars,
                    inputs=outputs,
                    bag=create_bag,
                    catalog=self.catalog,
                    store=self.store,
                    base_path=base_path,
                    processor_params=processor_params,
                    remote_file_manifest=remote_file_manifest,
                    ro_manifest=ro_manifest,
                    ro_author_name=ro_author_name,
                    ro_author_orcid=ro_author_orcid,
                    identity=identity,
                    wallet=wallet)
                outputs = processor.process()
            except Exception as e:
                logging.error(format_exception(e))
                if create_bag:
                    bdb.cleanup_bag(bag_path)
                raise

        # 4. Execute anything in the transform processing pipeline, if configured
        transform_processors = self.config.get('transform_processors', [])
        if transform_processors:
            for processor in transform_processors:
                processor_name = processor["processor"]
                processor_type = processor.get('processor_type')
                processor_params = processor.get('processor_params')
                try:
                    transform_processor = find_transform_processor(
                        processor_name, processor_type)
                    processor = transform_processor(
                        self.envars,
                        inputs=outputs,
                        processor_params=processor_params,
                        base_path=base_path,
                        bag=create_bag,
                        ro_manifest=ro_manifest,
                        ro_author_name=ro_author_name,
                        ro_author_orcid=ro_author_orcid,
                        identity=identity,
                        wallet=wallet)
                    outputs = processor.process()
                except Exception as e:
                    logging.error(format_exception(e))
                    raise

        # 5. Create the bag, and archive (serialize) if necessary
        if create_bag:
            try:
                if ro_manifest:
                    ro.write_bag_ro_metadata(ro_manifest, bag_path)
                if not os.path.isfile(remote_file_manifest):
                    remote_file_manifest = None
                bdb.make_bag(
                    bag_path,
                    algs=bag_algorithms,
                    remote_file_manifest=remote_file_manifest if
                    (remote_file_manifest
                     and os.path.getsize(remote_file_manifest) > 0) else None,
                    update=True)
            except Exception as e:
                logging.fatal("Exception while updating bag manifests: %s" %
                              format_exception(e))
                bdb.cleanup_bag(bag_path)
                raise
            finally:
                if remote_file_manifest and os.path.isfile(
                        remote_file_manifest):
                    os.remove(remote_file_manifest)

            logging.info('Created bag: %s' % bag_path)

            if bag_archiver is not None:
                try:
                    archive = bdb.archive_bag(bag_path, bag_archiver.lower())
                    bdb.cleanup_bag(bag_path)
                    outputs = {
                        os.path.basename(archive): {
                            LOCAL_PATH_KEY: archive
                        }
                    }
                except Exception as e:
                    logging.error(
                        "Exception while creating data bag archive: %s" %
                        format_exception(e))
                    raise
            else:
                outputs = {
                    os.path.basename(bag_path): {
                        LOCAL_PATH_KEY: bag_path
                    }
                }

        # 6. Execute anything in the post processing pipeline, if configured
        post_processors = self.config.get('post_processors', [])
        if post_processors:
            for processor in post_processors:
                processor_name = processor["processor"]
                processor_type = processor.get('processor_type')
                processor_params = processor.get('processor_params')
                try:
                    post_processor = find_post_processor(
                        processor_name, processor_type)
                    processor = post_processor(
                        self.envars,
                        inputs=outputs,
                        processor_params=processor_params,
                        identity=identity,
                        wallet=wallet)
                    outputs = processor.process()
                except Exception as e:
                    logging.error(format_exception(e))
                    raise

        return outputs
Beispiel #25
0
def main():

    sys.stderr.write('\n')

    args, is_bag, is_file = parse_cli()
    path = os.path.abspath(args.bag_path)

    archive = None
    temp_path = None
    error = None
    result = 0

    try:
        if not is_file:
            # do not try to create or update the bag if the user just wants to validate or complete an existing bag
            if not ((args.validate or args.validate_profile or args.resolve_fetch) and
                    not (args.update and bdb.is_bag(path))):
                if args.checksum and 'all' in args.checksum:
                    args.checksum = ['md5', 'sha1', 'sha256', 'sha512']
                # create or update the bag depending on the input arguments
                bdb.make_bag(path,
                             args.checksum,
                             args.update,
                             args.skip_manifests,
                             args.prune_manifests,
                             BAG_METADATA if BAG_METADATA else None,
                             args.metadata_file,
                             args.remote_file_manifest,
                             args.config_file)

        # otherwise just extract the bag if it is an archive and no other conflicting options specified
        elif not (args.validate or args.validate_profile or args.resolve_fetch):
            bdb.extract_bag(path)
            sys.stderr.write('\n')
            return result

        if args.resolve_fetch:
            if args.validate == 'full':
                sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING)
            bdb.resolve_fetch(path, True if args.resolve_fetch == 'all' else False)

        if args.validate:
            if is_file:
                temp_path = bdb.extract_bag(path, temp=True)
            bdb.validate_bag(temp_path if temp_path else path,
                             True if args.validate == 'fast' else False,
                             args.config_file)

        if args.archiver:
            archive = bdb.archive_bag(path, args.archiver)

        if archive is None and is_file:
            archive = path

        if args.validate_profile:
            if is_file:
                if not temp_path:
                    temp_path = bdb.extract_bag(path, temp=True)
            profile = bdb.validate_bag_profile(temp_path if temp_path else path)
            bdb.validate_bag_serialization(archive if archive else path, profile)

    except Exception as e:
        result = 1
        error = "Error: %s" % bdbag.get_named_exception(e)

    finally:
        if temp_path:
            bdb.cleanup_bag(os.path.dirname(temp_path))
        if result != 0:
            sys.stderr.write("\n%s" % error)

    sys.stderr.write('\n')

    return result
Beispiel #26
0
def main(argv):
    parser = argparse.ArgumentParser(
        description=
        'Program to create a BDBag containing a set of Minids for remote content'
    )
    parser.add_argument('-m',
                        '--minids',
                        metavar='<minid file>',
                        help='File listing Minids for new bag',
                        required=True)
    parser.add_argument('-b',
                        '--bagname',
                        metavar='<bag name>',
                        help='Name of directory for new bag.',
                        required=True)
    parser.add_argument('-v',
                        '--verify',
                        action='store_true',
                        help='Validate bag after building it.',
                        required=False)
    parser.add_argument('-q',
                        '--quiet',
                        action="store_true",
                        help="Suppress logging output.")
    parser.add_argument('-d',
                        '--debug',
                        action="store_true",
                        help="Enable debug logging output.")
    parser.add_argument(
        '-n',
        '--author-name',
        metavar="<person or entity name>",
        help=
        "Optional name of the person or entity responsible for the creation of this bag, "
        "for inclusion in the bag metadata.")
    parser.add_argument(
        '-o',
        '--author-orcid',
        metavar="<orcid>",
        help=
        "Optional ORCID identifier of the bag creator, for inclusion in the bag metadata."
    )
    args = parser.parse_args()

    bdb.configure_logging(level=logging.ERROR if args.quiet else (
        logging.DEBUG if args.debug else logging.INFO))

    # Create the directory that will hold the new BDBag
    bdb.ensure_bag_path_exists(args.bagname)

    # For each supplied minid, fetch sub-bag to determine its properties
    minid_fields = extract_fields(args.minids)

    # Create 'README' file in the newly created bag directory. (moved to 'data' when bag is created)
    write_readme(args.bagname, minid_fields)

    # Create remote_file_manifest_file, to be used by make_bag
    working_dir = temp_path = tempfile.mkdtemp(prefix='encode2bag_')
    remote_file_manifest_file = osp.abspath(
        osp.join(working_dir, 'remote-file-manifest.json'))
    generate_remote_manifest_file(minid_fields, remote_file_manifest_file)

    # Create the new bag based on the supplied remote manifest file
    bag = bdb.make_bag(args.bagname,
                       algs=['md5', 'sha256'],
                       remote_file_manifest=remote_file_manifest_file)

    # Create metadata/manifest.json file with Research Object JSON object
    ro_manifest = ro.init_ro_manifest(
        author_name=args.author_name,
        author_orcid=args.author_orcid,
        creator_name='bagofbags using BDBag version: %s (Bagit version: %s)' %
        (VERSION, BAGIT_VERSION),
        creator_uri='https://github.com/fair-research/bdbag/examples/bagofbags/'
    )
    add_remote_file_manifest_to_ro(ro_manifest, minid_fields)
    ro.write_bag_ro_metadata(ro_manifest, args.bagname, 'manifest.json')

    # Run make_bag again to include manifest.json in the checksums etc.
    bdb.make_bag(args.bagname, update=True)

    if args.verify:
        bdb.resolve_fetch(args.bagname, force=True)
        bdb.validate_bag(args.bagname, fast=False, callback=None)
Beispiel #27
0
def validate_user_submission(data_path,
                             schema,
                             output_dir=None,
                             delete_dir=False,
                             handle_git_repos=True,
                             bdbag_kwargs=None):
    """
    Arguments:
        data_path (str): The path to the data to ingest into DERIVA. The path can be:
                1) A directory to be formatted into a BDBag
                2) A Git repository to be copied into a BDBag
                3) A premade BDBag directory
                4) A premade BDBag in an archive file
        schema (str): The named schema or schema file link to validate data against.
                Default None, to only validate against the declared TableSchema.
        output_dir (str): The path to create an output directory in. The resulting
                BDBag archive will be named after this directory.
                If not set, the directory will be turned into a BDBag in-place.
                For Git repositories, this is automatically set, but can be overridden.
                If data_path is a file, this has no effect.
                This dir MUST NOT be in the `data_path` directory or any subdirectories.
                Default None.
        delete_dir (bool): Should the output_dir be deleted after submission?
                Has no effect if output_dir is not specified.
                For Git repositories, this is always True.
                Default False.
        handle_git_repos (bool): Should Git repositories be detected and handled?
                When this is False, Git repositories are handled as simple directories
                instead of Git repositories.
                Default True.
        bdbag_kwargs (dict): Extra args to pass to bdbag
    """
    bdbag_kwargs = bdbag_kwargs or {}
    data_path = os.path.abspath(data_path)
    if not os.path.exists(data_path):
        raise FileNotFoundError("Path '{}' does not exist".format(data_path))

    if handle_git_repos:
        logger.debug("Checking for a Git repository")
        # If Git repo, set output_dir appropriately
        try:
            repo = git.Repo(data_path, search_parent_directories=True)
        # Not Git repo
        except git.InvalidGitRepositoryError:
            logger.debug("Not a Git repo")
        # Path not found, turn into standard FileNotFoundError
        except git.NoSuchPathError:
            raise FileNotFoundError(
                "Path '{}' does not exist".format(data_path))
        # Is Git repo
        else:
            logger.debug("Git repo found, collecting metadata")
            # Needs to not have slash at end - is known Git repo already, slash
            # interferes with os.path.basename/dirname
            if data_path.endswith("/"):
                data_path = data_path[:-1]
            # Set output_dir to new dir named with HEAD commit hash
            new_dir_name = "{}_{}".format(os.path.basename(data_path),
                                          str(repo.head.commit))
            output_dir = os.path.join(os.path.dirname(data_path), new_dir_name)
            # Delete temp dir after archival
            delete_dir = True

    # If dir and not already BDBag, make BDBag
    if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path):
        logger.debug("Creating BDBag out of directory '{}'".format(data_path))
        # If output_dir specified, copy data to output dir first
        if output_dir:
            logger.debug("Copying data to '{}' before creating BDBag".format(
                output_dir))
            output_dir = os.path.abspath(output_dir)
            # If shutil.copytree is called when the destination dir is inside the source dir
            # by more than one layer, it will recurse infinitely.
            # (e.g. /source => /source/dir/dest)
            # Exactly one layer is technically okay (e.g. /source => /source/dest),
            # but it's easier to forbid all parent/child dir cases.
            # Check for this error condition by determining if output_dir is a child
            # of data_path.
            if os.path.commonpath([data_path]) == os.path.commonpath(
                [data_path, output_dir]):
                raise ValueError(
                    "The output_dir ('{}') must not be in data_path ('{}')".
                    format(output_dir, data_path))
            try:
                shutil.copytree(data_path, output_dir)
            except FileExistsError:
                raise FileExistsError(
                    ("The output directory must not exist. "
                     "Delete '{}' to submit.\nYou can set delete_dir=True "
                     "to avoid this issue in the future.").format(output_dir))
            # Process new dir instead of old path
            data_path = output_dir
        # If output_dir not specified, never delete data dir
        else:
            delete_dir = False
        # Make bag
        bdbag_api.make_bag(data_path, **bdbag_kwargs)
        if not bdbag_api.is_bag(data_path):
            raise ValueError(
                "Failed to create BDBag from {}".format(data_path))
        logger.debug("BDBag created at '{}'".format(data_path))

    # If dir (must be BDBag at this point), archive
    if os.path.isdir(data_path):
        logger.debug("Archiving BDBag at '{}' using '{}'".format(
            data_path, CONFIG["ARCHIVE_FORMAT"]))
        new_data_path = bdbag_api.archive_bag(data_path,
                                              CONFIG["ARCHIVE_FORMAT"])
        logger.debug("BDBag archived to file '{}'".format(new_data_path))
        # If requested (e.g. Git repo copied dir), delete data dir
        if delete_dir:
            logger.debug("Removing old directory '{}'".format(data_path))
            shutil.rmtree(data_path)
        # Overwrite data_path - don't care about dir for uploading
        data_path = new_data_path

    # Validate TableSchema in BDBag
    logger.debug("Validating TableSchema in BDBag '{}'".format(data_path))
    validation_res = ts_validate(data_path, schema=schema)
    if not validation_res["is_valid"]:
        raise exc.ValidationException(
            "TableSchema invalid due to the following errors: "
            "\n{}\n".format(validation_res["error"]))

    logger.debug("Validation successful")
    return data_path
Beispiel #28
0
def main():

    args, is_bag, is_file = parse_cli()
    path = os.path.abspath(args.path)

    archive = None
    temp_path = None
    error = None
    result = 0

    if not args.quiet:
        sys.stderr.write('\n')

    try:
        if not is_file:
            # do not try to create or update the bag if the user just wants to validate or complete an existing bag
            if not (
                (args.validate or args.validate_profile or args.resolve_fetch)
                    and not (args.update and bdb.is_bag(path))):
                if args.checksum and 'all' in args.checksum:
                    args.checksum = ['md5', 'sha1', 'sha256', 'sha512']
                # create or update the bag depending on the input arguments
                bdb.make_bag(path,
                             algs=args.checksum,
                             update=args.update,
                             save_manifests=not args.skip_manifests,
                             prune_manifests=args.prune_manifests,
                             metadata=BAG_METADATA if BAG_METADATA else None,
                             metadata_file=args.metadata_file,
                             remote_file_manifest=args.remote_file_manifest,
                             config_file=args.config_file,
                             ro_metadata_file=args.ro_metadata_file)

        # otherwise just extract the bag if it is an archive and no other conflicting options specified
        elif not (args.validate or args.validate_profile
                  or args.resolve_fetch):
            bdb.extract_bag(path)
            if not args.quiet:
                sys.stderr.write('\n')
            return result

        if args.ro_manifest_generate:
            bdb.generate_ro_manifest(
                path,
                True if args.ro_manifest_generate == "overwrite" else False,
                config_file=args.config_file)

        if args.resolve_fetch:
            if args.validate == 'full':
                sys.stderr.write(ASYNC_TRANSFER_VALIDATION_WARNING)
            bdb.resolve_fetch(
                path,
                force=True if args.resolve_fetch == 'all' else False,
                keychain_file=args.keychain_file,
                config_file=args.config_file,
                filter_expr=args.fetch_filter)

        if args.validate:
            if is_file:
                temp_path = bdb.extract_bag(path, temp=True)
            if args.validate == 'structure':
                bdb.validate_bag_structure(temp_path if temp_path else path)
            else:
                bdb.validate_bag(
                    temp_path if temp_path else path,
                    fast=True if args.validate == 'fast' else False,
                    config_file=args.config_file)

        if args.archiver:
            archive = bdb.archive_bag(path, args.archiver)

        if archive is None and is_file:
            archive = path

        if args.validate_profile:
            if is_file:
                if not temp_path:
                    temp_path = bdb.extract_bag(path, temp=True)
            profile = bdb.validate_bag_profile(
                temp_path if temp_path else path)
            bdb.validate_bag_serialization(archive if archive else path,
                                           profile)

        if args.revert:
            bdb.revert_bag(path)

    except Exception as e:
        result = 1
        error = "Error: %s" % get_typed_exception(e)

    finally:
        if temp_path:
            bdb.cleanup_bag(os.path.dirname(temp_path))
        if result != 0:
            sys.stderr.write("\n%s" % error)

    if not args.quiet:
        sys.stderr.write('\n')

    return result
Beispiel #29
0
    def download(self, identity=None):

        if not self.config:
            raise RuntimeError("No configuration specified!")

        if self.config.get("catalog") is None:
            raise RuntimeError("Catalog configuration error!")

        if not identity:
            logging.info("Validating credentials")
            try:
                if not self.credentials:
                    self.setCredentials(get_credential(self.hostname))
                attributes = self.catalog.get_authn_session().json()
                identity = attributes["client"]
            except Exception as e:
                raise RuntimeError("Unable to validate credentials: %s" % format_exception(e))

        ro_manifest = None
        ro_author_name = None
        ro_author_orcid = None
        remote_file_manifest = os.path.abspath(
            ''.join([os.path.join(self.output_dir, 'remote-file-manifest_'), str(uuid.uuid4()), ".json"]))

        catalog_config = self.config['catalog']
        self.envars.update(self.config.get('env', dict()))

        bag_path = None
        bag_archiver = None
        bag_algorithms = None
        bag_config = self.config.get('bag')
        create_bag = True if bag_config else False
        if create_bag:
            bag_name = bag_config.get('bag_name', ''.join(["deriva_bag", '_', time.strftime("%Y-%m-%d_%H.%M.%S")]))
            bag_path = os.path.abspath(os.path.join(self.output_dir, bag_name))
            bag_archiver = bag_config.get('bag_archiver')
            bag_algorithms = bag_config.get('bag_algorithms', ['sha256'])
            bag_metadata = bag_config.get('bag_metadata', {"Internal-Sender-Identifier":
                                                           "deriva@%s" % self.server_url})
            bag_ro = create_bag and stob(bag_config.get('bag_ro', "True"))
            if create_bag:
                bdb.ensure_bag_path_exists(bag_path)
                bag = bdb.make_bag(bag_path, algs=bag_algorithms, metadata=bag_metadata)
                if bag_ro:
                    ro_author_name = bag.info.get("Contact-Name",
                                                  identity.get('full_name',
                                                               identity.get('display_name',
                                                                            identity.get('id', None))))
                    ro_author_orcid = bag.info.get("Contact-Orcid")
                    ro_manifest = ro.init_ro_manifest(author_name=ro_author_name, author_orcid=ro_author_orcid)
                    bag_metadata.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID})

        file_list = list()
        base_path = bag_path if bag_path else self.output_dir
        for query in catalog_config['queries']:
            query_path = query['query_path']
            output_format = query['output_format']
            output_processor = query.get("output_format_processor")
            format_args = query.get('output_format_params', None)
            output_path = query.get('output_path', '')

            try:
                download_processor = findProcessor(output_format, output_processor)
                processor = download_processor(self.envars,
                                               bag=create_bag,
                                               catalog=self.catalog,
                                               store=self.store,
                                               query=query_path,
                                               base_path=base_path,
                                               sub_path=output_path,
                                               format_args=format_args,
                                               remote_file_manifest=remote_file_manifest,
                                               ro_manifest=ro_manifest,
                                               ro_author_name=ro_author_name,
                                               ro_author_orcid=ro_author_orcid)
                file_list.extend(processor.process())
            except Exception as e:
                logging.error(format_exception(e))
                if create_bag:
                    bdb.cleanup_bag(bag_path)
                raise

        if create_bag:
            try:
                if ro_manifest:
                    ro.write_bag_ro_metadata(ro_manifest, bag_path)
                if not os.path.isfile(remote_file_manifest):
                    remote_file_manifest = None
                bdb.make_bag(bag_path, algs=bag_algorithms, remote_file_manifest=remote_file_manifest, update=True)
            except Exception as e:
                logging.fatal("Exception while updating bag manifests: %s", format_exception(e))
                bdb.cleanup_bag(bag_path)
                raise
            finally:
                if remote_file_manifest and os.path.isfile(remote_file_manifest):
                    os.remove(remote_file_manifest)

            logging.info('Created bag: %s' % bag_path)

            if bag_archiver is not None:
                try:
                    archive = bdb.archive_bag(bag_path, bag_archiver.lower())
                    bdb.cleanup_bag(bag_path)
                    return [archive]
                except Exception as e:
                    logging.error("Exception while creating data bag archive:", format_exception(e))
                    raise
            else:
                return [bag_path]

        return file_list
def update_bag(outdir):
    bdbag_api.make_bag(outdir, update=True)
    return bdbag_api.archive_bag(outdir, "zip")
Beispiel #31
0
    def start_deriva_flow(self,
                          data_path,
                          dcc_id,
                          catalog_id=None,
                          schema=None,
                          server=None,
                          dataset_acls=None,
                          output_dir=None,
                          delete_dir=False,
                          handle_git_repos=True,
                          dry_run=False,
                          test_sub=False,
                          verbose=False,
                          **kwargs):
        """Start the Globus Automate Flow to ingest CFDE data into DERIVA.

        Arguments:
            data_path (str): The path to the data to ingest into DERIVA. The path can be:
                    1) A directory to be formatted into a BDBag
                    2) A Git repository to be copied into a BDBag
                    3) A premade BDBag directory
                    4) A premade BDBag in an archive file
            dcc_id (str): The CFDE-recognized DCC ID for this submission.
            catalog_id (int or str): The ID of the DERIVA catalog to ingest into.
                    Default None, to create a new catalog.
            schema (str): The named schema or schema file link to validate data against.
                    Default None, to only validate against the declared TableSchema.
            server (str): The DERIVA server to ingest to.
                    Default None, to use the Action Provider-set default.
            dataset_acls (dict): The DERIVA ACL(s) to set on the final dataset.
                    Default None, to use the CFDE default ACLs.
            output_dir (str): The path to create an output directory in. The resulting
                    BDBag archive will be named after this directory.
                    If not set, the directory will be turned into a BDBag in-place.
                    For Git repositories, this is automatically set, but can be overridden.
                    If data_path is a file, this has no effect.
                    This dir MUST NOT be in the `data_path` directory or any subdirectories.
                    Default None.
            delete_dir (bool): Should the output_dir be deleted after submission?
                    Has no effect if output_dir is not specified.
                    For Git repositories, this is always True.
                    Default False.
            handle_git_repos (bool): Should Git repositories be detected and handled?
                    When this is False, Git repositories are handled as simple directories
                    instead of Git repositories.
                    Default True.
            dry_run (bool): Should the data be validated and bagged without starting the Flow?
                    When True, does not ingest into DERIVA or start the Globus Automate Flow,
                    and the return value will not have valid DERIVA Flow information.
                    Default False.
            test_sub (bool): Should the submission be run in "test mode" where
                    the submission will be inegsted into DERIVA and immediately deleted?
                    When True, the data wil not remain in DERIVA to be viewed and the
                    Flow will terminate before any curation step.
            verbose (bool): Should intermediate status messages be printed out?
                    Default False.

        Keyword Arguments:
            force_http (bool): Should the data be sent using HTTP instead of Globus Transfer,
                    even if Globus Transfer is available? Because Globus Transfer is more
                    robust than HTTP, it is highly recommended to leave this False.
                    Default False.

        Other keyword arguments are passed directly to the ``make_bag()`` function of the
        BDBag API (see https://github.com/fair-research/bdbag for details).
        """
        if verbose:
            print("Startup: Validating input")
        data_path = os.path.abspath(data_path)
        if not os.path.exists(data_path):
            raise FileNotFoundError(
                "Path '{}' does not exist".format(data_path))

        if catalog_id in self.catalogs.keys():
            if schema:
                raise ValueError(
                    "You may not specify a schema ('{}') when ingesting to "
                    "a named catalog ('{}'). Retry without specifying "
                    "a schema.".format(schema, catalog_id))
            schema = self.catalogs[catalog_id]
        # Pull out known kwargs
        force_http = kwargs.pop("force_http", False)

        if handle_git_repos:
            if verbose:
                print("Checking for a Git repository")
            # If Git repo, set output_dir appropriately
            try:
                repo = git.Repo(data_path, search_parent_directories=True)
            # Not Git repo
            except git.InvalidGitRepositoryError:
                if verbose:
                    print("Not a Git repo")
            # Path not found, turn into standard FileNotFoundError
            except git.NoSuchPathError:
                raise FileNotFoundError(
                    "Path '{}' does not exist".format(data_path))
            # Is Git repo
            else:
                if verbose:
                    print("Git repo found, collecting metadata")
                # Needs to not have slash at end - is known Git repo already, slash
                # interferes with os.path.basename/dirname
                if data_path.endswith("/"):
                    data_path = data_path[:-1]
                # Set output_dir to new dir named with HEAD commit hash
                new_dir_name = "{}_{}".format(os.path.basename(data_path),
                                              str(repo.head.commit))
                output_dir = os.path.join(os.path.dirname(data_path),
                                          new_dir_name)
                # Delete temp dir after archival
                delete_dir = True

        # If dir and not already BDBag, make BDBag
        if os.path.isdir(data_path) and not bdbag_api.is_bag(data_path):
            if verbose:
                print("Creating BDBag out of directory '{}'".format(data_path))
            # If output_dir specified, copy data to output dir first
            if output_dir:
                if verbose:
                    print("Copying data to '{}' before creating BDBag".format(
                        output_dir))
                output_dir = os.path.abspath(output_dir)
                # If shutil.copytree is called when the destination dir is inside the source dir
                # by more than one layer, it will recurse infinitely.
                # (e.g. /source => /source/dir/dest)
                # Exactly one layer is technically okay (e.g. /source => /source/dest),
                # but it's easier to forbid all parent/child dir cases.
                # Check for this error condition by determining if output_dir is a child
                # of data_path.
                if os.path.commonpath([data_path]) == os.path.commonpath(
                    [data_path, output_dir]):
                    raise ValueError(
                        "The output_dir ('{}') must not be in data_path ('{}')"
                        .format(output_dir, data_path))
                try:
                    shutil.copytree(data_path, output_dir)
                except FileExistsError:
                    raise FileExistsError(
                        ("The output directory must not exist. "
                         "Delete '{}' to submit.\nYou can set delete_dir=True "
                         "to avoid this issue in the future."
                         ).format(output_dir))
                # Process new dir instead of old path
                data_path = output_dir
            # If output_dir not specified, never delete data dir
            else:
                delete_dir = False
            # Make bag
            bdbag_api.make_bag(data_path, **kwargs)
            if not bdbag_api.is_bag(data_path):
                raise ValueError(
                    "Failed to create BDBag from {}".format(data_path))
            elif verbose:
                print("BDBag created at '{}'".format(data_path))

        # If dir (must be BDBag at this point), archive
        if os.path.isdir(data_path):
            if verbose:
                print("Archiving BDBag at '{}' using '{}'".format(
                    data_path, CONFIG["ARCHIVE_FORMAT"]))
            new_data_path = bdbag_api.archive_bag(data_path,
                                                  CONFIG["ARCHIVE_FORMAT"])
            if verbose:
                print("BDBag archived to file '{}'".format(new_data_path))
            # If requested (e.g. Git repo copied dir), delete data dir
            if delete_dir:
                if verbose:
                    print("Removing old directory '{}'".format(data_path))
                shutil.rmtree(data_path)
            # Overwrite data_path - don't care about dir for uploading
            data_path = new_data_path

        # Validate TableSchema in BDBag
        if verbose:
            print("Validating TableSchema in BDBag '{}'".format(data_path))
        validation_res = ts_validate(data_path, schema=schema)
        if not validation_res["is_valid"]:
            return {
                "success":
                False,
                "error":
                ("TableSchema invalid due to the following errors: \n{}\n".
                 format(validation_res["error"]))
            }
        elif verbose:
            print("Validation successful")

        # Now BDBag is archived file
        # Set path on destination
        dest_path = "{}{}".format(self.flow_info["cfde_ep_path"],
                                  os.path.basename(data_path))

        # If doing dry run, stop here before making Flow input
        if dry_run:
            return {
                "success":
                True,
                "message":
                "Dry run validated successfully. No data was transferred."
            }

        # Set up Flow
        if verbose:
            print("Creating input for Flow")
        # If local EP exists (and not force_http), can use Transfer
        # Local EP fetched now in case GCP started after Client creation
        local_endpoint = globus_sdk.LocalGlobusConnectPersonal().endpoint_id
        if local_endpoint and not force_http:
            if verbose:
                print(
                    "Using local Globus Connect Personal Endpoint '{}'".format(
                        local_endpoint))
            # Populate Transfer fields in Flow
            flow_id = self.flow_info["flow_id"]
            flow_input = {
                "source_endpoint_id": local_endpoint,
                "source_path": data_path,
                "cfde_ep_id": self.flow_info["cfde_ep_id"],
                "cfde_ep_path": dest_path,
                "cfde_ep_url": self.flow_info["cfde_ep_url"],
                "is_directory": False,
                "test_sub": test_sub,
                "dcc_id": dcc_id
            }
            if catalog_id:
                flow_input["catalog_id"] = str(catalog_id)
            if server:
                flow_input["server"] = server
        # Otherwise, we must PUT the BDBag on the server
        else:
            if verbose:
                print("No Globus Endpoint detected; using HTTP upload instead")
            headers = {}
            self.__https_authorizer.set_authorization_header(headers)
            data_url = "{}{}".format(self.flow_info["cfde_ep_url"], dest_path)

            with open(data_path, 'rb') as bag_file:
                bag_data = bag_file.read()

            put_res = requests.put(data_url, data=bag_data, headers=headers)

            # Regenerate headers on 401
            if put_res.status_code == 401:
                self.__https_authorizer.handle_missing_authorization()
                self.__https_authorizer.set_authorization_header(headers)
                put_res = requests.put(data_url,
                                       data=bag_data,
                                       headers=headers)

            # Error message on failed PUT or any unexpected response
            if put_res.status_code >= 300:
                return {
                    "success":
                    False,
                    "error":
                    ("Could not upload BDBag to server (error {}):\n{}".format(
                        put_res.status_code, put_res.content))
                }
            elif put_res.status_code != 200:
                print(
                    "Warning: HTTP upload returned status code {}, which was unexpected."
                    .format(put_res.status_code))

            if verbose:
                print("Upload successful to '{}': {} {}".format(
                    data_url, put_res.status_code, put_res.content))

            flow_id = self.flow_info["flow_id"]
            flow_input = {
                "source_endpoint_id": False,
                "data_url": data_url,
                "test_sub": test_sub,
                "dcc_id": dcc_id
            }
            if catalog_id:
                flow_input["catalog_id"] = str(catalog_id)
            if server:
                flow_input["server"] = server

        if verbose:
            print("Flow input populated:\n{}".format(
                json.dumps(flow_input, indent=4, sort_keys=True)))
        # Get Flow scope
        flow_def = self.flow_client.get_flow(flow_id)
        flow_scope = flow_def["globus_auth_scope"]
        # Start Flow
        if verbose:
            print("Starting Flow - Submitting data")
        try:
            flow_res = self.flow_client.run_flow(flow_id, flow_scope,
                                                 flow_input)
        except globus_sdk.GlobusAPIError as e:
            if e.http_status == 404:
                return {
                    "success":
                    False,
                    "error":
                    ("Could not access ingest Flow. Are you in the CFDE DERIVA "
                     "Demo Globus Group? Check your membership or apply for access "
                     "here: https://app.globus.org/groups/a437abe3-c9a4-11e9-b441-"
                     "0efb3ba9a670/about")
                }
            else:
                raise
        self.last_flow_run = {
            "flow_id": flow_id,
            "flow_instance_id": flow_res["action_id"]
        }
        if verbose:
            print("Flow started successfully.")

        return {
            "success":
            True,
            "message":
            ("Started DERIVA ingest Flow\nFlow ID: {}\nFlow Instance ID: {}".
             format(flow_id, flow_res["action_id"])),
            "flow_id":
            flow_id,
            "flow_instance_id":
            flow_res["action_id"],
            "cfde_dest_path":
            dest_path,
            "http_link":
            "{}{}".format(self.flow_info["cfde_ep_url"], dest_path),
            "globus_web_link":
            ("https://app.globus.org/file-manager?origin_id={}&origin_path={}".
             format(self.flow_info["cfde_ep_id"], os.path.dirname(dest_path)))
        }