Ejemplo n.º 1
0
def generate_ro_manifest(bag_path,
                         overwrite=False,
                         config_file=DEFAULT_CONFIG_FILE):
    bag = bdbagit.BDBag(bag_path)
    bag_ro_metadata_path = os.path.abspath(
        os.path.join(bag_path, "metadata", "manifest.json"))
    exists = os.path.isfile(bag_ro_metadata_path)
    if exists and not overwrite:
        logger.info("Auto-generating RO manifest: update existing file.")
        ro_metadata = bdbro.read_bag_ro_metadata(bag_path)
    else:
        logger.info(
            "Auto-generating RO manifest: %s." %
            "creating new file" if not exists else "overwrite existing file")
        ro_metadata = bdbro.init_ro_manifest(
            author_name=bag.info.get("Contact-Name"),
            author_orcid=bag.info.get("Contact-Orcid"),
            creator_name=bdbro.BAG_CREATOR_NAME,
            creator_uri=bdbro.BAG_CREATOR_URI)

    config = read_config(config_file)
    resolvers = config.get(
        ID_RESOLVER_TAG,
        DEFAULT_ID_RESOLVERS) if config else DEFAULT_ID_RESOLVERS
    fetched = bag.fetch_entries()
    local = bag.payload_files()

    for url, length, filename in fetched:
        if url.startswith("minid:") or url.startswith("ark:"):
            url = "".join(["http://", resolvers[0], "/", url])
        bdbro.add_file_metadata(ro_metadata,
                                source_url=url,
                                bundled_as=bdbro.make_bundled_as(
                                    folder=os.path.dirname(filename),
                                    filename=os.path.basename(filename)),
                                update_existing=True)

    for path in local:
        bdbro.add_file_metadata(ro_metadata,
                                local_path=path.replace("\\", "/"),
                                bundled_as=bdbro.make_bundled_as(),
                                update_existing=True)

    bdbro.write_bag_ro_metadata(ro_metadata, bag_path)
    profile = bag.info.get(BAG_PROFILE_TAG)
    if profile == BDBAG_PROFILE_ID:
        bag.info.update({BAG_PROFILE_TAG: BDBAG_RO_PROFILE_ID})
    bag.save()
Ejemplo n.º 2
0
    def test_generate_ro_manifest_update(self):
        logger.info(
            self.getTestHeader(
                'create bag with auto-generation of RO manifest in update mode'
            ))
        try:
            bdb.make_bag(self.test_data_dir,
                         algs=['md5', 'sha1', 'sha256', 'sha512'],
                         remote_file_manifest=ospj(self.test_config_dir,
                                                   'test-fetch-manifest.json'))
            bdb.generate_ro_manifest(self.test_data_dir, overwrite=True)
            ro = bdbro.read_bag_ro_metadata(self.test_data_dir)
            old_agg_dict = dict()
            for entry in ro.get("aggregates", []):
                old_agg_dict[entry["uri"]] = entry
            bdbro.add_file_metadata(ro,
                                    local_path="../data/FAKE.txt",
                                    bundled_as=bdbro.make_bundled_as())
            bdbro.write_bag_ro_metadata(ro, self.test_data_dir)

            bdb.generate_ro_manifest(self.test_data_dir, overwrite=False)
            ro = bdbro.read_bag_ro_metadata(self.test_data_dir)
            for entry in ro.get("aggregates", []):
                if entry["uri"] in old_agg_dict:
                    self.assertTrue(entry["bundledAs"]["uri"] == old_agg_dict[
                        entry["uri"]]["bundledAs"]["uri"])

        except Exception as e:
            self.fail(get_typed_exception(e))
Ejemplo n.º 3
0
 def downloadFiles(self, input_manifest):
     logging.info("Retrieving file(s)...")
     try:
         with open(input_manifest, "r") as in_file:
             file_list = list()
             for line in in_file:
                 entry = json.loads(line)
                 url = entry.get('url')
                 if not url:
                     raise RuntimeError(
                         "Missing required attribute \"url\" in download manifest entry %s" % json.dumps(entry))
                 store = self.getHatracStore(url)
                 filename = entry.get('filename')
                 envvars = self.envars.copy()
                 envvars.update(entry)
                 subdir = self.sub_path.format(**envvars)
                 if not filename:
                     if store:
                         head = store.head(url, headers=self.HEADERS)
                         content_disposition = head.headers.get("Content-Disposition") if head.ok else None
                         filename = os.path.basename(filename).split(":")[0] if not content_disposition else \
                             parse_content_disposition(content_disposition)
                     else:
                         filename = os.path.basename(url)
                 file_path = os.path.abspath(os.path.join(
                     self.base_path, 'data' if self.is_bag else '', subdir, filename))
                 output_dir = os.path.dirname(file_path)
                 self.makeDirs(output_dir)
                 if store:
                     resp = store.get_obj(url, self.HEADERS, file_path)
                     length = int(resp.headers.get('Content-Length'))
                     content_type = resp.headers.get("Content-Type")
                     url = self.getExternalUrl(url)
                 else:
                     url = self.getExternalUrl(url)
                     file_path, resp = self.getExternalFile(url, file_path, self.HEADERS)
                     length = int(resp.headers.get('Content-Length'))
                     content_type = resp.headers.get("Content-Type")
                 file_bytes = os.path.getsize(file_path)
                 if length != file_bytes:
                     raise RuntimeError(
                         "File size of %s does not match expected size of %s for file %s" %
                         (length, file_bytes, file_path))
                 output_path = ''.join([subdir, "/", filename]) if subdir else filename
                 if self.ro_manifest:
                     ro.add_file_metadata(self.ro_manifest,
                                          source_url=url,
                                          local_path=output_path,
                                          media_type=content_type,
                                          retrieved_on=ro.make_retrieved_on(),
                                          retrieved_by=ro.make_retrieved_by(
                                              self.ro_author_name, orcid=self.ro_author_orcid),
                                          bundled_as=ro.make_bundled_as())
                 file_list.append(output_path)
             return file_list
     finally:
         os.remove(input_manifest)
Ejemplo n.º 4
0
def add_remote_file_manifest_to_ro(ro_manifest, entries):
    for (minid, _, _, uri, _, _) in entries:
         ro.add_aggregate(ro_manifest, NAME2THING+minid,
                          mediatype=None,
                          conforms_to=ro.BAG_CONFORMS_TO,
                          bundled_as=ro.make_bundled_as(None, '', uri))
    ro.add_annotation(ro_manifest,
                      '../',
                      '../data/README',
                      motivatedBy={"@id": "oa:describing"})
Ejemplo n.º 5
0
    def process(self):
        headers = self.HEADERS
        headers.update({'accept': self.content_type})
        resp = self.catalogQuery(headers)

        if self.ro_manifest and self.ro_file_provenance:
            ro.add_file_metadata(self.ro_manifest,
                                 source_url=self.url,
                                 local_path=self.output_relpath,
                                 media_type=self.content_type,
                                 retrieved_on=ro.make_retrieved_on(),
                                 retrieved_by=ro.make_retrieved_by(self.ro_author_name, orcid=self.ro_author_orcid),
                                 bundled_as=ro.make_bundled_as())

        self.outputs.update({self.output_relpath: {LOCAL_PATH_KEY: self.output_abspath, SOURCE_URL_KEY: self.url}})
        return self.outputs
Ejemplo n.º 6
0
 def createRemoteFileManifest(self):
     logging.info("Creating remote file manifest")
     input_manifest = self.output_abspath
     remote_file_manifest = self.args.get("remote_file_manifest")
     with open(input_manifest, "r") as in_file, open(remote_file_manifest, "a") as remote_file:
         for line in in_file:
             # get the required bdbag remote file manifest vars from each line of the json-stream input file
             entry = json.loads(line)
             entry = self.createManifestEntry(entry)
             remote_file.write(json.dumps(entry) + "\n")
             if self.ro_manifest:
                 ro.add_file_metadata(self.ro_manifest,
                                      source_url=entry["url"],
                                      media_type=entry.get("content_type"),
                                      bundled_as=ro.make_bundled_as(
                                          folder=os.path.dirname(entry["filename"]),
                                          filename=os.path.basename(entry["filename"])))
     os.remove(input_manifest)
     return os.path.relpath(remote_file_manifest, self.base_path)
Ejemplo n.º 7
0
    def process(self):
        if self.ro_manifest and self.ro_file_provenance:
            ro.add_file_metadata(
                self.ro_manifest,
                source_url=self.url,
                local_path=self.output_relpath,
                media_type=guess_content_type(self.output_abspath),
                retrieved_on=ro.make_retrieved_on(),
                retrieved_by=ro.make_retrieved_by(self.ro_author_name,
                                                  orcid=self.ro_author_orcid),
                bundled_as=ro.make_bundled_as())
        if self.delete_input:
            self._delete_input()

        self.outputs.update({
            self.output_relpath: {
                LOCAL_PATH_KEY: self.output_abspath,
                SOURCE_URL_KEY: self.url
            }
        })
        return self.outputs
 def downloadFiles(self, input_manifest):
     logging.info(
         "Attempting to download file(s) based on the results of query: %s"
         % self.query)
     try:
         with open(input_manifest, "r") as in_file:
             file_list = dict()
             for line in in_file:
                 entry = json.loads(line)
                 url = entry.get('url')
                 if not url:
                     logging.warning(
                         "Skipping download due to missing required attribute \"url\" in download manifest entry %s"
                         % json.dumps(entry))
                     continue
                 store = self.getHatracStore(url)
                 filename = entry.get('filename')
                 envvars = self.envars.copy()
                 envvars.update(entry)
                 subdir = self.sub_path.format(**envvars)
                 if not filename:
                     if store:
                         try:
                             head = store.head(url, headers=self.HEADERS)
                         except requests.HTTPError as e:
                             raise DerivaDownloadError(
                                 "HEAD request for [%s] failed: %s" %
                                 (url, e))
                         content_disposition = head.headers.get(
                             "Content-Disposition") if head.ok else None
                         filename = os.path.basename(filename).split(":")[0] if not content_disposition else \
                             parse_content_disposition(content_disposition)
                     else:
                         filename = os.path.basename(url)
                 file_path = os.path.abspath(
                     os.path.join(self.base_path,
                                  'data' if self.is_bag else '', subdir,
                                  filename))
                 output_dir = os.path.dirname(file_path)
                 make_dirs(output_dir)
                 if store:
                     try:
                         resp = store.get_obj(url, self.HEADERS, file_path)
                     except requests.HTTPError as e:
                         raise DerivaDownloadError(
                             "File [%s] transfer failed: %s" %
                             (file_path, e))
                     length = int(resp.headers.get('Content-Length'))
                     content_type = resp.headers.get("Content-Type")
                     url = self.getExternalUrl(url)
                 else:
                     url = self.getExternalUrl(url)
                     file_path, resp = self.getExternalFile(
                         url, file_path, self.HEADERS)
                     length = int(resp.headers.get('Content-Length'))
                     content_type = resp.headers.get("Content-Type")
                 file_bytes = os.path.getsize(file_path)
                 if length != file_bytes:
                     raise DerivaDownloadError(
                         "File size of %s does not match expected size of %s for file %s"
                         % (length, file_bytes, file_path))
                 output_path = ''.join([subdir, "/", filename
                                        ]) if subdir else filename
                 if self.ro_manifest:
                     ro.add_file_metadata(
                         self.ro_manifest,
                         source_url=url,
                         local_path=output_path,
                         media_type=content_type,
                         retrieved_on=ro.make_retrieved_on(),
                         retrieved_by=ro.make_retrieved_by(
                             self.ro_author_name,
                             orcid=self.ro_author_orcid),
                         bundled_as=ro.make_bundled_as())
                 file_list.update(
                     {output_path: {
                         LOCAL_PATH_KEY: file_path
                     }})
             return file_list
     finally:
         os.remove(input_manifest)