def downloadFiles(self, input_manifest): logging.info("Retrieving file(s)...") try: with open(input_manifest, "r") as in_file: file_list = list() for line in in_file: entry = json.loads(line) url = entry.get('url') if not url: raise RuntimeError( "Missing required attribute \"url\" in download manifest entry %s" % json.dumps(entry)) store = self.getHatracStore(url) filename = entry.get('filename') envvars = self.envars.copy() envvars.update(entry) subdir = self.sub_path.format(**envvars) if not filename: if store: head = store.head(url, headers=self.HEADERS) content_disposition = head.headers.get("Content-Disposition") if head.ok else None filename = os.path.basename(filename).split(":")[0] if not content_disposition else \ parse_content_disposition(content_disposition) else: filename = os.path.basename(url) file_path = os.path.abspath(os.path.join( self.base_path, 'data' if self.is_bag else '', subdir, filename)) output_dir = os.path.dirname(file_path) self.makeDirs(output_dir) if store: resp = store.get_obj(url, self.HEADERS, file_path) length = int(resp.headers.get('Content-Length')) content_type = resp.headers.get("Content-Type") url = self.getExternalUrl(url) else: url = self.getExternalUrl(url) file_path, resp = self.getExternalFile(url, file_path, self.HEADERS) length = int(resp.headers.get('Content-Length')) content_type = resp.headers.get("Content-Type") file_bytes = os.path.getsize(file_path) if length != file_bytes: raise RuntimeError( "File size of %s does not match expected size of %s for file %s" % (length, file_bytes, file_path)) output_path = ''.join([subdir, "/", filename]) if subdir else filename if self.ro_manifest: ro.add_file_metadata(self.ro_manifest, source_url=url, local_path=output_path, media_type=content_type, retrieved_on=ro.make_retrieved_on(), retrieved_by=ro.make_retrieved_by( self.ro_author_name, orcid=self.ro_author_orcid), bundled_as=ro.make_bundled_as()) file_list.append(output_path) return file_list finally: os.remove(input_manifest)
def process(self): headers = self.HEADERS headers.update({'accept': self.content_type}) resp = self.catalogQuery(headers) if self.ro_manifest and self.ro_file_provenance: ro.add_file_metadata(self.ro_manifest, source_url=self.url, local_path=self.output_relpath, media_type=self.content_type, retrieved_on=ro.make_retrieved_on(), retrieved_by=ro.make_retrieved_by(self.ro_author_name, orcid=self.ro_author_orcid), bundled_as=ro.make_bundled_as()) self.outputs.update({self.output_relpath: {LOCAL_PATH_KEY: self.output_abspath, SOURCE_URL_KEY: self.url}}) return self.outputs
def process(self): if self.ro_manifest and self.ro_file_provenance: ro.add_file_metadata( self.ro_manifest, source_url=self.url, local_path=self.output_relpath, media_type=guess_content_type(self.output_abspath), retrieved_on=ro.make_retrieved_on(), retrieved_by=ro.make_retrieved_by(self.ro_author_name, orcid=self.ro_author_orcid), bundled_as=ro.make_bundled_as()) if self.delete_input: self._delete_input() self.outputs.update({ self.output_relpath: { LOCAL_PATH_KEY: self.output_abspath, SOURCE_URL_KEY: self.url } }) return self.outputs
def downloadFiles(self, input_manifest): logging.info( "Attempting to download file(s) based on the results of query: %s" % self.query) try: with open(input_manifest, "r") as in_file: file_list = dict() for line in in_file: entry = json.loads(line) url = entry.get('url') if not url: logging.warning( "Skipping download due to missing required attribute \"url\" in download manifest entry %s" % json.dumps(entry)) continue store = self.getHatracStore(url) filename = entry.get('filename') envvars = self.envars.copy() envvars.update(entry) subdir = self.sub_path.format(**envvars) if not filename: if store: try: head = store.head(url, headers=self.HEADERS) except requests.HTTPError as e: raise DerivaDownloadError( "HEAD request for [%s] failed: %s" % (url, e)) content_disposition = head.headers.get( "Content-Disposition") if head.ok else None filename = os.path.basename(filename).split(":")[0] if not content_disposition else \ parse_content_disposition(content_disposition) else: filename = os.path.basename(url) file_path = os.path.abspath( os.path.join(self.base_path, 'data' if self.is_bag else '', subdir, filename)) output_dir = os.path.dirname(file_path) make_dirs(output_dir) if store: try: resp = store.get_obj(url, self.HEADERS, file_path) except requests.HTTPError as e: raise DerivaDownloadError( "File [%s] transfer failed: %s" % (file_path, e)) length = int(resp.headers.get('Content-Length')) content_type = resp.headers.get("Content-Type") url = self.getExternalUrl(url) else: url = self.getExternalUrl(url) file_path, resp = self.getExternalFile( url, file_path, self.HEADERS) length = int(resp.headers.get('Content-Length')) content_type = resp.headers.get("Content-Type") file_bytes = os.path.getsize(file_path) if length != file_bytes: raise DerivaDownloadError( "File size of %s does not match expected size of %s for file %s" % (length, file_bytes, file_path)) output_path = ''.join([subdir, "/", filename ]) if subdir else filename if self.ro_manifest: ro.add_file_metadata( self.ro_manifest, source_url=url, local_path=output_path, media_type=content_type, retrieved_on=ro.make_retrieved_on(), retrieved_by=ro.make_retrieved_by( self.ro_author_name, orcid=self.ro_author_orcid), bundled_as=ro.make_bundled_as()) file_list.update( {output_path: { LOCAL_PATH_KEY: file_path }}) return file_list finally: os.remove(input_manifest)