def get_cadc_headers(uri): """ Creates the FITS headers object by fetching the FITS headers of a CADC file. The function takes advantage of the fhead feature of the CADC storage service and retrieves just the headers and no data, minimizing the transfer time. The file must be public, because the header retrieval is done as an anonymous user. :param uri: CADC URI :return: a string of keyword/value pairs. """ file_url = parse.urlparse(uri) # create possible types of subjects subject = net.Subject() client = CadcDataClient(subject) # do a fhead on the file archive, file_id = file_url.path.split('/') b = BytesIO() b.name = uri client.get_file(archive, file_id, b, fhead=True) fits_header = b.getvalue().decode('ascii') b.close() return fits_header
def get_file(archive, file_name, cutout=None, destination=None): anonSubject = net.Subject() data_client = CadcDataClient(anonSubject) return data_client.get_file(archive, file_name, cutout=cutout, destination=destination)
def test_get_file(trans_reader_mock, basews_mock): # test a simple get - no decompress file_name = '/tmp/afile.txt' file_chunks = ['aaaa'.encode(), 'bbbb'.encode(), ''.encode()] response = Mock() hash_md5 = hashlib.md5() for i in file_chunks: hash_md5.update(i) response.headers.get.return_value = \ 'filename={}'.format('orig_file_name') response.raw.read.side_effect = file_chunks # returns multiple blocks basews_mock.return_value.get.return_value = response client = CadcDataClient(auth.Subject()) with pytest.raises(exceptions.HttpException): # no URLs returned in the transfer negotiations client.get_file('TEST', 'afile', destination=file_name) t = transfer.Transfer('ad:TEST/afile', 'pullFromVoSpace') p = transfer.Protocol p.endpoint = Mock() t.protocols = [p] trans_reader_mock.return_value.read.return_value = t client.get_file('TEST', 'afile', destination=file_name, md5_check=False) expected_content = \ (''.join([c.decode() for c in file_chunks])).encode() with open(file_name, 'rb') as f: assert expected_content == f.read() os.remove(file_name) # do it again with the file now open response = Mock() response.headers = { 'filename': 'orig_file_name', 'content-MD5': hash_md5.hexdigest() } response.raw.read.side_effect = file_chunks basews_mock.return_value.get.return_value = response with open(file_name, 'wb') as f: client.get_file('TEST', 'afile', destination=f) with open(file_name, 'rb') as f: assert expected_content == f.read() os.remove(file_name) # test a get with decompress and md5 check enabled file_name = 'bfile.txt' file_content = 'aaaabbbb' hash_md5 = hashlib.md5() hash_md5.update(file_content.encode()) file_chunks = [file_content.encode(), ''.encode()] decoded_file_content = 'MNOPRST6789' decoded_file_chunks = [decoded_file_content.encode(), ''.encode()] response = Mock() response.headers = \ {'content-MD5': '{}'.format(hash_md5.hexdigest()), 'filename': file_name} response.raw.read.side_effect = file_chunks response.raw._decode.side_effect = decoded_file_chunks basews_mock.return_value.get.return_value = response client = CadcDataClient(auth.Subject()) client.get_file('TEST', file_name=file_name, decompress=True, md5_check=True) with open(file_name, 'r') as f: # note the check against the decoded content assert decoded_file_content == f.read() os.remove(file_name) # repeat test with a bad md5 file_name = 'bfile.txt' file_content = 'ABCDEFGH12345' file_chunks = [file_content.encode(), ''.encode()] decoded_file_content = 'MNOPRST6789' decoded_file_chunks = [decoded_file_content.encode(), ''.encode()] response = Mock() response.headers = {'content-MD5': 'abc', 'filename': file_name} response.raw.read.side_effect = file_chunks response.raw._decode.side_effect = decoded_file_chunks basews_mock.return_value.get.return_value = response client = CadcDataClient(auth.Subject()) with pytest.raises(exceptions.HttpException): client.get_file('TEST', file_name=file_name, decompress=True, md5_check=True) # test process_bytes and send the content to /dev/null after. # Use no decompress def concatenate_chunks(chunk): global mycontent mycontent = '{}{}'.format(mycontent, chunk.decode()) file_name = 'bfile.txt' file_content = 'ABCDEFGH12345' file_chunks = [ file_content[i:i + 5].encode() for i in xrange(0, len(file_content), 5) ] file_chunks.append('') # last chunk is empty response = Mock() response.headers = {'filename': '{}.gz'.format(file_name)} response.raw.read.side_effect = file_chunks basews_mock.return_value.get.return_value = response client = CadcDataClient(auth.Subject()) client.logger.setLevel(logging.INFO) # md5_check does not take place because no content-MD5 received # from server client.get_file('TEST', 'afile', destination='/dev/null', process_bytes=concatenate_chunks) assert file_content == mycontent # failed md5 checksum response = Mock() response.headers = { 'filename': '{}.gz'.format(file_name), 'content-MD5': '33' } response.raw.read.side_effect = file_chunks basews_mock.return_value.get.return_value = response client = CadcDataClient(auth.Subject()) client.logger.setLevel(logging.INFO) # md5_check does not take place because no content-MD5 received # from server with pytest.raises(exceptions.HttpException): client.get_file('TEST', 'afile', destination='/dev/null', process_bytes=concatenate_chunks) # test get fhead response = Mock() response.headers.get.return_value = 'filename={}.gz'.format(file_name) response.raw.read.side_effect = file_chunks response.history = [] response.status_code = 200 response.url = 'someurl' post_mock = Mock(return_value=response) basews_mock.return_value.post = post_mock file_name = 'getfile' archive = 'TEST' p.endpoint = 'http://someurl/transfer/{}/{}'.format(archive, file_name) client.get_file('TEST', 'getfile', decompress=True, wcs=True, md5_check=False) trans_doc = \ ('<vos:transfer xmlns:' 'vos="http://www.ivoa.net/xml/VOSpace/v2.0">\n ' '<vos:target>ad:TEST/getfile</vos:target>\n ' '<vos:direction>pullFromVoSpace</vos:direction>\n ' '<vos:protocol uri="ivo://ivoa.net/vospace/core#httpget"/>\n' ' <vos:protocol uri="ivo://ivoa.net/vospace/core#httpsget"/>\n' '</vos:transfer>\n').encode() post_mock.assert_called_with(resource=(TRANSFER_RESOURCE_ID, None), params={'wcs': True}, data=trans_doc, headers={'Content-Type': 'text/xml'}) response.raw.read.side_effect = file_chunks post_mock.reset_mock() client.get_file('TEST', 'getfile', decompress=True, fhead=True, md5_check=False) post_mock.assert_called_with(resource=(TRANSFER_RESOURCE_ID, None), params={'fhead': True}, data=trans_doc, headers={'Content-Type': 'text/xml'}) response.raw.read.side_effect = file_chunks post_mock.reset_mock() client.get_file('TEST', 'getfile', decompress=True, cutout='[1:1]', md5_check=False) post_mock.assert_called_with(resource=(TRANSFER_RESOURCE_ID, None), params={'cutout': '[1:1]'}, data=trans_doc, headers={'Content-Type': 'text/xml'}) response.raw.read.side_effect = file_chunks post_mock.reset_mock() client.get_file('TEST', 'getfile', decompress=True, cutout='[[1:1], 2]', md5_check=False) post_mock.assert_called_with(resource=(TRANSFER_RESOURCE_ID, None), params={'cutout': '[[1:1], 2]'}, data=trans_doc, headers={'Content-Type': 'text/xml'})
from cadcdata import CadcDataClient from cadcutils import net fname = "cadcUrlList.txt" with open(fname) as f: txt = f.readlines() txt = [x.strip() for x in txt] print(len(txt)) txt = list(map(lambda x: x[73:81], txt)) for pid in txt: if "." in pid: pid = pid[:-1] else: pid = pid try: client = CadcDataClient(net.Subject()) client.get_file('CFHT', pid + '.fits.fz') print(pid) except Exception as e: print(e) continue
class StorageClientWrapper: """ Wrap the choice between CadcDataClient and StorageInventoryClient. """ def __init__( self, subject, using_storage_inventory=True, resource_id='ivo://cadc.nrc.ca/uvic/minoc', metrics=None, ): """ :param subject: net.Subject instance for authentication and authorization :param using_storage_inventory: if True will use StorageInventoryClient for file operations at CADC. If False will use CadcDataClient. :param resource_id: str identifies the StorageInventoryClient endpoint. If using_storage_inventory is set to False, it's un-necessary. :param metrics: caom2pipe.manaage_composable.Metrics instance. If set, will track execution times, by action, from the beginning of the method invocation to the end of the method invocation, success or failure. Defaults to None, because fits2caom2 is a stand-alone application. """ if using_storage_inventory: self._cadc_client = StorageInventoryClient( subject=subject, resource_id=resource_id ) else: self._cadc_client = CadcDataClient(subject=subject) self._use_si = using_storage_inventory self._metrics = metrics self._logger = logging.getLogger(self.__class__.__name__) def _add_fail_metric(self, action, name): """Single location for the check for a self._metrics member in the failure case.""" if self._metrics is not None: client_name = 'si' if self._use_si else 'data' self._metrics.observe_failure(action, client_name, name) def _add_metric(self, action, name, start, value): """Single location for the check for a self._metrics member in the success case.""" if self._metrics is not None: client_name = 'si' if self._use_si else 'data' self._metrics.observe( start, StorageClientWrapper._current(), value, action, client_name, name, ) def get(self, working_directory, uri): """ Retrieve data. :param working_directory: str where the file will be retrieved to. Assumes the same machine as this function is being called from. :param uri: str this is an Artifact URI, representing the file to be retrieved. """ self._logger.debug(f'Being get for {uri} in {working_directory}') start = StorageClientWrapper._current() try: archive, f_name = self._decompose(uri) fqn = path.join(working_directory, f_name) if self._use_si: self._cadc_client.cadcget(uri, dest=fqn) else: self._cadc_client.get_file(archive, f_name, destination=fqn) except Exception as e: self._add_fail_metric('get', uri) self._logger.debug(traceback.format_exc()) raise exceptions.UnexpectedException( f'Did not retrieve {uri} because {e}' ) self._add_metric('get', uri, start, stat(fqn).st_size) self._logger.debug('End get') def get_head(self, uri): """ Retrieve FITS file header data. :param uri: str that is an Artifact URI, representing the file for which to retrieve headers :return: list of fits.Header instances """ self._logger.debug(f'Begin get_head for {uri}') start = StorageClientWrapper._current() try: b = BytesIO() b.name = uri if self._use_si: self._cadc_client.cadcget(uri, b, fhead=True) else: archive, f_name = StorageClientWrapper._decompose(uri) self._cadc_client.get_file(archive, f_name, b, fhead=True) fits_header = b.getvalue().decode('ascii') b.close() self._add_metric('get_head', uri, start, len(fits_header)) temp = make_headers_from_string(fits_header) self._logger.debug('End get_head') return temp except Exception as e: self._add_fail_metric('get_header', uri) self._logger.debug(traceback.format_exc()) self._logger.error(e) raise exceptions.UnexpectedException( f'Did not retrieve {uri} header because {e}' ) def info(self, uri): """ Retrieve the descriptive metdata associated with a file. :param uri: str that is an Artifact URI, representing the file for which to retrieve metadata :return: cadcdata.FileInfo instance, no scheme for md5sum """ self._logger.debug(f'Begin info for {uri}') try: if self._use_si: result = self._cadc_client.cadcinfo(uri) # make the result look like the other possible ways to # obtain metadata result.md5sum = result.md5sum.replace('md5:', '') else: archive, f_name = StorageClientWrapper._decompose(uri) temp = self._cadc_client.get_file_info(archive, f_name) result = FileInfo( id=uri, size=temp.get('size'), file_type=temp.get('type'), md5sum=temp.get('md5sum').replace('md5:', '') ) except exceptions.NotFoundException: self._logger.info(f'cadcinfo:: {uri} not found') result = None self._logger.debug('End info') return result def put(self, working_directory, uri, stream='default'): """ Store a file at CADC. :param working_directory: str fully-qualified name of where to find the file on the local machine :param uri: str that is an Artifact URI, representing the file to be stored at CADC. :param stream: str representing the namespace used by the CadcDataClient. Not required if using the StorageInventoryClient. 'default' is default name for a lately-created ad archive. """ self._logger.debug(f'Begin put for {uri} in {working_directory}') start = self._current() cwd = getcwd() archive, f_name = StorageClientWrapper._decompose(uri) fqn = path.join(working_directory, f_name) chdir(working_directory) try: local_meta = get_local_file_info(fqn) encoding = get_file_encoding(fqn) if self._use_si: replace = True cadc_meta = self.info(uri) if cadc_meta is None: replace = False self._logger.debug( f'uri {uri} src {fqn} replace {replace} file_type ' f'{local_meta.file_type} encoding {encoding} md5_checksum ' f'{local_meta.md5sum}' ) self._cadc_client.cadcput( uri, src=fqn, replace=replace, file_type=local_meta.file_type, file_encoding=encoding, md5_checksum=local_meta.md5sum, ) else: archive, f_name = self._decompose(uri) # libmagic does a worse job with guessing file types # than ad for .fits.gz => it will say 'binary' self._logger.debug( f'archive {archive} f_name {f_name} archive_stream ' f'{stream} mime_type {local_meta.file_type} ' f'mime_encoding {encoding} md5_check True ' ) self._cadc_client.put_file( archive, f_name, archive_stream=stream, mime_type=local_meta.file_type, mime_encoding=encoding, md5_check=True, ) self._logger.info(f'Stored {fqn} at CADC.') except Exception as e: self._add_fail_metric('put', uri) self._logger.debug(traceback.format_exc()) self._logger.error(e) raise exceptions.UnexpectedException( f'Failed to store data with {e}' ) finally: chdir(cwd) self._add_metric('put', uri, start, local_meta.size) self._logger.debug('End put') def remove(self, uri): """ Delete a file from CADC storage. :param uri: str that is an Artifact URI, representing the file to be removed from CADC. """ self._logger.debug(f'Begin remove for {uri}') start = StorageClientWrapper._current() if self._use_si: try: self._cadc_client.cadcremove(uri) except Exception as e: self._add_fail_metric('remove', uri) self._logger.debug(traceback.format_exc()) self._logger.error(e) raise exceptions.UnexpectedException( f'Did not remove {uri} because {e}' ) else: raise NotImplementedError( 'No remove functionality for CadcDataClient' ) self._add_metric('remove', uri, start, value=None) self._logger.debug('End remove') @staticmethod def _current(): """Encapsulate returning UTC now in microsecond resolution.""" return datetime.now(tz=timezone.utc).timestamp() @staticmethod def _decompose(uri): temp = urlparse(uri) return path.dirname(temp.path), path.basename(temp.path)