def query_async(self, payload, cache=True, public=True, science=True, view_format='raw'): """ Perform a generic query with user-specified payload Parameters ---------- payload : dict A dictionary of payload keywords that are accepted by the ALMA archive system. You can look these up by examining the forms at http://almascience.org/aq or using the `help` method cache : bool Cache the query? public : bool Return only publicly available datasets? science : bool Return only data marked as "science" in the archive? """ url = urljoin(self._get_dataarchive_url(), 'aq/') payload.update({'result_view': view_format, 'format': 'VOTABLE', 'download': 'true'}) if public: payload['public_data'] = 'public' if science: payload['science_observations'] = '=%TARGET%' self.validate_query(payload) response = self._request('GET', url, params=payload, timeout=self.TIMEOUT, cache=cache) response.raise_for_status() return response
def _login(self, username, store_password=False): # Check if already logged in loginpage = self._request("GET", "https://asa.alma.cl/cas/login", cache=False) root = BeautifulSoup(loginpage.content, 'html5lib') if root.find('div', class_='success'): log.info("Already logged in.") return True # Get password from keyring or prompt password_from_keyring = keyring.get_password("astroquery:asa.alma.cl", username) if password_from_keyring is None: if system_tools.in_ipynb(): log.warn("You may be using an ipython notebook:" " the password form will appear in your terminal.") password = getpass.getpass("{0}, enter your ALMA password:"******"\n".format(username)) else: password = password_from_keyring # Authenticate log.info("Authenticating {0} on asa.alma.cl ...".format(username)) # Do not cache pieces of the login process data = {kw: root.find('input', {'name': kw})['value'] for kw in ('lt', '_eventId', 'execution')} data['username'] = username data['password'] = password login_response = self._request("POST", "https://asa.alma.cl/cas/login", params={'service': urljoin(self.archive_url, 'rh/login')}, data=data, cache=False) authenticated = ('You have successfully logged in' in login_response.content) if authenticated: log.info("Authentication successful!") self._username = username else: log.exception("Authentication failed!") # When authenticated, save password in keyring if needed if authenticated and password_from_keyring is None and store_password: keyring.set_password("astroquery:asa.alma.cl", username, password) return authenticated
def get_cycle0_uid_contents(self, uid): """ List the file contents of a UID from Cycle 0. Will raise an error if the UID is from cycle 1+, since those data have been released in a different and more consistent format. See http://almascience.org/documents-and-tools/cycle-2/ALMAQA2Productsv1.01.pdf for details. """ # First, check if UID is in the Cycle 0 listing if uid in self.cycle0_table['uid']: cycle0id = self.cycle0_table[self.cycle0_table['uid'] == uid][0]['ID'] contents = [row['Files'] for row in self._cycle0_tarfile_content if cycle0id in row['ID']] return contents else: info_url = urljoin(self._get_dataarchive_url(), 'documents-and-tools/cycle-2/ALMAQA2Productsv1.01.pdf') raise ValueError("Not a Cycle 0 UID. See {0} for details about" " cycle 1+ data release formats.".format(info_url))
def _cycle0_tarfile_content(self): """ In principle, this is a static file, but we'll retrieve it just in case """ if not hasattr(self, '_cycle0_tarfile_content_table'): url = urljoin(self._get_dataarchive_url(), 'alma-data/archive/cycle-0-tarfile-content') response = self._request('GET', url, cache=True) # html.parser is needed because some <tr>'s have form: # <tr width="blah"> which the default parser does not pick up root = BeautifulSoup(response.content, 'html.parser') html_table = root.find('table',class_='grid listing') data = list(zip(*[(x.findAll('td')[0].text, x.findAll('td')[1].text) for x in html_table.findAll('tr')])) columns = [Column(data=data[0], name='ID'), Column(data=data[1], name='Files')] tbl = Table(columns) assert len(tbl) == response.text.count('<tr') == 8497 self._cycle0_tarfile_content_table = tbl else: tbl = self._cycle0_tarfile_content_table return tbl
def _cycle0_tarfile_content(self): """ In principle, this is a static file, but we'll retrieve it just in case """ if not hasattr(self, '_cycle0_tarfile_content_table'): url = urljoin(self._get_dataarchive_url(), 'alma-data/archive/cycle-0-tarfile-content') response = self._request('GET', url, cache=True) # html.parser is needed because some <tr>'s have form: # <tr width="blah"> which the default parser does not pick up root = BeautifulSoup(response.content, 'html.parser') html_table = root.find('table', class_='grid listing') data = list(zip(*[(x.findAll('td')[0].text, x.findAll('td')[1].text) for x in html_table.findAll('tr')])) columns = [Column(data=data[0], name='ID'), Column(data=data[1], name='Files')] tbl = Table(columns) assert len(tbl) == 8497 self._cycle0_tarfile_content_table = tbl else: tbl = self._cycle0_tarfile_content_table return tbl
def get_cycle0_uid_contents(self, uid): """ List the file contents of a UID from Cycle 0. Will raise an error if the UID is from cycle 1+, since those data have been released in a different and more consistent format. See http://almascience.org/documents-and-tools/cycle-2/ALMAQA2Productsv1.01.pdf for details. """ # First, check if UID is in the Cycle 0 listing if uid in self.cycle0_table['uid']: cycle0id = self.cycle0_table[self.cycle0_table['uid'] == uid][0]['ID'] contents = [ row['Files'] for row in self._cycle0_tarfile_content if cycle0id in row['ID'] ] return contents else: info_url = urljoin( self._get_dataarchive_url(), 'documents-and-tools/cycle-2/ALMAQA2Productsv1.01.pdf') raise ValueError("Not a Cycle 0 UID. See {0} for details about " "cycle 1+ data release formats.".format(info_url))
def stage_data(self, uids): """ Stage ALMA data Parameters ---------- uids : list or str A list of valid UIDs or a single UID. UIDs should have the form: 'uid://A002/X391d0b/X7b' Returns ------- data_file_table : Table A table containing 3 columns: the UID, the file URL (for future downloading), and the file size """ """ With log.set_level(10) INFO: Staging files... [astroquery.alma.core] DEBUG: First request URL: https://almascience.eso.org/rh/submission [astroquery.alma.core] DEBUG: First request payload: {'dataset': [u'ALMA+uid___A002_X3b3400_X90f']} [astroquery.alma.core] DEBUG: First response URL: https://almascience.eso.org/rh/checkAuthenticationStatus/3f98de33-197e-4692-9afa-496842032ea9/submission [astroquery.alma.core] DEBUG: Request ID: 3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] DEBUG: Submission URL: https://almascience.eso.org/rh/submission/3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] .DEBUG: Data list URL: https://almascience.eso.org/rh/requests/anonymous/786823226 [astroquery.alma.core] """ if isinstance(uids, six.string_types): uids = [uids] if not isinstance(uids, (list, tuple, np.ndarray)): raise TypeError("Datasets must be given as a list of strings.") log.info("Staging files...") self._get_dataarchive_url() url = urljoin(self.dataarchive_url, 'rh/submission') log.debug("First request URL: {0}".format(url)) # 'ALMA+uid___A002_X391d0b_X7b' payload = {'dataset': ['ALMA+' + clean_uid(uid) for uid in uids]} log.debug("First request payload: {0}".format(payload)) self._staging_log = {'first_post_url': url} # Request staging for the UIDs # This component cannot be cached, since the returned data can change # if new data are uploaded response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response log.debug("First response URL: {0}".format(response.url)) response.raise_for_status() if 'j_spring_cas_security_check' in response.url: time.sleep(1) # CANNOT cache this stage: it not a real data page! results in # infinite loops response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response if 'j_spring_cas_security_check' in response.url: log.warn("Staging request was not successful. Try again?") response.raise_for_status() if 'j_spring_cas_security_check' in response.url: raise RemoteServiceError("Could not access data. This error " "can arise if the data are private and " "you do not have access rights or are " "not logged in.") request_id = response.url.split("/")[-2] assert len(request_id) == 36 self._staging_log['request_id'] = request_id log.debug("Request ID: {0}".format(request_id)) # Submit a request for the specific request ID identified above submission_url = urljoin(self.dataarchive_url, os.path.join('rh/submission', request_id)) log.debug("Submission URL: {0}".format(submission_url)) self._staging_log['submission_url'] = submission_url staging_submission = self._request('GET', submission_url, cache=True) self._staging_log['staging_submission'] = staging_submission staging_submission.raise_for_status() data_page_url = staging_submission.url self._staging_log['data_page_url'] = data_page_url dpid = data_page_url.split("/")[-1] assert len(dpid) == 9 self._staging_log['staging_page_id'] = dpid # CANNOT cache this step: please_wait will happen infinitely data_page = self._request('GET', data_page_url, cache=False) self._staging_log['data_page'] = data_page data_page.raise_for_status() has_completed = False while not has_completed: time.sleep(1) summary = self._request('GET', os.path.join(data_page_url, 'summary'), cache=False) summary.raise_for_status() print(".", end='') sys.stdout.flush() has_completed = summary.json()['complete'] self._staging_log['summary'] = summary summary.raise_for_status() self._staging_log['json_data'] = json_data = summary.json() username = self._username if hasattr(self, '_username') else 'anonymous' # templates: # https://almascience.eso.org/dataPortal/requests/keflavich/946895898/ALMA/ # 2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar/2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar # uid___A002_X9ee74a_X26f0/2013.1.00308.S_uid___A002_X9ee74a_X26f0.asdm.sdm.tar url_decomposed = urlparse(data_page_url) base_url = ('{uri.scheme}://{uri.netloc}/' 'dataPortal/requests/{username}/' '{staging_page_id}/ALMA'.format(uri=url_decomposed, staging_page_id=dpid, username=username, )) tbl = self._json_summary_to_table(json_data, base_url=base_url) # staging_root = BeautifulSoup(data_page.content) # downloadFileURL = staging_root.find('form').attrs['action'] # data_list_url = os.path.split(downloadFileURL)[0] # # Old version, unreliable: data_list_url = staging_submission.url # log.debug("Data list URL: {0}".format(data_list_url)) # self._staging_log['data_list_url'] = data_list_url # time.sleep(1) # data_list_page = self._request('GET', data_list_url, cache=True) # self._staging_log['data_list_page'] = data_list_page # data_list_page.raise_for_status() # if 'Error' in data_list_page.text: # errormessage = staging_root.find( # 'div', id='errorContent').string.strip() # raise RemoteServiceError(errormessage) # tbl = self._parse_staging_request_page(data_list_page) return tbl
def _login(self, username=None, store_password=False, reenter_password=False): """ Login to the ALMA Science Portal. Parameters ---------- username : str, optional Username to the ALMA Science Portal. If not given, it should be specified in the config file. store_password : bool, optional Stores the password securely in your keyring. Default is False. reenter_password : bool, optional Asks for the password even if it is already stored in the keyring. This is the way to overwrite an already stored passwork on the keyring. Default is False. """ if username is None: if not self.USERNAME: raise LoginError("If you do not pass a username to login(), " "you should configure a default one!") else: username = self.USERNAME # Check if already logged in loginpage = self._request("GET", "https://asa.alma.cl/cas/login", cache=False) root = BeautifulSoup(loginpage.content, 'html5lib') if root.find('div', class_='success'): log.info("Already logged in.") return True # Get password from keyring or prompt if reenter_password is False: password_from_keyring = keyring.get_password( "astroquery:asa.alma.cl", username) else: password_from_keyring = None if password_from_keyring is None: if system_tools.in_ipynb(): log.warning("You may be using an ipython notebook:" " the password form will appear in your terminal.") password = getpass.getpass("{0}, enter your ALMA password:"******"\n".format(username)) else: password = password_from_keyring # Authenticate log.info("Authenticating {0} on asa.alma.cl ...".format(username)) # Do not cache pieces of the login process data = { kw: root.find('input', {'name': kw})['value'] for kw in ('lt', '_eventId', 'execution') } data['username'] = username data['password'] = password login_response = self._request( "POST", "https://asa.alma.cl/cas/login", params={'service': urljoin(self.archive_url, 'rh/login')}, data=data, cache=False) authenticated = ('You have successfully logged in' in login_response.text) if authenticated: log.info("Authentication successful!") self.USERNAME = username else: log.exception("Authentication failed!") # When authenticated, save password in keyring if needed if authenticated and password_from_keyring is None and store_password: keyring.set_password("astroquery:asa.alma.cl", username, password) return authenticated
def stage_data(self, uids): """ Stage ALMA data Parameters ---------- uids : list or str A list of valid UIDs or a single UID. UIDs should have the form: 'uid://A002/X391d0b/X7b' Returns ------- data_file_table : Table A table containing 3 columns: the UID, the file URL (for future downloading), and the file size """ """ With log.set_level(10) INFO: Staging files... [astroquery.alma.core] DEBUG: First request URL: https://almascience.eso.org/rh/submission [astroquery.alma.core] DEBUG: First request payload: {'dataset': [u'ALMA+uid___A002_X3b3400_X90f']} [astroquery.alma.core] DEBUG: First response URL: https://almascience.eso.org/rh/checkAuthenticationStatus/3f98de33-197e-4692-9afa-496842032ea9/submission [astroquery.alma.core] DEBUG: Request ID: 3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] DEBUG: Submission URL: https://almascience.eso.org/rh/submission/3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] .DEBUG: Data list URL: https://almascience.eso.org/rh/requests/anonymous/786823226 [astroquery.alma.core] """ if isinstance(uids, six.string_types + (np.bytes_, )): uids = [uids] if not isinstance(uids, (list, tuple, np.ndarray)): raise TypeError("Datasets must be given as a list of strings.") log.info("Staging files...") self._get_dataarchive_url() url = urljoin(self.dataarchive_url, 'rh/submission') log.debug("First request URL: {0}".format(url)) # 'ALMA+uid___A002_X391d0b_X7b' payload = {'dataset': ['ALMA+' + clean_uid(uid) for uid in uids]} log.debug("First request payload: {0}".format(payload)) self._staging_log = {'first_post_url': url} # Request staging for the UIDs # This component cannot be cached, since the returned data can change # if new data are uploaded response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response log.debug("First response URL: {0}".format(response.url)) if 'login' in response.url: raise ValueError( "You must login before downloading this data set.") if response.status_code == 405: if hasattr(self, '_last_successful_staging_log'): log.warning( "Error 405 received. If you have previously staged " "the same UIDs, the result returned is probably " "correct, otherwise you may need to create a fresh " "astroquery.Alma instance.") return self._last_successful_staging_log['result'] else: raise HTTPError( "Received an error 405: this may indicate you " "have already staged the data. Try downloading " "the file URLs directly with download_files.") response.raise_for_status() if 'j_spring_cas_security_check' in response.url: time.sleep(1) # CANNOT cache this stage: it not a real data page! results in # infinite loops response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response if 'j_spring_cas_security_check' in response.url: log.warning("Staging request was not successful. Try again?") response.raise_for_status() if 'j_spring_cas_security_check' in response.url: raise RemoteServiceError("Could not access data. This error " "can arise if the data are private and " "you do not have access rights or are " "not logged in.") request_id = response.url.split("/")[-2] self._staging_log['request_id'] = request_id log.debug("Request ID: {0}".format(request_id)) # Submit a request for the specific request ID identified above submission_url = urljoin(self.dataarchive_url, url_helpers.join('rh/submission', request_id)) log.debug("Submission URL: {0}".format(submission_url)) self._staging_log['submission_url'] = submission_url staging_submission = self._request('GET', submission_url, cache=True) self._staging_log['staging_submission'] = staging_submission staging_submission.raise_for_status() data_page_url = staging_submission.url self._staging_log['data_page_url'] = data_page_url dpid = data_page_url.split("/")[-1] self._staging_log['staging_page_id'] = dpid # CANNOT cache this step: please_wait will happen infinitely data_page = self._request('GET', data_page_url, cache=False) self._staging_log['data_page'] = data_page data_page.raise_for_status() has_completed = False while not has_completed: time.sleep(1) summary = self._request('GET', url_helpers.join(data_page_url, 'summary'), cache=False) summary.raise_for_status() print(".", end='') sys.stdout.flush() has_completed = summary.json()['complete'] self._staging_log['summary'] = summary summary.raise_for_status() self._staging_log['json_data'] = json_data = summary.json() username = self.USERNAME if self.USERNAME else 'anonymous' # templates: # https://almascience.eso.org/dataPortal/requests/keflavich/946895898/ALMA/ # 2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar/2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar # uid___A002_X9ee74a_X26f0/2013.1.00308.S_uid___A002_X9ee74a_X26f0.asdm.sdm.tar url_decomposed = urlparse(data_page_url) base_url = ('{uri.scheme}://{uri.netloc}/' 'dataPortal/requests/{username}/' '{staging_page_id}/ALMA'.format( uri=url_decomposed, staging_page_id=dpid, username=username, )) tbl = self._json_summary_to_table(json_data, base_url=base_url) self._staging_log['result'] = tbl self._staging_log['file_urls'] = tbl['URL'] self._last_successful_staging_log = self._staging_log return tbl
def query_async(self, payload, cache=True, public=True, science=True, max_retries=5, get_html_version=False, get_query_payload=False, **kwargs): """ Perform a generic query with user-specified payload Parameters ---------- payload : dict A dictionary of payload keywords that are accepted by the ALMA archive system. You can look these up by examining the forms at http://almascience.org/aq or using the `help` method cache : bool Cache the query? (note: HTML queries *cannot* be cached using the standard caching mechanism because the URLs are different each time public : bool Return only publicly available datasets? science : bool Return only data marked as "science" in the archive? """ url = urljoin(self._get_dataarchive_url(), 'aq/') payload.update(kwargs) if get_html_version: payload.update({ 'result_view': 'observation', 'format': 'URL', 'download': 'true' }) else: payload.update({ 'result_view': 'raw', 'format': 'VOTABLE', 'download': 'true' }) if public: payload['public_data'] = 'public' if science: payload['science_observations'] = '=%TARGET%' self.validate_query(payload) if get_query_payload: return payload response = self._request('GET', url, params=payload, timeout=self.TIMEOUT, cache=cache and not get_html_version) self._last_response = response response.raise_for_status() if get_html_version: if 'run' not in response.text: if max_retries > 0: log.info( "Failed query. Retrying up to {0} more times".format( max_retries)) return self.query_async( payload=payload, cache=False, public=public, science=science, max_retries=max_retries - 1, get_html_version=get_html_version, get_query_payload=get_query_payload, **kwargs) raise RemoteServiceError( "Incorrect return from HTML table query.") response2 = self._request( 'GET', "{0}/{1}/{2}".format(self._get_dataarchive_url(), 'aq', response.text), params={'query_url': response.url.split("?")[-1]}, timeout=self.TIMEOUT, cache=False, ) self._last_response = response2 response2.raise_for_status() if len(response2.text) == 0: if max_retries > 0: log.info( "Failed (empty) query. Retrying up to {0} more times". format(max_retries)) return self.query_async( payload=payload, cache=cache, public=public, science=science, max_retries=max_retries - 1, get_html_version=get_html_version, get_query_payload=get_query_payload, **kwargs) raise RemoteServiceError("Empty return.") return response2 else: return response
def _login(self, username=None, store_password=False, reenter_password=False): """ Login to the ALMA Science Portal. Parameters ---------- username : str, optional Username to the ALMA Science Portal. If not given, it should be specified in the config file. store_password : bool, optional Stores the password securely in your keyring. Default is False. reenter_password : bool, optional Asks for the password even if it is already stored in the keyring. This is the way to overwrite an already stored passwork on the keyring. Default is False. """ if username is None: if not self.USERNAME: raise LoginError("If you do not pass a username to login(), " "you should configure a default one!") else: username = self.USERNAME # Check if already logged in loginpage = self._request("GET", "https://asa.alma.cl/cas/login", cache=False) root = BeautifulSoup(loginpage.content, 'html5lib') if root.find('div', class_='success'): log.info("Already logged in.") return True # Get password from keyring or prompt if reenter_password is False: password_from_keyring = keyring.get_password( "astroquery:asa.alma.cl", username) else: password_from_keyring = None if password_from_keyring is None: if system_tools.in_ipynb(): log.warning("You may be using an ipython notebook:" " the password form will appear in your terminal.") password = getpass.getpass("{0}, enter your ALMA password:"******"\n".format(username)) else: password = password_from_keyring # Authenticate log.info("Authenticating {0} on asa.alma.cl ...".format(username)) # Do not cache pieces of the login process data = {kw: root.find('input', {'name': kw})['value'] for kw in ('lt', '_eventId', 'execution')} data['username'] = username data['password'] = password login_response = self._request("POST", "https://asa.alma.cl/cas/login", params={'service': urljoin(self.archive_url, 'rh/login')}, data=data, cache=False) authenticated = ('You have successfully logged in' in login_response.text) if authenticated: log.info("Authentication successful!") self.USERNAME = username else: log.exception("Authentication failed!") # When authenticated, save password in keyring if needed if authenticated and password_from_keyring is None and store_password: keyring.set_password("astroquery:asa.alma.cl", username, password) return authenticated
def stage_data(self, uids): """ Stage ALMA data Parameters ---------- uids : list or str A list of valid UIDs or a single UID. UIDs should have the form: 'uid://A002/X391d0b/X7b' cache : True This is *forced* true, because the ALMA servers don't support repeats of the same request. Whether to cache the staging process. This should generally be left as False when used interactively. Returns ------- data_file_table : Table A table containing 3 columns: the UID, the file URL (for future downloading), and the file size """ """ With log.set_level(10) INFO: Staging files... [astroquery.alma.core] DEBUG: First request URL: https://almascience.eso.org/rh/submission [astroquery.alma.core] DEBUG: First request payload: {'dataset': [u'ALMA+uid___A002_X3b3400_X90f']} [astroquery.alma.core] DEBUG: First response URL: https://almascience.eso.org/rh/checkAuthenticationStatus/3f98de33-197e-4692-9afa-496842032ea9/submission [astroquery.alma.core] DEBUG: Request ID: 3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] DEBUG: Submission URL: https://almascience.eso.org/rh/submission/3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] .DEBUG: Data list URL: https://almascience.eso.org/rh/requests/anonymous/786823226 [astroquery.alma.core] """ if isinstance(uids, six.string_types): uids = [uids] if not isinstance(uids, (list, tuple, np.ndarray)): raise TypeError("Datasets must be given as a list of strings.") log.info("Staging files...") self._get_dataarchive_url() url = urljoin(self.dataarchive_url, 'rh/submission') log.debug("First request URL: {0}".format(url)) #'ALMA+uid___A002_X391d0b_X7b' #payload = [('dataset','ALMA+'+clean_uid(uid)) for uid in uids] payload = {'dataset':['ALMA+'+clean_uid(uid) for uid in uids]} log.debug("First request payload: {0}".format(payload)) self._staging_log = {'first_post_url':url} # Request staging for the UIDs # This component cannot be cached, since the returned data can change # if new data are uploaded response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response log.debug("First response URL: {0}".format(response.url)) response.raise_for_status() if 'j_spring_cas_security_check' in response.url: time.sleep(1) # CANNOT cache this stage: it not a real data page! results in # infinite loops response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response if 'j_spring_cas_security_check' in response.url: log.warn("Staging request was not successful. Try again?") response.raise_for_status() if 'j_spring_cas_security_check' in response.url: raise RemoteServiceError("Could not access data. This error " "can arise if the data are private and " "you do not have access rights or are " "not logged in.") request_id = response.url.split("/")[-2] assert len(request_id) == 36 self._staging_log['request_id'] = request_id log.debug("Request ID: {0}".format(request_id)) # Submit a request for the specific request ID identified above submission_url = urljoin(self.dataarchive_url, os.path.join('rh/submission', request_id)) log.debug("Submission URL: {0}".format(submission_url)) self._staging_log['submission_url'] = submission_url has_completed = False staging_submission = self._request('GET', submission_url, cache=True) self._staging_log['staging_submission'] = staging_submission staging_submission.raise_for_status() data_page_url = staging_submission.url dpid = data_page_url.split("/")[-1] assert len(dpid) == 9 self._staging_log['staging_page_id'] = dpid while not has_completed: time.sleep(1) # CANNOT cache this step: please_wait will happen infinitely data_page = self._request('GET', data_page_url, cache=False) if 'Please wait' not in data_page.text: has_completed = True print(".",end='') self._staging_log['data_page'] = data_page data_page.raise_for_status() staging_root = BeautifulSoup(data_page.content) downloadFileURL = staging_root.find('form').attrs['action'] data_list_url = os.path.split(downloadFileURL)[0] # Old version, unreliable: data_list_url = staging_submission.url log.debug("Data list URL: {0}".format(data_list_url)) self._staging_log['data_list_url'] = data_list_url time.sleep(1) data_list_page = self._request('GET', data_list_url, cache=True) self._staging_log['data_list_page'] = data_list_page data_list_page.raise_for_status() if 'Error' in data_list_page.text: errormessage = root.find('div', id='errorContent').string.strip() raise RemoteServiceError(errormessage) tbl = self._parse_staging_request_page(data_list_page) return tbl
def _absurl_from_url(url, base_url): if url[:4] != 'http': return urlparse.urljoin(base_url, url) return url
def stage_data(self, uids): """ Stage ALMA data Parameters ---------- uids : list or str A list of valid UIDs or a single UID. UIDs should have the form: 'uid://A002/X391d0b/X7b' Returns ------- data_file_table : Table A table containing 3 columns: the UID, the file URL (for future downloading), and the file size """ """ With log.set_level(10) INFO: Staging files... [astroquery.alma.core] DEBUG: First request URL: https://almascience.eso.org/rh/submission [astroquery.alma.core] DEBUG: First request payload: {'dataset': [u'ALMA+uid___A002_X3b3400_X90f']} [astroquery.alma.core] DEBUG: First response URL: https://almascience.eso.org/rh/checkAuthenticationStatus/3f98de33-197e-4692-9afa-496842032ea9/submission [astroquery.alma.core] DEBUG: Request ID: 3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] DEBUG: Submission URL: https://almascience.eso.org/rh/submission/3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] .DEBUG: Data list URL: https://almascience.eso.org/rh/requests/anonymous/786823226 [astroquery.alma.core] """ if isinstance(uids, six.string_types): uids = [uids] if not isinstance(uids, (list, tuple, np.ndarray)): raise TypeError("Datasets must be given as a list of strings.") log.info("Staging files...") self._get_dataarchive_url() url = urljoin(self.dataarchive_url, 'rh/submission') log.debug("First request URL: {0}".format(url)) #'ALMA+uid___A002_X391d0b_X7b' #payload = [('dataset','ALMA+'+clean_uid(uid)) for uid in uids] payload = {'dataset': ['ALMA+' + clean_uid(uid) for uid in uids]} log.debug("First request payload: {0}".format(payload)) self._staging_log = {'first_post_url': url} # Request staging for the UIDs # This component cannot be cached, since the returned data can change # if new data are uploaded response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response log.debug("First response URL: {0}".format(response.url)) response.raise_for_status() if 'j_spring_cas_security_check' in response.url: time.sleep(1) # CANNOT cache this stage: it not a real data page! results in # infinite loops response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response if 'j_spring_cas_security_check' in response.url: log.warn("Staging request was not successful. Try again?") response.raise_for_status() if 'j_spring_cas_security_check' in response.url: raise RemoteServiceError("Could not access data. This error " "can arise if the data are private and " "you do not have access rights or are " "not logged in.") request_id = response.url.split("/")[-2] assert len(request_id) == 36 self._staging_log['request_id'] = request_id log.debug("Request ID: {0}".format(request_id)) # Submit a request for the specific request ID identified above submission_url = urljoin(self.dataarchive_url, os.path.join('rh/submission', request_id)) log.debug("Submission URL: {0}".format(submission_url)) self._staging_log['submission_url'] = submission_url staging_submission = self._request('GET', submission_url, cache=True) self._staging_log['staging_submission'] = staging_submission staging_submission.raise_for_status() data_page_url = staging_submission.url self._staging_log['data_page_url'] = data_page_url dpid = data_page_url.split("/")[-1] assert len(dpid) == 9 self._staging_log['staging_page_id'] = dpid # CANNOT cache this step: please_wait will happen infinitely data_page = self._request('GET', data_page_url, cache=False) self._staging_log['data_page'] = data_page data_page.raise_for_status() has_completed = False while not has_completed: time.sleep(1) summary = self._request('GET', os.path.join(data_page_url, 'summary'), cache=False) summary.raise_for_status() print(".", end='') sys.stdout.flush() has_completed = summary.json()['complete'] self._staging_log['summary'] = summary summary.raise_for_status() self._staging_log['json_data'] = json_data = summary.json() username = self._username if hasattr(self, '_username') else 'anonymous' # templates: # https://almascience.eso.org/dataPortal/requests/keflavich/946895898/ALMA/ # 2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar/2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar # uid___A002_X9ee74a_X26f0/2013.1.00308.S_uid___A002_X9ee74a_X26f0.asdm.sdm.tar url_decomposed = urlparse(data_page_url) base_url = ('{uri.scheme}://{uri.netloc}/' 'dataPortal/requests/{username}/' '{staging_page_id}/ALMA'.format( uri=url_decomposed, staging_page_id=dpid, username=username, )) tbl = self._json_summary_to_table(json_data, base_url=base_url) # staging_root = BeautifulSoup(data_page.content) # downloadFileURL = staging_root.find('form').attrs['action'] # data_list_url = os.path.split(downloadFileURL)[0] # # Old version, unreliable: data_list_url = staging_submission.url # log.debug("Data list URL: {0}".format(data_list_url)) # self._staging_log['data_list_url'] = data_list_url # time.sleep(1) # data_list_page = self._request('GET', data_list_url, cache=True) # self._staging_log['data_list_page'] = data_list_page # data_list_page.raise_for_status() # if 'Error' in data_list_page.text: # errormessage = staging_root.find('div', id='errorContent').string.strip() # raise RemoteServiceError(errormessage) # tbl = self._parse_staging_request_page(data_list_page) return tbl
def stage_data(self, uids): """ Stage ALMA data Parameters ---------- uids : list or str A list of valid UIDs or a single UID. UIDs should have the form: 'uid://A002/X391d0b/X7b' Returns ------- data_file_table : Table A table containing 3 columns: the UID, the file URL (for future downloading), and the file size """ """ With log.set_level(10) INFO: Staging files... [astroquery.alma.core] DEBUG: First request URL: https://almascience.eso.org/rh/submission [astroquery.alma.core] DEBUG: First request payload: {'dataset': [u'ALMA+uid___A002_X3b3400_X90f']} [astroquery.alma.core] DEBUG: First response URL: https://almascience.eso.org/rh/checkAuthenticationStatus/3f98de33-197e-4692-9afa-496842032ea9/submission [astroquery.alma.core] DEBUG: Request ID: 3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] DEBUG: Submission URL: https://almascience.eso.org/rh/submission/3f98de33-197e-4692-9afa-496842032ea9 [astroquery.alma.core] .DEBUG: Data list URL: https://almascience.eso.org/rh/requests/anonymous/786823226 [astroquery.alma.core] """ if isinstance(uids, six.string_types + (np.bytes_,)): uids = [uids] if not isinstance(uids, (list, tuple, np.ndarray)): raise TypeError("Datasets must be given as a list of strings.") log.info("Staging files...") self._get_dataarchive_url() url = urljoin(self.dataarchive_url, 'rh/submission') log.debug("First request URL: {0}".format(url)) # 'ALMA+uid___A002_X391d0b_X7b' payload = {'dataset': ['ALMA+' + clean_uid(uid) for uid in uids]} log.debug("First request payload: {0}".format(payload)) self._staging_log = {'first_post_url': url} # Request staging for the UIDs # This component cannot be cached, since the returned data can change # if new data are uploaded response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response log.debug("First response URL: {0}".format(response.url)) if response.status_code == 405: if hasattr(self,'_last_successful_staging_log'): log.warning("Error 405 received. If you have previously staged " "the same UIDs, the result returned is probably " "correct, otherwise you may need to create a fresh " "astroquery.Alma instance.") return self._last_successful_staging_log['result'] else: raise HTTPError("Received an error 405: this may indicate you " "have already staged the data. Try downloading " "the file URLs directly with download_files.") response.raise_for_status() if 'j_spring_cas_security_check' in response.url: time.sleep(1) # CANNOT cache this stage: it not a real data page! results in # infinite loops response = self._request('POST', url, data=payload, timeout=self.TIMEOUT, cache=False) self._staging_log['initial_response'] = response if 'j_spring_cas_security_check' in response.url: log.warning("Staging request was not successful. Try again?") response.raise_for_status() if 'j_spring_cas_security_check' in response.url: raise RemoteServiceError("Could not access data. This error " "can arise if the data are private and " "you do not have access rights or are " "not logged in.") request_id = response.url.split("/")[-2] self._staging_log['request_id'] = request_id log.debug("Request ID: {0}".format(request_id)) # Submit a request for the specific request ID identified above submission_url = urljoin(self.dataarchive_url, url_helpers.join('rh/submission', request_id)) log.debug("Submission URL: {0}".format(submission_url)) self._staging_log['submission_url'] = submission_url staging_submission = self._request('GET', submission_url, cache=True) self._staging_log['staging_submission'] = staging_submission staging_submission.raise_for_status() data_page_url = staging_submission.url self._staging_log['data_page_url'] = data_page_url dpid = data_page_url.split("/")[-1] self._staging_log['staging_page_id'] = dpid # CANNOT cache this step: please_wait will happen infinitely data_page = self._request('GET', data_page_url, cache=False) self._staging_log['data_page'] = data_page data_page.raise_for_status() has_completed = False while not has_completed: time.sleep(1) summary = self._request('GET', url_helpers.join(data_page_url, 'summary'), cache=False) summary.raise_for_status() print(".", end='') sys.stdout.flush() has_completed = summary.json()['complete'] self._staging_log['summary'] = summary summary.raise_for_status() self._staging_log['json_data'] = json_data = summary.json() username = self.USERNAME if self.USERNAME else 'anonymous' # templates: # https://almascience.eso.org/dataPortal/requests/keflavich/946895898/ALMA/ # 2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar/2013.1.00308.S_uid___A001_X196_X93_001_of_001.tar # uid___A002_X9ee74a_X26f0/2013.1.00308.S_uid___A002_X9ee74a_X26f0.asdm.sdm.tar url_decomposed = urlparse(data_page_url) base_url = ('{uri.scheme}://{uri.netloc}/' 'dataPortal/requests/{username}/' '{staging_page_id}/ALMA'.format(uri=url_decomposed, staging_page_id=dpid, username=username, )) tbl = self._json_summary_to_table(json_data, base_url=base_url) self._staging_log['result'] = tbl self._staging_log['file_urls'] = tbl['URL'] self._last_successful_staging_log = self._staging_log return tbl
def query_async(self, payload, cache=True, public=True, science=True, max_retries=5, get_html_version=False, get_query_payload=False, **kwargs): """ Perform a generic query with user-specified payload Parameters ---------- payload : dict A dictionary of payload keywords that are accepted by the ALMA archive system. You can look these up by examining the forms at http://almascience.org/aq or using the `help` method cache : bool Cache the query? (note: HTML queries *cannot* be cached using the standard caching mechanism because the URLs are different each time public : bool Return only publicly available datasets? science : bool Return only data marked as "science" in the archive? """ url = urljoin(self._get_dataarchive_url(), 'aq/') payload.update(kwargs) if get_html_version: payload.update({'result_view': 'observation', 'format': 'URL', 'download': 'true'}) else: payload.update({'result_view': 'raw', 'format': 'VOTABLE', 'download': 'true'}) if public: payload['public_data'] = 'public' if science: payload['science_observations'] = '=%TARGET%' self.validate_query(payload) if get_query_payload: return payload response = self._request('GET', url, params=payload, timeout=self.TIMEOUT, cache=cache and not get_html_version) self._last_response = response response.raise_for_status() if get_html_version: if 'run' not in response.text: if max_retries > 0: log.info("Failed query. Retrying up to {0} more times" .format(max_retries)) return self.query_async(payload=payload, cache=False, public=public, science=science, max_retries=max_retries-1, get_html_version=get_html_version, get_query_payload=get_query_payload, **kwargs) raise RemoteServiceError("Incorrect return from HTML table query.") response2 = self._request('GET', "{0}/{1}/{2}".format( self._get_dataarchive_url(), 'aq', response.text), params={'query_url': response.url.split("?")[-1]}, timeout=self.TIMEOUT, cache=False, ) self._last_response = response2 response2.raise_for_status() if len(response2.text) == 0: if max_retries > 0: log.info("Failed (empty) query. Retrying up to {0} more times" .format(max_retries)) return self.query_async(payload=payload, cache=cache, public=public, science=science, max_retries=max_retries-1, get_html_version=get_html_version, get_query_payload=get_query_payload, **kwargs) raise RemoteServiceError("Empty return.") return response2 else: return response