def test_select_dimension(self): # nosetests -s -v dlstats.tests.test_xml_utils:UtilsTestCase.test_select_dimension print() dimensions = {} dimension_keys = ["A", "B", "C"] dimensions["A"] = {"a1": "a1 lib"} dimensions["B"] = {"b1": "b1 lib", "b2": "b2 lib"} dimensions["C"] = {"c1": "c1 lib", "c2": "c2 lib", "c3": "c3 lib"} position, key, dimension_values = xml_utils.select_dimension(dimension_keys, dimensions, choice="max") self.assertEqual(key, "C") self.assertEqual(position, 2) self.assertEqual(sorted(dimension_values), ["c1", "c2", "c3"]) position, key, dimension_values = xml_utils.select_dimension(dimension_keys, dimensions, choice="min") self.assertEqual(key, "A") self.assertEqual(position, 0) self.assertEqual(sorted(dimension_values), ["a1"]) position, key, dimension_values = xml_utils.select_dimension(dimension_keys, dimensions, choice="avg") self.assertEqual(key, "B") self.assertEqual(position, 1) self.assertEqual(sorted(dimension_values), ["b1", "b2"]) position, key, dimension_values = xml_utils.select_dimension(dimension_keys, dimensions, choice=None) self.assertEqual(key, "B") self.assertEqual(position, 1) self.assertEqual(sorted(dimension_values), ["b1", "b2"]) position, key, dimension_values = xml_utils.select_dimension([], {}, choice="min") self.assertEqual(key, None) self.assertEqual(position, 0) self.assertEqual(sorted(dimension_values), [])
def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension( dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: key = get_key_for_dimension(count_dimensions, position, dimension_value) #http://sdw-wsrest.ecb.int/service/data/IEAQ/A............ url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % ( self.dataset_code, key) if not self._is_good_url( url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]): print("bypass url[%s]" % url) continue headers = SDMX_DATA_HEADERS filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader( url=url, filename=filename, store_filepath=self.store_path, headers=headers, use_existing_file=self.fetcher.use_existing_file, #client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response: self._add_url_cache(url, response.status_code) elif response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err yield None, None
def _get_data_by_dimension(self): self.xml_data = XMLData( provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED, ) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: key = get_key_for_dimension(count_dimensions, position, dimension_value) # http://sdw-wsrest.ecb.int/service/data/IEAQ/A............ url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key) if not self._is_good_url(url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]): print("bypass url[%s]" % url) continue headers = SDMX_DATA_HEADERS filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader( url=url, filename=filename, store_filepath=self.store_path, headers=headers, use_existing_file=self.fetcher.use_existing_file, # client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response: self._add_url_cache(url, response.status_code) elif response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err yield None, None
def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code)) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' key = get_key_for_dimension(count_dimensions, position, dimension_value) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key) if self._is_good_url(url) is False: logger.warning("bypass not good url[%s]" % url) continue filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, #NOT USE FOR INSEE client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if not response is None: self._add_url_cache(url, response.status_code) if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def test_select_dimension(self): # nosetests -s -v dlstats.tests.test_xml_utils:UtilsTestCase.test_select_dimension print() dimensions = {} dimension_keys = ["A", "B", "C"] dimensions["A"] = {"a1": "a1 lib"} dimensions["B"] = {"b1": "b1 lib", "b2": "b2 lib"} dimensions["C"] = {"c1": "c1 lib", "c2": "c2 lib", "c3": "c3 lib"} position, key, dimension_values = xml_utils.select_dimension( dimension_keys, dimensions, choice="max") self.assertEqual(key, "C") self.assertEqual(position, 2) self.assertEqual(sorted(dimension_values), ["c1", "c2", "c3"]) position, key, dimension_values = xml_utils.select_dimension( dimension_keys, dimensions, choice="min") self.assertEqual(key, "A") self.assertEqual(position, 0) self.assertEqual(sorted(dimension_values), ["a1"]) position, key, dimension_values = xml_utils.select_dimension( dimension_keys, dimensions, choice="avg") self.assertEqual(key, "B") self.assertEqual(position, 1) self.assertEqual(sorted(dimension_values), ["b1", "b2"]) position, key, dimension_values = xml_utils.select_dimension( dimension_keys, dimensions, choice=None) self.assertEqual(key, "B") self.assertEqual(position, 1) self.assertEqual(sorted(dimension_values), ["b1", "b2"]) position, key, dimension_values = xml_utils.select_dimension( [], {}, choice="min") self.assertEqual(key, None) self.assertEqual(position, 0) self.assertEqual(sorted(dimension_values), [])
def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max") count_dimensions = len(dimension_keys) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' local_count = 0 sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "%s/%s" % (self._get_url_data(), key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code >= 400 and response.status_code < 500: continue elif response.status_code >= 500: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err local_count += 1 if local_count >= 2999: logger.warning( "TODO: VRFY - series > 2999 for provider[IMF] - dataset[%s] - key[%s]" % (self.dataset_code, key)) #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, #client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code == HTTP_ERROR_NO_RESULT: continue elif response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dataset_code, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max") count_dimensions = len(dimension_keys) for dimension_value in dimension_values: sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "%s/%s" % (self._get_url_data(), key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code >= 400 and response.status_code < 500: continue elif response.status_code >= 500: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max") count_dimensions = len(dimension_keys) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' local_count = 0 sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "%s/%s" % (self._get_url_data(), key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code >= 400 and response.status_code < 500: continue elif response.status_code >= 500: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err local_count += 1 if local_count >= 2999: logger.warning("TODO: VRFY - series > 2999 for provider[IMF] - dataset[%s] - key[%s]" % (self.dataset_code, key)) #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dataset_code, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max") count_dimensions = len(dimension_keys) for dimension_value in dimension_values: sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "%s/%s" % (self._get_url_data(), key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code >= 400 and response.status_code < 500: continue elif response.status_code >= 500: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) position, _key, dimension_values = select_dimension(dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key) headers = SDMX_DATA_HEADERS last_modified = None if self.dataset.metadata and "Last-Modified" in self.dataset.metadata: headers["If-Modified-Since"] = self.dataset.metadata["Last-Modified"] last_modified = self.dataset.metadata["Last-Modified"] filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, headers=headers, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code == HTTP_ERROR_NOT_MODIFIED: msg = "Reject dataset updated for provider[%s] - dataset[%s] - update-date[%s]" logger.warning(msg % (self.provider_name, self.dataset_code, last_modified)) continue elif response.status_code == HTTP_ERROR_NO_RESULT: continue elif response.status_code >= 400: raise response.raise_for_status() if "Last-Modified" in response.headers: if not self.dataset.metadata: self.dataset.metadata = {} self.dataset.metadata["Last-Modified"] = response.headers["Last-Modified"] for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None