def _load_dsd(self): """ #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400), - download 1 dsd partage par plusieurs dataset - 668 datase """ url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if response: if response.status_code == HTTP_ERROR_LONG_RESPONSE: self._load_dsd_by_element() return elif response.status_code >= 400: raise response.raise_for_status() if not os.path.exists(filepath): self._load_dsd_by_element() return self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset()
def _load(self): #TODO: DSD """ url = "xxx/%s" % self.dataset_code download = Downloader(url=url, filename="dataflow-%s.xml" % self.dataset_code) self.xml_dsd.process(download.get_filepath()) """ url = "https://www.destatis.de/sddsplus/%s.xml" % self.dataset_code download = Downloader(url=url, filename="data-%s.xml" % self.dataset_code) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, ns_tag_data=self.ns_tag_data, #dimension_keys=self.xml_dsd.dimension_keys ) #TODO: response and exception try: filepath, response = download.get_filepath_and_response() except requests.exceptions.HTTPError as err: logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code) raise self.rows = self.xml_data.process(filepath)
def _load_file(self): filename = "data-%s.zip" % (self.dataset_code) download = Downloader( url=self.url, filename=filename, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file, ) self.filepath, response = download.get_filepath_and_response() if self.filepath: self.fetcher.for_delete.append(self.filepath) release_date_str = response.headers["Last-Modified"] # Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT self.release_date = clean_datetime(datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT")) if self.dataset.last_update and self.dataset.last_update >= self.release_date: comments = "update-date[%s]" % self.release_date raise errors.RejectUpdatedDataset( provider_name=self.provider_name, dataset_code=self.dataset_code, comments=comments ) self.dataset.last_update = self.release_date
def _load_dsd(self): """ #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400), - download 1 dsd partage par plusieurs dataset - 668 datase """ url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if response: if response.status_code == HTTP_ERROR_LONG_RESPONSE: self._load_dsd_by_element() return elif response.status_code >= 400: raise response.raise_for_status() if not os.path.exists(filepath): self._load_dsd_by_element() return self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset()
def _load_file(self): filename = "data-%s.zip" % (self.dataset_code) download = Downloader( url=self.url, filename=filename, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file, ) self.filepath, response = download.get_filepath_and_response() if self.filepath: self.fetcher.for_delete.append(self.filepath) release_date_str = response.headers['Last-Modified'] #Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT self.release_date = clean_datetime( datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT")) if self.dataset.last_update and self.dataset.last_update >= self.release_date: comments = "update-date[%s]" % self.release_date raise errors.RejectUpdatedDataset(provider_name=self.provider_name, dataset_code=self.dataset_code, comments=comments) self.dataset.last_update = self.release_date
def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension( dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: key = get_key_for_dimension(count_dimensions, position, dimension_value) #http://sdw-wsrest.ecb.int/service/data/IEAQ/A............ url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % ( self.dataset_code, key) if not self._is_good_url( url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]): print("bypass url[%s]" % url) continue headers = SDMX_DATA_HEADERS filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader( url=url, filename=filename, store_filepath=self.store_path, headers=headers, use_existing_file=self.fetcher.use_existing_file, #client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response: self._add_url_cache(url, response.status_code) elif response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err yield None, None
def _get_data_by_dimension(self): self.xml_data = XMLData( provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED, ) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: key = get_key_for_dimension(count_dimensions, position, dimension_value) # http://sdw-wsrest.ecb.int/service/data/IEAQ/A............ url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key) if not self._is_good_url(url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]): print("bypass url[%s]" % url) continue headers = SDMX_DATA_HEADERS filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader( url=url, filename=filename, store_filepath=self.store_path, headers=headers, use_existing_file=self.fetcher.use_existing_file, # client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response: self._add_url_cache(url, response.status_code) elif response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err yield None, None
def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code)) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' key = get_key_for_dimension(count_dimensions, position, dimension_value) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key) if self._is_good_url(url) is False: logger.warning("bypass not good url[%s]" % url) continue filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, #NOT USE FOR INSEE client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if not response is None: self._add_url_cache(url, response.status_code) if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code)) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' key = get_key_for_dimension(count_dimensions, position, dimension_value) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key) if self._is_good_url(url) is False: logger.warning("bypass not good url[%s]" % url) continue filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, #NOT USE FOR INSEE client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if not response is None: self._add_url_cache(url, response.status_code) if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max") count_dimensions = len(dimension_keys) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' local_count = 0 sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "%s/%s" % (self._get_url_data(), key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code >= 400 and response.status_code < 500: continue elif response.status_code >= 500: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err local_count += 1 if local_count >= 2999: logger.warning( "TODO: VRFY - series > 2999 for provider[IMF] - dataset[%s] - key[%s]" % (self.dataset_code, key)) #self.dataset.update_database(save_only=True) yield None, None
def _load(self): url = "http://sdw-wsrest.ecb.int/service/dataflow/ECB/%s" % self.dataset_code download = Downloader(url=url, filename="dataflow-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS) self.xml_dsd.process(download.get_filepath()) self.dsd_id = self.xml_dsd.dsd_id if not self.dsd_id: msg = "DSD ID not found for provider[%s] - dataset[%s]" % (self.provider_name, self.dataset_code) raise Exception(msg) url = "http://sdw-wsrest.ecb.int/service/datastructure/ECB/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS) self.xml_dsd.process(download.get_filepath()) self.dataset.name = self.xml_dsd.dataset_name dimensions = OrderedDict() for key, item in self.xml_dsd.dimensions.items(): dimensions[key] = item["dimensions"] self.dimension_list.set_dict(dimensions) attributes = OrderedDict() for key, item in self.xml_dsd.attributes.items(): attributes[key] = item["values"] self.attribute_list.set_dict(attributes) url = "http://sdw-wsrest.ecb.int/service/data/%s" % self.dataset_code download = Downloader(url=url, filename="data-%s.xml" % self.dataset_code, headers=SDMX_DATA_HEADERS) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, dimension_keys=self.xml_dsd.dimension_keys) #TODO: response and exception try: filepath, response = download.get_filepath_and_response() except requests.exceptions.HTTPError as err: logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code) raise self.rows = self.xml_data.process(filepath)
def _get_data_by_dimension(self): dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, #client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code == HTTP_ERROR_NO_RESULT: continue elif response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dataset_code, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max") count_dimensions = len(dimension_keys) for dimension_value in dimension_values: sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "%s/%s" % (self._get_url_data(), key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code >= 400 and response.status_code < 500: continue elif response.status_code >= 500: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max") count_dimensions = len(dimension_keys) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' local_count = 0 sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "%s/%s" % (self._get_url_data(), key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code >= 400 and response.status_code < 500: continue elif response.status_code >= 500: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err local_count += 1 if local_count >= 2999: logger.warning("TODO: VRFY - series > 2999 for provider[IMF] - dataset[%s] - key[%s]" % (self.dataset_code, key)) #self.dataset.update_database(save_only=True) yield None, None
def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dataset_code, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max") count_dimensions = len(dimension_keys) for dimension_value in dimension_values: sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "%s/%s" % (self._get_url_data(), key) filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code >= 400 and response.status_code < 500: continue elif response.status_code >= 500: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None
def _load(self): self.dsd_id = self.dataset_code url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS) self.xml_dsd.process(download.get_filepath()) self.dataset.name = self.xml_dsd.dataset_name dimensions = OrderedDict() for key, item in self.xml_dsd.dimensions.items(): dimensions[key] = item["dimensions"] self.dimension_list.set_dict(dimensions) attributes = OrderedDict() for key, item in self.xml_dsd.attributes.items(): attributes[key] = item["values"] self.attribute_list.set_dict(attributes) url = "http://www.bdm.insee.fr/series/sdmx/data/%s" % self.dataset_code download = Downloader(url=url, filename="data-%s.xml" % self.dataset_code, headers=SDMX_DATA_HEADERS) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, dimension_keys=self.xml_dsd.dimension_keys) #TODO: response and exception try: filepath, response = download.get_filepath_and_response() except requests.exceptions.HTTPError as err: logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code) raise self.rows = self.xml_data.process(filepath)
def load_datas(self): filename = "data-%s.zip" % (self.dataset_code) download = Downloader(url=self.url, filename=filename, store_filepath=self.fetcher.store_path, use_existing_file=self.fetcher.use_existing_file, #client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if response: release_date_str = response.headers['Last-Modified'] #Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT self.release_date = datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT") self._is_updated(self.release_date) self.zipfile = zipfile.ZipFile(filepath) self.excel_filenames = iter(self.zipfile.namelist())
def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) position, _key, dimension_values = select_dimension(dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key) headers = SDMX_DATA_HEADERS last_modified = None if self.dataset.metadata and "Last-Modified" in self.dataset.metadata: headers["If-Modified-Since"] = self.dataset.metadata["Last-Modified"] last_modified = self.dataset.metadata["Last-Modified"] filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, headers=headers, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code == HTTP_ERROR_NOT_MODIFIED: msg = "Reject dataset updated for provider[%s] - dataset[%s] - update-date[%s]" logger.warning(msg % (self.provider_name, self.dataset_code, last_modified)) continue elif response.status_code == HTTP_ERROR_NO_RESULT: continue elif response.status_code >= 400: raise response.raise_for_status() if "Last-Modified" in response.headers: if not self.dataset.metadata: self.dataset.metadata = {} self.dataset.metadata["Last-Modified"] = response.headers["Last-Modified"] for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None