def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() if not "series_last_update" in self.dataset.metadata: self.dataset.metadata["series_last_update"] = {} #TODO: prendre cette info dans la DSD sans utiliser dataflows self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.fetcher.xml_sdmx) self.xml_dsd.concepts = self.fetcher._concepts self.xml_dsd.codelists = self.fetcher._codelists self._load_dsd() self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension()
def __init__(self, **kwargs): super().__init__(provider_name='INSEE', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', terms_of_use= 'http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm', fetcher=self) self.xml_sdmx = XMLSDMX(agencyID=self.provider_name, store_filepath=self.store_path, use_existing_file=self.use_existing_file) self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.xml_sdmx) self._dataflows = None self._categoryschemes = None self._categorisations = None self._categorisations_categories = None self._concepts = None self._codelists = OrderedDict() self.requests_client = requests.Session()
def __init__(self, **kwargs): super().__init__(provider_name='INSEE', version=VERSION, **kwargs) self.provider = Providers(name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', terms_of_use='http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm', fetcher=self) self.xml_sdmx = XMLSDMX(agencyID=self.provider_name, store_filepath=self.store_path, use_existing_file=self.use_existing_file) self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.xml_sdmx) self._dataflows = None self._categoryschemes = None self._categorisations = None self._categorisations_categories = None self._concepts = None self._codelists = OrderedDict() self.requests_client = requests.Session()
def __init__(self, dataset, dataset_doc=None): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.dataset_doc = dataset_doc self.store_path = self.get_store_path() #TODO: prendre cette info dans la DSD sans utiliser dataflows self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] if self.dataset_doc and self.dataset_doc["enable"]: #self.last_update = self.dataset_doc["last_update"] self.last_update = self.dataset_doc["download_last"] else: self.last_update = self.dataset.download_last #self.dataset.last_update self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.fetcher.xml_sdmx) self.xml_dsd.concepts = self.fetcher._concepts self.xml_dsd.codelists = self.fetcher._codelists self._load_dsd() self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension()
def _load_structure(self, force=False): """Load structure and build data_tree """ if self._dataflows and not force: return self.xml_dsd = XMLStructure(provider_name=self.provider_name) url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="dataflow.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="categoryscheme.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="categorisation.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="conceptscheme.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._concepts = self.xml_dsd.concepts
def _load_structure(self, force=False): if self._dataflows and not force: return self.xml_sdmx = XMLSDMX(agencyID=self.provider_name) self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.xml_sdmx) url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name download = Downloader(url=url, filename="dataflow.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name download = Downloader(url=url, filename="categoryscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name download = Downloader(url=url, filename="categorisation.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name download = Downloader(url=url, filename="conceptscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._concepts = self.xml_dsd.concepts
def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.xml_dsd = XMLStructure(provider_name=self.provider_name) self.xml_dsd.concepts = self.fetcher._concepts self._load() self.rows = self._get_data_by_dimension()
def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() self.last_modified = None self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.agency_id = self.fetcher._dataflows[ self.dataset_code]["attrs"].get("agencyID") self.xml_dsd = XMLStructure(provider_name=self.provider_name) #self.xml_dsd.concepts = self.fetcher._concepts self._load() self.rows = self._get_data_by_dimension()
class INSEE_Data(SeriesIterator): def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() if not "series_last_update" in self.dataset.metadata: self.dataset.metadata["series_last_update"] = {} #TODO: prendre cette info dans la DSD sans utiliser dataflows self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.fetcher.xml_sdmx) self.xml_dsd.concepts = self.fetcher._concepts self.xml_dsd.codelists = self.fetcher._codelists self._load_dsd() self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension() def _load_dsd_by_element(self): #FIXME: Manque codelist et concepts ? url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id download = Downloader(url=url, filename="datastructure-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _load_dsd(self): """ #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400), - download 1 dsd partage par plusieurs dataset - 668 datase """ url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if response: if response.status_code == HTTP_ERROR_LONG_RESPONSE: self._load_dsd_by_element() return elif response.status_code >= 400: raise response.raise_for_status() if not os.path.exists(filepath): self._load_dsd_by_element() return self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _set_dataset(self): dataset = dataset_converter(self.xml_dsd, self.dataset_code, dsd_id=self.dsd_id) self.dataset.dimension_keys = dataset["dimension_keys"] self.dataset.attribute_keys = dataset["attribute_keys"] self.dataset.concepts = dataset["concepts"] self.dataset.codelists = dataset["codelists"] def _get_dimensions_from_dsd(self): return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) logger.info( "choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code)) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' key = get_key_for_dimension(count_dimensions, position, dimension_value) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % ( self.dataset_code, key) if self._is_good_url(url) is False: logger.warning("bypass not good url[%s]" % url) continue filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader( url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, #NOT USE FOR INSEE client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if not response is None: self._add_url_cache(url, response.status_code) if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None def _is_updated(self, bson): """Verify if series changes Return True si la series doit etre mise a jour et False si elle est a jour """ if not bson["key"] in self.dataset.metadata["series_last_update"]: self.dataset.metadata["series_last_update"][ bson["key"]] = bson.get('last_update') return True last_update = self.dataset.metadata["series_last_update"][bson["key"]] series_updated = bson.get('last_update') if not series_updated: return True if series_updated > last_update: return True return False def clean_field(self, bson): bson["attributes"].pop("IDBANK", None) bson = super().clean_field(bson) return bson def build_series(self, bson): self.dataset.add_frequency(bson["frequency"]) if not self._is_updated(bson): raise errors.RejectUpdatedSeries(provider_name=self.provider_name, dataset_code=self.dataset_code, key=bson.get('key')) return bson
class INSEE(Fetcher): def __init__(self, **kwargs): super().__init__(provider_name='INSEE', version=VERSION, **kwargs) self.provider = Providers(name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', terms_of_use='http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm', fetcher=self) self.xml_sdmx = XMLSDMX(agencyID=self.provider_name, store_filepath=self.store_path, use_existing_file=self.use_existing_file) self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.xml_sdmx) self._dataflows = None self._categoryschemes = None self._categorisations = None self._categorisations_categories = None self._concepts = None self._codelists = OrderedDict() self.requests_client = requests.Session() def _load_structure_dataflows(self, force=False): if self._dataflows and not force: return self.provider_verify() url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name if self.refresh_meta is False: self._dataflows = self._structure_get("dataflows") if self._dataflows: self.xml_dsd.dataflows = self._dataflows logger.info("load structure [dataflows] from metadata for url[%s]" % url) return download = Downloader(url=url, filename="dataflow.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows self._structure_put("dataflows", url, **self._dataflows) def _load_structure_datatree(self, force=False): if self._categoryschemes and self._categorisations and not force: return self._load_structure_dataflows(force) url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name """ if self.refresh_meta is False: self._categoryschemes = self._structure_get("categoryschemes") if self._categoryschemes: logger.info("load structure [categoryschemes] from metadata for url[%s]" % url) """ if not self._categoryschemes: download = Downloader(url=url, filename="categoryscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories #self._structure_put("categoryschemes", url, **self._categoryschemes) url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name """ if self.refresh_meta is False: self._categorisations = self._structure_get("categorisation") if self._categorisations: self._categorisations_categories = self._structure_get("categorisations_categories") logger.info("load structure [categorisation] from metadata for url[%s]" % url) """ if not self._categorisations: download = Downloader(url=url, filename="categorisation.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations self._categorisations_categories = self.xml_dsd.categorisations_categories #self._structure_put("categorisation", url, **self._categorisations) #self._structure_put("categorisations_categories", url, **self._categorisations_categories) def _load_structure_concepts(self, force=False): if self._dataflows and self._concepts and not force: return self._load_structure_dataflows(force) url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name if self.refresh_meta is False: self._concepts = self._structure_get("concepts") if self._concepts: self.xml_dsd.concepts = self._concepts logger.info("load structure [concepts] from metadata for url[%s]" % url) if not self._concepts: download = Downloader(url=url, filename="conceptscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._concepts = self.xml_dsd.concepts self._structure_put("concepts", url, **self._concepts) def load_datasets_first(self): self._load_structure_datatree() return super().load_datasets_first() def build_data_tree(self): """Build data_tree from structure datas """ self._load_structure_datatree() categories = [] position = 0 for category_code, category in self._categoryschemes.items(): parent_ids = self.xml_dsd.iter_parent_category_id(category) parent = None all_parents = None if parent_ids: all_parents = parent_ids.copy() parent = parent_ids.pop() else: position += 1 cat = { "provider_name": self.provider_name, "category_code": category_code, "name": category["name"], "position": position, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": {} } if category_code in self._categorisations_categories: categorisation_ids = self._categorisations_categories[category_code] for categorisation_id in categorisation_ids: categorisation = self._categorisations[categorisation_id] dataflow_id = categorisation["dataflow"]["id"] #dataset = self.xml_dsd.dataflows[dataflow_id] if not dataflow_id in self._dataflows: logger.critical("dataflow not found [%s]" % dataflow_id) continue dataset = self._dataflows[dataflow_id] cat["datasets"].append({ "dataset_code": dataset['id'], "name":dataset["name"], "last_update": None, "metadata": { "dsd_id": dataset["dsd_id"] } }) categories.append(cat) return categories def upsert_dataset(self, dataset_code): self._load_structure_dataflows() self._load_structure_concepts() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=None, fetcher=self) dataset.last_update = clean_datetime() insee_data = INSEE_Data(dataset) dataset.series.data_iterator = insee_data return dataset.update_database() def get_calendar(self): datasets = {d['name']: d['dataset_code'] for d in self.datasets_list()} DATEEXP = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+") url = 'http://www.insee.fr/en/service/agendas/agenda.asp' d = pq(url=url, parser='html') for li in d('div#contenu')('ul.liens')("li.princ-ind"): try: # April 21, 2016 08:45 - INSEE text = pq(li)("p.info")[0].text _date = datetime.strptime(DATEEXP.match(text).group(),'%B %d, %Y %H:%M') #/en/themes/indicateur.asp?id=105 url1 = "http://www.insee.fr%s" % pq(li)("a")[0].get("href") page2 = pq(url=url1, parser='html') # 'http://www.bdm.insee.fr/bdm2/choixCriteres.action?request_locale=en&codeGroupe=1007' url2 = page2("div#savoirplus")('p')('a')[0].get("href") page3 = pq(url=url2, parser='html') #telechargeSDMX-ML?lien=CLIMAT-AFFAIRES&groupeLibc=CLIMAT-AFFAIRES dataset_code = page3("a#exportSDMX")[0].get("href").split("=")[-1] #print("dataset_code : ", dataset_code) if dataset_code in datasets: yield {'action': "update-dataset", "kwargs": {"provider_name": self.provider_name, "dataset_code": dataset_code}, "period_type": "date", "period_kwargs": {"run_date": datetime(_date.year, _date.month, _date.day, _date.hour, _date.minute+2, 0), "timezone": 'Europe/Paris'} } except Exception as err: logger.exception(err)
class INSEE_Data(SeriesIterator): def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() if not "series_last_update" in self.dataset.metadata: self.dataset.metadata["series_last_update"] = {} #TODO: prendre cette info dans la DSD sans utiliser dataflows self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.fetcher.xml_sdmx) self.xml_dsd.concepts = self.fetcher._concepts self.xml_dsd.codelists = self.fetcher._codelists self._load_dsd() self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension() def _load_dsd_by_element(self): #FIXME: Manque codelist et concepts ? url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id download = Downloader(url=url, filename="datastructure-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _load_dsd(self): """ #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400), - download 1 dsd partage par plusieurs dataset - 668 datase """ url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if response: if response.status_code == HTTP_ERROR_LONG_RESPONSE: self._load_dsd_by_element() return elif response.status_code >= 400: raise response.raise_for_status() if not os.path.exists(filepath): self._load_dsd_by_element() return self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _set_dataset(self): dataset = dataset_converter(self.xml_dsd, self.dataset_code, dsd_id=self.dsd_id) self.dataset.dimension_keys = dataset["dimension_keys"] self.dataset.attribute_keys = dataset["attribute_keys"] self.dataset.concepts = dataset["concepts"] self.dataset.codelists = dataset["codelists"] def _get_dimensions_from_dsd(self): return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) def _get_data_by_dimension(self): dimension_keys, dimensions = self._get_dimensions_from_dsd() choice = "avg" if self.dataset_code in ["IPC-2015-COICOP"]: choice = "max" position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice=choice) count_dimensions = len(dimension_keys) logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code)) for dimension_value in dimension_values: '''Pour chaque valeur de la dimension, generer une key d'url''' key = get_key_for_dimension(count_dimensions, position, dimension_value) url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key) if self._is_good_url(url) is False: logger.warning("bypass not good url[%s]" % url) continue filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, #NOT USE FOR INSEE client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if not response is None: self._add_url_cache(url, response.status_code) if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None def _is_updated(self, bson): """Verify if series changes Return True si la series doit etre mise a jour et False si elle est a jour """ if not bson["key"] in self.dataset.metadata["series_last_update"]: self.dataset.metadata["series_last_update"][bson["key"]] = bson.get('last_update') return True last_update = self.dataset.metadata["series_last_update"][bson["key"]] series_updated = bson.get('last_update') if not series_updated: return True if series_updated > last_update: return True return False def clean_field(self, bson): bson["attributes"].pop("IDBANK", None) bson = super().clean_field(bson) return bson def build_series(self, bson): self.dataset.add_frequency(bson["frequency"]) if not self._is_updated(bson): raise errors.RejectUpdatedSeries(provider_name=self.provider_name, dataset_code=self.dataset_code, key=bson.get('key')) return bson
class INSEE(Fetcher): def __init__(self, **kwargs): super().__init__(provider_name='INSEE', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', terms_of_use= 'http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm', fetcher=self) self.xml_sdmx = XMLSDMX(agencyID=self.provider_name, store_filepath=self.store_path, use_existing_file=self.use_existing_file) self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.xml_sdmx) self._dataflows = None self._categoryschemes = None self._categorisations = None self._categorisations_categories = None self._concepts = None self._codelists = OrderedDict() self.requests_client = requests.Session() def _load_structure_dataflows(self, force=False): if self._dataflows and not force: return self.provider_verify() url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name if self.refresh_meta is False: self._dataflows = self._structure_get("dataflows") if self._dataflows: self.xml_dsd.dataflows = self._dataflows logger.info( "load structure [dataflows] from metadata for url[%s]" % url) return download = Downloader(url=url, filename="dataflow.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows self._structure_put("dataflows", url, **self._dataflows) def _load_structure_datatree(self, force=False): if self._categoryschemes and self._categorisations and not force: return self._load_structure_dataflows(force) url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name """ if self.refresh_meta is False: self._categoryschemes = self._structure_get("categoryschemes") if self._categoryschemes: logger.info("load structure [categoryschemes] from metadata for url[%s]" % url) """ if not self._categoryschemes: download = Downloader(url=url, filename="categoryscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories #self._structure_put("categoryschemes", url, **self._categoryschemes) url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name """ if self.refresh_meta is False: self._categorisations = self._structure_get("categorisation") if self._categorisations: self._categorisations_categories = self._structure_get("categorisations_categories") logger.info("load structure [categorisation] from metadata for url[%s]" % url) """ if not self._categorisations: download = Downloader(url=url, filename="categorisation.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations self._categorisations_categories = self.xml_dsd.categorisations_categories #self._structure_put("categorisation", url, **self._categorisations) #self._structure_put("categorisations_categories", url, **self._categorisations_categories) def _load_structure_concepts(self, force=False): if self._dataflows and self._concepts and not force: return self._load_structure_dataflows(force) url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name if self.refresh_meta is False: self._concepts = self._structure_get("concepts") if self._concepts: self.xml_dsd.concepts = self._concepts logger.info( "load structure [concepts] from metadata for url[%s]" % url) if not self._concepts: download = Downloader(url=url, filename="conceptscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._concepts = self.xml_dsd.concepts self._structure_put("concepts", url, **self._concepts) def load_datasets_first(self): self._load_structure_datatree() return super().load_datasets_first() def build_data_tree(self): """Build data_tree from structure datas """ self._load_structure_datatree() categories = [] position = 0 for category_code, category in self._categoryschemes.items(): parent_ids = self.xml_dsd.iter_parent_category_id(category) parent = None all_parents = None if parent_ids: all_parents = parent_ids.copy() parent = parent_ids.pop() else: position += 1 cat = { "provider_name": self.provider_name, "category_code": category_code, "name": category["name"], "position": position, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": {} } if category_code in self._categorisations_categories: categorisation_ids = self._categorisations_categories[ category_code] for categorisation_id in categorisation_ids: categorisation = self._categorisations[categorisation_id] dataflow_id = categorisation["dataflow"]["id"] #dataset = self.xml_dsd.dataflows[dataflow_id] if not dataflow_id in self._dataflows: logger.critical("dataflow not found [%s]" % dataflow_id) continue dataset = self._dataflows[dataflow_id] cat["datasets"].append({ "dataset_code": dataset['id'], "name": dataset["name"], "last_update": None, "metadata": { "dsd_id": dataset["dsd_id"] } }) categories.append(cat) return categories def upsert_dataset(self, dataset_code): self._load_structure_dataflows() self._load_structure_concepts() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=None, fetcher=self) dataset.last_update = clean_datetime() insee_data = INSEE_Data(dataset) dataset.series.data_iterator = insee_data return dataset.update_database() def get_calendar(self): datasets = {d['name']: d['dataset_code'] for d in self.datasets_list()} DATEEXP = re.compile( "(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+" ) url = 'http://www.insee.fr/en/service/agendas/agenda.asp' d = pq(url=url, parser='html') for li in d('div#contenu')('ul.liens')("li.princ-ind"): try: # April 21, 2016 08:45 - INSEE text = pq(li)("p.info")[0].text _date = datetime.strptime( DATEEXP.match(text).group(), '%B %d, %Y %H:%M') #/en/themes/indicateur.asp?id=105 url1 = "http://www.insee.fr%s" % pq(li)("a")[0].get("href") page2 = pq(url=url1, parser='html') # 'http://www.bdm.insee.fr/bdm2/choixCriteres.action?request_locale=en&codeGroupe=1007' url2 = page2("div#savoirplus")('p')('a')[0].get("href") page3 = pq(url=url2, parser='html') #telechargeSDMX-ML?lien=CLIMAT-AFFAIRES&groupeLibc=CLIMAT-AFFAIRES dataset_code = page3("a#exportSDMX")[0].get("href").split( "=")[-1] #print("dataset_code : ", dataset_code) if dataset_code in datasets: yield { 'action': "update-dataset", "kwargs": { "provider_name": self.provider_name, "dataset_code": dataset_code }, "period_type": "date", "period_kwargs": { "run_date": datetime(_date.year, _date.month, _date.day, _date.hour, _date.minute + 2, 0), "timezone": 'Europe/Paris' } } except Exception as err: logger.exception(err)
class ECB_Data(SeriesIterator): def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.xml_dsd = XMLStructure(provider_name=self.provider_name) self.xml_dsd.concepts = self.fetcher._concepts self._load() self.rows = self._get_data_by_dimension() def _load(self): url = "http://sdw-wsrest.ecb.int/service/datastructure/ECB/%s?references=all" % self.dsd_id download = utils.Downloader(store_filepath=self.store_path, url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS, use_existing_file=self.fetcher.use_existing_file) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) position, _key, dimension_values = select_dimension(dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: sdmx_key = [] for i in range(count_dimensions): if i == position: sdmx_key.append(dimension_value) else: sdmx_key.append(".") key = "".join(sdmx_key) url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key) headers = SDMX_DATA_HEADERS last_modified = None if self.dataset.metadata and "Last-Modified" in self.dataset.metadata: headers["If-Modified-Since"] = self.dataset.metadata["Last-Modified"] last_modified = self.dataset.metadata["Last-Modified"] filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader(url=url, filename=filename, store_filepath=self.store_path, headers=headers, client=self.fetcher.requests_client) filepath, response = download.get_filepath_and_response() if filepath: self.fetcher.for_delete.append(filepath) if response.status_code == HTTP_ERROR_NOT_MODIFIED: msg = "Reject dataset updated for provider[%s] - dataset[%s] - update-date[%s]" logger.warning(msg % (self.provider_name, self.dataset_code, last_modified)) continue elif response.status_code == HTTP_ERROR_NO_RESULT: continue elif response.status_code >= 400: raise response.raise_for_status() if "Last-Modified" in response.headers: if not self.dataset.metadata: self.dataset.metadata = {} self.dataset.metadata["Last-Modified"] = response.headers["Last-Modified"] for row, err in self.xml_data.process(filepath): yield row, err #self.dataset.update_database(save_only=True) yield None, None def _set_dataset(self): dataset = dataset_converter(self.xml_dsd, self.dataset_code) self.dataset.dimension_keys = dataset["dimension_keys"] self.dataset.attribute_keys = dataset["attribute_keys"] self.dataset.concepts = dataset["concepts"] self.dataset.codelists = dataset["codelists"] def clean_field(self, bson): bson = super().clean_field(bson) bson["attributes"].pop("TITLE", None) bson["attributes"].pop("TITLE_COMPL", None) return bson def build_series(self, bson): self.dataset.add_frequency(bson["frequency"]) bson["last_update"] = self.dataset.last_update return bson
class ECB_Data(SeriesIterator): def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() self.last_modified = None self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.agency_id = self.fetcher._dataflows[self.dataset_code]["attrs"].get("agencyID") self.xml_dsd = XMLStructure(provider_name=self.provider_name) # self.xml_dsd.concepts = self.fetcher._concepts self._load() self.rows = self._get_data_by_dimension() def _load(self): url = "http://sdw-wsrest.ecb.int/service/datastructure/%s/%s?references=all" % (self.agency_id, self.dsd_id) download = utils.Downloader( store_filepath=self.store_path, url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS, use_existing_file=self.fetcher.use_existing_file, ) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _get_dimensions_from_dsd(self): return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) def _get_data_by_dimension(self): self.xml_data = XMLData( provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED, ) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension(dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: key = get_key_for_dimension(count_dimensions, position, dimension_value) # http://sdw-wsrest.ecb.int/service/data/IEAQ/A............ url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key) if not self._is_good_url(url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]): print("bypass url[%s]" % url) continue headers = SDMX_DATA_HEADERS filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader( url=url, filename=filename, store_filepath=self.store_path, headers=headers, use_existing_file=self.fetcher.use_existing_file, # client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response: self._add_url_cache(url, response.status_code) elif response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err yield None, None def _set_dataset(self): dataset = dataset_converter(self.xml_dsd, self.dataset_code) self.dataset.dimension_keys = dataset["dimension_keys"] self.dataset.attribute_keys = dataset["attribute_keys"] self.dataset.concepts = dataset["concepts"] self.dataset.codelists = dataset["codelists"] def clean_field(self, bson): bson["attributes"].pop("TITLE", None) bson["attributes"].pop("TITLE_COMPL", None) bson = super().clean_field(bson) return bson def build_series(self, bson): self.dataset.add_frequency(bson["frequency"]) bson["last_update"] = self.dataset.last_update return bson
class INSEE(Fetcher): def __init__(self, **kwargs): super().__init__(provider_name='INSEE', version=VERSION, **kwargs) self.provider = Providers(name=self.provider_name, long_name='National Institute of Statistics and Economic Studies', version=VERSION, region='France', website='http://www.insee.fr', fetcher=self) self.xml_sdmx = None self.xml_dsd = None self._dataflows = None self._categoryschemes = None self._categorisations = None self._concepts = None self._codelists = OrderedDict() self.requests_client = requests.Session() def _add_metadata(self): return #TODO: self.provider.metadata = { "web": { "remote_series": "http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=%(key)s", "remote_datasets": "http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=%(dataset_code)s", "remote_category": None, } } def _load_structure(self, force=False): if self._dataflows and not force: return self.xml_sdmx = XMLSDMX(agencyID=self.provider_name) self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.xml_sdmx) url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name download = Downloader(url=url, filename="dataflow.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name download = Downloader(url=url, filename="categoryscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name download = Downloader(url=url, filename="categorisation.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name download = Downloader(url=url, filename="conceptscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._concepts = self.xml_dsd.concepts def load_datasets_first(self): self._load_structure() return super().load_datasets_first() def build_data_tree(self): """Build data_tree from structure datas """ self._load_structure() categories = [] position = 0 for category_code, category in self.xml_dsd.categories.items(): parent_ids = self.xml_dsd.iter_parent_category_id(category) parent = None all_parents = None if parent_ids: all_parents = parent_ids.copy() parent = parent_ids.pop() else: position += 1 cat = { "provider_name": self.provider_name, "category_code": category_code, "name": category["name"], "position": position, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": {} } if category_code in self.xml_dsd.categorisations_categories: categorisation_ids = self.xml_dsd.categorisations_categories[category_code] for categorisation_id in categorisation_ids: categorisation = self.xml_dsd.categorisations[categorisation_id] dataflow_id = categorisation["dataflow"]["id"] dataset = self.xml_dsd.dataflows[dataflow_id] cat["datasets"].append({ "dataset_code": dataset['id'], "name":dataset["name"], "last_update": None, "metadata": { "dsd_id": dataset["dsd_id"] } }) categories.append(cat) return categories def upsert_dataset(self, dataset_code): self._load_structure() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=None, last_update=clean_datetime(), fetcher=self) query = {'provider_name': self.provider_name, "dataset_code": dataset_code} dataset_doc = self.db[constants.COL_DATASETS].find_one(query) insee_data = INSEE_Data(dataset, dataset_doc=dataset_doc) dataset.series.data_iterator = insee_data return dataset.update_database() def get_calendar(self): """Parse agenda of new releases and schedule jobs""" name_list = {d['name']: d['dataset_code'] for d in self.datasets_list()} DATEEXP = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+") url = 'http://www.insee.fr/en/service/agendas/agenda.asp' page = download_page(url) agenda = etree.HTML(page) ul = agenda.find('.//div[@id="contenu"]').find('.//ul[@class="liens"]') for li in ul.iterfind('li'): text = li.find('p[@class="info"]').text _date = datetime.strptime(DATEEXP.match(text).group(),'%B %d, %Y %H:%M') href = li.find('.//a').get('href') groups = self._parse_theme(urljoin('http://www.insee.fr',href)) for group in groups: group_info = self._parse_group_page(group['url']) yield {'action': "update_node", "kwargs": {"provider_name": self.provider_name, "dataset_code": name_list[group_info['name']]}, "period_type": "date", "period_kwargs": {"run_date": datetime(_date.year, _date.month, _date.day, _date.hour, _date.minute+5, 0), "timezone": pytz.country_timezones('fr')} } def _parse_theme(self,url): """Find updated code group and url""" page = download_page(url) theme = etree.HTML(page) p = theme.find('.//div[@id="savoirplus"]').find('p') groups = [] for a in p.iterfind('.//a'): groups.append({'code': a.text[1:], 'url': a.get('href')}) return groups def _parse_group_page(self,url): """Find updated dataset code""" page = download_page(url) group = etree.HTML(page) div = group.find('.//div[@id="contenu"]') name = div.find('.//h1').text # this will be useful if we change the way to download INSEE data url = div.find('.//a[@id="exportSDMX"]').get('href') return({'name': name, 'url': url})
class ECB(Fetcher): def __init__(self, **kwargs): super().__init__(provider_name='ECB', version=VERSION, **kwargs) self.provider = Providers( name=self.provider_name, long_name='European Central Bank', version=VERSION, region='Europe', website='http://www.ecb.europa.eu', terms_of_use= 'https://www.ecb.europa.eu/home/disclaimer/html/index.en.html', fetcher=self) self.xml_sdmx = None self.xml_dsd = None self._dataflows = None self._categoryschemes = None self._categorisations = None self._concepts = None #self.requests_client = requests.Session() def _load_structure(self, force=False): """Load structure and build data_tree """ if self._dataflows and not force: return self.xml_dsd = XMLStructure(provider_name=self.provider_name) url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="dataflow.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="categoryscheme.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="categorisation.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="conceptscheme.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._concepts = self.xml_dsd.concepts def build_data_tree(self): self._load_structure() categories = [] position = 0 for category_code, category in self.xml_dsd.categories.items(): parent_ids = self.xml_dsd.iter_parent_category_id(category) parent = None all_parents = None if parent_ids: all_parents = parent_ids.copy() parent = parent_ids.pop() else: position += 1 cat = { "provider_name": self.provider_name, "category_code": category_code, "name": category["name"], "position": position, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": {} } if category_code in self.xml_dsd.categorisations_categories: categorisation_ids = self.xml_dsd.categorisations_categories[ category_code] for categorisation_id in categorisation_ids: categorisation = self.xml_dsd.categorisations[ categorisation_id] dataflow_id = categorisation["dataflow"]["id"] if not dataflow_id in self.xml_dsd.dataflows: logger.warning( "dataflow[%s] is not in xml_dsd.dataflows" % (dataflow_id)) continue dataset = self.xml_dsd.dataflows[dataflow_id] cat["datasets"].append({ "dataset_code": dataset['id'], "name": dataset["name"], "last_update": None, "metadata": { "dsd_id": dataset["dsd_id"] } }) if len(cat["datasets"]) > 0: categories.append(cat) return categories def _parse_agenda(self): download = utils.Downloader( store_filepath=self.store_path, url= "http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html", filename="statscall.html") filepath = download.get_filepath() with open(filepath, 'rb') as fp: agenda = lxml.html.parse(fp) self.for_delete.append(filepath) regex_date = re.compile("Reference period: (.*)") regex_dataset = re.compile(".*Dataset: (.*)\)") entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | ' '//div[@class="ecb-faytdd"]/*/dd')[2:] entries = zip(entries[::2], entries[1::2]) for entry in entries: item = {} match_key = regex_dataset.match(entry[1][0].text_content()) item['dataflow_key'] = match_key.groups()[0] match_date = regex_date.match(entry[1][1].text_content()) item['reference_period'] = match_date.groups()[0] item['scheduled_date'] = entry[0].text_content().replace('\n', '') yield (item) def get_calendar(self): datasets = [d["dataset_code"] for d in self.datasets_list()] for entry in self._parse_agenda(): if entry['dataflow_key'] in datasets: scheduled_date = entry.pop("scheduled_date") run_date = datetime.strptime(scheduled_date, "%d/%m/%Y %H:%M CET") yield { 'action': 'update-dataset', 'kwargs': { 'provider_name': self.provider_name, 'dataset_code': entry['dataflow_key'] }, 'period_type': 'date', 'period_kwargs': { 'run_date': run_date, 'timezone': 'CET' } } def upsert_dataset(self, dataset_code): self._load_structure() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=self.provider.website, fetcher=self) dataset.last_update = utils.clean_datetime() _data = ECB_Data(dataset=dataset) dataset.series.data_iterator = _data return dataset.update_database() def load_datasets_first(self): self._load_structure() return super().load_datasets_first()
class ECB(Fetcher): def __init__(self, **kwargs): super().__init__(provider_name='ECB', version=VERSION, **kwargs) self.provider = Providers(name=self.provider_name, long_name='European Central Bank', version=VERSION, region='Europe', website='http://www.ecb.europa.eu', fetcher=self) self.xml_sdmx = None self.xml_dsd = None self._dataflows = None self._categoryschemes = None self._categorisations = None self._concepts = None self.requests_client = requests.Session() def _load_structure(self, force=False): """Load structure and build data_tree """ if self._dataflows and not force: return self.xml_dsd = XMLStructure(provider_name=self.provider_name) url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="dataflow.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="categoryscheme.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="categorisation.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name download = utils.Downloader(store_filepath=self.store_path, url=url, filename="conceptscheme.xml", headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._concepts = self.xml_dsd.concepts def build_data_tree(self): self._load_structure() categories = [] position = 0 for category_code, category in self.xml_dsd.categories.items(): parent_ids = self.xml_dsd.iter_parent_category_id(category) parent = None all_parents = None if parent_ids: all_parents = parent_ids.copy() parent = parent_ids.pop() else: position += 1 cat = { "provider_name": self.provider_name, "category_code": category_code, "name": category["name"], "position": position, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": {} } if category_code in self.xml_dsd.categorisations_categories: categorisation_ids = self.xml_dsd.categorisations_categories[category_code] for categorisation_id in categorisation_ids: categorisation = self.xml_dsd.categorisations[categorisation_id] dataflow_id = categorisation["dataflow"]["id"] if not dataflow_id in self.xml_dsd.dataflows: logger.warning("dataflow[%s] is not in xml_dsd.dataflows" % (dataflow_id)) continue dataset = self.xml_dsd.dataflows[dataflow_id] cat["datasets"].append({ "dataset_code": dataset['id'], "name":dataset["name"], "last_update": None, "metadata": { "dsd_id": dataset["dsd_id"] } }) categories.append(cat) return categories def _parse_agenda(self): download = utils.Downloader(store_filepath=self.store_path, url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html", filename="statscall.html") filepath = download.get_filepath() with open(filepath, 'rb') as fp: agenda = lxml.html.parse(fp) self.for_delete.append(filepath) regex_date = re.compile("Reference period: (.*)") regex_dataset = re.compile(".*Dataset: (.*)\)") entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | ' '//div[@class="ecb-faytdd"]/*/dd')[2:] entries = zip(entries[::2], entries[1::2]) for entry in entries: item = {} match_key = regex_dataset.match(entry[1][0].text_content()) item['dataflow_key'] = match_key.groups()[0] match_date = regex_date.match(entry[1][1].text_content()) item['reference_period'] = match_date.groups()[0] item['scheduled_date'] = entry[0].text_content().replace('\n','') yield(item) def get_calendar(self): datasets = [d["dataset_code"] for d in self.datasets_list()] for entry in self._parse_agenda(): if entry['dataflow_key'] in datasets: yield {'action': 'update_node', 'kwargs': {'provider_name': self.provider_name, 'dataset_code': entry['dataflow_key']}, 'period_type': 'date', 'period_kwargs': {'run_date': datetime.strptime( entry['scheduled_date'], "%d/%m/%Y %H:%M CET"), 'timezone': pytz.timezone('CET') } } def upsert_dataset(self, dataset_code): self._load_structure() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=self.provider.website, last_update=utils.clean_datetime(), fetcher=self) _data = ECB_Data(dataset=dataset) dataset.series.data_iterator = _data return dataset.update_database() def load_datasets_first(self): self._load_structure() return super().load_datasets_first()
class ECB_Data(SeriesIterator): def __init__(self, dataset): """ :param Datasets dataset: Datasets instance """ super().__init__(dataset) self.store_path = self.get_store_path() self.last_modified = None self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"] self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"] self.agency_id = self.fetcher._dataflows[ self.dataset_code]["attrs"].get("agencyID") self.xml_dsd = XMLStructure(provider_name=self.provider_name) #self.xml_dsd.concepts = self.fetcher._concepts self._load() self.rows = self._get_data_by_dimension() def _load(self): url = "http://sdw-wsrest.ecb.int/service/datastructure/%s/%s?references=all" % ( self.agency_id, self.dsd_id) download = utils.Downloader( store_filepath=self.store_path, url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS, use_existing_file=self.fetcher.use_existing_file) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset() def _get_dimensions_from_dsd(self): return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code) def _get_data_by_dimension(self): self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dsd_id, frequencies_supported=FREQUENCIES_SUPPORTED) dimension_keys, dimensions = self._get_dimensions_from_dsd() position, _key, dimension_values = select_dimension( dimension_keys, dimensions) count_dimensions = len(dimension_keys) for dimension_value in dimension_values: key = get_key_for_dimension(count_dimensions, position, dimension_value) #http://sdw-wsrest.ecb.int/service/data/IEAQ/A............ url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % ( self.dataset_code, key) if not self._is_good_url( url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]): print("bypass url[%s]" % url) continue headers = SDMX_DATA_HEADERS filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_")) download = Downloader( url=url, filename=filename, store_filepath=self.store_path, headers=headers, use_existing_file=self.fetcher.use_existing_file, #client=self.fetcher.requests_client ) filepath, response = download.get_filepath_and_response() if filepath and os.path.exists(filepath): self.fetcher.for_delete.append(filepath) elif not filepath or not os.path.exists(filepath): continue if response: self._add_url_cache(url, response.status_code) elif response and response.status_code == HTTP_ERROR_NO_RESULT: continue elif response and response.status_code >= 400: raise response.raise_for_status() for row, err in self.xml_data.process(filepath): yield row, err yield None, None def _set_dataset(self): dataset = dataset_converter(self.xml_dsd, self.dataset_code) self.dataset.dimension_keys = dataset["dimension_keys"] self.dataset.attribute_keys = dataset["attribute_keys"] self.dataset.concepts = dataset["concepts"] self.dataset.codelists = dataset["codelists"] def clean_field(self, bson): bson["attributes"].pop("TITLE", None) bson["attributes"].pop("TITLE_COMPL", None) bson = super().clean_field(bson) return bson def build_series(self, bson): self.dataset.add_frequency(bson["frequency"]) bson["last_update"] = self.dataset.last_update return bson