Python XMLStructure_2_1 Examples, dlstats.xml_utils.XMLStructure_2_1 Python Examples

Example #1

0

Show file

File: insee.py Project: vishalbelsare/dlstats

    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.store_path = self.get_store_path()

        if not "series_last_update" in self.dataset.metadata:
            self.dataset.metadata["series_last_update"] = {}

        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()

Example #2

0

Show file

File: insee.py Project: vishalbelsare/dlstats

    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='National Institute of Statistics and Economic Studies',
            version=VERSION,
            region='France',
            website='http://www.insee.fr',
            terms_of_use=
            'http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm',
            fetcher=self)

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name,
                                store_filepath=self.store_path,
                                use_existing_file=self.use_existing_file)

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._categorisations_categories = None
        self._concepts = None
        self._codelists = OrderedDict()

        self.requests_client = requests.Session()

Example #3

0

Show file

File: insee.py Project: Widukind/dlstats

    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                 long_name='National Institute of Statistics and Economic Studies',
                                 version=VERSION,
                                 region='France',
                                 website='http://www.insee.fr',
                                 terms_of_use='http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm',
                                 fetcher=self)

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name,
                                store_filepath=self.store_path,
                                use_existing_file=self.use_existing_file)

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._categorisations_categories = None
        self._concepts = None
        self._codelists = OrderedDict()

        self.requests_client = requests.Session()

Example #4

0

Show file

File: insee.py Project: Widukind/dlstats

    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.store_path = self.get_store_path()

        if not "series_last_update" in self.dataset.metadata:
            self.dataset.metadata["series_last_update"] = {}

        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()

Example #5

0

Show file

File: insee.py Project: ThomasRoca/dlstats

    def __init__(self, dataset, dataset_doc=None):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.dataset_doc = dataset_doc
        self.store_path = self.get_store_path()
        
        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]        
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]
        
        if self.dataset_doc and self.dataset_doc["enable"]:
            #self.last_update = self.dataset_doc["last_update"]
            self.last_update = self.dataset_doc["download_last"]
        else:
            self.last_update = self.dataset.download_last #self.dataset.last_update

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)        
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()
        
        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                frequencies_supported=FREQUENCIES_SUPPORTED)
        
        self.rows = self._get_data_by_dimension()

Example #6

0

Show file

File: ecb.py Project: ThomasRoca/dlstats

    def _load_structure(self, force=False):
        """Load structure and build data_tree
        """
        
        if self._dataflows and not force:
            return
        
        self.xml_dsd = XMLStructure(provider_name=self.provider_name)       
        
        url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="dataflow.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="categoryscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="categorisation.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations
        
        url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="conceptscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        
        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts

Example #7

0

Show file

File: insee.py Project: ThomasRoca/dlstats

    def _load_structure(self, force=False):
        
        if self._dataflows and not force:
            return

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name)
        
        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)       
        
        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categoryscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categorisation.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations
        
        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="conceptscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts

Example #8

0

Show file

    def _load_structure(self, force=False):
        """Load structure and build data_tree
        """

        if self._dataflows and not force:
            return

        self.xml_dsd = XMLStructure(provider_name=self.provider_name)

        url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="dataflow.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="categoryscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="categorisation.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations

        url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="conceptscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)

        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts

Example #9

0

Show file

File: ecb.py Project: ThomasRoca/dlstats

    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)
        self.store_path = self.get_store_path()

        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]        
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name)        
        self.xml_dsd.concepts = self.fetcher._concepts
        
        self._load()
        
        self.rows = self._get_data_by_dimension()

Example #10

0

Show file

    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)
        self.store_path = self.get_store_path()
        self.last_modified = None

        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]
        self.agency_id = self.fetcher._dataflows[
            self.dataset_code]["attrs"].get("agencyID")

        self.xml_dsd = XMLStructure(provider_name=self.provider_name)
        #self.xml_dsd.concepts = self.fetcher._concepts

        self._load()

        self.rows = self._get_data_by_dimension()

Example #11

0

Show file

File: insee.py Project: vishalbelsare/dlstats

class INSEE_Data(SeriesIterator):
    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.store_path = self.get_store_path()

        if not "series_last_update" in self.dataset.metadata:
            self.dataset.metadata["series_last_update"] = {}

        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()

    def _load_dsd_by_element(self):

        #FIXME: Manque codelist et concepts ?

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id
        download = Downloader(url=url,
                              filename="datastructure-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _load_dsd(self):
        """
        #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400),
        - download 1 dsd partage par plusieurs dataset
        - 668 datase
        """

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url,
                              filename="dsd-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)

        filepath, response = download.get_filepath_and_response()

        if response:
            if response.status_code == HTTP_ERROR_LONG_RESPONSE:
                self._load_dsd_by_element()
                return
            elif response.status_code >= 400:
                raise response.raise_for_status()

        if not os.path.exists(filepath):
            self._load_dsd_by_element()
            return

        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _set_dataset(self):

        dataset = dataset_converter(self.xml_dsd,
                                    self.dataset_code,
                                    dsd_id=self.dsd_id)
        self.dataset.dimension_keys = dataset["dimension_keys"]
        self.dataset.attribute_keys = dataset["attribute_keys"]
        self.dataset.concepts = dataset["concepts"]
        self.dataset.codelists = dataset["codelists"]

    def _get_dimensions_from_dsd(self):
        return get_dimensions_from_dsd(self.xml_dsd, self.provider_name,
                                       self.dataset_code)

    def _get_data_by_dimension(self):

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        choice = "avg"
        if self.dataset_code in ["IPC-2015-COICOP"]:
            choice = "max"

        position, _key, dimension_values = select_dimension(dimension_keys,
                                                            dimensions,
                                                            choice=choice)

        count_dimensions = len(dimension_keys)

        logger.info(
            "choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]"
            % (choice, _key, len(dimension_values), self.provider_name,
               self.dataset_code))

        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''

            key = get_key_for_dimension(count_dimensions, position,
                                        dimension_value)

            url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (
                self.dataset_code, key)
            if self._is_good_url(url) is False:
                logger.warning("bypass not good url[%s]" % url)
                continue

            filename = "data-%s-%s.xml" % (self.dataset_code,
                                           key.replace(".", "_"))
            download = Downloader(
                url=url,
                filename=filename,
                store_filepath=self.store_path,
                use_existing_file=self.fetcher.use_existing_file,
                #NOT USE FOR INSEE client=self.fetcher.requests_client
            )
            filepath, response = download.get_filepath_and_response()

            if not response is None:
                self._add_url_cache(url, response.status_code)

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)

        yield None, None

    def _is_updated(self, bson):
        """Verify if series changes

        Return True si la series doit etre mise a jour et False si elle est a jour
        """
        if not bson["key"] in self.dataset.metadata["series_last_update"]:
            self.dataset.metadata["series_last_update"][
                bson["key"]] = bson.get('last_update')
            return True

        last_update = self.dataset.metadata["series_last_update"][bson["key"]]

        series_updated = bson.get('last_update')
        if not series_updated:
            return True

        if series_updated > last_update:
            return True

        return False

    def clean_field(self, bson):
        bson["attributes"].pop("IDBANK", None)
        bson = super().clean_field(bson)
        return bson

    def build_series(self, bson):
        self.dataset.add_frequency(bson["frequency"])

        if not self._is_updated(bson):
            raise errors.RejectUpdatedSeries(provider_name=self.provider_name,
                                             dataset_code=self.dataset_code,
                                             key=bson.get('key'))

        return bson

Example #12

0

Show file

File: insee.py Project: Widukind/dlstats

class INSEE(Fetcher):

    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                 long_name='National Institute of Statistics and Economic Studies',
                                 version=VERSION,
                                 region='France',
                                 website='http://www.insee.fr',
                                 terms_of_use='http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm',
                                 fetcher=self)

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name,
                                store_filepath=self.store_path,
                                use_existing_file=self.use_existing_file)

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._categorisations_categories = None
        self._concepts = None
        self._codelists = OrderedDict()

        self.requests_client = requests.Session()

    def _load_structure_dataflows(self, force=False):

        if self._dataflows and not force:
            return

        self.provider_verify()

        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name

        if self.refresh_meta is False:
            self._dataflows = self._structure_get("dataflows")

            if self._dataflows:
                self.xml_dsd.dataflows = self._dataflows
                logger.info("load structure [dataflows] from metadata for url[%s]" % url)
                return

        download = Downloader(url=url,
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file,
                              client=self.requests_client)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        self._structure_put("dataflows", url, **self._dataflows)

    def _load_structure_datatree(self, force=False):

        if self._categoryschemes and self._categorisations and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categoryschemes = self._structure_get("categoryschemes")
            if self._categoryschemes:
                logger.info("load structure [categoryschemes] from metadata for url[%s]" % url)
        """
        if not self._categoryschemes:
            download = Downloader(url=url,
                                  filename="categoryscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categoryschemes = self.xml_dsd.categories
            #self._structure_put("categoryschemes", url, **self._categoryschemes)

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categorisations = self._structure_get("categorisation")
            if self._categorisations:
                self._categorisations_categories = self._structure_get("categorisations_categories")
                logger.info("load structure [categorisation] from metadata for url[%s]" % url)
        """

        if not self._categorisations:
            download = Downloader(url=url,
                                  filename="categorisation.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categorisations = self.xml_dsd.categorisations
            self._categorisations_categories = self.xml_dsd.categorisations_categories
            #self._structure_put("categorisation", url, **self._categorisations)
            #self._structure_put("categorisations_categories", url, **self._categorisations_categories)

    def _load_structure_concepts(self, force=False):

        if self._dataflows and self._concepts and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        if self.refresh_meta is False:
            self._concepts = self._structure_get("concepts")
            if self._concepts:
                self.xml_dsd.concepts = self._concepts
                logger.info("load structure [concepts] from metadata for url[%s]" % url)

        if not self._concepts:
            download = Downloader(url=url,
                                  filename="conceptscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._concepts = self.xml_dsd.concepts
            self._structure_put("concepts", url, **self._concepts)

    def load_datasets_first(self):
        self._load_structure_datatree()
        return super().load_datasets_first()

    def build_data_tree(self):
        """Build data_tree from structure datas
        """
        self._load_structure_datatree()

        categories = []

        position = 0
        for category_code, category in self._categoryschemes.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1

            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,
                "all_parents": all_parents,
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self._categorisations_categories:
                categorisation_ids = self._categorisations_categories[category_code]

                for categorisation_id in categorisation_ids:
                    categorisation = self._categorisations[categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    #dataset = self.xml_dsd.dataflows[dataflow_id]
                    if not dataflow_id in self._dataflows:
                        logger.critical("dataflow not found [%s]" % dataflow_id)
                        continue
                    dataset = self._dataflows[dataflow_id]

                    cat["datasets"].append({
                        "dataset_code": dataset['id'],
                        "name":dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })

            categories.append(cat)

        return categories

    def upsert_dataset(self, dataset_code):

        self._load_structure_dataflows()
        self._load_structure_concepts()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           fetcher=self)
        dataset.last_update = clean_datetime()

        insee_data = INSEE_Data(dataset)
        dataset.series.data_iterator = insee_data

        return dataset.update_database()

    def get_calendar(self):

        datasets = {d['name']: d['dataset_code'] for d in self.datasets_list()}

        DATEEXP = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+")
        url = 'http://www.insee.fr/en/service/agendas/agenda.asp'

        d = pq(url=url, parser='html')

        for li in d('div#contenu')('ul.liens')("li.princ-ind"):
            try:

                # April 21, 2016  08:45 - INSEE
                text = pq(li)("p.info")[0].text

                _date = datetime.strptime(DATEEXP.match(text).group(),'%B %d, %Y %H:%M')

                #/en/themes/indicateur.asp?id=105
                url1 = "http://www.insee.fr%s" % pq(li)("a")[0].get("href")
                page2 = pq(url=url1, parser='html')

                # 'http://www.bdm.insee.fr/bdm2/choixCriteres.action?request_locale=en&codeGroupe=1007'
                url2 = page2("div#savoirplus")('p')('a')[0].get("href")
                page3 = pq(url=url2, parser='html')

                #telechargeSDMX-ML?lien=CLIMAT-AFFAIRES&groupeLibc=CLIMAT-AFFAIRES
                dataset_code = page3("a#exportSDMX")[0].get("href").split("=")[-1]

                #print("dataset_code : ", dataset_code)

                if dataset_code in datasets:

                    yield {'action': "update-dataset",
                           "kwargs": {"provider_name": self.provider_name,
                                      "dataset_code": dataset_code},
                           "period_type": "date",
                           "period_kwargs": {"run_date": datetime(_date.year,
                                                                  _date.month,
                                                                  _date.day,
                                                                  _date.hour,
                                                                  _date.minute+2,
                                                                  0),
                                             "timezone": 'Europe/Paris'}
                         }

            except Exception as err:
                logger.exception(err)

Example #13

0

Show file

File: insee.py Project: Widukind/dlstats

class INSEE_Data(SeriesIterator):

    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)

        self.store_path = self.get_store_path()

        if not "series_last_update" in self.dataset.metadata:
            self.dataset.metadata["series_last_update"] = {}

        #TODO: prendre cette info dans la DSD sans utiliser dataflows
        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.fetcher.xml_sdmx)
        self.xml_dsd.concepts = self.fetcher._concepts
        self.xml_dsd.codelists = self.fetcher._codelists

        self._load_dsd()

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()

    def _load_dsd_by_element(self):

        #FIXME: Manque codelist et concepts ?

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id
        download = Downloader(url=url,
                              filename="datastructure-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _load_dsd(self):
        """
        #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400),
        - download 1 dsd partage par plusieurs dataset
        - 668 datase
        """

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url,
                              filename="dsd-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)

        filepath, response = download.get_filepath_and_response()

        if response:
            if response.status_code == HTTP_ERROR_LONG_RESPONSE:
                self._load_dsd_by_element()
                return
            elif response.status_code >= 400:
                raise response.raise_for_status()

        if not os.path.exists(filepath):
            self._load_dsd_by_element()
            return

        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _set_dataset(self):

        dataset = dataset_converter(self.xml_dsd, self.dataset_code, dsd_id=self.dsd_id)
        self.dataset.dimension_keys = dataset["dimension_keys"]
        self.dataset.attribute_keys = dataset["attribute_keys"]
        self.dataset.concepts = dataset["concepts"]
        self.dataset.codelists = dataset["codelists"]

    def _get_dimensions_from_dsd(self):
        return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code)

    def _get_data_by_dimension(self):

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        choice = "avg"
        if self.dataset_code in ["IPC-2015-COICOP"]:
            choice = "max"

        position, _key, dimension_values = select_dimension(dimension_keys,
                                                            dimensions,
                                                            choice=choice)

        count_dimensions = len(dimension_keys)

        logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code))

        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''

            key = get_key_for_dimension(count_dimensions, position, dimension_value)

            url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key)
            if self._is_good_url(url) is False:
                logger.warning("bypass not good url[%s]" % url)
                continue

            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.fetcher.use_existing_file,
                                  #NOT USE FOR INSEE client=self.fetcher.requests_client
                                  )
            filepath, response = download.get_filepath_and_response()

            if not response is None:
                self._add_url_cache(url, response.status_code)

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)

        yield None, None

    def _is_updated(self, bson):
        """Verify if series changes

        Return True si la series doit etre mise a jour et False si elle est a jour
        """
        if not bson["key"] in self.dataset.metadata["series_last_update"]:
            self.dataset.metadata["series_last_update"][bson["key"]] = bson.get('last_update')
            return True

        last_update = self.dataset.metadata["series_last_update"][bson["key"]]

        series_updated = bson.get('last_update')
        if not series_updated:
            return True

        if series_updated > last_update:
            return True

        return False

    def clean_field(self, bson):
        bson["attributes"].pop("IDBANK", None)
        bson = super().clean_field(bson)
        return bson

    def build_series(self, bson):
        self.dataset.add_frequency(bson["frequency"])

        if not self._is_updated(bson):
            raise errors.RejectUpdatedSeries(provider_name=self.provider_name,
                                             dataset_code=self.dataset_code,
                                             key=bson.get('key'))

        return bson

Example #14

0

Show file

File: insee.py Project: vishalbelsare/dlstats

class INSEE(Fetcher):
    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='National Institute of Statistics and Economic Studies',
            version=VERSION,
            region='France',
            website='http://www.insee.fr',
            terms_of_use=
            'http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm',
            fetcher=self)

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name,
                                store_filepath=self.store_path,
                                use_existing_file=self.use_existing_file)

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._categorisations_categories = None
        self._concepts = None
        self._codelists = OrderedDict()

        self.requests_client = requests.Session()

    def _load_structure_dataflows(self, force=False):

        if self._dataflows and not force:
            return

        self.provider_verify()

        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name

        if self.refresh_meta is False:
            self._dataflows = self._structure_get("dataflows")

            if self._dataflows:
                self.xml_dsd.dataflows = self._dataflows
                logger.info(
                    "load structure [dataflows] from metadata for url[%s]" %
                    url)
                return

        download = Downloader(url=url,
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file,
                              client=self.requests_client)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        self._structure_put("dataflows", url, **self._dataflows)

    def _load_structure_datatree(self, force=False):

        if self._categoryschemes and self._categorisations and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categoryschemes = self._structure_get("categoryschemes")
            if self._categoryschemes:
                logger.info("load structure [categoryschemes] from metadata for url[%s]" % url)
        """
        if not self._categoryschemes:
            download = Downloader(url=url,
                                  filename="categoryscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categoryschemes = self.xml_dsd.categories
            #self._structure_put("categoryschemes", url, **self._categoryschemes)

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categorisations = self._structure_get("categorisation")
            if self._categorisations:
                self._categorisations_categories = self._structure_get("categorisations_categories")
                logger.info("load structure [categorisation] from metadata for url[%s]" % url)
        """

        if not self._categorisations:
            download = Downloader(url=url,
                                  filename="categorisation.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categorisations = self.xml_dsd.categorisations
            self._categorisations_categories = self.xml_dsd.categorisations_categories
            #self._structure_put("categorisation", url, **self._categorisations)
            #self._structure_put("categorisations_categories", url, **self._categorisations_categories)

    def _load_structure_concepts(self, force=False):

        if self._dataflows and self._concepts and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        if self.refresh_meta is False:
            self._concepts = self._structure_get("concepts")
            if self._concepts:
                self.xml_dsd.concepts = self._concepts
                logger.info(
                    "load structure [concepts] from metadata for url[%s]" %
                    url)

        if not self._concepts:
            download = Downloader(url=url,
                                  filename="conceptscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._concepts = self.xml_dsd.concepts
            self._structure_put("concepts", url, **self._concepts)

    def load_datasets_first(self):
        self._load_structure_datatree()
        return super().load_datasets_first()

    def build_data_tree(self):
        """Build data_tree from structure datas
        """
        self._load_structure_datatree()

        categories = []

        position = 0
        for category_code, category in self._categoryschemes.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1

            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,
                "all_parents": all_parents,
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self._categorisations_categories:
                categorisation_ids = self._categorisations_categories[
                    category_code]

                for categorisation_id in categorisation_ids:
                    categorisation = self._categorisations[categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    #dataset = self.xml_dsd.dataflows[dataflow_id]
                    if not dataflow_id in self._dataflows:
                        logger.critical("dataflow not found [%s]" %
                                        dataflow_id)
                        continue
                    dataset = self._dataflows[dataflow_id]

                    cat["datasets"].append({
                        "dataset_code": dataset['id'],
                        "name": dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })

            categories.append(cat)

        return categories

    def upsert_dataset(self, dataset_code):

        self._load_structure_dataflows()
        self._load_structure_concepts()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           fetcher=self)
        dataset.last_update = clean_datetime()

        insee_data = INSEE_Data(dataset)
        dataset.series.data_iterator = insee_data

        return dataset.update_database()

    def get_calendar(self):

        datasets = {d['name']: d['dataset_code'] for d in self.datasets_list()}

        DATEEXP = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+"
        )
        url = 'http://www.insee.fr/en/service/agendas/agenda.asp'

        d = pq(url=url, parser='html')

        for li in d('div#contenu')('ul.liens')("li.princ-ind"):
            try:

                # April 21, 2016  08:45 - INSEE
                text = pq(li)("p.info")[0].text

                _date = datetime.strptime(
                    DATEEXP.match(text).group(), '%B %d, %Y %H:%M')

                #/en/themes/indicateur.asp?id=105
                url1 = "http://www.insee.fr%s" % pq(li)("a")[0].get("href")
                page2 = pq(url=url1, parser='html')

                # 'http://www.bdm.insee.fr/bdm2/choixCriteres.action?request_locale=en&codeGroupe=1007'
                url2 = page2("div#savoirplus")('p')('a')[0].get("href")
                page3 = pq(url=url2, parser='html')

                #telechargeSDMX-ML?lien=CLIMAT-AFFAIRES&groupeLibc=CLIMAT-AFFAIRES
                dataset_code = page3("a#exportSDMX")[0].get("href").split(
                    "=")[-1]

                #print("dataset_code : ", dataset_code)

                if dataset_code in datasets:

                    yield {
                        'action': "update-dataset",
                        "kwargs": {
                            "provider_name": self.provider_name,
                            "dataset_code": dataset_code
                        },
                        "period_type": "date",
                        "period_kwargs": {
                            "run_date":
                            datetime(_date.year, _date.month, _date.day,
                                     _date.hour, _date.minute + 2, 0),
                            "timezone":
                            'Europe/Paris'
                        }
                    }

            except Exception as err:
                logger.exception(err)

Example #15

0

Show file

File: ecb.py Project: ThomasRoca/dlstats

class ECB_Data(SeriesIterator):
    
    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)
        self.store_path = self.get_store_path()

        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]        
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]

        self.xml_dsd = XMLStructure(provider_name=self.provider_name)        
        self.xml_dsd.concepts = self.fetcher._concepts
        
        self._load()
        
        self.rows = self._get_data_by_dimension()        
                
    def _load(self):

        url = "http://sdw-wsrest.ecb.int/service/datastructure/ECB/%s?references=all" % self.dsd_id
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="dsd-%s.xml" % self.dataset_code,
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.fetcher.use_existing_file)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _get_data_by_dimension(self):

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                frequencies_supported=FREQUENCIES_SUPPORTED)
        
        dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd,
                                                             self.provider_name,
                                                             self.dataset_code)
        
        position, _key, dimension_values = select_dimension(dimension_keys, dimensions)
        
        count_dimensions = len(dimension_keys)
        
        for dimension_value in dimension_values:
                        
            sdmx_key = []
            for i in range(count_dimensions):
                if i == position:
                    sdmx_key.append(dimension_value)
                else:
                    sdmx_key.append(".")
            key = "".join(sdmx_key)

            url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key)
            headers = SDMX_DATA_HEADERS
            
            last_modified = None
            if self.dataset.metadata and "Last-Modified" in self.dataset.metadata:
                headers["If-Modified-Since"] = self.dataset.metadata["Last-Modified"]
                last_modified = self.dataset.metadata["Last-Modified"]
        
            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))               
            download = Downloader(url=url, 
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  headers=headers,
                                  client=self.fetcher.requests_client)
            filepath, response = download.get_filepath_and_response()

            if filepath:
                self.fetcher.for_delete.append(filepath)

            if response.status_code == HTTP_ERROR_NOT_MODIFIED:
                msg = "Reject dataset updated for provider[%s] - dataset[%s] - update-date[%s]"
                logger.warning(msg % (self.provider_name, self.dataset_code, last_modified))
                continue
            
            elif response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            
            elif response.status_code >= 400:
                raise response.raise_for_status()
    
            if "Last-Modified" in response.headers:
                if not self.dataset.metadata:
                    self.dataset.metadata = {}
                self.dataset.metadata["Last-Modified"] = response.headers["Last-Modified"]
            
            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)
        
        yield None, None
                        
    def _set_dataset(self):
        dataset = dataset_converter(self.xml_dsd, self.dataset_code)
        self.dataset.dimension_keys = dataset["dimension_keys"] 
        self.dataset.attribute_keys = dataset["attribute_keys"] 
        self.dataset.concepts = dataset["concepts"] 
        self.dataset.codelists = dataset["codelists"]
        
    def clean_field(self, bson):
        bson = super().clean_field(bson)
        bson["attributes"].pop("TITLE", None)
        bson["attributes"].pop("TITLE_COMPL", None)
        return bson

    def build_series(self, bson):
        self.dataset.add_frequency(bson["frequency"])
        bson["last_update"] = self.dataset.last_update
        
        return bson

Example #16

0

Show file

File: ecb.py Project: srault95/dlstats

class ECB_Data(SeriesIterator):
    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)
        self.store_path = self.get_store_path()
        self.last_modified = None

        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]
        self.agency_id = self.fetcher._dataflows[self.dataset_code]["attrs"].get("agencyID")

        self.xml_dsd = XMLStructure(provider_name=self.provider_name)
        # self.xml_dsd.concepts = self.fetcher._concepts

        self._load()

        self.rows = self._get_data_by_dimension()

    def _load(self):

        url = "http://sdw-wsrest.ecb.int/service/datastructure/%s/%s?references=all" % (self.agency_id, self.dsd_id)
        download = utils.Downloader(
            store_filepath=self.store_path,
            url=url,
            filename="dsd-%s.xml" % self.dataset_code,
            headers=SDMX_METADATA_HEADERS,
            use_existing_file=self.fetcher.use_existing_file,
        )
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _get_dimensions_from_dsd(self):
        return get_dimensions_from_dsd(self.xml_dsd, self.provider_name, self.dataset_code)

    def _get_data_by_dimension(self):

        self.xml_data = XMLData(
            provider_name=self.provider_name,
            dataset_code=self.dataset_code,
            xml_dsd=self.xml_dsd,
            dsd_id=self.dsd_id,
            frequencies_supported=FREQUENCIES_SUPPORTED,
        )

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        position, _key, dimension_values = select_dimension(dimension_keys, dimensions)

        count_dimensions = len(dimension_keys)

        for dimension_value in dimension_values:

            key = get_key_for_dimension(count_dimensions, position, dimension_value)

            # http://sdw-wsrest.ecb.int/service/data/IEAQ/A............
            url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key)
            if not self._is_good_url(url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]):
                print("bypass url[%s]" % url)
                continue

            headers = SDMX_DATA_HEADERS

            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(
                url=url,
                filename=filename,
                store_filepath=self.store_path,
                headers=headers,
                use_existing_file=self.fetcher.use_existing_file,
                # client=self.fetcher.requests_client
            )
            filepath, response = download.get_filepath_and_response()

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response:
                self._add_url_cache(url, response.status_code)
            elif response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

        yield None, None

    def _set_dataset(self):
        dataset = dataset_converter(self.xml_dsd, self.dataset_code)
        self.dataset.dimension_keys = dataset["dimension_keys"]
        self.dataset.attribute_keys = dataset["attribute_keys"]
        self.dataset.concepts = dataset["concepts"]
        self.dataset.codelists = dataset["codelists"]

    def clean_field(self, bson):
        bson["attributes"].pop("TITLE", None)
        bson["attributes"].pop("TITLE_COMPL", None)
        bson = super().clean_field(bson)
        return bson

    def build_series(self, bson):
        self.dataset.add_frequency(bson["frequency"])
        bson["last_update"] = self.dataset.last_update

        return bson

Example #17

0

Show file

File: insee.py Project: ThomasRoca/dlstats

class INSEE(Fetcher):
    
    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                 long_name='National Institute of Statistics and Economic Studies',
                                 version=VERSION,
                                 region='France',
                                 website='http://www.insee.fr',
                                 fetcher=self)
        
        self.xml_sdmx = None
        self.xml_dsd = None
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._concepts = None
        self._codelists = OrderedDict()
        
        self.requests_client = requests.Session()
                
    def _add_metadata(self):
        return
        #TODO:
        self.provider.metadata = {
            "web": {
                "remote_series": "http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=%(key)s",
                "remote_datasets": "http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=%(dataset_code)s",
                "remote_category": None,
            }
        }
    
    def _load_structure(self, force=False):
        
        if self._dataflows and not force:
            return

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name)
        
        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)       
        
        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categoryscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categorisation.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations
        
        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="conceptscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts
        
    def load_datasets_first(self):
        self._load_structure()
        return super().load_datasets_first()

    def build_data_tree(self):
        """Build data_tree from structure datas
        """
        self._load_structure()
        
        categories = []
        
        position = 0
        for category_code, category in self.xml_dsd.categories.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1
                
            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,                
                "all_parents": all_parents,
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self.xml_dsd.categorisations_categories:
                categorisation_ids = self.xml_dsd.categorisations_categories[category_code]
                
                for categorisation_id in categorisation_ids:
                    categorisation = self.xml_dsd.categorisations[categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    dataset = self.xml_dsd.dataflows[dataflow_id]
                    
                    cat["datasets"].append({
                        "dataset_code": dataset['id'], 
                        "name":dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })
                
            categories.append(cat)
            
        return categories

    def upsert_dataset(self, dataset_code):

        self._load_structure()

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           last_update=clean_datetime(),
                           fetcher=self)
        
        query = {'provider_name': self.provider_name, 
                 "dataset_code": dataset_code}        
        dataset_doc = self.db[constants.COL_DATASETS].find_one(query)
        
        insee_data = INSEE_Data(dataset,
                                dataset_doc=dataset_doc)
        dataset.series.data_iterator = insee_data
        
        return dataset.update_database()

    def get_calendar(self):
        """Parse agenda of new releases and schedule jobs"""
        
        name_list = {d['name']: d['dataset_code'] for d in self.datasets_list()}
        DATEEXP = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December)[ ]+\d+[ ]*,[ ]+\d+[ ]+\d+:\d+")
        url = 'http://www.insee.fr/en/service/agendas/agenda.asp'
        page = download_page(url)
        agenda = etree.HTML(page)
        ul = agenda.find('.//div[@id="contenu"]').find('.//ul[@class="liens"]')
        for li in ul.iterfind('li'):
            text = li.find('p[@class="info"]').text
            _date = datetime.strptime(DATEEXP.match(text).group(),'%B %d, %Y %H:%M')
            href = li.find('.//a').get('href')
            groups = self._parse_theme(urljoin('http://www.insee.fr',href))
            for group in groups:
                group_info = self._parse_group_page(group['url'])
                yield {'action': "update_node",
                       "kwargs": {"provider_name": self.provider_name,
                                  "dataset_code": name_list[group_info['name']]},
                       "period_type": "date",
                       "period_kwargs": {"run_date": datetime(_date.year,
                                                              _date.month,
                                                              _date.day,
                                                              _date.hour,
                                                              _date.minute+5,
                                                              0),
                                         "timezone": pytz.country_timezones('fr')}
                     }

    def _parse_theme(self,url):
        """Find updated code group and url"""

        page = download_page(url)
        theme = etree.HTML(page)
        p = theme.find('.//div[@id="savoirplus"]').find('p')
        groups = []
        for a in p.iterfind('.//a'):
            groups.append({'code': a.text[1:],
                           'url': a.get('href')})
        return groups

    def _parse_group_page(self,url):
        """Find updated dataset code"""

        page = download_page(url)
        group = etree.HTML(page)
        div = group.find('.//div[@id="contenu"]')
        name = div.find('.//h1').text
        # this will be useful if we change the way to download INSEE data
        url = div.find('.//a[@id="exportSDMX"]').get('href')
        return({'name': name, 'url': url})

Example #18

0

Show file

class ECB(Fetcher):
    def __init__(self, **kwargs):
        super().__init__(provider_name='ECB', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='European Central Bank',
            version=VERSION,
            region='Europe',
            website='http://www.ecb.europa.eu',
            terms_of_use=
            'https://www.ecb.europa.eu/home/disclaimer/html/index.en.html',
            fetcher=self)

        self.xml_sdmx = None
        self.xml_dsd = None

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._concepts = None

        #self.requests_client = requests.Session()

    def _load_structure(self, force=False):
        """Load structure and build data_tree
        """

        if self._dataflows and not force:
            return

        self.xml_dsd = XMLStructure(provider_name=self.provider_name)

        url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="dataflow.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="categoryscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="categorisation.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations

        url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url,
                                    filename="conceptscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)

        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts

    def build_data_tree(self):

        self._load_structure()

        categories = []

        position = 0
        for category_code, category in self.xml_dsd.categories.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1

            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,
                "all_parents": all_parents,
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self.xml_dsd.categorisations_categories:
                categorisation_ids = self.xml_dsd.categorisations_categories[
                    category_code]

                for categorisation_id in categorisation_ids:
                    categorisation = self.xml_dsd.categorisations[
                        categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    if not dataflow_id in self.xml_dsd.dataflows:
                        logger.warning(
                            "dataflow[%s] is not in xml_dsd.dataflows" %
                            (dataflow_id))
                        continue

                    dataset = self.xml_dsd.dataflows[dataflow_id]

                    cat["datasets"].append({
                        "dataset_code": dataset['id'],
                        "name": dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })

            if len(cat["datasets"]) > 0:
                categories.append(cat)

        return categories

    def _parse_agenda(self):
        download = utils.Downloader(
            store_filepath=self.store_path,
            url=
            "http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html",
            filename="statscall.html")
        filepath = download.get_filepath()
        with open(filepath, 'rb') as fp:
            agenda = lxml.html.parse(fp)
        self.for_delete.append(filepath)

        regex_date = re.compile("Reference period: (.*)")
        regex_dataset = re.compile(".*Dataset: (.*)\)")
        entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | '
                               '//div[@class="ecb-faytdd"]/*/dd')[2:]
        entries = zip(entries[::2], entries[1::2])
        for entry in entries:
            item = {}
            match_key = regex_dataset.match(entry[1][0].text_content())
            item['dataflow_key'] = match_key.groups()[0]
            match_date = regex_date.match(entry[1][1].text_content())
            item['reference_period'] = match_date.groups()[0]
            item['scheduled_date'] = entry[0].text_content().replace('\n', '')
            yield (item)

    def get_calendar(self):
        datasets = [d["dataset_code"] for d in self.datasets_list()]

        for entry in self._parse_agenda():

            if entry['dataflow_key'] in datasets:

                scheduled_date = entry.pop("scheduled_date")
                run_date = datetime.strptime(scheduled_date,
                                             "%d/%m/%Y %H:%M CET")

                yield {
                    'action': 'update-dataset',
                    'kwargs': {
                        'provider_name': self.provider_name,
                        'dataset_code': entry['dataflow_key']
                    },
                    'period_type': 'date',
                    'period_kwargs': {
                        'run_date': run_date,
                        'timezone': 'CET'
                    }
                }

    def upsert_dataset(self, dataset_code):

        self._load_structure()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=self.provider.website,
                           fetcher=self)
        dataset.last_update = utils.clean_datetime()

        _data = ECB_Data(dataset=dataset)
        dataset.series.data_iterator = _data
        return dataset.update_database()

    def load_datasets_first(self):
        self._load_structure()
        return super().load_datasets_first()

Example #19

0

Show file

File: ecb.py Project: ThomasRoca/dlstats

class ECB(Fetcher):
    
    def __init__(self, **kwargs):        
        super().__init__(provider_name='ECB', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                  long_name='European Central Bank',
                                  version=VERSION,
                                  region='Europe',
                                  website='http://www.ecb.europa.eu',
                                  fetcher=self)
    
        self.xml_sdmx = None
        self.xml_dsd = None
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._concepts = None
        
        self.requests_client = requests.Session()

    def _load_structure(self, force=False):
        """Load structure and build data_tree
        """
        
        if self._dataflows and not force:
            return
        
        self.xml_dsd = XMLStructure(provider_name=self.provider_name)       
        
        url = "http://sdw-wsrest.ecb.int/service/dataflow/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="dataflow.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://sdw-wsrest.ecb.int/service/categoryscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="categoryscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://sdw-wsrest.ecb.int/service/categorisation/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="categorisation.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations
        
        url = "http://sdw-wsrest.ecb.int/service/conceptscheme/%s" % self.provider_name
        download = utils.Downloader(store_filepath=self.store_path,
                                    url=url, 
                                    filename="conceptscheme.xml",
                                    headers=SDMX_METADATA_HEADERS,
                                    use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        
        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts
        
    def build_data_tree(self):

        self._load_structure()
        
        categories = []
        
        position = 0
        for category_code, category in self.xml_dsd.categories.items():
            parent_ids = self.xml_dsd.iter_parent_category_id(category)

            parent = None
            all_parents = None
            if parent_ids:
                all_parents = parent_ids.copy()
                parent = parent_ids.pop()
            else:
                position += 1
                
            cat = {
                "provider_name": self.provider_name,
                "category_code": category_code,
                "name": category["name"],
                "position": position,
                "parent": parent,
                "all_parents": all_parents, 
                "datasets": [],
                "doc_href": None,
                "metadata": {}
            }
            if category_code in self.xml_dsd.categorisations_categories:
                categorisation_ids = self.xml_dsd.categorisations_categories[category_code]
                
                for categorisation_id in categorisation_ids:
                    categorisation = self.xml_dsd.categorisations[categorisation_id]
                    dataflow_id = categorisation["dataflow"]["id"]
                    if not dataflow_id in self.xml_dsd.dataflows:
                        logger.warning("dataflow[%s] is not in xml_dsd.dataflows" % (dataflow_id))
                        continue
                        
                    dataset = self.xml_dsd.dataflows[dataflow_id]
                    
                    cat["datasets"].append({
                        "dataset_code": dataset['id'], 
                        "name":dataset["name"],
                        "last_update": None,
                        "metadata": {
                            "dsd_id": dataset["dsd_id"]
                        }
                    })
                
            categories.append(cat)
            
        return categories
        
    def _parse_agenda(self):
        download = utils.Downloader(store_filepath=self.store_path,
                              url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html",
                              filename="statscall.html")
        filepath = download.get_filepath()
        with open(filepath, 'rb') as fp:
            agenda = lxml.html.parse(fp)
        self.for_delete.append(filepath)
        
        regex_date = re.compile("Reference period: (.*)")
        regex_dataset = re.compile(".*Dataset: (.*)\)")
        entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | '
                               '//div[@class="ecb-faytdd"]/*/dd')[2:]
        entries = zip(entries[::2], entries[1::2])
        for entry in entries:
            item = {}
            match_key = regex_dataset.match(entry[1][0].text_content())
            item['dataflow_key'] = match_key.groups()[0]
            match_date = regex_date.match(entry[1][1].text_content())
            item['reference_period'] = match_date.groups()[0]
            item['scheduled_date'] = entry[0].text_content().replace('\n','')
            yield(item)

    def get_calendar(self):
        datasets = [d["dataset_code"] for d in self.datasets_list()]

        for entry in self._parse_agenda():

            if entry['dataflow_key'] in datasets:

                yield {'action': 'update_node',
                       'kwargs': {'provider_name': self.provider_name,
                                  'dataset_code': entry['dataflow_key']},
                       'period_type': 'date',
                       'period_kwargs': {'run_date': datetime.strptime(
                           entry['scheduled_date'], "%d/%m/%Y %H:%M CET"),
                           'timezone': pytz.timezone('CET')
                       }
                      }

    def upsert_dataset(self, dataset_code):
        
        self._load_structure()
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=self.provider.website,
                           last_update=utils.clean_datetime(),
                           fetcher=self)

        _data = ECB_Data(dataset=dataset)
        dataset.series.data_iterator = _data
        return dataset.update_database()

    def load_datasets_first(self):
        self._load_structure()
        return super().load_datasets_first()

Example #20

0

Show file

class ECB_Data(SeriesIterator):
    def __init__(self, dataset):
        """
        :param Datasets dataset: Datasets instance
        """
        super().__init__(dataset)
        self.store_path = self.get_store_path()
        self.last_modified = None

        self.dataset.name = self.fetcher._dataflows[self.dataset_code]["name"]
        self.dsd_id = self.fetcher._dataflows[self.dataset_code]["dsd_id"]
        self.agency_id = self.fetcher._dataflows[
            self.dataset_code]["attrs"].get("agencyID")

        self.xml_dsd = XMLStructure(provider_name=self.provider_name)
        #self.xml_dsd.concepts = self.fetcher._concepts

        self._load()

        self.rows = self._get_data_by_dimension()

    def _load(self):

        url = "http://sdw-wsrest.ecb.int/service/datastructure/%s/%s?references=all" % (
            self.agency_id, self.dsd_id)
        download = utils.Downloader(
            store_filepath=self.store_path,
            url=url,
            filename="dsd-%s.xml" % self.dataset_code,
            headers=SDMX_METADATA_HEADERS,
            use_existing_file=self.fetcher.use_existing_file)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

    def _get_dimensions_from_dsd(self):
        return get_dimensions_from_dsd(self.xml_dsd, self.provider_name,
                                       self.dataset_code)

    def _get_data_by_dimension(self):

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        position, _key, dimension_values = select_dimension(
            dimension_keys, dimensions)

        count_dimensions = len(dimension_keys)

        for dimension_value in dimension_values:

            key = get_key_for_dimension(count_dimensions, position,
                                        dimension_value)

            #http://sdw-wsrest.ecb.int/service/data/IEAQ/A............
            url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (
                self.dataset_code, key)
            if not self._is_good_url(
                    url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]):
                print("bypass url[%s]" % url)
                continue

            headers = SDMX_DATA_HEADERS

            filename = "data-%s-%s.xml" % (self.dataset_code,
                                           key.replace(".", "_"))
            download = Downloader(
                url=url,
                filename=filename,
                store_filepath=self.store_path,
                headers=headers,
                use_existing_file=self.fetcher.use_existing_file,
                #client=self.fetcher.requests_client
            )
            filepath, response = download.get_filepath_and_response()

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response:
                self._add_url_cache(url, response.status_code)
            elif response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

        yield None, None

    def _set_dataset(self):
        dataset = dataset_converter(self.xml_dsd, self.dataset_code)
        self.dataset.dimension_keys = dataset["dimension_keys"]
        self.dataset.attribute_keys = dataset["attribute_keys"]
        self.dataset.concepts = dataset["concepts"]
        self.dataset.codelists = dataset["codelists"]

    def clean_field(self, bson):
        bson["attributes"].pop("TITLE", None)
        bson["attributes"].pop("TITLE_COMPL", None)
        bson = super().clean_field(bson)
        return bson

    def build_series(self, bson):
        self.dataset.add_frequency(bson["frequency"])
        bson["last_update"] = self.dataset.last_update

        return bson