def test_update_database(self):

        # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_update_database

        self._collections_is_empty()

        f = Fetcher(provider_name="p1", 
                    db=self.db)

        p = Providers(name="p1", 
                      long_name="Provider One",
                      version=1,
                      region="Dreamland",
                      website="http://www.example.com", 
                      fetcher=f)
        id = p.update_database()
        self.assertIsNotNone(id)
        self.assertIsInstance(id, ObjectId)
        self.db[constants.COL_PROVIDERS].find_one({'_id': ObjectId(id)})
        
        bson = self.db[constants.COL_PROVIDERS].find_one({"name": "p1"})
        self.assertIsNotNone(bson)
        
        self.assertEqual(bson["name"], "p1")
        self.assertEqual(bson["website"], "http://www.example.com")
    def test_version_field(self):

        # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_version_field

        self._collections_is_empty()

        f = Fetcher(provider_name="p1", 
                    db=self.db)

        with self.assertRaises(MultipleInvalid):
            Providers(name="p1", 
                      long_name="Provider One",
                      region="Dreamland",
                      website="http://www.example.com", 
                      fetcher=f)

        p = Providers(name="p1", 
                      long_name="Provider One",
                      version=1,
                      region="Dreamland",
                      website="http://www.example.com", 
                      fetcher=f)
        p.update_database()        

        self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 1)
Exemple #3
0
    def __init__(self, db=None, **kwargs):        
        super().__init__(provider_name='DESTATIS', db=db, **kwargs)
        
        if not self.provider:
            self.provider = Providers(name=self.provider_name,
                                      long_name='Statistisches Bundesamt',
                                      version=VERSION,
                                      region='Germany',
                                      website='https://www.destatis.de',
                                      fetcher=self)

        if self.provider.version != VERSION:
            self.provider.update_database()
Exemple #4
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='INSEE', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='National Institute of Statistics and Economic Studies',
            version=VERSION,
            region='France',
            website='http://www.insee.fr',
            terms_of_use=
            'http://www.insee.fr/en/service/default.asp?page=rediffusion/rediffusion.htm',
            fetcher=self)

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name,
                                store_filepath=self.store_path,
                                use_existing_file=self.use_existing_file)

        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._categorisations_categories = None
        self._concepts = None
        self._codelists = OrderedDict()

        self.requests_client = requests.Session()
Exemple #5
0
 def __init__(self, db=None, **kwargs):        
     super().__init__(provider_name='DESTATIS', db=db, **kwargs)
     
     self.provider = Providers(name=self.provider_name,
                               long_name='Statistisches Bundesamt',
                               version=VERSION,
                               region='Germany',
                               website='https://www.destatis.de',
                               fetcher=self)
Exemple #6
0
 def __init__(self, db=None, **kwargs):
     super().__init__(provider_name='OECD', db=db, **kwargs)
     self.provider_name = 'OECD'
     self.provider = Providers(name=self.provider_name, 
                               long_name='Organisation for Economic Co-operation and Development',
                               version=VERSION,
                               region='world',
                               website='http://www.oecd.org', 
                               fetcher=self)
Exemple #7
0
 def __init__(self, db=None, **kwargs):        
     super().__init__(provider_name='IMF', db=db, **kwargs)
     
     self.provider = Providers(name=self.provider_name, 
                               long_name="International Monetary Fund",
                               version=VERSION, 
                               region='world', 
                               website='http://www.imf.org/', 
                               fetcher=self)
Exemple #8
0
 def __init__(self, db=None, **kwargs):        
     super().__init__(provider_name='FED', db=db, **kwargs)
     
     self.provider = Providers(name=self.provider_name,
                               long_name='Federal Reserve',
                               version=VERSION,
                               region='US',
                               website='http://www.federalreserve.gov',
                               fetcher=self)
Exemple #9
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='DUMMY', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                  long_name='Dummy Fetcher',
                                  version=VERSION,
                                  region='World',
                                  website='http://www.example.org',
                                  fetcher=self)
Exemple #10
0
 def __init__(self, db=None):
     
     super().__init__(provider_name='WorldBank',  db=db)         
     
     self.provider = Providers(name=self.provider_name,
                              long_name='World Bank',
                              version=VERSION,
                              region='world',
                              website='http://www.worldbank.org/',
                              fetcher=self)
Exemple #11
0
 def __init__(self, db=None):
     super().__init__(provider_name='ESRI', db=db)         
     self.provider = Providers(name=self.provider_name,
                               long_name='Economic and Social Research Institute, Cabinet Office',
                               version=VERSION,
                               region='Japan',
                               website='http://www.esri.cao.go.jp/index-e.html',
                               fetcher=self)
     self.datasets_dict = {}
     self.selected_codes = ['GDP.Amount']
Exemple #12
0
    def test_build_data_tree(self):

        # nosetests -s -v dlstats.tests.fetchers.test_ecb:FetcherTestCase.test_build_data_tree

        self._register_urls_data_tree()

        self.fetcher.build_data_tree()

        # self.maxDiff = None

        provider = self.fetcher.provider
        self.assertEqual(provider.count_data_tree(), 12)

        """
        pprint(provider.data_tree)
        with open(DATA_TREE_FP, "w") as fp:
            json.dump(provider.data_tree, fp, sort_keys=False)
        """

        new_provider = Providers(fetcher=self.fetcher, **provider.bson)

        with open(DATA_TREE_FP) as fp:
            local_data_tree = json.load(fp, object_pairs_hook=OrderedDict)
            new_provider.data_tree = local_data_tree
            # self.assertEqual(provider.data_tree, new_provider.data_tree)

        filter_datasets = provider.datasets(category_filter="ECB.MOBILE_NAVI.06")
        self.assertEqual(len(filter_datasets), 6)
        self.assertEqual(filter_datasets[0]["dataset_code"], "BOP")
        self.assertEqual(filter_datasets[-1]["dataset_code"], "TRD")

        for d in provider.data_tree:
            schemas.data_tree_schema(d)

        provider.update_database()

        doc = self.db[constants.COL_PROVIDERS].find_one({"name": self.fetcher.provider_name})
        self.assertIsNotNone(doc)
        for i, d in enumerate(doc["data_tree"]):
            self.assertEqual(doc["data_tree"][i], provider.data_tree[i])

        count = len(self.fetcher.datasets_list())
        self.assertEqual(count, DATAFLOW_COUNT)
Exemple #13
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='BDF', version=2, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                  long_name='Banque de France',
                                  version=2,
                                  region='France',
                                  website='http://webstat.banque-france.fr/',
                                  fetcher=self)
        self.categories_filter = ['concept']
Exemple #14
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='BIS', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='Bank for International Settlements',
            version=VERSION,
            region='World',
            website='http://www.bis.org',
            terms_of_use='https://www.bis.org/terms_conditions.htm',
            fetcher=self)
Exemple #15
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='ESRI', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='Economic and Social Research Institute, Cabinet Office',
            version=VERSION,
            region='Japan',
            website='http://www.esri.cao.go.jp/index-e.html',
            fetcher=self)

        self.categories_filter = ['SNA']
    def test_unique_constraint(self):

        # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_unique_constraint

        self._collections_is_empty()

        f = Fetcher(provider_name="p1", 
                    db=self.db)

        p = Providers(name="p1", 
                      long_name="Provider One",
                      version=1,
                      region="Dreamland",
                      website="http://www.example.com", 
                      fetcher=f)
        p.update_database()        

        self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 1)
        
        existing_provider = dict(name="p1")
        
        with self.assertRaises(DuplicateKeyError):
            self.db[constants.COL_PROVIDERS].insert(existing_provider)

        p = Providers(name="p2", 
                      long_name="Provider One",
                      version=1,                      
                      region="Dreamland",
                      website="http://www.example.com",
                      fetcher=f)
        p.update_database()

        self.assertEqual(self.db[constants.COL_PROVIDERS].count(), 2)
Exemple #17
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='IMF', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name="International Monetary Fund",
            version=VERSION,
            region='World',
            website='http://www.imf.org/',
            terms_of_use='http://www.imf.org/external/terms.htm',
            fetcher=self)

        self.requests_client = requests.Session()
Exemple #18
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='OECD', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='Organisation for Economic Co-operation and Development',
            version=VERSION,
            region='World',
            website='http://www.oecd.org',
            terms_of_use='http://www.oecd.org/termsandconditions/',
            fetcher=self)

        self.requests_client = requests.Session()
Exemple #19
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='EUROSTAT', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='Eurostat',
            version=VERSION,
            region='Europe',
            website='http://ec.europa.eu/eurostat',
            terms_of_use=
            'http://ec.europa.eu/eurostat/about/our-partners/copyright',
            fetcher=self)

        self.categories_filter = [
            'nama10',
            'namq_10',
            'nasa_10',
            'nasq_10',
            'naid_10',
            'nama',
            'namq',
            'nasa',
            'nasq',
            'gov',
            'ert',
            'irt',
            'prc',
            'bop',
            'bop_6',
            'demo',  # We harvest demo because we need demo_pjanbroad.
            'lfsi_act_q',
            'euroind',
            'pop',
            'labour',
        ]

        self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml"
        self.updated_catalog = False
Exemple #20
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='BEA', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='Bureau of Economic Analysis',
            region='USA',
            version=VERSION,
            website='http://www.bea.gov',
            terms_of_use='http://www.bea.gov/about/BEAciting.htm',
            fetcher=self)

        self._datasets_settings = None
        self._current_urls = {}
Exemple #21
0
    def __init__(self, db=None):
        super().__init__(provider_name="BIS", db=db)

        if not self.provider:
            self.provider = Providers(
                name=self.provider_name,
                long_name="Bank for International Settlements",
                version=VERSION,
                region="world",
                website="http://www.bis.org",
                fetcher=self,
            )
            self.provider.update_database()

        if self.provider.version != VERSION:
            self.provider.update_database()
    def test_add_data_tree(self):
        
        # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_add_data_tree

        f = Fetcher(provider_name="p1", is_indexes=False)

        p = Providers(name="p1",
                      long_name="Provider One",
                      version=1,
                      region="Dreamland",
                      website="http://www.example.com", 
                      fetcher=f)
        
        self.assertEqual(len(p.data_tree), 1)
        p.data_tree[0]["category_code"] = p.name
        p.data_tree[0]["long_name"] = p.long_name
        p.data_tree[0]["website"] = p.website
        
        p.update_database()
        
        minimal_category = { 'category_code': "c0", 'name': "p1"}
        p.add_category(minimal_category)
        
        data_tree = [
             {'category_code': 'p1',
              'datasets': [],
              'description': None,
              'doc_href': 'http://www.example.com',
              'exposed': False,
              'last_update': None,
              'name': 'p1'},
             {'category_code': 'p1.c0',
              'datasets': [],
              'description': None,
              'doc_href': None,
              'exposed': False,
              'last_update': None,
              'name': 'p1'}
        ]        
        
        self.assertEqual(p.data_tree, data_tree)
Exemple #23
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='ECB', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='European Central Bank',
            version=VERSION,
            region='Europe',
            website='http://www.ecb.europa.eu',
            terms_of_use=
            'https://www.ecb.europa.eu/home/disclaimer/html/index.en.html',
            fetcher=self)

        self.xml_sdmx = None
        self.xml_dsd = None

        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
        self._concepts = None
Exemple #24
0
    def __init__(self, db=None, sdmx=None, **kwargs):        
        super().__init__(provider_name='ECB', db=db, **kwargs)

        if not self.provider:        
            self.provider = Providers(name=self.provider_name,
                                      long_name='European Central Bank',
                                      version=VERSION,
                                      region='Europe',
                                      website='http://www.ecb.europa.eu',
                                      fetcher=self)
            self.provider.update_database()
        
        if self.provider.version != VERSION:
            self.provider.update_database()
            
        self.sdmx = sdmx or ECBRequest(agency=self.provider_name)
        self.sdmx.timeout = 90
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
Exemple #25
0
    def __init__(self, db=None, sdmx=None, **kwargs):        
        super().__init__(provider_name='INSEE', db=db, **kwargs)

        if not self.provider:        
            self.provider = Providers(name=self.provider_name,
                                     long_name='National Institute of Statistics and Economic Studies',
                                     version=VERSION,
                                     region='France',
                                     website='http://www.insee.fr',
                                     fetcher=self)
            self.provider.update_database()
        
        if self.provider.version != VERSION:
            self.provider.update_database()
            
        
        self.sdmx = sdmx or Request(agency='INSEE')
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
Exemple #26
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='WORLDBANK', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='World Bank',
            version=VERSION,
            region='World',
            website='http://www.worldbank.org/',
            terms_of_use='http://data.worldbank.org/summary-terms-of-use',
            fetcher=self)

        self.api_url = 'http://api.worldbank.org/v2/'

        self.requests_client = requests.Session()

        self.blacklist = [
            '13',  # Enterprise Surveys
            '26',  # Corporate scorecard # datacatalog id="89"    
            '29',  # Global Social Protection
            '31',  # Country Policy and Institutional Assessment (CPIA)
            '36',  # Statistical Capacity Indicators # datacatalog id="8"
            '37',  # LAC Equity Lab
            '41',  # Country Partnership Strategy for India
            '44',  # Readiness for Investment in Sustainable Energy (RISE)
            '45',  # INDO-DAPOER
        ]
        """
        A Exclure:
        economycoverage: WLD, EAP, ECA, LAC, MNA, SAS, SSA, HIC, LMY, IBRD, IDA
        numberofeconomies: 214
        topics: 
        mobileapp: ???
        > Les données agrégés par régions sont aussi dans les countries mais avec un id="NA" dans region
        <wb:region id="NA">Aggregates</wb:region>
        """

        self._available_countries = None
        self._available_countries_by_name = None
Exemple #27
0
    def __init__(self, **kwargs):
        super().__init__(provider_name="EUROSTAT", version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name="Eurostat",
            version=VERSION,
            region="Europe",
            website="http://ec.europa.eu/eurostat",
            terms_of_use="http://ec.europa.eu/eurostat/about/our-partners/copyright",
            fetcher=self,
        )

        self.categories_filter = [
            "nama_10",
            "namq_10",
            "nasa_10",
            "nasq_10",
            "naid_10",
            "nama",
            "namq",
            "nasa",
            "nasq",
            "gov",
            "ert",
            "irt",
            "prc",
            "bop",
            "bop_6",
            "demo_pjanbroad",
            "lfsi_act_q",
            "euroind",
            "pop",
            "labour",
        ]

        self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml"
        self.updated_catalog = False
Exemple #28
0
 def __init__(self, db=None):
     super().__init__(provider_name='Eurostat', db=db)
     
     if not self.provider:
         self.provider = Providers(name=self.provider_name,
                                   long_name='Eurostat',
                                   version=VERSION,
                                   region='Europe',
                                   website='http://ec.europa.eu/eurostat',
                                   fetcher=self)
         self.provider.update_database()
     
     if self.provider.version != VERSION:
         self.provider.update_database()
     
     self.selected_codes = [
         'nama_10', 
         'namq_10', 
         'nasa_10', 
         'nasq_10', 
         'naid_10',
         'nama', 
         'namq', 
         'nasa', 
         'nasq', 
         'gov', 
         'ert', 
         'irt', 
         'prc', 
         'bop', 
         'bop_6',
         'demo_pjanbroad', 
         'lfsi_act_q'
     ]
     self.selected_datasets = {}
     self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml"
     self.dataset_url = None
Exemple #29
0
    def __init__(self, **kwargs):
        super().__init__(provider_name='EUROSTAT', version=VERSION, **kwargs)

        self.provider = Providers(name=self.provider_name,
                                  long_name='Eurostat',
                                  version=VERSION,
                                  region='Europe',
                                  website='http://ec.europa.eu/eurostat',
                                  terms_of_use='http://ec.europa.eu/eurostat/about/our-partners/copyright',
                                  fetcher=self)

        self.categories_filter = [
            'nama10',
            'namq_10',
            'nasa_10',
            'nasq_10',
            'naid_10',
            'nama',
            'namq',
            'nasa',
            'nasq',
            'gov',
            'ert',
            'irt',
            'prc',
            'bop',
            'bop_6',
            'demo',  # We harvest demo because we need demo_pjanbroad.
            'lfsi_act_q',
            'euroind',
            'pop',
            'labour',
        ]

        self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml"
        self.updated_catalog = False
Exemple #30
0
class BIS(Fetcher):
    def __init__(self, db=None):
        super().__init__(provider_name="BIS", db=db)

        if not self.provider:
            self.provider = Providers(
                name=self.provider_name,
                long_name="Bank for International Settlements",
                version=VERSION,
                region="world",
                website="http://www.bis.org",
                fetcher=self,
            )
            self.provider.update_database()

        if self.provider.version != VERSION:
            self.provider.update_database()

    def upsert_dataset(self, dataset_code):

        start = time.time()

        logger.info("upsert dataset[%s] - START" % (dataset_code))

        if not DATASETS.get(dataset_code):
            raise Exception("This dataset is unknown" + dataset_code)

        dataset = Datasets(
            provider_name=self.provider_name,
            dataset_code=dataset_code,
            name=DATASETS[dataset_code]["name"],
            doc_href=DATASETS[dataset_code]["doc_href"],
            fetcher=self,
        )

        fetcher_data = BIS_Data(dataset, url=DATASETS[dataset_code]["url"], filename=DATASETS[dataset_code]["filename"])

        if fetcher_data.is_updated():

            dataset.series.data_iterator = fetcher_data
            dataset.update_database()

            # TODO: clean datas (file temp)

            end = time.time() - start
            logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end))

            self.update_metas(dataset_code)

            end = time.time() - start
            logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        else:
            logger.info(
                "upsert dataset[%s] bypass because is updated from release_date[%s]"
                % (dataset_code, fetcher_data.release_date)
            )

    def load_datasets_first(self):
        start = time.time()
        logger.info("first load fetcher[%s] - START" % (self.provider_name))

        for dataset_code in DATASETS.keys():
            self.upsert_dataset(dataset_code)

        end = time.time() - start
        logger.info("first load fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        self.load_datasets_first()

    def build_data_tree(self, force_update=False):

        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree

        for category_code, dataset in DATASETS.items():
            category_key = self.provider.add_category(
                {"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]}
            )
            _dataset = {"name": dataset["name"], "dataset_code": category_code}
            self.provider.add_dataset(_dataset, category_key)

        return self.provider.data_tree

    def parse_agenda(self):

        agenda = etree.HTML(get_agenda())
        table = agenda.find(".//table")
        # only one table
        rows = table[0].findall("tr")
        # skipping first row
        cells = rows[1].findall("td")
        agenda = []
        months = [None, None]

        for c in rows[1].iterfind("td"):
            content = c.find("strong")
            if content.text is None:
                content = content.find("strong")
            months.append(datetime.datetime.strptime(content.text, "%B %Y"))
        agenda.append(months)
        ir = 2

        def get_links_text(cell):
            txt = []
            for link in cell.findall("a"):
                if link.text:
                    txt.append(link.text)
            return txt

        def _get_dates(cells):
            item = []
            for ic, c in enumerate(cells):
                if c.text[0] != chr(160):
                    item.append(re.match("\d\d|\d", c.text).group(0))
                else:
                    item.append(None)
            return item

        while ir < len(rows):
            cells = rows[ir].findall("td")

            content = cells[0]
            if content.text is None:
                content = content.find("a")
            item = [content.text]

            if cells[0].get("rowspan") == "2":
                two_rows = True
                content = cells[1].find("a")
                item.append(content.text)
                offset = 2
            else:
                two_rows = False
                item.append(None)
                offset = 1

            item.extend(_get_dates(cells[offset:]))

            agenda.append(item)
            ir += 1

            if two_rows:
                cells = rows[ir].findall("td")
                links = get_links_text(cells[0])
                for content in links:
                    item = [item[0]]
                    item.append(content)
                    item.extend(_get_dates(cells[1:]))
                    agenda.append(item)
                ir += 1
        return agenda

    def get_calendar(self):
        agenda = self.parse_agenda()

        dataset_codes = [d["dataset_code"] for d in self.datasets_list()]

        """First line - exclude first 2 columns (title1, title2)"""
        months = agenda[0][2:]

        """All line moins first list"""
        periods = agenda[1:]

        def _get_dataset_code(title):
            for key, d in DATASETS.items():
                if title in d.get("agenda_titles", []):
                    return key
            return None

        for period in periods:
            title = period[0]
            if period[1]:
                title = "%s %s" % (title, period[1])

            dataset_code = _get_dataset_code(title)
            if not dataset_code:
                logger.info("exclude calendar action for not implemented dataset[%s]" % title)
                continue
            if not dataset_code in dataset_codes:
                logger.info("exclude calendar action for dataset[%s]" % title)
                continue

            days = period[2:]
            scheds = [d for d in zip(months, days) if not d[1] is None]

            for date_base, day in scheds:
                yield {
                    "action": "update_node",
                    "kwargs": {"provider_name": self.provider_name, "dataset_code": dataset_code},
                    "period_type": "date",
                    "period_kwargs": {
                        "run_date": datetime.datetime(date_base.year, date_base.month, int(day), 8, 0, 0),
                        "timezone": pytz.country_timezones(AGENDA["country"]),
                    },
                }
Exemple #31
0
class Eurostat(Fetcher):
    """Class for managing the SDMX endpoint from eurostat in dlstats."""

    def __init__(self, **kwargs):
        super().__init__(provider_name="EUROSTAT", version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name="Eurostat",
            version=VERSION,
            region="Europe",
            website="http://ec.europa.eu/eurostat",
            terms_of_use="http://ec.europa.eu/eurostat/about/our-partners/copyright",
            fetcher=self,
        )

        self.categories_filter = [
            "nama_10",
            "namq_10",
            "nasa_10",
            "nasq_10",
            "naid_10",
            "nama",
            "namq",
            "nasa",
            "nasq",
            "gov",
            "ert",
            "irt",
            "prc",
            "bop",
            "bop_6",
            "demo_pjanbroad",
            "lfsi_act_q",
            "euroind",
            "pop",
            "labour",
        ]

        self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml"
        self.updated_catalog = False

    def _is_updated_catalog(self, creation_date):

        if not self.provider.from_db:
            self.provider_verify()

        if not self.provider.metadata:
            self.provider.metadata = {}

        if not "creation_date" in self.provider.metadata:
            self.provider.metadata["creation_date"] = creation_date
            self.provider.update_database()
            return True

        if creation_date > self.provider.metadata["creation_date"]:
            self.provider.metadata["creation_date"] = creation_date
            self.provider.update_database()
            return True

        return False

    def build_data_tree(self):
        """Builds the data tree
        """

        download = Downloader(
            url=self.url_table_of_contents,
            filename="table_of_contents.xml",
            store_filepath=self.store_path,
            use_existing_file=self.use_existing_file,
        )
        filepath = download.get_filepath()

        categories = []
        categories_keys = []

        it = etree.iterparse(filepath, events=["end"], tag="{urn:eu.europa.ec.eurostat.navtree}leaf")

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.categories_filter:
                if _select in parent_codes:
                    return True
            return False

        def get_category(category_code):
            for c in categories:
                if c["category_code"] == category_code:
                    return c

        def create_categories(parent_codes, parent_titles, position):

            position += 1

            for i in range(len(parent_codes)):
                category_code = parent_codes.pop()
                name = parent_titles.pop()
                all_parents = parent_codes.copy()
                parent = None
                if all_parents:
                    parent = all_parents[-1]
                if not category_code in categories_keys:
                    _category = {
                        "provider_name": self.provider_name,
                        "category_code": category_code,
                        "name": name,
                        "position": position + i,
                        "parent": parent,
                        "all_parents": all_parents,
                        "datasets": [],
                        "doc_href": None,
                        "metadata": None,
                    }
                    categories_keys.append(category_code)
                    categories.append(_category)

        position = 0
        is_verify_creation_date = False

        for event, dataset in it:

            if is_verify_creation_date is False:
                _root = dataset.getroottree().getroot()
                creation_date_str = _root.attrib.get("creationDate")
                creation_date = clean_datetime(datetime.strptime(creation_date_str, "%Y%m%dT%H%M"))

                if self._is_updated_catalog(creation_date) is False:
                    msg = "no update from eurostat catalog. current[%s] - db[%s]"
                    logger.warning(msg % (creation_date, self.provider.metadata["creation_date"]))
                    if not self.force_update:
                        return []

                is_verify_creation_date = True
                if not self.force_update:
                    self.updated_catalog = True

            parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP)

            if not is_selected(parent_codes):
                continue

            parent_titles = dataset.xpath(
                "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP
            )
            category_code = parent_codes[-1]

            create_categories(parent_codes, parent_titles, position)

            category = get_category(category_code)

            name = xpath_title(dataset)[0]
            last_update = xpath_ds_last_update(dataset)
            last_modified = xpath_ds_last_modified(dataset)
            doc_href = xpath_ds_metadata_html(dataset)
            data_start = xpath_ds_data_start(dataset)
            data_end = xpath_ds_data_end(dataset)
            values = xpath_ds_values(dataset)

            last_update = datetime.strptime(last_update[0], "%d.%m.%Y")
            if last_modified:
                last_modified = datetime.strptime(last_modified[0], "%d.%m.%Y")
                last_update = max(last_update, last_modified)

            dataset_code = xpath_code(dataset)[0]
            _dataset = {
                "dataset_code": dataset_code,
                "name": name,
                "last_update": clean_datetime(last_update),
                "metadata": {
                    "doc_href": first_element_xpath(doc_href),
                    "data_start": first_element_xpath(data_start),
                    "data_end": first_element_xpath(data_end),
                    "values": int(first_element_xpath(values, default="0")),
                },
            }
            category["datasets"].append(_dataset)

        self.for_delete.append(filepath)

        return categories

    def upsert_dataset(self, dataset_code):
        """Updates data in Database for selected datasets
        """
        self.get_selected_datasets()

        doc = self.db[constants.COL_DATASETS].find_one(
            {"provider_name": self.provider_name, "dataset_code": dataset_code}, {"dataset_code": 1, "last_update": 1}
        )

        dataset_settings = self.selected_datasets[dataset_code]

        if doc and doc["last_update"] >= dataset_settings["last_update"]:
            comments = "update-date[%s]" % doc["last_update"]
            raise errors.RejectUpdatedDataset(
                provider_name=self.provider_name, dataset_code=dataset_code, comments=comments
            )

        dataset = Datasets(
            provider_name=self.provider_name,
            dataset_code=dataset_code,
            name=dataset_settings["name"],
            doc_href=dataset_settings["metadata"].get("doc_href"),
            last_update=None,
            fetcher=self,
        )
        dataset.last_update = dataset_settings["last_update"]

        dataset.series.data_iterator = EurostatData(dataset)

        return dataset.update_database()

    def get_calendar(self):

        yield {
            "action": "update-fetcher",
            "period_type": "cron",
            "kwargs": {"provider_name": self.provider_name},
            "period_kwargs": {"day": "*", "hour": 11, "minute": 1, "timezone": "Europe/Paris"},
        }

        yield {
            "action": "update-fetcher",
            "period_type": "cron",
            "kwargs": {"provider_name": self.provider_name},
            "period_kwargs": {"day": "*", "hour": 23, "minute": 1, "timezone": "Europe/Paris"},
        }

    def load_datasets_update(self):

        datasets_list = self.datasets_list()
        if not self.updated_catalog and not self.force_update:
            msg = "update aborted for updated catalog"
            logger.warning(msg)

        dataset_codes = [d["dataset_code"] for d in datasets_list]

        # TODO: enable ?
        cursor = self.db[constants.COL_DATASETS].find(
            {"provider_name": self.provider_name, "dataset_code": {"$in": dataset_codes}},
            {"dataset_code": 1, "last_update": 1},
        )

        selected_datasets = {s["dataset_code"]: s for s in cursor}

        for dataset in datasets_list:
            dataset_code = dataset["dataset_code"]

            last_update_from_catalog = dataset["last_update"]
            last_update_from_dataset = selected_datasets.get(dataset_code, {}).get("last_update")

            if (dataset_code not in selected_datasets) or (last_update_from_catalog > last_update_from_dataset):
                try:
                    self.wrap_upsert_dataset(dataset_code)
                except Exception as err:
                    if isinstance(err, errors.MaxErrors):
                        raise
                    msg = "error for provider[%s] - dataset[%s]: %s"
                    logger.critical(msg % (self.provider_name, dataset_code, str(err)))
            else:
                msg = "bypass update - provider[%s] - dataset[%s] - last-update-dataset[%s] - last-update-catalog[%s]"
                logger.info(
                    msg % (self.provider_name, dataset_code, last_update_from_dataset, last_update_from_catalog)
                )
Exemple #32
0
class FED(Fetcher):
    
    def __init__(self, db=None, **kwargs):        
        super().__init__(provider_name='FED', db=db, **kwargs)
        
        self.provider = Providers(name=self.provider_name,
                                  long_name='Federal Reserve',
                                  version=VERSION,
                                  region='US',
                                  website='http://www.federalreserve.gov',
                                  fetcher=self)

    def build_data_tree(self, force_update=False):
        
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree

        for category_code, dataset in DATASETS.items():
            category_key = self.provider.add_category({"name": dataset["name"],
                                                       "category_code": category_code,
                                                       "doc_href": dataset["doc_href"]})
            _dataset = {"name": dataset["name"], "dataset_code": category_code}
            self.provider.add_dataset(_dataset, category_key)
        
        return self.provider.data_tree

    def upsert_dataset(self, dataset_code):
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        #TODO: control si existe ou update !!!

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=DATASETS[dataset_code]['name'],
                           doc_href=DATASETS[dataset_code]['doc_href'],
                           last_update=datetime.now(),
                           fetcher=self)
        
        _data = FED_Data(dataset=dataset, 
                         url=DATASETS[dataset_code]['url'])
        dataset.series.data_iterator = _data
        result = dataset.update_database()
        
        _data = None

        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        
        return result

    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        self.provider.update_database()
        self.upsert_data_tree()

        datasets_list = [d["dataset_code"] for d in self.datasets_list()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        #TODO: 
        self.load_datasets_first()
Exemple #33
0
class DESTATIS(Fetcher):
    
    def __init__(self, db=None, **kwargs):        
        super().__init__(provider_name='DESTATIS', db=db, **kwargs)
        
        if not self.provider:
            self.provider = Providers(name=self.provider_name,
                                      long_name='Statistisches Bundesamt',
                                      version=VERSION,
                                      region='Germany',
                                      website='https://www.destatis.de',
                                      fetcher=self)

        if self.provider.version != VERSION:
            self.provider.update_database()

    def build_data_tree(self, force_update=False):
        
        return []
        """
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree

        for category_code, dataset in DATASETS.items():
            category_key = self.provider.add_category({"name": dataset["name"],
                                                       "category_code": category_code,
                                                       "doc_href": dataset["doc_href"]})
            _dataset = {"name": dataset["name"], "dataset_code": category_code}
            self.provider.add_dataset(_dataset, category_key)
        
        return self.provider.data_tree
        """

    def upsert_dataset(self, dataset_code):
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        #TODO: control si existe ou update !!!

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=DATASETS[dataset_code]['name'],
                           doc_href=DATASETS[dataset_code]['doc_href'],
                           last_update=datetime.now(),
                           fetcher=self)
        
        _data = DESTATIS_Data(dataset=dataset, 
                              ns_tag_data=DATASETS[dataset_code]["ns_tag_data"])
        dataset.series.data_iterator = _data
        result = dataset.update_database()
        
        _data = None

        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        
        return result

    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        self.provider.update_database()
        self.upsert_data_tree()

        datasets_list = [d["dataset_code"] for d in self.datasets_list()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        #TODO: 
        self.load_datasets_first()
Exemple #34
0
class Esri(Fetcher):
    def __init__(self, db=None):
        super().__init__(provider_name='ESRI', db=db)         
        self.provider = Providers(name=self.provider_name,
                                  long_name='Economic and Social Research Institute, Cabinet Office',
                                  version=VERSION,
                                  region='Japan',
                                  website='http://www.esri.cao.go.jp/index-e.html',
                                  fetcher=self)
        self.datasets_dict = {}
        self.selected_codes = ['GDP.Amount']
        
    def build_data_tree(self, force_update=False):
        """Build data_tree from ESRI site parsing
        """
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree

        def make_node(data,parent_key):
            _category = dict(name=data['name'],
                             category_code=data['category_code'])
            _category_key = self.provider.add_category(_category,
                                                       parent_code=parent_key)
            if 'children' in data:
                for c in data['children']:
                    make_node(c,_category_key)
            if 'datasets' in data:
                for d in data['datasets']:
                    self.provider.add_dataset(dict(dataset_code = d['dataset_code'],
                                                   name = d['name'],
                                                   last_update = d['release_date'],
                                                   metadata={'url': d['url'],
                                                             'doc_href': d['doc_href']}),
                                              _category_key)                        
        try:
            for data in parse_esri_site():
                make_node(data, self.provider_name)
        except Exception as err:
            logger.error(err)   
            raise                             

    def get_selected_datasets(self):
        """Collects the dataset codes that are in data_tree
        below the ones indicated in "selected_codes" provided in configuration
        :returns: list of dict of dataset settings"""
        category_filter = [".*%s.*" % d for d in self.selected_codes]
        category_filter = "|".join(category_filter)
        self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)}
        return self.selected_datasets

    # necessary for test mock
    def make_url(self):
        return self.dataset_settings['metadata']['url']

    def upsert_dataset(self, dataset_code):
        """Updates data in Database for selected datasets
        :dset: dataset_code
        :returns: None"""
        self.get_selected_datasets()
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))

        self.dataset_settings = self.selected_datasets[dataset_code]
        url = self.make_url()
        dataset = Datasets(self.provider_name,dataset_code,
                           fetcher=self)
        dataset.name = self.dataset_settings['name']
        dataset.doc_href = self.dataset_settings['metadata']['doc_href']
        dataset.last_update = self.dataset_settings['last_update']
        data_iterator = EsriData(dataset,url,filename=dataset_code)
        dataset.series.data_iterator = data_iterator
        dataset.update_database()
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    # TO BE FINISHED    
    def parse_sna_agenda(self):
        #TODO: use Downloader
        download = Downloader(url="http://www.esri.cao.go.jp/en/sna/kouhyou/kouhyou_top.html",
                              filename="agenda_sna.html")
        with open(download.get_filepath(), 'rb') as fp:
            agenda = lxml.html.parse(fp)
        
    # TO BE FINISHED
    def get_calendar(self):
        datasets = [d["dataset_code"] for d in self.datasets_list()]

        for entry in self.parse_agenda():

            if entry['dataflow_key'] in datasets:

                yield {'action': 'update_node',
                       'kwargs': {'provider_name': self.provider_name,
                                  'dataset_code': entry['dataflow_key']},
                       'period_type': 'date',
                       'period_kwargs': {'run_date': datetime.strptime(
                           entry['scheduled_date'], "%d/%m/%Y %H:%M CET"),
                           'timezone': pytz.timezone('Asia/Tokyo')
                       }
                      }

    # TODO: load earlier versions to get revisions
    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        
        self.provider.update_database()
        self.build_data_tree()
        self.upsert_data_tree()

        datasets_list = [d for d in self.get_selected_datasets().keys()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        self.provider.update_database()
        self.upsert_data_tree()

        datasets_list = [d["dataset_code"] for d in self.datasets_list()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
Exemple #35
0
class IMF(Fetcher):

    def __init__(self, db=None, **kwargs):        
        super().__init__(provider_name='IMF', db=db, **kwargs)
        
        self.provider = Providers(name=self.provider_name, 
                                  long_name="International Monetary Fund",
                                  version=VERSION, 
                                  region='world', 
                                  website='http://www.imf.org/', 
                                  fetcher=self)

    def upsert_all_datasets(self):
        start = time.time()
        logger.info("update fetcher[%s] - START" % (self.provider_name))
        
        for dataset_code in DATASETS.keys():
            self.upsert_dataset(dataset_code) 

        end = time.time() - start
        logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
        
    def upsert_dataset(self, dataset_code):
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        if dataset_code=='WEO':
            for u in self.weo_urls:
                self.upsert_weo_issue(u, dataset_code)
        else:
            raise Exception("This dataset is unknown" + dataset_code)
        
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    def datasets_list(self):
        return DATASETS.keys()

    def datasets_long_list(self):
        return [(key, dataset['name']) for key, dataset in DATASETS.items()]

    @property
    def weo_urls(self):

        webpage = requests.get('http://www.imf.org/external/ns/cs.aspx?id=28')
        
        #TODO: replace by beautifoulsoup ?
        html = etree.HTML(webpage.text)
        hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']")
        links = [href.values() for href in hrefs]
        
        #The last links of the WEO webpage lead to data we dont want to pull.
        links = links[:-16]
        #These are other links we don't want.
        links.pop(-8)
        links.pop(-10)
        links = [link[0][:-10]+'download.aspx' for link in links]
        
        output = []

        for link in links:
            webpage = requests.get(link)
            html = etree.HTML(webpage.text)
            final_link = html.xpath("//div[@id = 'content']//table//a['href']")
            final_link = final_link[0].values()
            output.append(link[:-13]+final_link[0])

        # we need to handle the issue in chronological order
        return(sorted(output))
        
    def upsert_weo_issue(self, url, dataset_code):
        
        settings = DATASETS[dataset_code]
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name=settings['name'], 
                           doc_href=settings['doc_href'], 
                           fetcher=self)
        
        weo_data = WeoData(dataset, url)
        dataset.last_update = weo_data.release_date        
        dataset.attribute_list.update_entry('flags','e','Estimated')
        dataset.series.data_iterator = weo_data
        try:
            dataset.update_database()
            self.update_metas(dataset_code)
        except Exception as err:
            logger.error(str(err))

    def upsert_categories(self):
        data_tree = {'name': 'IMF',
                     'category_code': 'imf_root',
                     'children': [{'name': 'WEO' , 
                                   'category_code': 'WEO',
                                   'exposed': True,
                                   'children': []}]}
        self.provider.add_data_tree(data_tree)
Exemple #36
0
class WorldBank(Fetcher):

    def __init__(self, db=None):
        
        super().__init__(provider_name='WorldBank',  db=db)         
        
        self.provider = Providers(name=self.provider_name,
                                 long_name='World Bank',
                                 version=VERSION,
                                 region='world',
                                 website='http://www.worldbank.org/',
                                 fetcher=self)
       
    def upsert_categories(self):
        data_tree = {'name': 'World Bank',
                     'category_code': 'worldbank_root',
                     'children': [{'name': 'GEM' , 
                                   'category_code': 'GEM',
                                   'exposed': True,
                                   'children': []}]}
        self.provider.add_data_tree(data_tree)

    def upsert_dataset(self, dataset_code):
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        #TODO return the _id field of the corresponding dataset. Update the category accordingly.
        if dataset_code=='GEM':
            self.upsert_gem(dataset_code)
        else:
            raise Exception("This dataset is unknown" + dataCode)                 
        self.update_metas(dataset_code)        
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    def upsert_gem(self, dataset_code):
        d = DATASETS[dataset_code]
        url = d['url']
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name=d['name'], 
                           doc_href=d['doc_href'], 
                           fetcher=self)
        gem_data = GemData(dataset, url)
        dataset.last_update = gem_data.release_date
        dataset.series.data_iterator = gem_data
        dataset.update_database()
        
    def upsert_all_datasets(self):
        start = time.time()
        logger.info("update fetcher[%s] - START" % (self.provider_name))
        self.upsert_dataset('GEM')  
        end = time.time() - start
        logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def datasets_list(self):
        return DATASETS.keys()

    def datasets_long_list(self):
        return [(key, dataset['name']) for key, dataset in DATASETS.items()]

    def download(self, dataset_code=None, url=None):

        filepath_dir = os.path.abspath(os.path.join(tempfile.gettempdir(), 
                                        self.provider_name))
        
        filepath = "%s.zip" % os.path.abspath(os.path.join(filepath_dir, dataset_code))

        if not os.path.exists(filepath_dir):
            os.makedirs(filepath_dir, exist_ok=True)
            
        if os.path.exists(filepath):
            os.remove(filepath)
            
        if logger.isEnabledFor(logging.INFO):
            logger.info("store file to [%s]" % filepath)

        start = time.time()
        try:
            response = requests.get(url, 
                                    #TODO: timeout=self.timeout, 
                                    stream=True,
                                    allow_redirects=True,
                                    verify=False)

            if not response.ok:
                msg = "download url[%s] - status_code[%s] - reason[%s]" % (url, 
                                                                           response.status_code, 
                                                                           response.reason)
                logger.error(msg)
                raise Exception(msg)
            
            with open(filepath,'wb') as f:
                for chunk in response.iter_content():
                    f.write(chunk)

            return response.headers['Last-Modified'], filepath
                
        except requests.exceptions.ConnectionError as err:
            raise Exception("Connection Error")
        except requests.exceptions.ConnectTimeout as err:
            raise Exception("Connect Timeout")
        except requests.exceptions.ReadTimeout as err:
            raise Exception("Read Timeout")
        except Exception as err:
            raise Exception("Not captured exception : %s" % str(err))            

        end = time.time() - start
        logger.info("download file[%s] - END - time[%.3f seconds]" % (url, end))
Exemple #37
0
class ECB(Fetcher):
    
    def __init__(self, db=None, sdmx=None, **kwargs):        
        super().__init__(provider_name='ECB', db=db, **kwargs)

        if not self.provider:        
            self.provider = Providers(name=self.provider_name,
                                      long_name='European Central Bank',
                                      version=VERSION,
                                      region='Europe',
                                      website='http://www.ecb.europa.eu',
                                      fetcher=self)
            self.provider.update_database()
        
        if self.provider.version != VERSION:
            self.provider.update_database()
            
        self.sdmx = sdmx or ECBRequest(agency=self.provider_name)
        self.sdmx.timeout = 90
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None

    def _load_structure(self, force=False):
        """Load structure and build data_tree
        """
        
        if (self._dataflows and self._categoryschemes and self._categorisations) and not force:
            return
        
        '''Force URL for select only ECB agency'''
        categoryschemes_response = self.sdmx.get(resource_type='categoryscheme', url='http://sdw-wsrest.ecb.int/service/categoryscheme/%s?references=parentsandsiblings' % self.provider_name)
        self._categorisations = categoryschemes_response.msg.categorisations
        self._categoryschemes = categoryschemes_response.msg.categoryschemes
        self._dataflows = categoryschemes_response.msg.dataflows
        
    def build_data_tree(self, force_update=False):
        """Build data_tree from structure datas
        """
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree
        
        self._load_structure()

        for category in self._categoryschemes.aslist():
            
            _category = dict(name=category.name.en,
                             category_code=category.id)
            category_key = self.provider.add_category(_category)
             
            for subcategory in category.values():
                
                if not subcategory.id in self._categorisations:
                    continue
                
                _subcategory = dict(name=subcategory.name.en,
                                    category_code=subcategory.id)
                _subcategory_key = self.provider.add_category(_subcategory,
                                           parent_code=category_key)
                
                try:
                    _categorisation = self._categorisations[subcategory.id]
                    for i in _categorisation:
                        _d = self._dataflows[i.artefact.id]
                        self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key)                        
                except Exception as err:
                    logger.error(err)   
                    raise                             

        return self.provider.data_tree
        
    def parse_agenda(self):
        #TODO: use Downloader
        download = Downloader(url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html",
                              filename="statscall.html")
        with open(download.get_filepath(), 'rb') as fp:
            agenda = lxml.html.parse(fp)
        
        regex_date = re.compile("Reference period: (.*)")
        regex_dataset = re.compile(".*Dataset: (.*)\)")
        entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | '
                               '//div[@class="ecb-faytdd"]/*/dd')[2:]
        entries = zip(entries[::2], entries[1::2])
        for entry in entries:
            item = {}
            match_key = regex_dataset.match(entry[1][0].text_content())
            item['dataflow_key'] = match_key.groups()[0]
            match_date = regex_date.match(entry[1][1].text_content())
            item['reference_period'] = match_date.groups()[0]
            item['scheduled_date'] = entry[0].text_content().replace('\n','')
            yield(item)

    def get_calendar(self):
        datasets = [d["dataset_code"] for d in self.datasets_list()]

        for entry in self.parse_agenda():

            if entry['dataflow_key'] in datasets:

                yield {'action': 'update_node',
                       'kwargs': {'provider_name': self.provider_name,
                                  'dataset_code': entry['dataflow_key']},
                       'period_type': 'date',
                       'period_kwargs': {'run_date': datetime.strptime(
                           entry['scheduled_date'], "%d/%m/%Y %H:%M CET"),
                           'timezone': pytz.timezone('CET')
                       }
                      }

    def upsert_dataset(self, dataset_code):
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        #TODO: control si existe ou update !!!

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=self.provider.website,
                           last_update=datetime.now(),
                           fetcher=self)
        
        _data = ECB_Data(dataset=dataset)
        dataset.series.data_iterator = _data
        try:
            result = dataset.update_database()
        except:
            raise
        
        _data = None

        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        
        return result

    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        self._load_structure()
        self.provider.update_database()
        self.upsert_data_tree()

        datasets_list = [d["dataset_code"] for d in self.datasets_list()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        #TODO: 
        self.load_datasets_first()
Exemple #38
0
class Eurostat(Fetcher):
    """Class for managing the SDMX endpoint from eurostat in dlstats."""
    
    def __init__(self, db=None):
        super().__init__(provider_name='Eurostat', db=db)
        
        if not self.provider:
            self.provider = Providers(name=self.provider_name,
                                      long_name='Eurostat',
                                      version=VERSION,
                                      region='Europe',
                                      website='http://ec.europa.eu/eurostat',
                                      fetcher=self)
            self.provider.update_database()
        
        if self.provider.version != VERSION:
            self.provider.update_database()
        
        self.selected_codes = [
            'nama_10', 
            'namq_10', 
            'nasa_10', 
            'nasq_10', 
            'naid_10',
            'nama', 
            'namq', 
            'nasa', 
            'nasq', 
            'gov', 
            'ert', 
            'irt', 
            'prc', 
            'bop', 
            'bop_6',
            'demo_pjanbroad', 
            'lfsi_act_q'
        ]
        self.selected_datasets = {}
        self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml"
        self.dataset_url = None
        
    def build_data_tree(self, force_update=False):
        """Builds the data tree
        
        Pour créer les categories, ne prend que les branch dont l'un des <code> 
        de la branch se trouvent dans selected_codes
        
        Même chose pour les datasets. Prend le category_code du parent
        et verifie si il est dans selected_codes
        """
        
        start = time.time()
        logger.info("build_data_tree provider[%s] - START" % self.provider_name)
        
        if self.provider.count_data_tree() > 1 and not force_update:
            logger.info("use existing data-tree for provider[%s]" % self.provider_name)
            return self.provider.data_tree

        filepath = self.get_table_of_contents()

        it = etree.iterparse(filepath, events=['end'])

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.selected_codes: 
                if _select in parent_codes:
                    return True
            return False

        for event, element in it:
            if event == 'end':

                if element.tag == fixtag_toc('nt', 'branch'):

                    for child in element.iterchildren(tag=fixtag_toc('nt', 'children')):

                        _parent_codes = xpath_parent_codes(child)
                        _parents = xpath_ancestor_branch(child)

                        if not is_selected(_parent_codes):
                            continue

                        for parent in _parents:
                            _parent_code = xpath_code(parent)[0]
                            _parent_title =xpath_title(parent)[0]

                            '''Extrait la partie gauche des categories parents'''
                            _parent_categories = ".".join(_parent_codes[:_parent_codes.index(_parent_code)])
                            _category = None
                            _parent = None

                            if not _parent_categories or len(_parent_categories) == 0:
                                _category = {"category_code": _parent_code, "name": _parent_title}
                            else:
                                _parent = self.provider._category_key(_parent_categories)
                                _category = {"category_code": _parent_code, "name": _parent_title}

                            try:
                                _key = self.provider.add_category(_category, _parent)
                            except:
                                #Pas de capture car verifie seulement si existe
                                pass

                        datasets = xpath_datasets(child)

                        for dataset in datasets:
                            parent_codes = xpath_parent_codes(dataset)
                            dataset_code = xpath_code(dataset)[0]
                            category_code = self.provider._category_key(".".join(parent_codes))

                            '''Verifie si au moins un des category_code est dans selected_codes'''
                            if not is_selected(parent_codes):
                                continue

                            name = xpath_title(dataset)[0]
                            last_update = xpath_ds_last_update(dataset)
                            last_modified = xpath_ds_last_modified(dataset)
                            doc_href = xpath_ds_metadata_html(dataset)
                            data_start = xpath_ds_data_start(dataset)
                            data_end = xpath_ds_data_end(dataset)
                            values = xpath_ds_values(dataset)

                            last_update = datetime.strptime(last_update[0], '%d.%m.%Y')
                            if last_modified:
                                last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y')
                                last_update = max(last_update, last_modified)

                            dataset = {
                                "dataset_code": dataset_code, 
                                "name": name,
                                "last_update": last_update,
                                "metadata": {
                                    "doc_href": first_element_xpath(doc_href),
                                    "data_start": first_element_xpath(data_start),
                                    "data_end": first_element_xpath(data_end),
                                    "values": int(first_element_xpath(values, default="0")),
                                }
                            }             
                            self.provider.add_dataset(dataset, category_code)
                            dataset.clear()
                        child.clear()
                    element.clear()

        end = time.time() - start
        logger.info("build_data_tree load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

        return self.provider.data_tree
    
    def get_table_of_contents(self):
        return Downloader(url=self.url_table_of_contents, 
                              filename="table_of_contents.xml").get_filepath()

    def get_selected_datasets(self):
        """Collects the dataset codes that are in table of contents
        below the ones indicated in "selected_codes" provided in configuration
        :returns: list of dict of dataset settings"""
        category_filter = [".*%s.*" % d for d in self.selected_codes]
        category_filter = "|".join(category_filter)
        self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)}
        return self.selected_datasets

    def upsert_dataset(self, dataset_code):
        """Updates data in Database for selected datasets
        :dset: dataset_code
        :returns: None"""
        self.get_selected_datasets()

        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))

        dataset_settings = self.selected_datasets[dataset_code]

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name=dataset_settings["name"], 
                           doc_href=dataset_settings["metadata"].get("doc_href"), 
                           last_update=dataset_settings["last_update"], 
                           fetcher=self)

        data_iterator = EurostatData(dataset, filename=dataset_code)
        dataset.series.data_iterator = data_iterator
        dataset.update_database()

        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    def load_datasets_first(self):
        self.get_selected_datasets()

        start = time.time()
        logger.info("first load provider[%s] - START" % (self.provider_name))

        for dataset_code in self.selected_datasets.keys():
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("first load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
        
    def load_datasets_update(self):
        self.get_selected_datasets()
        
        start = time.time()
        logger.info("update provider[%s] - START" % (self.provider_name))

        selected_datasets = self.db[constants.COL_DATASETS].find(
            {'provider_name': self.provider_name, 'dataset_code': {'$in': list(self.selected_datasets.keys())}},
            {'dataset_code': 1, 'last_update': 1})
        selected_datasets = {s['dataset_code'] : s for s in selected_datasets}

        for dataset_code, dataset in self.selected_datasets.items():
            if (dataset_code not in selected_datasets) or (selected_datasets[dataset_code]['last_update'] < dataset['last_update']):
                try:
                    self.upsert_dataset(dataset_code)
                except Exception as err:
                    logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("update provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
Exemple #39
0
class INSEE(Fetcher):
    
    def __init__(self, db=None, sdmx=None, **kwargs):        
        super().__init__(provider_name='INSEE', db=db, **kwargs)

        if not self.provider:        
            self.provider = Providers(name=self.provider_name,
                                     long_name='National Institute of Statistics and Economic Studies',
                                     version=VERSION,
                                     region='France',
                                     website='http://www.insee.fr',
                                     fetcher=self)
            self.provider.update_database()
        
        if self.provider.version != VERSION:
            self.provider.update_database()
            
        
        self.sdmx = sdmx or Request(agency='INSEE')
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
    
    def _load_structure(self, force=False):
        
        if self._dataflows and not force:
            return
        
        """
        #http://www.bdm.insee.fr/series/sdmx/categoryscheme
        categoryscheme_response = self.sdmx.get(resource_type='categoryscheme', params={"references": None})
        logger.debug(categoryscheme_response.url)
        self._categoryschemes = categoryscheme_response.msg.categoryschemes
    
        #http://www.bdm.insee.fr/series/sdmx/categorisation
        categorisation_response = self.sdmx.get(resource_type='categorisation')
        logger.debug(categorisation_response.url)
        self._categorisations = categorisation_response.msg.categorisations
        """
    
        #http://www.bdm.insee.fr/series/sdmx/dataflow
        dataflows_response = self.sdmx.get(resource_type='dataflow')    
        logger.debug(dataflows_response.url)
        self._dataflows = dataflows_response.msg.dataflows

    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        for dataset_code in self.datasets_list():
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        #TODO: 
        self.load_datasets_first()

    def build_data_tree(self, force_update=False):
        """Build data_tree from structure datas
        """
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree
        
        self._load_structure()
        
        for dataset_code, dataset in self._dataflows.items():

            name = dataset.name
            if "en" in dataset.name:
                name = dataset.name.en
            else:
                name = dataset.name.fr
            
            self.provider.add_dataset(dict(dataset_code=dataset_code, name=name), self.provider_name)
            
        return self.provider.data_tree

        for category in self._categoryschemes.aslist():
            
            _category = dict(name=category.name.en,
                             category_code=category.id)
            category_key = self.provider.add_category(_category)
             
            for subcategory in category.values():
                
                if not subcategory.id in self._categorisations:
                    continue
                
                _subcategory = dict(name=subcategory.name.en,
                                    category_code=subcategory.id)
                _subcategory_key = self.provider.add_category(_subcategory,
                                           parent_code=category_key)
                
                try:
                    _categorisation = self._categorisations[subcategory.id]
                    for i in _categorisation:
                        _d = self._dataflows[i.artefact.id]
                        self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key)                        
                except Exception as err:
                    logger.error(err)   
                    raise                             

        return self.provider.data_tree
    
    def upsert_dataset(self, dataset_code):

        #self.load_structure(force=False)
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        #if not dataset_code in self._dataflows:
        #    raise Exception("This dataset is unknown: %s" % dataset_code)
        
        #dataflow = self._dataflows[dataset_code]
        
        #cat = self.db[constants.COL_CATEGORIES].find_one({'category_code': dataset_code})
        #dataset.name = cat['name']
        #dataset.doc_href = cat['doc_href']
        #dataset.last_update = cat['last_update']

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           #name=dataflow.name.en,
                           doc_href=None,
                           last_update=datetime.now(), #TODO:
                           fetcher=self)
        
        dataset_doc = self.db[constants.COL_DATASETS].find_one({'provider_name': self.provider_name,
                                                                "dataset_code": dataset_code})
        
        insee_data = INSEE_Data(dataset=dataset,
                                dataset_doc=dataset_doc, 
                                #dataflow=dataflow, 
                                #sdmx=self.sdmx
                                )
        dataset.series.data_iterator = insee_data
        result = dataset.update_database()
        
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        
        """
        > IDBANK:  A définir dynamiquement sur site ?
        doc_href d'une serie: http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=001694226
        > CODE GROUPE: Balance des Paiements mensuelle - Compte de capital
        http://www.bdm.insee.fr/bdm2/choixCriteres?codeGroupe=1556
        """
        return result
Exemple #40
0
class OECD(Fetcher):
    
    def __init__(self, db=None, **kwargs):
        super().__init__(provider_name='OECD', db=db, **kwargs)
        self.provider_name = 'OECD'
        self.provider = Providers(name=self.provider_name, 
                                  long_name='Organisation for Economic Co-operation and Development',
                                  version=VERSION,
                                  region='world',
                                  website='http://www.oecd.org', 
                                  fetcher=self)

    def upsert_dataset(self, dataset_code, datas=None):
        
        start = time.time()
        
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        if not DATASETS.get(dataset_code):
            raise Exception("This dataset is unknown" + dataset_code)
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name=DATASETS[dataset_code]['name'], 
                           doc_href=DATASETS[dataset_code]['doc_href'],
                           fetcher=self)
        
        fetcher_data = OECD_Data(dataset)
        dataset.series.data_iterator = fetcher_data
        dataset.update_database()

        end = time.time() - start
        logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end))

        self.update_metas(dataset_code)
        
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    def datasets_list(self):
        return DATASETS.keys()

    def datasets_long_list(self):
        return [(key, dataset['name']) for key, dataset in DATASETS.items()]

    def upsert_all_datasets(self):
        start = time.time()
        logger.info("update fetcher[%s] - START" % (self.provider_name))
        
        for dataset_code in DATASETS.keys():
            self.upsert_dataset(dataset_code) 
        end = time.time() - start
        logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
        
    def upsert_categories(self):
        
        data_tree = {'name': 'OECD',
                     'category_code': 'oecd_root',
                     'children': []}
        
        for dataset_code in DATASETS.keys():
            data_tree['children'].append({'name': DATASETS[dataset_code]['name'], 
                                          'category_code': dataset_code,
                                          'exposed': True,
                                          'children': None})

        self.provider.add_data_tree(data_tree)    
Exemple #41
0
class Eurostat(Fetcher):
    """Class for managing the SDMX endpoint from eurostat in dlstats."""
    def __init__(self, **kwargs):
        super().__init__(provider_name='EUROSTAT', version=VERSION, **kwargs)

        self.provider = Providers(
            name=self.provider_name,
            long_name='Eurostat',
            version=VERSION,
            region='Europe',
            website='http://ec.europa.eu/eurostat',
            terms_of_use=
            'http://ec.europa.eu/eurostat/about/our-partners/copyright',
            fetcher=self)

        self.categories_filter = [
            'nama10',
            'namq_10',
            'nasa_10',
            'nasq_10',
            'naid_10',
            'nama',
            'namq',
            'nasa',
            'nasq',
            'gov',
            'ert',
            'irt',
            'prc',
            'bop',
            'bop_6',
            'demo',  # We harvest demo because we need demo_pjanbroad.
            'lfsi_act_q',
            'euroind',
            'pop',
            'labour',
        ]

        self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml"
        self.updated_catalog = False

    def _is_updated_catalog(self, creation_date):

        if not self.provider.from_db:
            self.provider_verify()

        if not self.provider.metadata:
            self.provider.metadata = {}

        if not "creation_date" in self.provider.metadata:
            self.provider.metadata["creation_date"] = creation_date
            self.provider.update_database()
            return True

        if creation_date > self.provider.metadata["creation_date"]:
            self.provider.metadata["creation_date"] = creation_date
            self.provider.update_database()
            return True

        return False

    def build_data_tree(self):
        """Builds the data tree
        """

        download = Downloader(url=self.url_table_of_contents,
                              filename="table_of_contents.xml",
                              store_filepath=self.store_path,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()

        categories = []
        categories_keys = []

        it = etree.iterparse(filepath,
                             events=['end'],
                             tag="{urn:eu.europa.ec.eurostat.navtree}leaf")

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.categories_filter:
                if _select in parent_codes:
                    return True
            return False

        def get_category(category_code):
            for c in categories:
                if c["category_code"] == category_code:
                    return c

        def create_categories(parent_codes, parent_titles, position):

            position += 1

            for i in range(len(parent_codes)):
                category_code = parent_codes.pop()
                name = parent_titles.pop()
                all_parents = parent_codes.copy()
                parent = None
                if all_parents:
                    parent = all_parents[-1]
                if not category_code in categories_keys:
                    _category = {
                        "provider_name": self.provider_name,
                        "category_code": category_code,
                        "name": name,
                        "position": position + i,
                        "parent": parent,
                        'all_parents': all_parents,
                        "datasets": [],
                        "doc_href": None,
                        "metadata": None
                    }
                    categories_keys.append(category_code)
                    categories.append(_category)

        position = 0
        is_verify_creation_date = False

        for event, dataset in it:

            if is_verify_creation_date is False:
                _root = dataset.getroottree().getroot()
                creation_date_str = _root.attrib.get("creationDate")
                creation_date = clean_datetime(
                    datetime.strptime(creation_date_str, '%Y%m%dT%H%M'))

                if self._is_updated_catalog(creation_date) is False:
                    msg = "no update from eurostat catalog. current[%s] - db[%s]"
                    logger.warning(msg %
                                   (creation_date,
                                    self.provider.metadata["creation_date"]))
                    if not self.force_update:
                        return []

                is_verify_creation_date = True
                if not self.force_update:
                    self.updated_catalog = True

            parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()",
                                         namespaces=TABLE_OF_CONTENT_NSMAP)

            if not is_selected(parent_codes):
                continue

            parent_titles = dataset.xpath(
                "ancestor::nt:branch/nt:title[attribute::language='en']/text()",
                namespaces=TABLE_OF_CONTENT_NSMAP)
            category_code = parent_codes[-1]

            create_categories(parent_codes, parent_titles, position)

            category = get_category(category_code)

            name = xpath_title(dataset)[0]
            last_update = xpath_ds_last_update(dataset)
            last_modified = xpath_ds_last_modified(dataset)
            doc_href = xpath_ds_metadata_html(dataset)
            data_start = xpath_ds_data_start(dataset)
            data_end = xpath_ds_data_end(dataset)
            values = xpath_ds_values(dataset)

            last_update = datetime.strptime(last_update[0], '%d.%m.%Y')
            if last_modified:
                last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y')
                last_update = max(last_update, last_modified)

            dataset_code = xpath_code(dataset)[0]
            _dataset = {
                "dataset_code": dataset_code,
                "name": name,
                "last_update": clean_datetime(last_update),
                "metadata": {
                    "doc_href": first_element_xpath(doc_href),
                    "data_start": first_element_xpath(data_start),
                    "data_end": first_element_xpath(data_end),
                    "values": int(first_element_xpath(values, default="0")),
                }
            }
            category["datasets"].append(_dataset)

        self.for_delete.append(filepath)

        return categories

    def upsert_dataset(self, dataset_code):
        """Updates data in Database for selected datasets
        """
        self.get_selected_datasets()

        doc = self.db[constants.COL_DATASETS].find_one(
            {
                'provider_name': self.provider_name,
                'dataset_code': dataset_code
            }, {
                'dataset_code': 1,
                'last_update': 1
            })

        dataset_settings = self.selected_datasets[dataset_code]

        if doc and doc['last_update'] >= dataset_settings['last_update']:
            comments = "update-date[%s]" % doc['last_update']
            raise errors.RejectUpdatedDataset(provider_name=self.provider_name,
                                              dataset_code=dataset_code,
                                              comments=comments)

        dataset = Datasets(
            provider_name=self.provider_name,
            dataset_code=dataset_code,
            name=dataset_settings["name"],
            doc_href=dataset_settings["metadata"].get("doc_href"),
            last_update=None,
            fetcher=self)
        dataset.last_update = dataset_settings["last_update"]

        dataset.series.data_iterator = EurostatData(dataset)

        return dataset.update_database()

    def get_calendar(self):

        yield {
            "action": "update-fetcher",
            "period_type": "cron",
            "kwargs": {
                "provider_name": self.provider_name
            },
            "period_kwargs": {
                "day": '*',
                "hour": 11,
                "minute": 1,
                "timezone": 'Europe/Paris'
            }
        }

        yield {
            "action": "update-fetcher",
            "period_type": "cron",
            "kwargs": {
                "provider_name": self.provider_name
            },
            "period_kwargs": {
                "day": '*',
                "hour": 23,
                "minute": 1,
                "timezone": 'Europe/Paris'
            }
        }

    def load_datasets_update(self):

        datasets_list = self.datasets_list()
        if not self.updated_catalog and not self.force_update:
            msg = "update aborted for updated catalog"
            logger.warning(msg)

        dataset_codes = [d["dataset_code"] for d in datasets_list]

        #TODO: enable ?
        cursor = self.db[constants.COL_DATASETS].find(
            {
                'provider_name': self.provider_name,
                'dataset_code': {
                    '$in': dataset_codes
                }
            }, {
                'dataset_code': 1,
                'last_update': 1
            })

        selected_datasets = {s['dataset_code']: s for s in cursor}

        for dataset in datasets_list:
            dataset_code = dataset["dataset_code"]

            last_update_from_catalog = dataset['last_update']
            last_update_from_dataset = selected_datasets.get(
                dataset_code, {}).get('last_update')

            if (dataset_code not in selected_datasets) or (
                    last_update_from_catalog > last_update_from_dataset):
                try:
                    self.wrap_upsert_dataset(dataset_code)
                except Exception as err:
                    if isinstance(err, errors.MaxErrors):
                        raise
                    msg = "error for provider[%s] - dataset[%s]: %s"
                    logger.critical(
                        msg % (self.provider_name, dataset_code, str(err)))
            else:
                msg = "bypass update - provider[%s] - dataset[%s] - last-update-dataset[%s] - last-update-catalog[%s]"
                logger.info(
                    msg % (self.provider_name, dataset_code,
                           last_update_from_dataset, last_update_from_catalog))