def test_add_data_tree(self):
        
        # nosetests -s -v dlstats.tests.fetchers.test__commons:DBProviderTestCase.test_add_data_tree

        f = Fetcher(provider_name="p1", is_indexes=False)

        p = Providers(name="p1",
                      long_name="Provider One",
                      version=1,
                      region="Dreamland",
                      website="http://www.example.com", 
                      fetcher=f)
        
        self.assertEqual(len(p.data_tree), 1)
        p.data_tree[0]["category_code"] = p.name
        p.data_tree[0]["long_name"] = p.long_name
        p.data_tree[0]["website"] = p.website
        
        p.update_database()
        
        minimal_category = { 'category_code': "c0", 'name': "p1"}
        p.add_category(minimal_category)
        
        data_tree = [
             {'category_code': 'p1',
              'datasets': [],
              'description': None,
              'doc_href': 'http://www.example.com',
              'exposed': False,
              'last_update': None,
              'name': 'p1'},
             {'category_code': 'p1.c0',
              'datasets': [],
              'description': None,
              'doc_href': None,
              'exposed': False,
              'last_update': None,
              'name': 'p1'}
        ]        
        
        self.assertEqual(p.data_tree, data_tree)
Beispiel #2
0
class FED(Fetcher):
    
    def __init__(self, db=None, **kwargs):        
        super().__init__(provider_name='FED', db=db, **kwargs)
        
        self.provider = Providers(name=self.provider_name,
                                  long_name='Federal Reserve',
                                  version=VERSION,
                                  region='US',
                                  website='http://www.federalreserve.gov',
                                  fetcher=self)

    def build_data_tree(self, force_update=False):
        
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree

        for category_code, dataset in DATASETS.items():
            category_key = self.provider.add_category({"name": dataset["name"],
                                                       "category_code": category_code,
                                                       "doc_href": dataset["doc_href"]})
            _dataset = {"name": dataset["name"], "dataset_code": category_code}
            self.provider.add_dataset(_dataset, category_key)
        
        return self.provider.data_tree

    def upsert_dataset(self, dataset_code):
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        #TODO: control si existe ou update !!!

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=DATASETS[dataset_code]['name'],
                           doc_href=DATASETS[dataset_code]['doc_href'],
                           last_update=datetime.now(),
                           fetcher=self)
        
        _data = FED_Data(dataset=dataset, 
                         url=DATASETS[dataset_code]['url'])
        dataset.series.data_iterator = _data
        result = dataset.update_database()
        
        _data = None

        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        
        return result

    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        self.provider.update_database()
        self.upsert_data_tree()

        datasets_list = [d["dataset_code"] for d in self.datasets_list()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        #TODO: 
        self.load_datasets_first()
Beispiel #3
0
class Esri(Fetcher):
    def __init__(self, db=None):
        super().__init__(provider_name='ESRI', db=db)         
        self.provider = Providers(name=self.provider_name,
                                  long_name='Economic and Social Research Institute, Cabinet Office',
                                  version=VERSION,
                                  region='Japan',
                                  website='http://www.esri.cao.go.jp/index-e.html',
                                  fetcher=self)
        self.datasets_dict = {}
        self.selected_codes = ['GDP.Amount']
        
    def build_data_tree(self, force_update=False):
        """Build data_tree from ESRI site parsing
        """
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree

        def make_node(data,parent_key):
            _category = dict(name=data['name'],
                             category_code=data['category_code'])
            _category_key = self.provider.add_category(_category,
                                                       parent_code=parent_key)
            if 'children' in data:
                for c in data['children']:
                    make_node(c,_category_key)
            if 'datasets' in data:
                for d in data['datasets']:
                    self.provider.add_dataset(dict(dataset_code = d['dataset_code'],
                                                   name = d['name'],
                                                   last_update = d['release_date'],
                                                   metadata={'url': d['url'],
                                                             'doc_href': d['doc_href']}),
                                              _category_key)                        
        try:
            for data in parse_esri_site():
                make_node(data, self.provider_name)
        except Exception as err:
            logger.error(err)   
            raise                             

    def get_selected_datasets(self):
        """Collects the dataset codes that are in data_tree
        below the ones indicated in "selected_codes" provided in configuration
        :returns: list of dict of dataset settings"""
        category_filter = [".*%s.*" % d for d in self.selected_codes]
        category_filter = "|".join(category_filter)
        self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)}
        return self.selected_datasets

    # necessary for test mock
    def make_url(self):
        return self.dataset_settings['metadata']['url']

    def upsert_dataset(self, dataset_code):
        """Updates data in Database for selected datasets
        :dset: dataset_code
        :returns: None"""
        self.get_selected_datasets()
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))

        self.dataset_settings = self.selected_datasets[dataset_code]
        url = self.make_url()
        dataset = Datasets(self.provider_name,dataset_code,
                           fetcher=self)
        dataset.name = self.dataset_settings['name']
        dataset.doc_href = self.dataset_settings['metadata']['doc_href']
        dataset.last_update = self.dataset_settings['last_update']
        data_iterator = EsriData(dataset,url,filename=dataset_code)
        dataset.series.data_iterator = data_iterator
        dataset.update_database()
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    # TO BE FINISHED    
    def parse_sna_agenda(self):
        #TODO: use Downloader
        download = Downloader(url="http://www.esri.cao.go.jp/en/sna/kouhyou/kouhyou_top.html",
                              filename="agenda_sna.html")
        with open(download.get_filepath(), 'rb') as fp:
            agenda = lxml.html.parse(fp)
        
    # TO BE FINISHED
    def get_calendar(self):
        datasets = [d["dataset_code"] for d in self.datasets_list()]

        for entry in self.parse_agenda():

            if entry['dataflow_key'] in datasets:

                yield {'action': 'update_node',
                       'kwargs': {'provider_name': self.provider_name,
                                  'dataset_code': entry['dataflow_key']},
                       'period_type': 'date',
                       'period_kwargs': {'run_date': datetime.strptime(
                           entry['scheduled_date'], "%d/%m/%Y %H:%M CET"),
                           'timezone': pytz.timezone('Asia/Tokyo')
                       }
                      }

    # TODO: load earlier versions to get revisions
    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        
        self.provider.update_database()
        self.build_data_tree()
        self.upsert_data_tree()

        datasets_list = [d for d in self.get_selected_datasets().keys()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        self.provider.update_database()
        self.upsert_data_tree()

        datasets_list = [d["dataset_code"] for d in self.datasets_list()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
Beispiel #4
0
class ECB(Fetcher):
    
    def __init__(self, db=None, sdmx=None, **kwargs):        
        super().__init__(provider_name='ECB', db=db, **kwargs)

        if not self.provider:        
            self.provider = Providers(name=self.provider_name,
                                      long_name='European Central Bank',
                                      version=VERSION,
                                      region='Europe',
                                      website='http://www.ecb.europa.eu',
                                      fetcher=self)
            self.provider.update_database()
        
        if self.provider.version != VERSION:
            self.provider.update_database()
            
        self.sdmx = sdmx or ECBRequest(agency=self.provider_name)
        self.sdmx.timeout = 90
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None

    def _load_structure(self, force=False):
        """Load structure and build data_tree
        """
        
        if (self._dataflows and self._categoryschemes and self._categorisations) and not force:
            return
        
        '''Force URL for select only ECB agency'''
        categoryschemes_response = self.sdmx.get(resource_type='categoryscheme', url='http://sdw-wsrest.ecb.int/service/categoryscheme/%s?references=parentsandsiblings' % self.provider_name)
        self._categorisations = categoryschemes_response.msg.categorisations
        self._categoryschemes = categoryschemes_response.msg.categoryschemes
        self._dataflows = categoryschemes_response.msg.dataflows
        
    def build_data_tree(self, force_update=False):
        """Build data_tree from structure datas
        """
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree
        
        self._load_structure()

        for category in self._categoryschemes.aslist():
            
            _category = dict(name=category.name.en,
                             category_code=category.id)
            category_key = self.provider.add_category(_category)
             
            for subcategory in category.values():
                
                if not subcategory.id in self._categorisations:
                    continue
                
                _subcategory = dict(name=subcategory.name.en,
                                    category_code=subcategory.id)
                _subcategory_key = self.provider.add_category(_subcategory,
                                           parent_code=category_key)
                
                try:
                    _categorisation = self._categorisations[subcategory.id]
                    for i in _categorisation:
                        _d = self._dataflows[i.artefact.id]
                        self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key)                        
                except Exception as err:
                    logger.error(err)   
                    raise                             

        return self.provider.data_tree
        
    def parse_agenda(self):
        #TODO: use Downloader
        download = Downloader(url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html",
                              filename="statscall.html")
        with open(download.get_filepath(), 'rb') as fp:
            agenda = lxml.html.parse(fp)
        
        regex_date = re.compile("Reference period: (.*)")
        regex_dataset = re.compile(".*Dataset: (.*)\)")
        entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | '
                               '//div[@class="ecb-faytdd"]/*/dd')[2:]
        entries = zip(entries[::2], entries[1::2])
        for entry in entries:
            item = {}
            match_key = regex_dataset.match(entry[1][0].text_content())
            item['dataflow_key'] = match_key.groups()[0]
            match_date = regex_date.match(entry[1][1].text_content())
            item['reference_period'] = match_date.groups()[0]
            item['scheduled_date'] = entry[0].text_content().replace('\n','')
            yield(item)

    def get_calendar(self):
        datasets = [d["dataset_code"] for d in self.datasets_list()]

        for entry in self.parse_agenda():

            if entry['dataflow_key'] in datasets:

                yield {'action': 'update_node',
                       'kwargs': {'provider_name': self.provider_name,
                                  'dataset_code': entry['dataflow_key']},
                       'period_type': 'date',
                       'period_kwargs': {'run_date': datetime.strptime(
                           entry['scheduled_date'], "%d/%m/%Y %H:%M CET"),
                           'timezone': pytz.timezone('CET')
                       }
                      }

    def upsert_dataset(self, dataset_code):
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        #TODO: control si existe ou update !!!

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=self.provider.website,
                           last_update=datetime.now(),
                           fetcher=self)
        
        _data = ECB_Data(dataset=dataset)
        dataset.series.data_iterator = _data
        try:
            result = dataset.update_database()
        except:
            raise
        
        _data = None

        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        
        return result

    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        self._load_structure()
        self.provider.update_database()
        self.upsert_data_tree()

        datasets_list = [d["dataset_code"] for d in self.datasets_list()]
        for dataset_code in datasets_list:
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("datasets first load. provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        #TODO: 
        self.load_datasets_first()
Beispiel #5
0
class BIS(Fetcher):
    def __init__(self, db=None):
        super().__init__(provider_name="BIS", db=db)

        if not self.provider:
            self.provider = Providers(
                name=self.provider_name,
                long_name="Bank for International Settlements",
                version=VERSION,
                region="world",
                website="http://www.bis.org",
                fetcher=self,
            )
            self.provider.update_database()

        if self.provider.version != VERSION:
            self.provider.update_database()

    def upsert_dataset(self, dataset_code):

        start = time.time()

        logger.info("upsert dataset[%s] - START" % (dataset_code))

        if not DATASETS.get(dataset_code):
            raise Exception("This dataset is unknown" + dataset_code)

        dataset = Datasets(
            provider_name=self.provider_name,
            dataset_code=dataset_code,
            name=DATASETS[dataset_code]["name"],
            doc_href=DATASETS[dataset_code]["doc_href"],
            fetcher=self,
        )

        fetcher_data = BIS_Data(dataset, url=DATASETS[dataset_code]["url"], filename=DATASETS[dataset_code]["filename"])

        if fetcher_data.is_updated():

            dataset.series.data_iterator = fetcher_data
            dataset.update_database()

            # TODO: clean datas (file temp)

            end = time.time() - start
            logger.info("upsert dataset[%s] - END-BEFORE-METAS - time[%.3f seconds]" % (dataset_code, end))

            self.update_metas(dataset_code)

            end = time.time() - start
            logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        else:
            logger.info(
                "upsert dataset[%s] bypass because is updated from release_date[%s]"
                % (dataset_code, fetcher_data.release_date)
            )

    def load_datasets_first(self):
        start = time.time()
        logger.info("first load fetcher[%s] - START" % (self.provider_name))

        for dataset_code in DATASETS.keys():
            self.upsert_dataset(dataset_code)

        end = time.time() - start
        logger.info("first load fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        self.load_datasets_first()

    def build_data_tree(self, force_update=False):

        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree

        for category_code, dataset in DATASETS.items():
            category_key = self.provider.add_category(
                {"name": dataset["name"], "category_code": category_code, "doc_href": dataset["doc_href"]}
            )
            _dataset = {"name": dataset["name"], "dataset_code": category_code}
            self.provider.add_dataset(_dataset, category_key)

        return self.provider.data_tree

    def parse_agenda(self):

        agenda = etree.HTML(get_agenda())
        table = agenda.find(".//table")
        # only one table
        rows = table[0].findall("tr")
        # skipping first row
        cells = rows[1].findall("td")
        agenda = []
        months = [None, None]

        for c in rows[1].iterfind("td"):
            content = c.find("strong")
            if content.text is None:
                content = content.find("strong")
            months.append(datetime.datetime.strptime(content.text, "%B %Y"))
        agenda.append(months)
        ir = 2

        def get_links_text(cell):
            txt = []
            for link in cell.findall("a"):
                if link.text:
                    txt.append(link.text)
            return txt

        def _get_dates(cells):
            item = []
            for ic, c in enumerate(cells):
                if c.text[0] != chr(160):
                    item.append(re.match("\d\d|\d", c.text).group(0))
                else:
                    item.append(None)
            return item

        while ir < len(rows):
            cells = rows[ir].findall("td")

            content = cells[0]
            if content.text is None:
                content = content.find("a")
            item = [content.text]

            if cells[0].get("rowspan") == "2":
                two_rows = True
                content = cells[1].find("a")
                item.append(content.text)
                offset = 2
            else:
                two_rows = False
                item.append(None)
                offset = 1

            item.extend(_get_dates(cells[offset:]))

            agenda.append(item)
            ir += 1

            if two_rows:
                cells = rows[ir].findall("td")
                links = get_links_text(cells[0])
                for content in links:
                    item = [item[0]]
                    item.append(content)
                    item.extend(_get_dates(cells[1:]))
                    agenda.append(item)
                ir += 1
        return agenda

    def get_calendar(self):
        agenda = self.parse_agenda()

        dataset_codes = [d["dataset_code"] for d in self.datasets_list()]

        """First line - exclude first 2 columns (title1, title2)"""
        months = agenda[0][2:]

        """All line moins first list"""
        periods = agenda[1:]

        def _get_dataset_code(title):
            for key, d in DATASETS.items():
                if title in d.get("agenda_titles", []):
                    return key
            return None

        for period in periods:
            title = period[0]
            if period[1]:
                title = "%s %s" % (title, period[1])

            dataset_code = _get_dataset_code(title)
            if not dataset_code:
                logger.info("exclude calendar action for not implemented dataset[%s]" % title)
                continue
            if not dataset_code in dataset_codes:
                logger.info("exclude calendar action for dataset[%s]" % title)
                continue

            days = period[2:]
            scheds = [d for d in zip(months, days) if not d[1] is None]

            for date_base, day in scheds:
                yield {
                    "action": "update_node",
                    "kwargs": {"provider_name": self.provider_name, "dataset_code": dataset_code},
                    "period_type": "date",
                    "period_kwargs": {
                        "run_date": datetime.datetime(date_base.year, date_base.month, int(day), 8, 0, 0),
                        "timezone": pytz.country_timezones(AGENDA["country"]),
                    },
                }
Beispiel #6
0
class INSEE(Fetcher):
    
    def __init__(self, db=None, sdmx=None, **kwargs):        
        super().__init__(provider_name='INSEE', db=db, **kwargs)

        if not self.provider:        
            self.provider = Providers(name=self.provider_name,
                                     long_name='National Institute of Statistics and Economic Studies',
                                     version=VERSION,
                                     region='France',
                                     website='http://www.insee.fr',
                                     fetcher=self)
            self.provider.update_database()
        
        if self.provider.version != VERSION:
            self.provider.update_database()
            
        
        self.sdmx = sdmx or Request(agency='INSEE')
        
        self._dataflows = None
        self._categoryschemes = None
        self._categorisations = None
    
    def _load_structure(self, force=False):
        
        if self._dataflows and not force:
            return
        
        """
        #http://www.bdm.insee.fr/series/sdmx/categoryscheme
        categoryscheme_response = self.sdmx.get(resource_type='categoryscheme', params={"references": None})
        logger.debug(categoryscheme_response.url)
        self._categoryschemes = categoryscheme_response.msg.categoryschemes
    
        #http://www.bdm.insee.fr/series/sdmx/categorisation
        categorisation_response = self.sdmx.get(resource_type='categorisation')
        logger.debug(categorisation_response.url)
        self._categorisations = categorisation_response.msg.categorisations
        """
    
        #http://www.bdm.insee.fr/series/sdmx/dataflow
        dataflows_response = self.sdmx.get(resource_type='dataflow')    
        logger.debug(dataflows_response.url)
        self._dataflows = dataflows_response.msg.dataflows

    def load_datasets_first(self):
        start = time.time()        
        logger.info("datasets first load. provider[%s] - START" % (self.provider_name))
        
        for dataset_code in self.datasets_list():
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("update fetcher[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

    def load_datasets_update(self):
        #TODO: 
        self.load_datasets_first()

    def build_data_tree(self, force_update=False):
        """Build data_tree from structure datas
        """
        if self.provider.count_data_tree() > 1 and not force_update:
            return self.provider.data_tree
        
        self._load_structure()
        
        for dataset_code, dataset in self._dataflows.items():

            name = dataset.name
            if "en" in dataset.name:
                name = dataset.name.en
            else:
                name = dataset.name.fr
            
            self.provider.add_dataset(dict(dataset_code=dataset_code, name=name), self.provider_name)
            
        return self.provider.data_tree

        for category in self._categoryschemes.aslist():
            
            _category = dict(name=category.name.en,
                             category_code=category.id)
            category_key = self.provider.add_category(_category)
             
            for subcategory in category.values():
                
                if not subcategory.id in self._categorisations:
                    continue
                
                _subcategory = dict(name=subcategory.name.en,
                                    category_code=subcategory.id)
                _subcategory_key = self.provider.add_category(_subcategory,
                                           parent_code=category_key)
                
                try:
                    _categorisation = self._categorisations[subcategory.id]
                    for i in _categorisation:
                        _d = self._dataflows[i.artefact.id]
                        self.provider.add_dataset(dict(dataset_code=_d.id, name=_d.name.en), _subcategory_key)                        
                except Exception as err:
                    logger.error(err)   
                    raise                             

        return self.provider.data_tree
    
    def upsert_dataset(self, dataset_code):

        #self.load_structure(force=False)
        
        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))
        
        #if not dataset_code in self._dataflows:
        #    raise Exception("This dataset is unknown: %s" % dataset_code)
        
        #dataflow = self._dataflows[dataset_code]
        
        #cat = self.db[constants.COL_CATEGORIES].find_one({'category_code': dataset_code})
        #dataset.name = cat['name']
        #dataset.doc_href = cat['doc_href']
        #dataset.last_update = cat['last_update']

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           #name=dataflow.name.en,
                           doc_href=None,
                           last_update=datetime.now(), #TODO:
                           fetcher=self)
        
        dataset_doc = self.db[constants.COL_DATASETS].find_one({'provider_name': self.provider_name,
                                                                "dataset_code": dataset_code})
        
        insee_data = INSEE_Data(dataset=dataset,
                                dataset_doc=dataset_doc, 
                                #dataflow=dataflow, 
                                #sdmx=self.sdmx
                                )
        dataset.series.data_iterator = insee_data
        result = dataset.update_database()
        
        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))
        
        """
        > IDBANK:  A définir dynamiquement sur site ?
        doc_href d'une serie: http://www.bdm.insee.fr/bdm2/affichageSeries?idbank=001694226
        > CODE GROUPE: Balance des Paiements mensuelle - Compte de capital
        http://www.bdm.insee.fr/bdm2/choixCriteres?codeGroupe=1556
        """
        return result
Beispiel #7
0
class Eurostat(Fetcher):
    """Class for managing the SDMX endpoint from eurostat in dlstats."""
    
    def __init__(self, db=None):
        super().__init__(provider_name='Eurostat', db=db)
        
        if not self.provider:
            self.provider = Providers(name=self.provider_name,
                                      long_name='Eurostat',
                                      version=VERSION,
                                      region='Europe',
                                      website='http://ec.europa.eu/eurostat',
                                      fetcher=self)
            self.provider.update_database()
        
        if self.provider.version != VERSION:
            self.provider.update_database()
        
        self.selected_codes = [
            'nama_10', 
            'namq_10', 
            'nasa_10', 
            'nasq_10', 
            'naid_10',
            'nama', 
            'namq', 
            'nasa', 
            'nasq', 
            'gov', 
            'ert', 
            'irt', 
            'prc', 
            'bop', 
            'bop_6',
            'demo_pjanbroad', 
            'lfsi_act_q'
        ]
        self.selected_datasets = {}
        self.url_table_of_contents = "http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?sort=1&file=table_of_contents.xml"
        self.dataset_url = None
        
    def build_data_tree(self, force_update=False):
        """Builds the data tree
        
        Pour créer les categories, ne prend que les branch dont l'un des <code> 
        de la branch se trouvent dans selected_codes
        
        Même chose pour les datasets. Prend le category_code du parent
        et verifie si il est dans selected_codes
        """
        
        start = time.time()
        logger.info("build_data_tree provider[%s] - START" % self.provider_name)
        
        if self.provider.count_data_tree() > 1 and not force_update:
            logger.info("use existing data-tree for provider[%s]" % self.provider_name)
            return self.provider.data_tree

        filepath = self.get_table_of_contents()

        it = etree.iterparse(filepath, events=['end'])

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.selected_codes: 
                if _select in parent_codes:
                    return True
            return False

        for event, element in it:
            if event == 'end':

                if element.tag == fixtag_toc('nt', 'branch'):

                    for child in element.iterchildren(tag=fixtag_toc('nt', 'children')):

                        _parent_codes = xpath_parent_codes(child)
                        _parents = xpath_ancestor_branch(child)

                        if not is_selected(_parent_codes):
                            continue

                        for parent in _parents:
                            _parent_code = xpath_code(parent)[0]
                            _parent_title =xpath_title(parent)[0]

                            '''Extrait la partie gauche des categories parents'''
                            _parent_categories = ".".join(_parent_codes[:_parent_codes.index(_parent_code)])
                            _category = None
                            _parent = None

                            if not _parent_categories or len(_parent_categories) == 0:
                                _category = {"category_code": _parent_code, "name": _parent_title}
                            else:
                                _parent = self.provider._category_key(_parent_categories)
                                _category = {"category_code": _parent_code, "name": _parent_title}

                            try:
                                _key = self.provider.add_category(_category, _parent)
                            except:
                                #Pas de capture car verifie seulement si existe
                                pass

                        datasets = xpath_datasets(child)

                        for dataset in datasets:
                            parent_codes = xpath_parent_codes(dataset)
                            dataset_code = xpath_code(dataset)[0]
                            category_code = self.provider._category_key(".".join(parent_codes))

                            '''Verifie si au moins un des category_code est dans selected_codes'''
                            if not is_selected(parent_codes):
                                continue

                            name = xpath_title(dataset)[0]
                            last_update = xpath_ds_last_update(dataset)
                            last_modified = xpath_ds_last_modified(dataset)
                            doc_href = xpath_ds_metadata_html(dataset)
                            data_start = xpath_ds_data_start(dataset)
                            data_end = xpath_ds_data_end(dataset)
                            values = xpath_ds_values(dataset)

                            last_update = datetime.strptime(last_update[0], '%d.%m.%Y')
                            if last_modified:
                                last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y')
                                last_update = max(last_update, last_modified)

                            dataset = {
                                "dataset_code": dataset_code, 
                                "name": name,
                                "last_update": last_update,
                                "metadata": {
                                    "doc_href": first_element_xpath(doc_href),
                                    "data_start": first_element_xpath(data_start),
                                    "data_end": first_element_xpath(data_end),
                                    "values": int(first_element_xpath(values, default="0")),
                                }
                            }             
                            self.provider.add_dataset(dataset, category_code)
                            dataset.clear()
                        child.clear()
                    element.clear()

        end = time.time() - start
        logger.info("build_data_tree load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))

        return self.provider.data_tree
    
    def get_table_of_contents(self):
        return Downloader(url=self.url_table_of_contents, 
                              filename="table_of_contents.xml").get_filepath()

    def get_selected_datasets(self):
        """Collects the dataset codes that are in table of contents
        below the ones indicated in "selected_codes" provided in configuration
        :returns: list of dict of dataset settings"""
        category_filter = [".*%s.*" % d for d in self.selected_codes]
        category_filter = "|".join(category_filter)
        self.selected_datasets = {d['dataset_code']: d for d in self.datasets_list(category_filter=category_filter)}
        return self.selected_datasets

    def upsert_dataset(self, dataset_code):
        """Updates data in Database for selected datasets
        :dset: dataset_code
        :returns: None"""
        self.get_selected_datasets()

        start = time.time()
        logger.info("upsert dataset[%s] - START" % (dataset_code))

        dataset_settings = self.selected_datasets[dataset_code]

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name=dataset_settings["name"], 
                           doc_href=dataset_settings["metadata"].get("doc_href"), 
                           last_update=dataset_settings["last_update"], 
                           fetcher=self)

        data_iterator = EurostatData(dataset, filename=dataset_code)
        dataset.series.data_iterator = data_iterator
        dataset.update_database()

        end = time.time() - start
        logger.info("upsert dataset[%s] - END - time[%.3f seconds]" % (dataset_code, end))

    def load_datasets_first(self):
        self.get_selected_datasets()

        start = time.time()
        logger.info("first load provider[%s] - START" % (self.provider_name))

        for dataset_code in self.selected_datasets.keys():
            try:
                self.upsert_dataset(dataset_code)
            except Exception as err:
                logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("first load provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))
        
    def load_datasets_update(self):
        self.get_selected_datasets()
        
        start = time.time()
        logger.info("update provider[%s] - START" % (self.provider_name))

        selected_datasets = self.db[constants.COL_DATASETS].find(
            {'provider_name': self.provider_name, 'dataset_code': {'$in': list(self.selected_datasets.keys())}},
            {'dataset_code': 1, 'last_update': 1})
        selected_datasets = {s['dataset_code'] : s for s in selected_datasets}

        for dataset_code, dataset in self.selected_datasets.items():
            if (dataset_code not in selected_datasets) or (selected_datasets[dataset_code]['last_update'] < dataset['last_update']):
                try:
                    self.upsert_dataset(dataset_code)
                except Exception as err:
                    logger.fatal("error for dataset[%s]: %s" % (dataset_code, str(err)))

        end = time.time() - start
        logger.info("update provider[%s] - END - time[%.3f seconds]" % (self.provider_name, end))