Ejemplo n.º 1
0
    def _load_file(self):

        filename = "data-%s.zip" % (self.dataset_code)
        download = Downloader(
            url=self.url,
            filename=filename,
            store_filepath=self.get_store_path(),
            use_existing_file=self.fetcher.use_existing_file,
        )
        self.filepath, response = download.get_filepath_and_response()

        if self.filepath:
            self.fetcher.for_delete.append(self.filepath)

        release_date_str = response.headers["Last-Modified"]
        # Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT
        self.release_date = clean_datetime(datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT"))

        if self.dataset.last_update and self.dataset.last_update >= self.release_date:
            comments = "update-date[%s]" % self.release_date
            raise errors.RejectUpdatedDataset(
                provider_name=self.provider_name, dataset_code=self.dataset_code, comments=comments
            )

        self.dataset.last_update = self.release_date
Ejemplo n.º 2
0
    def __init__(self, dataset=None):
        super().__init__(dataset)

        self.store_path = self.get_store_path()
        self.xml_dsd = XMLStructure(provider_name=self.provider_name)

        self._load_dsd()

        if self.dataset.last_update and self.xml_dsd.last_update:

            if self.dataset.last_update > self.xml_dsd.last_update:
                comments = "update-date[%s]" % self.xml_dsd.last_update
                raise errors.RejectUpdatedDataset(
                    provider_name=self.provider_name,
                    dataset_code=self.dataset.dataset_code,
                    comments=comments)

        self.dataset.last_update = clean_datetime(self.xml_dsd.last_update)

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dataset_code,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        self.rows = self._get_data_by_dimension()
Ejemplo n.º 3
0
    def __init__(self, dataset=None):
        super().__init__(dataset)

        self.store_path = self.get_store_path()
        self.xml_dsd = XMLStructure(provider_name=self.provider_name)
        
        self._load_dsd()

        if self.dataset.last_update and self.xml_dsd.last_update:
            
            if self.dataset.last_update > self.xml_dsd.last_update:
                comments = "update-date[%s]" % self.xml_dsd.last_update
                raise errors.RejectUpdatedDataset(provider_name=self.provider_name,
                                                  dataset_code=self.dataset.dataset_code,
                                                  comments=comments)
        
        self.dataset.last_update = clean_datetime(self.xml_dsd.last_update)        

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dataset_code,
                                frequencies_supported=FREQUENCIES_SUPPORTED)
        
        self.rows = self._get_data_by_dimension()
Ejemplo n.º 4
0
    def _load_file(self):

        filename = "data-%s.zip" % (self.dataset_code)
        download = Downloader(
            url=self.url,
            filename=filename,
            store_filepath=self.get_store_path(),
            use_existing_file=self.fetcher.use_existing_file,
        )
        self.filepath, response = download.get_filepath_and_response()

        if self.filepath:
            self.fetcher.for_delete.append(self.filepath)

        release_date_str = response.headers['Last-Modified']
        #Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT
        self.release_date = clean_datetime(
            datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT"))

        if self.dataset.last_update and self.dataset.last_update >= self.release_date:
            comments = "update-date[%s]" % self.release_date
            raise errors.RejectUpdatedDataset(provider_name=self.provider_name,
                                              dataset_code=self.dataset_code,
                                              comments=comments)

        self.dataset.last_update = self.release_date
Ejemplo n.º 5
0
 def _get_release_date(self, url, sheet):
     if 'Section' in  url :
         release_datesheet = sheet.cell_value(4,0)[15:] #April 28, 2016 
     elif 'ITA-XLS' in url or 'IIP-XLS' in url :
         release_datesheet = sheet.cell_value(3,0)[14:].split('-')[0]
     else :
         release_datesheet = sheet.cell_value(3,0)[14:] 
         
     return clean_datetime(datetime.strptime(release_datesheet.strip(), "%B %d, %Y")) 
Ejemplo n.º 6
0
    def clean_field(self, bson):

        if not "start_ts" in bson or not bson.get("start_ts"):
            if bson["frequency"] == "A":
                year = int(get_year(bson["values"][0]["period"]))
                bson["start_ts"] = clean_datetime(datetime(year, 1, 1), rm_hour=True, rm_minute=True, rm_second=True, rm_microsecond=True, rm_tzinfo=True) 
            else:
                bson["start_ts"] = clean_datetime(pandas.Period(ordinal=bson["start_date"], freq=bson["frequency"]).start_time.to_datetime())

        if not "end_ts" in bson or not bson.get("end_ts"):
            if bson["frequency"] == "A":
                year = int(get_year(bson["values"][-1]["period"]))
                bson["end_ts"] = clean_datetime(datetime(year, 12, 31), rm_hour=True, rm_minute=True, rm_second=True, rm_microsecond=True, rm_tzinfo=True) 
            else:
                bson["end_ts"] = clean_datetime(pandas.Period(ordinal=bson["end_date"], freq=bson["frequency"]).end_time.to_datetime())
        
        dimensions = bson.pop("dimensions")
        attributes = bson.pop("attributes", {})
        new_dimensions = {}
        new_attributes = {}
        
        for key, value in dimensions.items():
            new_dimensions[slugify(key, save_order=True)] = slugify(value, save_order=True)

        if attributes:
            for key, value in attributes.items():
                new_attributes[slugify(key, save_order=True)] = slugify(value, save_order=True)
            
        bson["dimensions"] = new_dimensions

        if attributes:
            bson["attributes"] = new_attributes
        else:
            bson["attributes"] = None
            
        for value in bson["values"]:
            if not value.get("attributes"):
                continue
            attributes_obs = {}
            for k, v in value.get("attributes").items():
                attributes_obs[slugify(k, save_order=True)] = slugify(v, save_order=True)
            value["attributes"] = attributes_obs
        
        return bson
Ejemplo n.º 7
0
    def update_database(self, save_only=False):

        self.fetcher.hook_before_dataset(self)
        
        try:
            if not save_only:
                if self.fetcher.async_mode and self.fetcher.async_framework == "gevent":
                    self.series.process_series_data_async()
                else:
                    self.series.process_series_data()
        except Exception:
            self.fetcher.errors += 1
            logger.critical(last_error())
            if self.fetcher.max_errors and self.fetcher.errors >= self.fetcher.max_errors:
                msg = "The maximum number of errors is exceeded for provider[%s] - dataset[%s]. MAX[%s]"
                raise errors.MaxErrors(msg % (self.provider_name,
                                              self.dataset_code,
                                              self.fetcher.max_errors))
        finally:
            now = clean_datetime()
    
            if not self.download_first:
                self.download_first = now
    
            self.download_last = now
    
            schemas.dataset_schema(self.bson)
            
            if not self.is_recordable():
                self.enable = False
                msg = "disable dataset[%s] for provider[%s]"
                logger.warning(msg % (self.dataset_code, 
                                      self.provider_name))
            else:
                self.enable = True

            if logger.isEnabledFor(logging.INFO):    
                msg_stats = "STATS dataset-update: provider[%s] - dataset[%s] - accepts[%s] - rejects[%s] - inserts[%s] - updates[%s]"
                logger.info(msg_stats % (self.provider_name,
                                         self.dataset_code,
                                         self.series.count_accepts,
                                         self.series.count_rejects,
                                         self.series.count_inserts,
                                         self.series.count_updates))
            
            if save_only:
                self.series.reset_counters()
                        
            result = self.update_mongo_collection(constants.COL_DATASETS,
                                                  ['provider_name', 
                                                   'dataset_code'],
                                                  self.bson)
    
            self.fetcher.hook_after_dataset(self)
    
            return result
Ejemplo n.º 8
0
    def _get_release_date(self, url, sheet):
        if 'Section' in url:
            release_datesheet = sheet.cell_value(4, 0)[15:]  #April 28, 2016
        elif 'ITA-XLS' in url or 'IIP-XLS' in url:
            release_datesheet = sheet.cell_value(3, 0)[14:].split('-')[0]
        else:
            release_datesheet = sheet.cell_value(3, 0)[14:]

        return clean_datetime(
            datetime.strptime(release_datesheet.strip(), "%B %d, %Y"))
Ejemplo n.º 9
0
    def upsert_dataset(self, dataset_code):
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name="My Dataset Name",
                           last_update=clean_datetime(), 
                           fetcher=self)
        
        fetcher_data = DUMMY_Data(dataset)
        dataset.series.data_iterator = fetcher_data

        return dataset.update_database()
Ejemplo n.º 10
0
    def upsert_dataset(self, dataset_code):
        
        self._load_structure()
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=self.provider.website,
                           last_update=utils.clean_datetime(),
                           fetcher=self)

        _data = ECB_Data(dataset=dataset)
        dataset.series.data_iterator = _data
        return dataset.update_database()
Ejemplo n.º 11
0
    def upsert_dataset(self, dataset_code):

        self._load_structure()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=self.provider.website,
                           fetcher=self)
        dataset.last_update = utils.clean_datetime()

        _data = ECB_Data(dataset=dataset)
        dataset.series.data_iterator = _data
        return dataset.update_database()
Ejemplo n.º 12
0
    def upsert_dataset(self, dataset_code):
        
        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code, 
                           name="My Dataset Name",
                           last_update=clean_datetime(), 
                           fetcher=self)
        dataset.codelists = {
            'COUNTRY': {'FRA': 'France'},
            'OBS_STATUS': {'A': "A"}
        }
        fetcher_data = DUMMY_Data(dataset)
        dataset.series.data_iterator = fetcher_data

        return dataset.update_database()
Ejemplo n.º 13
0
    def upsert_dataset(self, dataset_code):

        if not DATASETS.get(dataset_code):
            raise Exception("This dataset is unknown" + dataset_code)

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=DATASETS[dataset_code]['name'],
                           doc_href=DATASETS[dataset_code]['doc_href'],
                           fetcher=self)
        dataset.last_update = clean_datetime()

        dataset.series.data_iterator = OECD_Data(
            dataset, sdmx_filter=DATASETS[dataset_code]['sdmx_filter'])

        return dataset.update_database()
Ejemplo n.º 14
0
 def upsert_dataset(self, dataset_code):
     
     if not DATASETS.get(dataset_code):
         raise Exception("This dataset is unknown" + dataset_code)
             
     dataset = Datasets(provider_name=self.provider_name, 
                        dataset_code=dataset_code, 
                        name=DATASETS[dataset_code]['name'], 
                        doc_href=DATASETS[dataset_code]['doc_href'],
                        last_update=clean_datetime(),
                        fetcher=self)
     
     dataset.series.data_iterator = OECD_Data(dataset, 
                                              sdmx_filter=DATASETS[dataset_code]['sdmx_filter'])
     
     return dataset.update_database()
Ejemplo n.º 15
0
    def upsert_dataset(self, dataset_code):
        
        self.get_selected_datasets()
        
        dataset_settings = self.selected_datasets[dataset_code]

        #http://data.worldbank.org/indicator/AG.AGR.TRAC.NO
        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=dataset_settings["name"],
                           last_update=clean_datetime(),
                           fetcher=self)
        
        dataset.series.data_iterator = WorldBankAPIData(dataset, dataset_settings)
        
        return dataset.update_database()
Ejemplo n.º 16
0
    def upsert_dataset(self, dataset_code):

        self._load_structure_dataflows()
        self._load_structure_concepts()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           fetcher=self)
        dataset.last_update = clean_datetime()

        insee_data = INSEE_Data(dataset)
        dataset.series.data_iterator = insee_data

        return dataset.update_database()
Ejemplo n.º 17
0
    def upsert_dataset(self, dataset_code):

        self._load_structure_dataflows()
        self._load_structure_concepts()

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           fetcher=self)
        dataset.last_update = clean_datetime()

        insee_data = INSEE_Data(dataset)
        dataset.series.data_iterator = insee_data

        return dataset.update_database()
Ejemplo n.º 18
0
    def upsert_dataset(self, dataset_code):

        self.get_selected_datasets()

        dataset_settings = self.selected_datasets[dataset_code]

        dataset = Datasets(
            provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], fetcher=self
        )

        if dataset_code in DATASETS:
            dataset.series.data_iterator = ExcelData(dataset, DATASETS[dataset_code]["url"])
            dataset.doc_href = DATASETS[dataset_code]["doc_href"]
        else:
            dataset.last_update = clean_datetime()
            dataset.series.data_iterator = WorldBankAPIData(dataset, dataset_settings)

        return dataset.update_database()
Ejemplo n.º 19
0
    def upsert_dataset(self, dataset_code):

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name="My Dataset Name",
                           last_update=clean_datetime(),
                           fetcher=self)
        dataset.codelists = {
            'COUNTRY': {
                'FRA': 'France'
            },
            'OBS_STATUS': {
                'A': "A"
            }
        }
        fetcher_data = DUMMY_Data(dataset)
        dataset.series.data_iterator = fetcher_data

        return dataset.update_database()
Ejemplo n.º 20
0
    def upsert_dataset(self, dataset_code):

        self._load_structure()

        dataset = Datasets(provider_name=self.provider_name, 
                           dataset_code=dataset_code,
                           name=None,
                           doc_href=None,
                           last_update=clean_datetime(),
                           fetcher=self)
        
        query = {'provider_name': self.provider_name, 
                 "dataset_code": dataset_code}        
        dataset_doc = self.db[constants.COL_DATASETS].find_one(query)
        
        insee_data = INSEE_Data(dataset,
                                dataset_doc=dataset_doc)
        dataset.series.data_iterator = insee_data
        
        return dataset.update_database()
Ejemplo n.º 21
0
    def upsert_dataset(self, dataset_code):

        self.get_selected_datasets()

        dataset_settings = self.selected_datasets[dataset_code]

        dataset = Datasets(provider_name=self.provider_name,
                           dataset_code=dataset_code,
                           name=dataset_settings["name"],
                           fetcher=self)

        if dataset_code in DATASETS:
            dataset.series.data_iterator = ExcelData(
                dataset, DATASETS[dataset_code]["url"])
            dataset.doc_href = DATASETS[dataset_code]["doc_href"]
        else:
            dataset.last_update = clean_datetime()
            dataset.series.data_iterator = WorldBankAPIData(
                dataset, dataset_settings)

        return dataset.update_database()
Ejemplo n.º 22
0
    def _process(self):
        
        for current_indicator in self.indicators:
            self.current_indicator = current_indicator
            
            #if not self.current_indicator["id"] == "CC.EST":
            #    continue
            
            count = 0
            
            if self.current_indicator["id"] in self.blacklist_indicator:
                continue
            
            is_release_controled = False
            is_rejected = False
            
            slug_indicator = slugify(self.current_indicator["id"], save_order=True)
            
            for current_country in self.countries_to_process:
                self.current_country = current_country
            
                logger.info("Fetching dataset[%s] - indicator[%s] - country[%s]" % (self.dataset_code, 
                                                                                    self.current_indicator["id"], 
                                                                                    self.current_country))
    
                release_date, datas = self._download_values(self.current_country,
                                                            self.current_indicator["id"])
            
                if not datas:
                    continue
                
                self.release_date = clean_datetime(datetime.strptime(release_date, '%Y-%m-%d'))

                if is_release_controled is False:
                    
                    is_release_controled = True
                    
                    if self.dataset.metadata["indicators"].get(slug_indicator):
                        
                        if self.release_date >= self.dataset.metadata["indicators"][slug_indicator]:
                            msg = "Reject series updated for provider[%s] - dataset[%s] - key[%s]"
                            logger.info(msg % (self.provider_name, 
                                               self.dataset_code, 
                                               self.current_indicator["id"]))
                            
                            is_rejected = True
                            break

                    self.dataset.metadata["indicators"][slug_indicator] = self.release_date
                
                count += 1
    
                yield {"datas": datas}, None
            
            if not is_rejected:
                logger.info("TOTAL - dataset[%s] - indicator[%s] - count[%s]" % (self.dataset_code,
                                                                                 self.current_indicator["id"],
                                                                                 count))
                if count == 0:
                    logger.warning("EMPTY dataset[%s] - indicator[%s]"  % (self.dataset_code,
                                                                       self.current_indicator["id"]))

        yield None, None
Ejemplo n.º 23
0
    def build_data_tree(self):
        """
        http://api.worldbank.org/v2/datacatalog?format=xml&per_page=20
        http://api.worldbank.org/v2/datacatalog/3?format=json&per_page=20

        > toujours le catalogue mais en limitant les champs:
        http://api.worldbank.org/v2/datacatalog/metatypes/name;type;acronym?format=json&per_page=200
        http://api.worldbank.org/v2/datacatalog/metatypes/type;url;lastrevisiondate?format=json&per_page=50

        > Voir si numberofeconomies = nombre de series ?
        
        > calendar: updatefrequency, updateschedule
        
        > use: detailpageurl pour doc_href
        
        datacatalog": [
            {
                id": "3",
                "metatype": [
                    {
                    "id": "name",
                    "value": "Global Economic Monitor"
                    },
                    {
                        "id": "acronym",
                        "value": "GEM"
                    },
                    {
                        "id": "description",
                        "value": "Providing...."
                    },
                    {
                        "id": "url",
                        "value": "http://databank.worldbank.org/data/views/variableselection/selectvariables.aspx?source=global-economic-monitor-(gem)"                        
                    },
                    {
                        "id": "apisourceid",    !!! lien avec id source !
                        "value": "15"
                    }                    
                    
            },         
        ]        
        """

        categories = []
        
        position = 0
        
        for page in self.download_json('sources'):
            for source in page[1]:
        
                if source["id"] in self.blacklist:
                    continue
                
                position += 1
                
                cat = {
                    "provider_name": self.provider_name,
                    "category_code": source["code"],
                    "name": source["name"],
                    #TODO: "doc_href": ?,
                    "position": position,
                    "datasets": [{
                        "name": source["name"], 
                        "dataset_code": source["code"],
                        "last_update": None, 
                        "metadata": {"id": source["id"]}
                    }]
                }
                categories.append(cat)

        return categories
        
        """
        http://api.worldbank.org/v2/datacatalog?format=json&per_page=20

        FIXME: Par le catalogue: manque datasets. que:
        ADI                  | Africa Development Indicators                                          | 2013-02-22
        DB                   | Doing Business                                                         | 2015-11-24
        EdStats              | Education Statistics                                                   | 2016-03-04
        GEM                  | Global Economic Monitor                                                | 2016-03-22
        GEP                  | Global Economic Prospects                                              | 2016-01-06
        GFDD                 | Global Financial Development                                           | 2015-09-14
        GPE                  | GPE Results Forms Database                                             | 2013-01-10
        Global Findex        | Global Financial Inclusion (Global Findex) Database                    | 2015-04-15
        IDA                  | IDA Results Measurement System                                         | 2015-12-30
        IDS                  | International Debt Statistics                                          | 2015-12-16
        JOBS                 | Jobs                                                                   | 2015-09-21
        MDGs                 | Millennium Development Goals                                           | 2015-11-16
        QEDS/GDDS            | Quarterly External Debt Statistics GDDS (New)                          | 2016-01-28
        QEDS/SDDS            | Quarterly External Debt Statistics SDDS (New)                          | 2016-01-28
        SE4ALL               | Sustainable Energy for All                                             | 2015-09-09
        WDI                  | World Development Indicators                                           | 2016-02-17
        WGI                  | Worldwide Governance Indicators                                        | 2015-09-25        
        """
        
        for page in self.download_json('datacatalog'):
            
            for source in page["datacatalog"]:
                name = None
                is_time_series = False
                dataset_id = None
                dataset_code = None
                doc_href = None
                last_update = None
                metadata = {}
                for value in source["metatype"]:
                    if value["id"] == "type" and value["value"] == "Time series":
                        is_time_series = True
                    elif value["id"] == "name":
                        name = value["value"]
                    elif value["id"] == "acronym":
                        dataset_code = value["value"]
                    elif value["id"] == "apisourceid":
                        metadata["id"] = value["value"]
                        dataset_id = value["value"]
                    elif value["id"] == "detailpageurl":
                        doc_href = value["value"]
                    elif value["id"] == "lastrevisiondate":
                        print("Date: ", value["value"])
                        if value["value"].lower() == "current":
                            last_update = clean_datetime()
                        else:
                            try:    
                                last_update = clean_datetime(datetime.strptime(value["value"], '%d-%b-%Y')) #17-Feb-2016
                            except: 
                                pass
                    elif value["id"] == "updatefrequency":
                        metadata["updatefrequency"] = value["value"]  
                    elif value["id"] == "updateschedule":
                        metadata["updateschedule"] = value["value"]  
                
                if not dataset_id or is_time_series is False or not dataset_code or dataset_id in self.blacklist:
                    continue
                
                position += 1
                
                cat = {
                    "provider_name": self.provider_name,
                    "category_code": dataset_code,
                    "name": name,
                    "doc_href": doc_href,
                    "position": position,
                    "datasets": [{
                        "dataset_code": dataset_code,
                        "name": name, 
                        "last_update": last_update or clean_datetime(), 
                        "metadata": metadata
                    }]
                }
                categories.append(cat)
        
        return categories
Ejemplo n.º 24
0
def series_update(new_bson, old_bson=None, last_update=None):

    if not new_bson or not isinstance(new_bson, dict):
        raise ValueError("no new_bson or not dict instance")            

    if old_bson and not isinstance(old_bson, dict):
        raise ValueError("old_bson is not dict instance")            

    if not "values" in new_bson:
        raise ValueError("not values field in new_bson")

    if old_bson and not "values" in old_bson:
        raise ValueError("not values field in old_bson")
    
    if not isinstance(new_bson["values"][0], dict):
        raise ValueError("Invalid format for this series : %s" % new_bson)

    if new_bson["start_date"] > new_bson["end_date"]:
        raise errors.RejectInvalidSeries("Invalid dates. start_date > end_date",
                                         provider_name=new_bson["provider_name"],
                                         dataset_code=new_bson["dataset_code"],
                                         bson=new_bson) 

    #FIXME:
    """
    if new_bson["frequency"] != "D" and len(new_bson["values"]) > 1:
        count_obs = (new_bson["end_date"] - new_bson["start_date"]) +1
        if len(new_bson["values"]) != count_obs:
            msg = "Missing values for provider[%s] - dataset[%s] - current[%s] - attempt[%s]" % (new_bson["provider_name"],
                                                                     new_bson["dataset_code"],
                                                                     len(new_bson["values"]),
                                                                     count_obs)
            raise Exception(msg)
    """

    _last_update = None
    if new_bson.get('last_update'):
        _last_update = clean_datetime(new_bson.pop('last_update', None))
    else:
        _last_update = clean_datetime(last_update)
    
    new_bson.pop('last_update', None)
        
    #TODO: valeurs manquantes à remplacer par chaine Unique: NaN

    series_set_release_date(new_bson, _last_update)

    if not old_bson:
        if not IS_SCHEMAS_VALIDATION_DISABLE:
            schemas.series_schema(new_bson)
        return new_bson
    else:
        changed = series_revisions(new_bson, old_bson, _last_update)
        
        if not changed:
            changed = series_is_changed(new_bson, old_bson)
            
        if not changed:
            return

        if not IS_SCHEMAS_VALIDATION_DISABLE:
            schemas.series_schema(new_bson)
        
    return new_bson
Ejemplo n.º 25
0
    def build_data_tree(self):
        """Builds the data tree
        """

        download = Downloader(url=self.url_table_of_contents,
                              filename="table_of_contents.xml",
                              store_filepath=self.store_path,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()

        categories = []
        categories_keys = []

        it = etree.iterparse(filepath,
                             events=['end'],
                             tag="{urn:eu.europa.ec.eurostat.navtree}leaf")

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.categories_filter:
                if _select in parent_codes:
                    return True
            return False

        def get_category(category_code):
            for c in categories:
                if c["category_code"] == category_code:
                    return c

        def create_categories(parent_codes, parent_titles, position):

            position += 1

            for i in range(len(parent_codes)):
                category_code = parent_codes.pop()
                name = parent_titles.pop()
                all_parents = parent_codes.copy()
                parent = None
                if all_parents:
                    parent = all_parents[-1]
                if not category_code in categories_keys:
                    _category = {
                        "provider_name": self.provider_name,
                        "category_code": category_code,
                        "name": name,
                        "position": position + i,
                        "parent": parent,
                        'all_parents': all_parents,
                        "datasets": [],
                        "doc_href": None,
                        "metadata": None
                    }
                    categories_keys.append(category_code)
                    categories.append(_category)

        position = 0
        is_verify_creation_date = False

        for event, dataset in it:

            if is_verify_creation_date is False:
                _root = dataset.getroottree().getroot()
                creation_date_str = _root.attrib.get("creationDate")
                creation_date = clean_datetime(
                    datetime.strptime(creation_date_str, '%Y%m%dT%H%M'))

                if self._is_updated_catalog(creation_date) is False:
                    msg = "no update from eurostat catalog. current[%s] - db[%s]"
                    logger.warning(msg %
                                   (creation_date,
                                    self.provider.metadata["creation_date"]))
                    if not self.force_update:
                        return []

                is_verify_creation_date = True
                if not self.force_update:
                    self.updated_catalog = True

            parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()",
                                         namespaces=TABLE_OF_CONTENT_NSMAP)

            if not is_selected(parent_codes):
                continue

            parent_titles = dataset.xpath(
                "ancestor::nt:branch/nt:title[attribute::language='en']/text()",
                namespaces=TABLE_OF_CONTENT_NSMAP)
            category_code = parent_codes[-1]

            create_categories(parent_codes, parent_titles, position)

            category = get_category(category_code)

            name = xpath_title(dataset)[0]
            last_update = xpath_ds_last_update(dataset)
            last_modified = xpath_ds_last_modified(dataset)
            doc_href = xpath_ds_metadata_html(dataset)
            data_start = xpath_ds_data_start(dataset)
            data_end = xpath_ds_data_end(dataset)
            values = xpath_ds_values(dataset)

            last_update = datetime.strptime(last_update[0], '%d.%m.%Y')
            if last_modified:
                last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y')
                last_update = max(last_update, last_modified)

            dataset_code = xpath_code(dataset)[0]
            _dataset = {
                "dataset_code": dataset_code,
                "name": name,
                "last_update": clean_datetime(last_update),
                "metadata": {
                    "doc_href": first_element_xpath(doc_href),
                    "data_start": first_element_xpath(data_start),
                    "data_end": first_element_xpath(data_end),
                    "values": int(first_element_xpath(values, default="0")),
                }
            }
            category["datasets"].append(_dataset)

        self.for_delete.append(filepath)

        return categories
Ejemplo n.º 26
0
    def build_data_tree(self):
        """
        http://api.worldbank.org/v2/datacatalog?format=xml&per_page=20
        http://api.worldbank.org/v2/datacatalog/3?format=json&per_page=20

        > toujours le catalogue mais en limitant les champs:
        http://api.worldbank.org/v2/datacatalog/metatypes/name;type;acronym?format=json&per_page=200
        http://api.worldbank.org/v2/datacatalog/metatypes/type;url;lastrevisiondate?format=json&per_page=50

        > Voir si numberofeconomies = nombre de series ?
        
        > calendar: updatefrequency, updateschedule
        
        > use: detailpageurl pour doc_href
        
        datacatalog": [
            {
                id": "3",
                "metatype": [
                    {
                    "id": "name",
                    "value": "Global Economic Monitor"
                    },
                    {
                        "id": "acronym",
                        "value": "GEM"
                    },
                    {
                        "id": "description",
                        "value": "Providing...."
                    },
                    {
                        "id": "url",
                        "value": "http://databank.worldbank.org/data/views/variableselection/selectvariables.aspx?source=global-economic-monitor-(gem)"                        
                    },
                    {
                        "id": "apisourceid",    !!! lien avec id source !
                        "value": "15"
                    }                    
                    
            },         
        ]        
        """

        categories = []

        position = 0

        for page in self.download_json('sources'):
            for source in page[1]:

                if source["id"] in self.blacklist:
                    continue

                position += 1

                cat = {
                    "provider_name":
                    self.provider_name,
                    "category_code":
                    source["code"],
                    "name":
                    source["name"],
                    #TODO: "doc_href": ?,
                    "position":
                    position,
                    "datasets": [{
                        "name": source["name"],
                        "dataset_code": source["code"],
                        "last_update": None,
                        "metadata": {
                            "id": source["id"]
                        }
                    }]
                }
                categories.append(cat)

        return categories
        """
        http://api.worldbank.org/v2/datacatalog?format=json&per_page=20

        FIXME: Par le catalogue: manque datasets. que:
        ADI                  | Africa Development Indicators                                          | 2013-02-22
        DB                   | Doing Business                                                         | 2015-11-24
        EdStats              | Education Statistics                                                   | 2016-03-04
        GEM                  | Global Economic Monitor                                                | 2016-03-22
        GEP                  | Global Economic Prospects                                              | 2016-01-06
        GFDD                 | Global Financial Development                                           | 2015-09-14
        GPE                  | GPE Results Forms Database                                             | 2013-01-10
        Global Findex        | Global Financial Inclusion (Global Findex) Database                    | 2015-04-15
        IDA                  | IDA Results Measurement System                                         | 2015-12-30
        IDS                  | International Debt Statistics                                          | 2015-12-16
        JOBS                 | Jobs                                                                   | 2015-09-21
        MDGs                 | Millennium Development Goals                                           | 2015-11-16
        QEDS/GDDS            | Quarterly External Debt Statistics GDDS (New)                          | 2016-01-28
        QEDS/SDDS            | Quarterly External Debt Statistics SDDS (New)                          | 2016-01-28
        SE4ALL               | Sustainable Energy for All                                             | 2015-09-09
        WDI                  | World Development Indicators                                           | 2016-02-17
        WGI                  | Worldwide Governance Indicators                                        | 2015-09-25        
        """

        for page in self.download_json('datacatalog'):

            for source in page["datacatalog"]:
                name = None
                is_time_series = False
                dataset_id = None
                dataset_code = None
                doc_href = None
                last_update = None
                metadata = {}
                for value in source["metatype"]:
                    if value["id"] == "type" and value[
                            "value"] == "Time series":
                        is_time_series = True
                    elif value["id"] == "name":
                        name = value["value"]
                    elif value["id"] == "acronym":
                        dataset_code = value["value"]
                    elif value["id"] == "apisourceid":
                        metadata["id"] = value["value"]
                        dataset_id = value["value"]
                    elif value["id"] == "detailpageurl":
                        doc_href = value["value"]
                    elif value["id"] == "lastrevisiondate":
                        print("Date: ", value["value"])
                        if value["value"].lower() == "current":
                            last_update = clean_datetime()
                        else:
                            try:
                                last_update = clean_datetime(
                                    datetime.strptime(
                                        value["value"],
                                        '%d-%b-%Y'))  #17-Feb-2016
                            except:
                                pass
                    elif value["id"] == "updatefrequency":
                        metadata["updatefrequency"] = value["value"]
                    elif value["id"] == "updateschedule":
                        metadata["updateschedule"] = value["value"]

                if not dataset_id or is_time_series is False or not dataset_code or dataset_id in self.blacklist:
                    continue

                position += 1

                cat = {
                    "provider_name":
                    self.provider_name,
                    "category_code":
                    dataset_code,
                    "name":
                    name,
                    "doc_href":
                    doc_href,
                    "position":
                    position,
                    "datasets": [{
                        "dataset_code": dataset_code,
                        "name": name,
                        "last_update": last_update or clean_datetime(),
                        "metadata": metadata
                    }]
                }
                categories.append(cat)

        return categories
Ejemplo n.º 27
0
    def _process(self):

        for current_indicator in self.indicators:
            self.current_indicator = current_indicator

            #if not self.current_indicator["id"] == "CC.EST":
            #    continue

            count = 0

            if self.current_indicator["id"] in self.blacklist_indicator:
                continue

            is_release_controled = False
            is_rejected = False

            slug_indicator = slugify(self.current_indicator["id"],
                                     save_order=True)

            for current_country in self.countries_to_process:
                self.current_country = current_country

                logger.info(
                    "Fetching dataset[%s] - indicator[%s] - country[%s]" %
                    (self.dataset_code, self.current_indicator["id"],
                     self.current_country))

                release_date, datas = self._download_values(
                    self.current_country, self.current_indicator["id"])

                if not datas:
                    continue

                self.release_date = clean_datetime(
                    datetime.strptime(release_date, '%Y-%m-%d'))

                if is_release_controled is False:

                    is_release_controled = True

                    if self.dataset.metadata["indicators"].get(slug_indicator):

                        if self.release_date >= self.dataset.metadata[
                                "indicators"][slug_indicator]:
                            msg = "Reject series updated for provider[%s] - dataset[%s] - key[%s]"
                            logger.info(msg %
                                        (self.provider_name, self.dataset_code,
                                         self.current_indicator["id"]))

                            is_rejected = True
                            break

                    self.dataset.metadata["indicators"][
                        slug_indicator] = self.release_date
                    self.dataset.last_update = clean_datetime()

                count += 1

                yield {"datas": datas}, None

            if not is_rejected:
                logger.info(
                    "TOTAL - dataset[%s] - indicator[%s] - count[%s]" %
                    (self.dataset_code, self.current_indicator["id"], count))
                if count == 0:
                    logger.warning(
                        "EMPTY dataset[%s] - indicator[%s]" %
                        (self.dataset_code, self.current_indicator["id"]))

        yield None, None
Ejemplo n.º 28
0
    def build_data_tree(self):
        """Builds the data tree
        """

        download = Downloader(
            url=self.url_table_of_contents,
            filename="table_of_contents.xml",
            store_filepath=self.store_path,
            use_existing_file=self.use_existing_file,
        )
        filepath = download.get_filepath()

        categories = []
        categories_keys = []

        it = etree.iterparse(filepath, events=["end"], tag="{urn:eu.europa.ec.eurostat.navtree}leaf")

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.categories_filter:
                if _select in parent_codes:
                    return True
            return False

        def get_category(category_code):
            for c in categories:
                if c["category_code"] == category_code:
                    return c

        def create_categories(parent_codes, parent_titles, position):

            position += 1

            for i in range(len(parent_codes)):
                category_code = parent_codes.pop()
                name = parent_titles.pop()
                all_parents = parent_codes.copy()
                parent = None
                if all_parents:
                    parent = all_parents[-1]
                if not category_code in categories_keys:
                    _category = {
                        "provider_name": self.provider_name,
                        "category_code": category_code,
                        "name": name,
                        "position": position + i,
                        "parent": parent,
                        "all_parents": all_parents,
                        "datasets": [],
                        "doc_href": None,
                        "metadata": None,
                    }
                    categories_keys.append(category_code)
                    categories.append(_category)

        position = 0
        is_verify_creation_date = False

        for event, dataset in it:

            if is_verify_creation_date is False:
                _root = dataset.getroottree().getroot()
                creation_date_str = _root.attrib.get("creationDate")
                creation_date = clean_datetime(datetime.strptime(creation_date_str, "%Y%m%dT%H%M"))

                if self._is_updated_catalog(creation_date) is False:
                    msg = "no update from eurostat catalog. current[%s] - db[%s]"
                    logger.warning(msg % (creation_date, self.provider.metadata["creation_date"]))
                    if not self.force_update:
                        return []

                is_verify_creation_date = True
                if not self.force_update:
                    self.updated_catalog = True

            parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP)

            if not is_selected(parent_codes):
                continue

            parent_titles = dataset.xpath(
                "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP
            )
            category_code = parent_codes[-1]

            create_categories(parent_codes, parent_titles, position)

            category = get_category(category_code)

            name = xpath_title(dataset)[0]
            last_update = xpath_ds_last_update(dataset)
            last_modified = xpath_ds_last_modified(dataset)
            doc_href = xpath_ds_metadata_html(dataset)
            data_start = xpath_ds_data_start(dataset)
            data_end = xpath_ds_data_end(dataset)
            values = xpath_ds_values(dataset)

            last_update = datetime.strptime(last_update[0], "%d.%m.%Y")
            if last_modified:
                last_modified = datetime.strptime(last_modified[0], "%d.%m.%Y")
                last_update = max(last_update, last_modified)

            dataset_code = xpath_code(dataset)[0]
            _dataset = {
                "dataset_code": dataset_code,
                "name": name,
                "last_update": clean_datetime(last_update),
                "metadata": {
                    "doc_href": first_element_xpath(doc_href),
                    "data_start": first_element_xpath(data_start),
                    "data_end": first_element_xpath(data_end),
                    "values": int(first_element_xpath(values, default="0")),
                },
            }
            category["datasets"].append(_dataset)

        self.for_delete.append(filepath)

        return categories