def _load_file(self): filename = "data-%s.zip" % (self.dataset_code) download = Downloader( url=self.url, filename=filename, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file, ) self.filepath, response = download.get_filepath_and_response() if self.filepath: self.fetcher.for_delete.append(self.filepath) release_date_str = response.headers["Last-Modified"] # Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT self.release_date = clean_datetime(datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT")) if self.dataset.last_update and self.dataset.last_update >= self.release_date: comments = "update-date[%s]" % self.release_date raise errors.RejectUpdatedDataset( provider_name=self.provider_name, dataset_code=self.dataset_code, comments=comments ) self.dataset.last_update = self.release_date
def __init__(self, dataset=None): super().__init__(dataset) self.store_path = self.get_store_path() self.xml_dsd = XMLStructure(provider_name=self.provider_name) self._load_dsd() if self.dataset.last_update and self.xml_dsd.last_update: if self.dataset.last_update > self.xml_dsd.last_update: comments = "update-date[%s]" % self.xml_dsd.last_update raise errors.RejectUpdatedDataset( provider_name=self.provider_name, dataset_code=self.dataset.dataset_code, comments=comments) self.dataset.last_update = clean_datetime(self.xml_dsd.last_update) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dataset_code, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension()
def __init__(self, dataset=None): super().__init__(dataset) self.store_path = self.get_store_path() self.xml_dsd = XMLStructure(provider_name=self.provider_name) self._load_dsd() if self.dataset.last_update and self.xml_dsd.last_update: if self.dataset.last_update > self.xml_dsd.last_update: comments = "update-date[%s]" % self.xml_dsd.last_update raise errors.RejectUpdatedDataset(provider_name=self.provider_name, dataset_code=self.dataset.dataset_code, comments=comments) self.dataset.last_update = clean_datetime(self.xml_dsd.last_update) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dataset_code, frequencies_supported=FREQUENCIES_SUPPORTED) self.rows = self._get_data_by_dimension()
def _load_file(self): filename = "data-%s.zip" % (self.dataset_code) download = Downloader( url=self.url, filename=filename, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file, ) self.filepath, response = download.get_filepath_and_response() if self.filepath: self.fetcher.for_delete.append(self.filepath) release_date_str = response.headers['Last-Modified'] #Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT self.release_date = clean_datetime( datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT")) if self.dataset.last_update and self.dataset.last_update >= self.release_date: comments = "update-date[%s]" % self.release_date raise errors.RejectUpdatedDataset(provider_name=self.provider_name, dataset_code=self.dataset_code, comments=comments) self.dataset.last_update = self.release_date
def _get_release_date(self, url, sheet): if 'Section' in url : release_datesheet = sheet.cell_value(4,0)[15:] #April 28, 2016 elif 'ITA-XLS' in url or 'IIP-XLS' in url : release_datesheet = sheet.cell_value(3,0)[14:].split('-')[0] else : release_datesheet = sheet.cell_value(3,0)[14:] return clean_datetime(datetime.strptime(release_datesheet.strip(), "%B %d, %Y"))
def clean_field(self, bson): if not "start_ts" in bson or not bson.get("start_ts"): if bson["frequency"] == "A": year = int(get_year(bson["values"][0]["period"])) bson["start_ts"] = clean_datetime(datetime(year, 1, 1), rm_hour=True, rm_minute=True, rm_second=True, rm_microsecond=True, rm_tzinfo=True) else: bson["start_ts"] = clean_datetime(pandas.Period(ordinal=bson["start_date"], freq=bson["frequency"]).start_time.to_datetime()) if not "end_ts" in bson or not bson.get("end_ts"): if bson["frequency"] == "A": year = int(get_year(bson["values"][-1]["period"])) bson["end_ts"] = clean_datetime(datetime(year, 12, 31), rm_hour=True, rm_minute=True, rm_second=True, rm_microsecond=True, rm_tzinfo=True) else: bson["end_ts"] = clean_datetime(pandas.Period(ordinal=bson["end_date"], freq=bson["frequency"]).end_time.to_datetime()) dimensions = bson.pop("dimensions") attributes = bson.pop("attributes", {}) new_dimensions = {} new_attributes = {} for key, value in dimensions.items(): new_dimensions[slugify(key, save_order=True)] = slugify(value, save_order=True) if attributes: for key, value in attributes.items(): new_attributes[slugify(key, save_order=True)] = slugify(value, save_order=True) bson["dimensions"] = new_dimensions if attributes: bson["attributes"] = new_attributes else: bson["attributes"] = None for value in bson["values"]: if not value.get("attributes"): continue attributes_obs = {} for k, v in value.get("attributes").items(): attributes_obs[slugify(k, save_order=True)] = slugify(v, save_order=True) value["attributes"] = attributes_obs return bson
def update_database(self, save_only=False): self.fetcher.hook_before_dataset(self) try: if not save_only: if self.fetcher.async_mode and self.fetcher.async_framework == "gevent": self.series.process_series_data_async() else: self.series.process_series_data() except Exception: self.fetcher.errors += 1 logger.critical(last_error()) if self.fetcher.max_errors and self.fetcher.errors >= self.fetcher.max_errors: msg = "The maximum number of errors is exceeded for provider[%s] - dataset[%s]. MAX[%s]" raise errors.MaxErrors(msg % (self.provider_name, self.dataset_code, self.fetcher.max_errors)) finally: now = clean_datetime() if not self.download_first: self.download_first = now self.download_last = now schemas.dataset_schema(self.bson) if not self.is_recordable(): self.enable = False msg = "disable dataset[%s] for provider[%s]" logger.warning(msg % (self.dataset_code, self.provider_name)) else: self.enable = True if logger.isEnabledFor(logging.INFO): msg_stats = "STATS dataset-update: provider[%s] - dataset[%s] - accepts[%s] - rejects[%s] - inserts[%s] - updates[%s]" logger.info(msg_stats % (self.provider_name, self.dataset_code, self.series.count_accepts, self.series.count_rejects, self.series.count_inserts, self.series.count_updates)) if save_only: self.series.reset_counters() result = self.update_mongo_collection(constants.COL_DATASETS, ['provider_name', 'dataset_code'], self.bson) self.fetcher.hook_after_dataset(self) return result
def _get_release_date(self, url, sheet): if 'Section' in url: release_datesheet = sheet.cell_value(4, 0)[15:] #April 28, 2016 elif 'ITA-XLS' in url or 'IIP-XLS' in url: release_datesheet = sheet.cell_value(3, 0)[14:].split('-')[0] else: release_datesheet = sheet.cell_value(3, 0)[14:] return clean_datetime( datetime.strptime(release_datesheet.strip(), "%B %d, %Y"))
def upsert_dataset(self, dataset_code): dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name="My Dataset Name", last_update=clean_datetime(), fetcher=self) fetcher_data = DUMMY_Data(dataset) dataset.series.data_iterator = fetcher_data return dataset.update_database()
def upsert_dataset(self, dataset_code): self._load_structure() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=self.provider.website, last_update=utils.clean_datetime(), fetcher=self) _data = ECB_Data(dataset=dataset) dataset.series.data_iterator = _data return dataset.update_database()
def upsert_dataset(self, dataset_code): self._load_structure() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=self.provider.website, fetcher=self) dataset.last_update = utils.clean_datetime() _data = ECB_Data(dataset=dataset) dataset.series.data_iterator = _data return dataset.update_database()
def upsert_dataset(self, dataset_code): dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name="My Dataset Name", last_update=clean_datetime(), fetcher=self) dataset.codelists = { 'COUNTRY': {'FRA': 'France'}, 'OBS_STATUS': {'A': "A"} } fetcher_data = DUMMY_Data(dataset) dataset.series.data_iterator = fetcher_data return dataset.update_database()
def upsert_dataset(self, dataset_code): if not DATASETS.get(dataset_code): raise Exception("This dataset is unknown" + dataset_code) dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], fetcher=self) dataset.last_update = clean_datetime() dataset.series.data_iterator = OECD_Data( dataset, sdmx_filter=DATASETS[dataset_code]['sdmx_filter']) return dataset.update_database()
def upsert_dataset(self, dataset_code): if not DATASETS.get(dataset_code): raise Exception("This dataset is unknown" + dataset_code) dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=DATASETS[dataset_code]['name'], doc_href=DATASETS[dataset_code]['doc_href'], last_update=clean_datetime(), fetcher=self) dataset.series.data_iterator = OECD_Data(dataset, sdmx_filter=DATASETS[dataset_code]['sdmx_filter']) return dataset.update_database()
def upsert_dataset(self, dataset_code): self.get_selected_datasets() dataset_settings = self.selected_datasets[dataset_code] #http://data.worldbank.org/indicator/AG.AGR.TRAC.NO dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], last_update=clean_datetime(), fetcher=self) dataset.series.data_iterator = WorldBankAPIData(dataset, dataset_settings) return dataset.update_database()
def upsert_dataset(self, dataset_code): self._load_structure_dataflows() self._load_structure_concepts() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=None, fetcher=self) dataset.last_update = clean_datetime() insee_data = INSEE_Data(dataset) dataset.series.data_iterator = insee_data return dataset.update_database()
def upsert_dataset(self, dataset_code): self.get_selected_datasets() dataset_settings = self.selected_datasets[dataset_code] dataset = Datasets( provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], fetcher=self ) if dataset_code in DATASETS: dataset.series.data_iterator = ExcelData(dataset, DATASETS[dataset_code]["url"]) dataset.doc_href = DATASETS[dataset_code]["doc_href"] else: dataset.last_update = clean_datetime() dataset.series.data_iterator = WorldBankAPIData(dataset, dataset_settings) return dataset.update_database()
def upsert_dataset(self, dataset_code): dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name="My Dataset Name", last_update=clean_datetime(), fetcher=self) dataset.codelists = { 'COUNTRY': { 'FRA': 'France' }, 'OBS_STATUS': { 'A': "A" } } fetcher_data = DUMMY_Data(dataset) dataset.series.data_iterator = fetcher_data return dataset.update_database()
def upsert_dataset(self, dataset_code): self._load_structure() dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=None, doc_href=None, last_update=clean_datetime(), fetcher=self) query = {'provider_name': self.provider_name, "dataset_code": dataset_code} dataset_doc = self.db[constants.COL_DATASETS].find_one(query) insee_data = INSEE_Data(dataset, dataset_doc=dataset_doc) dataset.series.data_iterator = insee_data return dataset.update_database()
def upsert_dataset(self, dataset_code): self.get_selected_datasets() dataset_settings = self.selected_datasets[dataset_code] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], fetcher=self) if dataset_code in DATASETS: dataset.series.data_iterator = ExcelData( dataset, DATASETS[dataset_code]["url"]) dataset.doc_href = DATASETS[dataset_code]["doc_href"] else: dataset.last_update = clean_datetime() dataset.series.data_iterator = WorldBankAPIData( dataset, dataset_settings) return dataset.update_database()
def _process(self): for current_indicator in self.indicators: self.current_indicator = current_indicator #if not self.current_indicator["id"] == "CC.EST": # continue count = 0 if self.current_indicator["id"] in self.blacklist_indicator: continue is_release_controled = False is_rejected = False slug_indicator = slugify(self.current_indicator["id"], save_order=True) for current_country in self.countries_to_process: self.current_country = current_country logger.info("Fetching dataset[%s] - indicator[%s] - country[%s]" % (self.dataset_code, self.current_indicator["id"], self.current_country)) release_date, datas = self._download_values(self.current_country, self.current_indicator["id"]) if not datas: continue self.release_date = clean_datetime(datetime.strptime(release_date, '%Y-%m-%d')) if is_release_controled is False: is_release_controled = True if self.dataset.metadata["indicators"].get(slug_indicator): if self.release_date >= self.dataset.metadata["indicators"][slug_indicator]: msg = "Reject series updated for provider[%s] - dataset[%s] - key[%s]" logger.info(msg % (self.provider_name, self.dataset_code, self.current_indicator["id"])) is_rejected = True break self.dataset.metadata["indicators"][slug_indicator] = self.release_date count += 1 yield {"datas": datas}, None if not is_rejected: logger.info("TOTAL - dataset[%s] - indicator[%s] - count[%s]" % (self.dataset_code, self.current_indicator["id"], count)) if count == 0: logger.warning("EMPTY dataset[%s] - indicator[%s]" % (self.dataset_code, self.current_indicator["id"])) yield None, None
def build_data_tree(self): """ http://api.worldbank.org/v2/datacatalog?format=xml&per_page=20 http://api.worldbank.org/v2/datacatalog/3?format=json&per_page=20 > toujours le catalogue mais en limitant les champs: http://api.worldbank.org/v2/datacatalog/metatypes/name;type;acronym?format=json&per_page=200 http://api.worldbank.org/v2/datacatalog/metatypes/type;url;lastrevisiondate?format=json&per_page=50 > Voir si numberofeconomies = nombre de series ? > calendar: updatefrequency, updateschedule > use: detailpageurl pour doc_href datacatalog": [ { id": "3", "metatype": [ { "id": "name", "value": "Global Economic Monitor" }, { "id": "acronym", "value": "GEM" }, { "id": "description", "value": "Providing...." }, { "id": "url", "value": "http://databank.worldbank.org/data/views/variableselection/selectvariables.aspx?source=global-economic-monitor-(gem)" }, { "id": "apisourceid", !!! lien avec id source ! "value": "15" } }, ] """ categories = [] position = 0 for page in self.download_json('sources'): for source in page[1]: if source["id"] in self.blacklist: continue position += 1 cat = { "provider_name": self.provider_name, "category_code": source["code"], "name": source["name"], #TODO: "doc_href": ?, "position": position, "datasets": [{ "name": source["name"], "dataset_code": source["code"], "last_update": None, "metadata": {"id": source["id"]} }] } categories.append(cat) return categories """ http://api.worldbank.org/v2/datacatalog?format=json&per_page=20 FIXME: Par le catalogue: manque datasets. que: ADI | Africa Development Indicators | 2013-02-22 DB | Doing Business | 2015-11-24 EdStats | Education Statistics | 2016-03-04 GEM | Global Economic Monitor | 2016-03-22 GEP | Global Economic Prospects | 2016-01-06 GFDD | Global Financial Development | 2015-09-14 GPE | GPE Results Forms Database | 2013-01-10 Global Findex | Global Financial Inclusion (Global Findex) Database | 2015-04-15 IDA | IDA Results Measurement System | 2015-12-30 IDS | International Debt Statistics | 2015-12-16 JOBS | Jobs | 2015-09-21 MDGs | Millennium Development Goals | 2015-11-16 QEDS/GDDS | Quarterly External Debt Statistics GDDS (New) | 2016-01-28 QEDS/SDDS | Quarterly External Debt Statistics SDDS (New) | 2016-01-28 SE4ALL | Sustainable Energy for All | 2015-09-09 WDI | World Development Indicators | 2016-02-17 WGI | Worldwide Governance Indicators | 2015-09-25 """ for page in self.download_json('datacatalog'): for source in page["datacatalog"]: name = None is_time_series = False dataset_id = None dataset_code = None doc_href = None last_update = None metadata = {} for value in source["metatype"]: if value["id"] == "type" and value["value"] == "Time series": is_time_series = True elif value["id"] == "name": name = value["value"] elif value["id"] == "acronym": dataset_code = value["value"] elif value["id"] == "apisourceid": metadata["id"] = value["value"] dataset_id = value["value"] elif value["id"] == "detailpageurl": doc_href = value["value"] elif value["id"] == "lastrevisiondate": print("Date: ", value["value"]) if value["value"].lower() == "current": last_update = clean_datetime() else: try: last_update = clean_datetime(datetime.strptime(value["value"], '%d-%b-%Y')) #17-Feb-2016 except: pass elif value["id"] == "updatefrequency": metadata["updatefrequency"] = value["value"] elif value["id"] == "updateschedule": metadata["updateschedule"] = value["value"] if not dataset_id or is_time_series is False or not dataset_code or dataset_id in self.blacklist: continue position += 1 cat = { "provider_name": self.provider_name, "category_code": dataset_code, "name": name, "doc_href": doc_href, "position": position, "datasets": [{ "dataset_code": dataset_code, "name": name, "last_update": last_update or clean_datetime(), "metadata": metadata }] } categories.append(cat) return categories
def series_update(new_bson, old_bson=None, last_update=None): if not new_bson or not isinstance(new_bson, dict): raise ValueError("no new_bson or not dict instance") if old_bson and not isinstance(old_bson, dict): raise ValueError("old_bson is not dict instance") if not "values" in new_bson: raise ValueError("not values field in new_bson") if old_bson and not "values" in old_bson: raise ValueError("not values field in old_bson") if not isinstance(new_bson["values"][0], dict): raise ValueError("Invalid format for this series : %s" % new_bson) if new_bson["start_date"] > new_bson["end_date"]: raise errors.RejectInvalidSeries("Invalid dates. start_date > end_date", provider_name=new_bson["provider_name"], dataset_code=new_bson["dataset_code"], bson=new_bson) #FIXME: """ if new_bson["frequency"] != "D" and len(new_bson["values"]) > 1: count_obs = (new_bson["end_date"] - new_bson["start_date"]) +1 if len(new_bson["values"]) != count_obs: msg = "Missing values for provider[%s] - dataset[%s] - current[%s] - attempt[%s]" % (new_bson["provider_name"], new_bson["dataset_code"], len(new_bson["values"]), count_obs) raise Exception(msg) """ _last_update = None if new_bson.get('last_update'): _last_update = clean_datetime(new_bson.pop('last_update', None)) else: _last_update = clean_datetime(last_update) new_bson.pop('last_update', None) #TODO: valeurs manquantes à remplacer par chaine Unique: NaN series_set_release_date(new_bson, _last_update) if not old_bson: if not IS_SCHEMAS_VALIDATION_DISABLE: schemas.series_schema(new_bson) return new_bson else: changed = series_revisions(new_bson, old_bson, _last_update) if not changed: changed = series_is_changed(new_bson, old_bson) if not changed: return if not IS_SCHEMAS_VALIDATION_DISABLE: schemas.series_schema(new_bson) return new_bson
def build_data_tree(self): """Builds the data tree """ download = Downloader(url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=['end'], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, 'all_parents': all_parents, "datasets": [], "doc_href": None, "metadata": None } categories_keys.append(category_code) categories.append(_category) position = 0 is_verify_creation_date = False for event, dataset in it: if is_verify_creation_date is False: _root = dataset.getroottree().getroot() creation_date_str = _root.attrib.get("creationDate") creation_date = clean_datetime( datetime.strptime(creation_date_str, '%Y%m%dT%H%M')) if self._is_updated_catalog(creation_date) is False: msg = "no update from eurostat catalog. current[%s] - db[%s]" logger.warning(msg % (creation_date, self.provider.metadata["creation_date"])) if not self.force_update: return [] is_verify_creation_date = True if not self.force_update: self.updated_catalog = True parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath( "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], '%d.%m.%Y') if last_modified: last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y') last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": clean_datetime(last_update), "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), } } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories
def build_data_tree(self): """ http://api.worldbank.org/v2/datacatalog?format=xml&per_page=20 http://api.worldbank.org/v2/datacatalog/3?format=json&per_page=20 > toujours le catalogue mais en limitant les champs: http://api.worldbank.org/v2/datacatalog/metatypes/name;type;acronym?format=json&per_page=200 http://api.worldbank.org/v2/datacatalog/metatypes/type;url;lastrevisiondate?format=json&per_page=50 > Voir si numberofeconomies = nombre de series ? > calendar: updatefrequency, updateschedule > use: detailpageurl pour doc_href datacatalog": [ { id": "3", "metatype": [ { "id": "name", "value": "Global Economic Monitor" }, { "id": "acronym", "value": "GEM" }, { "id": "description", "value": "Providing...." }, { "id": "url", "value": "http://databank.worldbank.org/data/views/variableselection/selectvariables.aspx?source=global-economic-monitor-(gem)" }, { "id": "apisourceid", !!! lien avec id source ! "value": "15" } }, ] """ categories = [] position = 0 for page in self.download_json('sources'): for source in page[1]: if source["id"] in self.blacklist: continue position += 1 cat = { "provider_name": self.provider_name, "category_code": source["code"], "name": source["name"], #TODO: "doc_href": ?, "position": position, "datasets": [{ "name": source["name"], "dataset_code": source["code"], "last_update": None, "metadata": { "id": source["id"] } }] } categories.append(cat) return categories """ http://api.worldbank.org/v2/datacatalog?format=json&per_page=20 FIXME: Par le catalogue: manque datasets. que: ADI | Africa Development Indicators | 2013-02-22 DB | Doing Business | 2015-11-24 EdStats | Education Statistics | 2016-03-04 GEM | Global Economic Monitor | 2016-03-22 GEP | Global Economic Prospects | 2016-01-06 GFDD | Global Financial Development | 2015-09-14 GPE | GPE Results Forms Database | 2013-01-10 Global Findex | Global Financial Inclusion (Global Findex) Database | 2015-04-15 IDA | IDA Results Measurement System | 2015-12-30 IDS | International Debt Statistics | 2015-12-16 JOBS | Jobs | 2015-09-21 MDGs | Millennium Development Goals | 2015-11-16 QEDS/GDDS | Quarterly External Debt Statistics GDDS (New) | 2016-01-28 QEDS/SDDS | Quarterly External Debt Statistics SDDS (New) | 2016-01-28 SE4ALL | Sustainable Energy for All | 2015-09-09 WDI | World Development Indicators | 2016-02-17 WGI | Worldwide Governance Indicators | 2015-09-25 """ for page in self.download_json('datacatalog'): for source in page["datacatalog"]: name = None is_time_series = False dataset_id = None dataset_code = None doc_href = None last_update = None metadata = {} for value in source["metatype"]: if value["id"] == "type" and value[ "value"] == "Time series": is_time_series = True elif value["id"] == "name": name = value["value"] elif value["id"] == "acronym": dataset_code = value["value"] elif value["id"] == "apisourceid": metadata["id"] = value["value"] dataset_id = value["value"] elif value["id"] == "detailpageurl": doc_href = value["value"] elif value["id"] == "lastrevisiondate": print("Date: ", value["value"]) if value["value"].lower() == "current": last_update = clean_datetime() else: try: last_update = clean_datetime( datetime.strptime( value["value"], '%d-%b-%Y')) #17-Feb-2016 except: pass elif value["id"] == "updatefrequency": metadata["updatefrequency"] = value["value"] elif value["id"] == "updateschedule": metadata["updateschedule"] = value["value"] if not dataset_id or is_time_series is False or not dataset_code or dataset_id in self.blacklist: continue position += 1 cat = { "provider_name": self.provider_name, "category_code": dataset_code, "name": name, "doc_href": doc_href, "position": position, "datasets": [{ "dataset_code": dataset_code, "name": name, "last_update": last_update or clean_datetime(), "metadata": metadata }] } categories.append(cat) return categories
def _process(self): for current_indicator in self.indicators: self.current_indicator = current_indicator #if not self.current_indicator["id"] == "CC.EST": # continue count = 0 if self.current_indicator["id"] in self.blacklist_indicator: continue is_release_controled = False is_rejected = False slug_indicator = slugify(self.current_indicator["id"], save_order=True) for current_country in self.countries_to_process: self.current_country = current_country logger.info( "Fetching dataset[%s] - indicator[%s] - country[%s]" % (self.dataset_code, self.current_indicator["id"], self.current_country)) release_date, datas = self._download_values( self.current_country, self.current_indicator["id"]) if not datas: continue self.release_date = clean_datetime( datetime.strptime(release_date, '%Y-%m-%d')) if is_release_controled is False: is_release_controled = True if self.dataset.metadata["indicators"].get(slug_indicator): if self.release_date >= self.dataset.metadata[ "indicators"][slug_indicator]: msg = "Reject series updated for provider[%s] - dataset[%s] - key[%s]" logger.info(msg % (self.provider_name, self.dataset_code, self.current_indicator["id"])) is_rejected = True break self.dataset.metadata["indicators"][ slug_indicator] = self.release_date self.dataset.last_update = clean_datetime() count += 1 yield {"datas": datas}, None if not is_rejected: logger.info( "TOTAL - dataset[%s] - indicator[%s] - count[%s]" % (self.dataset_code, self.current_indicator["id"], count)) if count == 0: logger.warning( "EMPTY dataset[%s] - indicator[%s]" % (self.dataset_code, self.current_indicator["id"])) yield None, None
def build_data_tree(self): """Builds the data tree """ download = Downloader( url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file, ) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=["end"], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": None, } categories_keys.append(category_code) categories.append(_category) position = 0 is_verify_creation_date = False for event, dataset in it: if is_verify_creation_date is False: _root = dataset.getroottree().getroot() creation_date_str = _root.attrib.get("creationDate") creation_date = clean_datetime(datetime.strptime(creation_date_str, "%Y%m%dT%H%M")) if self._is_updated_catalog(creation_date) is False: msg = "no update from eurostat catalog. current[%s] - db[%s]" logger.warning(msg % (creation_date, self.provider.metadata["creation_date"])) if not self.force_update: return [] is_verify_creation_date = True if not self.force_update: self.updated_catalog = True parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath( "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP ) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], "%d.%m.%Y") if last_modified: last_modified = datetime.strptime(last_modified[0], "%d.%m.%Y") last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": clean_datetime(last_update), "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), }, } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories