def _load_structure(self, force=False): if self._dataflows and not force: return self.xml_sdmx = XMLSDMX(agencyID=self.provider_name) self.xml_dsd = XMLStructure(provider_name=self.provider_name, sdmx_client=self.xml_sdmx) url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name download = Downloader(url=url, filename="dataflow.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name download = Downloader(url=url, filename="categoryscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name download = Downloader(url=url, filename="categorisation.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name download = Downloader(url=url, filename="conceptscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._concepts = self.xml_dsd.concepts
def _load(self): url = "http://sdw-wsrest.ecb.int/service/dataflow/ECB/%s" % self.dataset_code download = Downloader(url=url, filename="dataflow-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS) self.xml_dsd.process(download.get_filepath()) self.dsd_id = self.xml_dsd.dsd_id if not self.dsd_id: msg = "DSD ID not found for provider[%s] - dataset[%s]" % (self.provider_name, self.dataset_code) raise Exception(msg) url = "http://sdw-wsrest.ecb.int/service/datastructure/ECB/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS) self.xml_dsd.process(download.get_filepath()) self.dataset.name = self.xml_dsd.dataset_name dimensions = OrderedDict() for key, item in self.xml_dsd.dimensions.items(): dimensions[key] = item["dimensions"] self.dimension_list.set_dict(dimensions) attributes = OrderedDict() for key, item in self.xml_dsd.attributes.items(): attributes[key] = item["values"] self.attribute_list.set_dict(attributes) url = "http://sdw-wsrest.ecb.int/service/data/%s" % self.dataset_code download = Downloader(url=url, filename="data-%s.xml" % self.dataset_code, headers=SDMX_DATA_HEADERS) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, dimension_keys=self.xml_dsd.dimension_keys) #TODO: response and exception try: filepath, response = download.get_filepath_and_response() except requests.exceptions.HTTPError as err: logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code) raise self.rows = self.xml_data.process(filepath)
def _load_structure_datatree(self, force=False): if self._categoryschemes and self._categorisations and not force: return self._load_structure_dataflows(force) url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name """ if self.refresh_meta is False: self._categoryschemes = self._structure_get("categoryschemes") if self._categoryschemes: logger.info("load structure [categoryschemes] from metadata for url[%s]" % url) """ if not self._categoryschemes: download = Downloader(url=url, filename="categoryscheme.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categoryschemes = self.xml_dsd.categories #self._structure_put("categoryschemes", url, **self._categoryschemes) url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name """ if self.refresh_meta is False: self._categorisations = self._structure_get("categorisation") if self._categorisations: self._categorisations_categories = self._structure_get("categorisations_categories") logger.info("load structure [categorisation] from metadata for url[%s]" % url) """ if not self._categorisations: download = Downloader(url=url, filename="categorisation.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._categorisations = self.xml_dsd.categorisations self._categorisations_categories = self.xml_dsd.categorisations_categories
def _load(self): download = Downloader( url=self.dataset_url, filename="data-%s.zip" % self.dataset_code, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, ) filepaths = extract_zip_file(download.get_filepath()) dsd_fp = filepaths[self.dataset_code + ".dsd.xml"] data_fp = filepaths[self.dataset_code + ".sdmx.xml"] self.fetcher.for_delete.append(dsd_fp) self.fetcher.for_delete.append(data_fp) self.xml_dsd.process(dsd_fp) self._set_dataset() self.xml_data = XMLData( provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dataset_code, # TODO: frequencies_supported=FREQUENCIES_SUPPORTED ) self.rows = self.xml_data.process(data_fp)
def _load_structure_dataflows(self, force=False): if self._dataflows and not force: return self.provider_verify() url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name if self.refresh_meta is False: self._dataflows = self._structure_get("dataflows") if self._dataflows: self.xml_dsd.dataflows = self._dataflows logger.info( "load structure [dataflows] from metadata for url[%s]" % url) return download = Downloader(url=url, filename="dataflow.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows self._structure_put("dataflows", url, **self._dataflows)
def _load_datas(self): store_filepath = self.get_store_path() # TODO: timeout, replace download = Downloader(url=self.dataset_url, filename=self.filename, store_filepath=store_filepath) return(download.get_filepath())
def _load_structure_dataflows(self, force=False): if self._dataflows and not force: return self.provider_verify() url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name if self.refresh_meta is False: self._dataflows = self._structure_get("dataflows") if self._dataflows: self.xml_dsd.dataflows = self._dataflows logger.info("load structure [dataflows] from metadata for url[%s]" % url) return download = Downloader(url=url, filename="dataflow.xml", store_filepath=self.store_path, headers=SDMX_METADATA_HEADERS, use_existing_file=self.use_existing_file, client=self.requests_client) filepath = download.get_filepath() self.for_delete.append(filepath) self.xml_dsd.process(filepath) self._dataflows = self.xml_dsd.dataflows self._structure_put("dataflows", url, **self._dataflows)
def weo_urls(self): download = Downloader(url='http://www.imf.org/external/ns/cs.aspx?id=28', filename="weo.html", store_filepath=self.store_path) filepath = download.get_filepath() with open(filepath, 'rb') as fp: webpage = fp.read() self.fetcher.for_delete.append(filepath) #TODO: replace by beautifoulsoup ? html = etree.HTML(webpage) hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']") links = [href.values() for href in hrefs] #The last links of the WEO webpage lead to data we dont want to pull. links = links[:-16] #These are other links we don't want. links.pop(-8) links.pop(-10) links = [link[0][:-10]+'download.aspx' for link in links] output = [] for link in links: webpage = requests.get(link) html = etree.HTML(webpage.text) final_link = html.xpath("//div[@id = 'content']//table//a['href']") output.append(link[:-13]+final_link[0].values()[0]) # we need to handle the issue in chronological order return sorted(output)
def _load_datas(self, datas=None): kwargs = {} if not datas: # TODO: timeout, replace download = Downloader(url=self.url, store_filepath=self.store_path, filename=self.filename, use_existing_file=self.fetcher.use_existing_file) zip_filepath = download.get_filepath() self.fetcher.for_delete.append(zip_filepath) filepath = extract_zip_file(zip_filepath) self.fetcher.for_delete.append(zip_filepath) kwargs['filepath'] = filepath else: kwargs['fileobj'] = io.StringIO(datas, newline="\n") kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y" kwargs['headers_line'] = DATASETS[self.dataset.dataset_code]['lines']['headers'] self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(**kwargs) self.dataset.dimension_keys = self.dimension_keys self.dataset.last_update = self.release_date self.start_date = get_ordinal_from_period(self.periods[0], freq=self.frequency) self.end_date = get_ordinal_from_period(self.periods[-1], freq=self.frequency)
def weo_urls(self): download = Downloader( url='http://www.imf.org/external/ns/cs.aspx?id=28', filename="weo.html", store_filepath=self.store_path) filepath = download.get_filepath() with open(filepath, 'rb') as fp: webpage = fp.read() self.fetcher.for_delete.append(filepath) #TODO: replace by beautifoulsoup ? html = etree.HTML(webpage) hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']") links = [href.values() for href in hrefs] #The last links of the WEO webpage lead to data we dont want to pull. links = links[:-16] #These are other links we don't want. links.pop(-8) links.pop(-10) links = [link[0][:-10] + 'download.aspx' for link in links] output = [] for link in links: webpage = requests.get(link) html = etree.HTML(webpage.text) final_link = html.xpath("//div[@id = 'content']//table//a['href']") output.append(link[:-13] + final_link[0].values()[0]) # we need to handle the issue in chronological order return sorted(output)
def _load(self): download = Downloader(url=self.dataset_url, filename="data-%s.zip" % self.dataset_code, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file) filepaths = (extract_zip_file(download.get_filepath())) dsd_fp = filepaths[self.dataset_code + ".dsd.xml"] data_fp = filepaths[self.dataset_code + ".sdmx.xml"] self.fetcher.for_delete.append(dsd_fp) self.fetcher.for_delete.append(data_fp) self.xml_dsd.process(dsd_fp) self._set_dataset() self.xml_data = XMLData( provider_name=self.provider_name, dataset_code=self.dataset_code, xml_dsd=self.xml_dsd, dsd_id=self.dataset_code, #TODO: frequencies_supported=FREQUENCIES_SUPPORTED ) self.rows = self.xml_data.process(data_fp)
def get_data_directory(self): """ Get directory content for one dataset Returns a directory dict """ dirname = self.dataset_code download = Downloader(url=self.dataset_url, filename="index.html", store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file) with open(download.get_filepath()) as f: html = etree.HTML(f.read()) directory = {} for br in html.xpath('.//br'): text = br.tail if not text: continue entry = text.strip().split() filename = br.getnext().text splitdate = entry[0].split('/') (hour, minute) = entry[1].split(':') if entry[2] == 'PM' and int(hour) < 12: hour = str(int(hour) + 12) directory[filename] = { 'year': int(splitdate[2]), 'month': int(splitdate[0]), 'day': int(splitdate[1]), 'hour': int(hour), 'minute': int(minute), } return directory
def get_dimension_data(self, filename, fmt): """Parses code file for one dimension Returns a dict """ download = Downloader(url=os.path.join(self.dataset_url, filename), filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file) filepath = download.get_filepath() entries1 = {} entries2 = {} with open(filepath) as source_file: data = csv.reader(source_file, delimiter='\t') fields = next(data) for row in data: if len(row) == 0: continue if fmt == 1: entries1[row[0]] = row[1] elif fmt == 2: entries1[row[1]] = row[2] elif fmt == 3: entries1[row[0]] = row[2] entries2[row[1]] = row[1] elif fmt == 4: entries1[row[0]] = row[0] entries2[row[1]] = row[3] else: raise Exception("fmt {} doesn't exist".format(fmt)) return (entries1, entries2)
def _load_xls(self): url_xls = make_xls_url(self.dataset_code) download = Downloader(url=url_xls, filename=self.dataset_code + '_info.xls', store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file) filepath = download.get_filepath() return filepath
def _load_datas(self): store_filepath = self.get_store_path() download = Downloader(url=self.dataset_url, filename=self.filename, store_filepath=store_filepath) '''Return 2 filepath (dsd and data)''' return (extract_zip_file(download.get_filepath()))
def _load_datas(self): # TODO: timeout, replace download = Downloader(url=self.dataset_url, filename=self.dataset_code, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) return filepath
def _get_agenda(self): download = Downloader(url=AGENDA['url'], filename=AGENDA['filename'], store_filepath=self.store_path) filepath = download.get_filepath() with open(filepath, 'rb') as fp: content = fp.read() self.for_delete.append(filepath) return content
def get_series_filepath(self): """Parse series file for a dataset Returns a dict of dict """ filename = self.dataset_code + '.series' download = Downloader(url=os.path.join(self.dataset_url, filename), filename=filename, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file) return download.get_filepath()
def _load_datas(self): filepath = list() if self.dataset_url: download = Downloader(url=self.dataset_url, filename=self.dataset_code, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file) filepath.append(download.get_filepath()) else: _filter_name, _filter_codes = get_filter(self.dataset_code) for code in _filter_codes: url = "http://webstat.banque-france.fr/en/export.do?node=DATASETS_%s&%s=%s&exportType=sdmx" % (self.dataset_code, _filter_name, code) name = self.dataset_code + "_%s" % (code) # print(name) download = Downloader(url=url, filename=name, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file) filepath.append(download.get_filepath()) return filepath
def _load_dsd(self): url = self._get_url_dsd() download = Downloader(store_filepath=self.store_path, url=url, filename="dsd-%s.xml" % self.dataset_code, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset()
def _load_datas(self): filepath = list() if self.dataset_url: download = Downloader( url=self.dataset_url, filename=self.dataset_code, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file) filepath.append(download.get_filepath()) else: _filter_name, _filter_codes = get_filter(self.dataset_code) for code in _filter_codes: url = "http://webstat.banque-france.fr/en/export.do?node=DATASETS_%s&%s=%s&exportType=sdmx" % ( self.dataset_code, _filter_name, code) name = self.dataset_code + "_%s" % (code) # print(name) download = Downloader( url=url, filename=name, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file) filepath.append(download.get_filepath()) return filepath
def _load(self): download = Downloader(url=self.url, filename="data-%s.xml" % self.dataset_code, #headers=SDMX_DATA_HEADERS ) data_fp, dsd_fp = (extract_zip_file(download.get_filepath())) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, #dimension_keys=self.xml_dsd.dimension_keys ) self.rows = self.xml_data.process(data_fp)
def _load_dsd_by_element(self): #FIXME: Manque codelist et concepts ? url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id download = Downloader(url=url, filename="datastructure-%s.xml" % self.dsd_id, headers=SDMX_METADATA_HEADERS, store_filepath=self.store_path, use_existing_file=self.fetcher.use_existing_file, client=self.fetcher.requests_client) filepath = download.get_filepath() self.fetcher.for_delete.append(filepath) self.xml_dsd.process(filepath) self._set_dataset()
def iter_row(self, url, filename, store_path, use_existing_file): download = Downloader(url=url, filename=filename, store_filepath=store_path, use_existing_file=use_existing_file) filepath = download.get_filepath() with open(filepath) as source_file: data = csv.reader(source_file, delimiter='\t') fields = [f.strip() for f in next(data)] # for pc dataset if 'footnotes' in fields: i = fields.index('footnotes') fields[i] = 'footnote_codes' #check that data are in the right order assert (fields == [ 'series_id', 'year', 'period', 'value', 'footnote_codes' ]) for row in data: yield [elem.strip() for elem in row]
def _process(self): for url in self.urls: #TODO: if not url.endswith("alla.xls"): #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls] date_str = match(".*WEO(\w{7})", url).groups()[0] #Sep2006 self.release_date = datetime.strptime(date_str, "%b%Y") #2006-09-01 00:00:00 if not self._is_updated(): msg = "upsert dataset[%s] bypass because is updated from release_date[%s]" logger.info(msg % (self.dataset_code, self.release_date)) continue self.dataset.last_update = self.release_date logger.info("load url[%s]" % url) download = Downloader( url=url, store_filepath=self.store_path, filename=os.path.basename(url), use_existing_file=self.fetcher.use_existing_file) data_filepath = download.get_filepath() self.fetcher.for_delete.append(data_filepath) with open(data_filepath, encoding='latin-1') as fp: self.sheet = csv.DictReader(fp, dialect=csv.excel_tab) self.years = self.sheet.fieldnames[8:-1] self.start_date = get_ordinal_from_period(self.years[0], freq=self.frequency) self.end_date = get_ordinal_from_period(self.years[-1], freq=self.frequency) for row in self.sheet: if not row or not row.get('Country Group Name'): break yield row, None yield None, None
def _process(self): for url in self.urls: #TODO: if not url.endswith("alla.xls"): #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls] date_str = match(".*WEO(\w{7})", url).groups()[0] #Sep2006 self.release_date = datetime.strptime(date_str, "%b%Y") #2006-09-01 00:00:00 if not self.is_updated(): msg = "upsert dataset[%s] bypass because is updated from release_date[%s]" logger.info(msg % (self.dataset_code, self.release_date)) continue self.dataset.last_update = self.release_date logger.info("load url[%s]" % url) download = Downloader(url=url, store_filepath=self.store_path, filename=os.path.basename(url), use_existing_file=self.fetcher.use_existing_file) data_filepath = download.get_filepath() self.fetcher.for_delete.append(data_filepath) with open(data_filepath, encoding='latin-1') as fp: self.sheet = csv.DictReader(fp, dialect=csv.excel_tab) self.years = self.sheet.fieldnames[9:-1] self.start_date = get_ordinal_from_period(self.years[0], freq=self.frequency) self.end_date = get_ordinal_from_period(self.years[-1], freq=self.frequency) for row in self.sheet: if not row or not row.get('Country'): break yield row, None #self.dataset.update_database(save_only=True) yield None, None
def download_all_sources(): """Download all datasets files (if not exist) and store local temp directory Store in /[TMP_DIR]/[PROVIDER_NAME]/[DATASET_CODE]/[FILENAME] return a dict with key is filename and value is full filepath """ filepaths = {} for dataset_code, dataset in DATASETS.items(): store_filepath = os.path.abspath(os.path.join(tempfile.gettempdir(), PROVIDER_NAME, dataset_code)) download = Downloader( url=dataset["url"], filename=dataset["filename"], store_filepath=store_filepath ) # TODO:, timeout, replace) filepaths[dataset["filename"]] = os.path.abspath(os.path.join(store_filepath, dataset["filename"])) logger.info("Download file[%s]" % download.get_filepath()) return filepaths
def parse_agenda(self): #TODO: use Downloader download = Downloader(url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html", filename="statscall.html") with open(download.get_filepath(), 'rb') as fp: agenda = lxml.html.parse(fp) regex_date = re.compile("Reference period: (.*)") regex_dataset = re.compile(".*Dataset: (.*)\)") entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | ' '//div[@class="ecb-faytdd"]/*/dd')[2:] entries = zip(entries[::2], entries[1::2]) for entry in entries: item = {} match_key = regex_dataset.match(entry[1][0].text_content()) item['dataflow_key'] = match_key.groups()[0] match_date = regex_date.match(entry[1][1].text_content()) item['reference_period'] = match_date.groups()[0] item['scheduled_date'] = entry[0].text_content().replace('\n','') yield(item)
def _get_sheet(self, url, filename, sheet_name): if url in self._current_urls: filepath = self._current_urls[url] else: download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() self._current_urls[url] = filepath zipfile_ = zipfile.ZipFile(filepath) section = zipfile_.namelist()[0] file_contents = zipfile_.read(section) excel_book = xlrd.open_workbook(file_contents=file_contents) return excel_book.sheet_by_name(sheet_name)
def _load(self): self.dsd_id = self.dataset_code url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id download = Downloader(url=url, filename="dsd-%s.xml" % self.dataset_code, headers=SDMX_METADATA_HEADERS) self.xml_dsd.process(download.get_filepath()) self.dataset.name = self.xml_dsd.dataset_name dimensions = OrderedDict() for key, item in self.xml_dsd.dimensions.items(): dimensions[key] = item["dimensions"] self.dimension_list.set_dict(dimensions) attributes = OrderedDict() for key, item in self.xml_dsd.attributes.items(): attributes[key] = item["values"] self.attribute_list.set_dict(attributes) url = "http://www.bdm.insee.fr/series/sdmx/data/%s" % self.dataset_code download = Downloader(url=url, filename="data-%s.xml" % self.dataset_code, headers=SDMX_DATA_HEADERS) self.xml_data = XMLData(provider_name=self.provider_name, dataset_code=self.dataset_code, dimension_keys=self.xml_dsd.dimension_keys) #TODO: response and exception try: filepath, response = download.get_filepath_and_response() except requests.exceptions.HTTPError as err: logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code) raise self.rows = self.xml_data.process(filepath)
def _get_sheet(self, url, filename, sheet_name): if url in self._current_urls: filepath = self._current_urls[url] else: download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() #self.for_delete.append(filepath) self._current_urls[url] = filepath zipfile_ = zipfile.ZipFile(filepath) section = zipfile_.namelist()[0] file_contents = zipfile_.read(section) excel_book = xlrd.open_workbook(file_contents=file_contents) return excel_book.sheet_by_name(sheet_name)
def _load_datas(self, datas=None): kwargs = {} if not datas: store_filepath = self.get_store_path() # TODO: timeout, replace download = Downloader(url=self.url, filename=self.filename, store_filepath=store_filepath) filepath = extract_zip_file(download.get_filepath()) kwargs["filepath"] = filepath else: kwargs["fileobj"] = io.StringIO(datas, newline="\n") kwargs["date_format"] = "%a %b %d %H:%M:%S %Z %Y" kwargs["headers_line"] = DATASETS[self.dataset.dataset_code]["lines"]["headers"] self.rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(**kwargs) self.dataset.last_update = self.release_date self.start_date = pandas.Period(self.periods[0], freq=self.frequency) self.end_date = pandas.Period(self.periods[-1], freq=self.frequency)
def _load_datas(self, datas=None): kwargs = {} if not datas: # TODO: timeout, replace download = Downloader( url=self.url, store_filepath=self.store_path, filename=self.filename, use_existing_file=self.fetcher.use_existing_file) zip_filepath = download.get_filepath() self.fetcher.for_delete.append(zip_filepath) filepath = extract_zip_file(zip_filepath) self.fetcher.for_delete.append(zip_filepath) kwargs['filepath'] = filepath else: kwargs['fileobj'] = io.StringIO(datas, newline="\n") kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y" kwargs['headers_line'] = DATASETS[ self.dataset.dataset_code]['lines']['headers'] self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv( **kwargs) self.dataset.dimension_keys = self.dimension_keys #TODO: if "frequency" in self.dataset.dimension_keys: # self.dataset.set_dimension_frequency("frequency") self.dataset.last_update = self.release_date self.start_date = get_ordinal_from_period(self.periods[0], freq=self.frequency) self.end_date = get_ordinal_from_period(self.periods[-1], freq=self.frequency)
def build_data_tree(self): """Builds the data tree """ download = Downloader( url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file, ) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=["end"], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, "all_parents": all_parents, "datasets": [], "doc_href": None, "metadata": None, } categories_keys.append(category_code) categories.append(_category) position = 0 is_verify_creation_date = False for event, dataset in it: if is_verify_creation_date is False: _root = dataset.getroottree().getroot() creation_date_str = _root.attrib.get("creationDate") creation_date = clean_datetime(datetime.strptime(creation_date_str, "%Y%m%dT%H%M")) if self._is_updated_catalog(creation_date) is False: msg = "no update from eurostat catalog. current[%s] - db[%s]" logger.warning(msg % (creation_date, self.provider.metadata["creation_date"])) if not self.force_update: return [] is_verify_creation_date = True if not self.force_update: self.updated_catalog = True parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath( "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP ) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], "%d.%m.%Y") if last_modified: last_modified = datetime.strptime(last_modified[0], "%d.%m.%Y") last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": clean_datetime(last_update), "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), }, } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories
def build_data_tree(self): """Builds the data tree """ download = Downloader(url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=['end'], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, 'all_parents': all_parents, "datasets": [], "doc_href": None, "metadata": None } categories_keys.append(category_code) categories.append(_category) position = 0 is_verify_creation_date = False for event, dataset in it: if is_verify_creation_date is False: _root = dataset.getroottree().getroot() creation_date_str = _root.attrib.get("creationDate") creation_date = clean_datetime( datetime.strptime(creation_date_str, '%Y%m%dT%H%M')) if self._is_updated_catalog(creation_date) is False: msg = "no update from eurostat catalog. current[%s] - db[%s]" logger.warning(msg % (creation_date, self.provider.metadata["creation_date"])) if not self.force_update: return [] is_verify_creation_date = True if not self.force_update: self.updated_catalog = True parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath( "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], '%d.%m.%Y') if last_modified: last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y') last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": clean_datetime(last_update), "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), } } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories
def build_data_tree(self): categories = [] for category_code, values in CATEGORIES.items(): if "url" in values: continue cat = { "category_code": category_code, "name": values["name"], "parent": values["parent"], "all_parents": values["all_parents"], "doc_href": values["doc_href"], "datasets": [] } categories.append(cat) for category_code, category in CATEGORIES.items(): if not "url" in category: continue url = category["url"] #filename = category["filename"] filename = "%s.xls.zip" % category_code download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() #self.for_delete.append(filepath) self._current_urls[url] = filepath try: zipfile_ = zipfile.ZipFile(filepath) except Exception as err: logger.error("bea zip error - url[%s] - filepath[%s] - error[%s]" % (url, filepath, str(err))) continue for section in zipfile_.namelist(): if section in ['Iip_PrevT3a.xls', 'Iip_PrevT3b.xls', 'Iip_PrevT3c.xls']: continue file_contents = zipfile_.read(section) excel_book = xlrd.open_workbook(file_contents=file_contents) try: sheet = excel_book.sheet_by_name('Contents') cat = { "category_code": category_code, "name": category["name"], "parent": category.get("parent"), "all_parents": category.get("all_parents"), "doc_href": None, "datasets": [] } dataset_base_names = {} first_line = 0 for i, cell in enumerate(sheet.col(1)): if "Code" in cell.value: first_line = i+2 break for i, cell in enumerate(sheet.col(1)): if i < first_line: continue cell_row = sheet.row(i) if cell_row[1].value != '': dataset_code = cell_row[1].value dataset_name = cell_row[2].value dataset_base_names[dataset_code] = dataset_name for sheet_name in excel_book.sheet_names(): _dataset_code = sheet_name.split()[0] if not _dataset_code in dataset_base_names: continue _dataset_name = dataset_base_names[_dataset_code] frequency_name, frequency_code = _get_frequency(sheet_name) if not frequency_name: msg = "not frequency name for sheet[%s] - url[%s] - filename[%s]" % (sheet_name, url, filename) logger.critical(msg) raise Exception(msg) dataset_code = "%s-%s-%s" % (category_code, _dataset_code, frequency_code.lower()) dataset_name = "%s - %s" % (_dataset_name, frequency_name) cat["datasets"].append({ "name": dataset_name, "dataset_code": dataset_code, "last_update": self._get_release_date(url, excel_book.sheet_by_name(sheet_name)), "metadata": { "url": url, "filename": filename, "sheet_name": sheet_name } }) categories.append(cat) except Exception as err: logger.error(str(err)) return categories
def parse_sna_agenda(self): #TODO: use Downloader download = Downloader(url="http://www.esri.cao.go.jp/en/sna/kouhyou/kouhyou_top.html", filename="agenda_sna.html") with open(download.get_filepath(), 'rb') as fp: agenda = lxml.html.parse(fp)
def build_data_tree(self): """Builds the data tree """ download = Downloader(url=self.url_table_of_contents, filename="table_of_contents.xml", store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() categories = [] categories_keys = [] it = etree.iterparse(filepath, events=['end'], tag="{urn:eu.europa.ec.eurostat.navtree}leaf") def is_selected(parent_codes): """parent_codes is array of category_code """ for _select in self.categories_filter: if _select in parent_codes: return True return False def get_category(category_code): for c in categories: if c["category_code"] == category_code: return c #TODO: date TOC à stocker dans provider !!! def create_categories(parent_codes, parent_titles, position): position += 1 for i in range(len(parent_codes)): category_code = parent_codes.pop() name = parent_titles.pop() all_parents = parent_codes.copy() parent = None if all_parents: parent = all_parents[-1] if not category_code in categories_keys: _category = { "provider_name": self.provider_name, "category_code": category_code, "name": name, "position": position + i, "parent": parent, 'all_parents': all_parents, "datasets": [], "doc_href": None, "metadata": None } categories_keys.append(category_code) categories.append(_category) # .getroottree().creationDate="20160225T1102" position = 0 for event, dataset in it: parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP) if not is_selected(parent_codes): continue parent_titles = dataset.xpath("ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP) category_code = parent_codes[-1] create_categories(parent_codes, parent_titles, position) category = get_category(category_code) name = xpath_title(dataset)[0] last_update = xpath_ds_last_update(dataset) last_modified = xpath_ds_last_modified(dataset) doc_href = xpath_ds_metadata_html(dataset) data_start = xpath_ds_data_start(dataset) data_end = xpath_ds_data_end(dataset) values = xpath_ds_values(dataset) last_update = datetime.strptime(last_update[0], '%d.%m.%Y') if last_modified: last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y') last_update = max(last_update, last_modified) dataset_code = xpath_code(dataset)[0] _dataset = { "dataset_code": dataset_code, "name": name, "last_update": last_update, "metadata": { "doc_href": first_element_xpath(doc_href), "data_start": first_element_xpath(data_start), "data_end": first_element_xpath(data_end), "values": int(first_element_xpath(values, default="0")), } } category["datasets"].append(_dataset) self.for_delete.append(filepath) return categories
def get_agenda(): download = Downloader(url=AGENDA["url"], filename=AGENDA["filename"]) with open(download.get_filepath(), "rb") as fp: return fp.read()
def build_data_tree(self): categories = [] for category_code, values in sorted(CATEGORIES.items()): if "url" in values: continue cat = { "category_code": category_code, "name": values["name"], "parent": values["parent"], "all_parents": values["all_parents"], "doc_href": values["doc_href"], "datasets": [] } categories.append(cat) for category_code, category in sorted(CATEGORIES.items()): if not "url" in category: continue url = category["url"] #filename = category["filename"] filename = "%s.xls.zip" % category_code download = Downloader(url=url, filename=filename, store_filepath=self.store_path, use_existing_file=self.use_existing_file) filepath = download.get_filepath() #self.for_delete.append(filepath) self._current_urls[url] = filepath try: zipfile_ = zipfile.ZipFile(filepath) except Exception as err: logger.error( "bea zip error - url[%s] - filepath[%s] - error[%s]" % (url, filepath, str(err))) continue for section in zipfile_.namelist(): if section in [ 'Iip_PrevT3a.xls', 'Iip_PrevT3b.xls', 'Iip_PrevT3c.xls' ]: continue file_contents = zipfile_.read(section) excel_book = xlrd.open_workbook(file_contents=file_contents) try: sheet = excel_book.sheet_by_name('Contents') cat = { "category_code": category_code, "name": category["name"], "parent": category.get("parent"), "all_parents": category.get("all_parents"), "doc_href": None, "datasets": [] } dataset_base_names = {} first_line = 0 for i, cell in enumerate(sheet.col(1)): if "Code" in cell.value: first_line = i + 2 break for i, cell in enumerate(sheet.col(1)): if i < first_line: continue cell_row = sheet.row(i) if cell_row[1].value != '': dataset_code = cell_row[1].value dataset_name = cell_row[2].value dataset_base_names[dataset_code] = dataset_name for sheet_name in excel_book.sheet_names(): _dataset_code = sheet_name.split()[0] if not _dataset_code in dataset_base_names: continue _dataset_name = dataset_base_names[_dataset_code] frequency_name, frequency_code = _get_frequency( sheet_name) if not frequency_name: msg = "not frequency name for sheet[%s] - url[%s] - filename[%s]" % ( sheet_name, url, filename) logger.critical(msg) raise Exception(msg) dataset_code = "%s-%s-%s" % (category_code, _dataset_code, frequency_code.lower()) dataset_name = "%s - %s" % (_dataset_name, frequency_name) cat["datasets"].append({ "name": dataset_name, "dataset_code": dataset_code, "last_update": self._get_release_date( url, excel_book.sheet_by_name(sheet_name)), "metadata": { "url": url, "filename": filename, "sheet_name": sheet_name } }) categories.append(cat) except Exception as err: logger.error(str(err)) return categories