Python Downloader.get_filepathの例、dlstats.utils.Downloader.get_filepath Pythonの例

コード例 #1

0

ファイルを表示

ファイル: insee.py プロジェクト: ThomasRoca/dlstats

    def _load_structure(self, force=False):
        
        if self._dataflows and not force:
            return

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name)
        
        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)       
        
        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categoryscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categorisation.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations
        
        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="conceptscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts

コード例 #2

0

ファイルを表示

ファイル: ecb.py プロジェクト: MichelJuillard/dlstats

    def _load(self):

        url = "http://sdw-wsrest.ecb.int/service/dataflow/ECB/%s" % self.dataset_code
        download = Downloader(url=url, 
                              filename="dataflow-%s.xml" % self.dataset_code,
                              headers=SDMX_METADATA_HEADERS)
        
        self.xml_dsd.process(download.get_filepath())
        self.dsd_id = self.xml_dsd.dsd_id
        
        if not self.dsd_id:
            msg = "DSD ID not found for provider[%s] - dataset[%s]" % (self.provider_name, 
                                                                       self.dataset_code)
            raise Exception(msg)
        
        url = "http://sdw-wsrest.ecb.int/service/datastructure/ECB/%s?references=children" % self.dsd_id
        download = Downloader(url=url, 
                              filename="dsd-%s.xml" % self.dataset_code,
                              headers=SDMX_METADATA_HEADERS)
        self.xml_dsd.process(download.get_filepath())
        
        self.dataset.name = self.xml_dsd.dataset_name
        
        dimensions = OrderedDict()
        for key, item in self.xml_dsd.dimensions.items():
            dimensions[key] = item["dimensions"]
        self.dimension_list.set_dict(dimensions)
        
        attributes = OrderedDict()
        for key, item in self.xml_dsd.attributes.items():
            attributes[key] = item["values"]
        self.attribute_list.set_dict(attributes)
        
        url = "http://sdw-wsrest.ecb.int/service/data/%s" % self.dataset_code
        download = Downloader(url=url, 
                              filename="data-%s.xml" % self.dataset_code,
                              headers=SDMX_DATA_HEADERS)

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                dimension_keys=self.xml_dsd.dimension_keys)
        
        
        #TODO: response and exception
        try:
            filepath, response = download.get_filepath_and_response()        
        except requests.exceptions.HTTPError as err:
            logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code)
            raise
            
        self.rows = self.xml_data.process(filepath)

コード例 #3

0

ファイルを表示

ファイル: insee.py プロジェクト: Widukind/dlstats

    def _load_structure_datatree(self, force=False):

        if self._categoryschemes and self._categorisations and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categoryschemes = self._structure_get("categoryschemes")
            if self._categoryschemes:
                logger.info("load structure [categoryschemes] from metadata for url[%s]" % url)
        """
        if not self._categoryschemes:
            download = Downloader(url=url,
                                  filename="categoryscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categoryschemes = self.xml_dsd.categories
            #self._structure_put("categoryschemes", url, **self._categoryschemes)

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categorisations = self._structure_get("categorisation")
            if self._categorisations:
                self._categorisations_categories = self._structure_get("categorisations_categories")
                logger.info("load structure [categorisation] from metadata for url[%s]" % url)
        """

        if not self._categorisations:
            download = Downloader(url=url,
                                  filename="categorisation.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categorisations = self.xml_dsd.categorisations
            self._categorisations_categories = self.xml_dsd.categorisations_categories

コード例 #4

0

ファイルを表示

ファイル: insee.py プロジェクト: vishalbelsare/dlstats

    def _load_structure_datatree(self, force=False):

        if self._categoryschemes and self._categorisations and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categoryschemes = self._structure_get("categoryschemes")
            if self._categoryschemes:
                logger.info("load structure [categoryschemes] from metadata for url[%s]" % url)
        """
        if not self._categoryschemes:
            download = Downloader(url=url,
                                  filename="categoryscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categoryschemes = self.xml_dsd.categories
            #self._structure_put("categoryschemes", url, **self._categoryschemes)

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categorisations = self._structure_get("categorisation")
            if self._categorisations:
                self._categorisations_categories = self._structure_get("categorisations_categories")
                logger.info("load structure [categorisation] from metadata for url[%s]" % url)
        """

        if not self._categorisations:
            download = Downloader(url=url,
                                  filename="categorisation.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categorisations = self.xml_dsd.categorisations
            self._categorisations_categories = self.xml_dsd.categorisations_categories

コード例 #5

0

ファイルを表示

ファイル: eurostat.py プロジェクト: srault95/dlstats

    def _load(self):

        download = Downloader(
            url=self.dataset_url,
            filename="data-%s.zip" % self.dataset_code,
            store_filepath=self.store_path,
            use_existing_file=self.fetcher.use_existing_file,
        )

        filepaths = extract_zip_file(download.get_filepath())
        dsd_fp = filepaths[self.dataset_code + ".dsd.xml"]
        data_fp = filepaths[self.dataset_code + ".sdmx.xml"]

        self.fetcher.for_delete.append(dsd_fp)
        self.fetcher.for_delete.append(data_fp)

        self.xml_dsd.process(dsd_fp)
        self._set_dataset()

        self.xml_data = XMLData(
            provider_name=self.provider_name,
            dataset_code=self.dataset_code,
            xml_dsd=self.xml_dsd,
            dsd_id=self.dataset_code,
            # TODO: frequencies_supported=FREQUENCIES_SUPPORTED
        )
        self.rows = self.xml_data.process(data_fp)

コード例 #6

0

ファイルを表示

ファイル: insee.py プロジェクト: vishalbelsare/dlstats

    def _load_structure_dataflows(self, force=False):

        if self._dataflows and not force:
            return

        self.provider_verify()

        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name

        if self.refresh_meta is False:
            self._dataflows = self._structure_get("dataflows")

            if self._dataflows:
                self.xml_dsd.dataflows = self._dataflows
                logger.info(
                    "load structure [dataflows] from metadata for url[%s]" %
                    url)
                return

        download = Downloader(url=url,
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file,
                              client=self.requests_client)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        self._structure_put("dataflows", url, **self._dataflows)

コード例 #7

0

ファイルを表示

ファイル: esri.py プロジェクト: MichelJuillard/dlstats

 def _load_datas(self):
     
     store_filepath = self.get_store_path()
     # TODO: timeout, replace
     download = Downloader(url=self.dataset_url, filename=self.filename, store_filepath=store_filepath)
         
     return(download.get_filepath())

コード例 #8

0

ファイルを表示

ファイル: insee.py プロジェクト: Widukind/dlstats

    def _load_structure_dataflows(self, force=False):

        if self._dataflows and not force:
            return

        self.provider_verify()

        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name

        if self.refresh_meta is False:
            self._dataflows = self._structure_get("dataflows")

            if self._dataflows:
                self.xml_dsd.dataflows = self._dataflows
                logger.info("load structure [dataflows] from metadata for url[%s]" % url)
                return

        download = Downloader(url=url,
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file,
                              client=self.requests_client)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        self._structure_put("dataflows", url, **self._dataflows)

コード例 #9

0

ファイルを表示

ファイル: imf.py プロジェクト: Menandalbee/dlstats

    def weo_urls(self):
        download = Downloader(url='http://www.imf.org/external/ns/cs.aspx?id=28',
                              filename="weo.html",
                              store_filepath=self.store_path)
        
        filepath = download.get_filepath()
        with open(filepath, 'rb') as fp:
            webpage = fp.read()
        
        self.fetcher.for_delete.append(filepath)
            
        #TODO: replace by beautifoulsoup ?
        html = etree.HTML(webpage)
        hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']")
        links = [href.values() for href in hrefs]
        
        #The last links of the WEO webpage lead to data we dont want to pull.
        links = links[:-16]
        #These are other links we don't want.
        links.pop(-8)
        links.pop(-10)
        links = [link[0][:-10]+'download.aspx' for link in links]

        output = []
    
        for link in links:
            webpage = requests.get(link)
            html = etree.HTML(webpage.text)
            final_link = html.xpath("//div[@id = 'content']//table//a['href']")
            output.append(link[:-13]+final_link[0].values()[0])
            
        # we need to handle the issue in chronological order
        return sorted(output)

コード例 #10

0

ファイルを表示

ファイル: bis.py プロジェクト: gitter-badger/dlstats

 def _load_datas(self, datas=None):
     
     kwargs = {}
     
     if not datas:
         # TODO: timeout, replace
         download = Downloader(url=self.url,
                               store_filepath=self.store_path, 
                               filename=self.filename,
                               use_existing_file=self.fetcher.use_existing_file)
         
         zip_filepath = download.get_filepath()
         self.fetcher.for_delete.append(zip_filepath)
         filepath = extract_zip_file(zip_filepath)
         self.fetcher.for_delete.append(zip_filepath)
         
         kwargs['filepath'] = filepath
     else:
         kwargs['fileobj'] = io.StringIO(datas, newline="\n")
     
     kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y"
     kwargs['headers_line'] = DATASETS[self.dataset.dataset_code]['lines']['headers']
     self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(**kwargs)
     
     self.dataset.dimension_keys = self.dimension_keys
     
     self.dataset.last_update = self.release_date
     
     self.start_date = get_ordinal_from_period(self.periods[0], freq=self.frequency)
     self.end_date = get_ordinal_from_period(self.periods[-1], freq=self.frequency)

コード例 #11

0

ファイルを表示

    def weo_urls(self):
        download = Downloader(
            url='http://www.imf.org/external/ns/cs.aspx?id=28',
            filename="weo.html",
            store_filepath=self.store_path)

        filepath = download.get_filepath()
        with open(filepath, 'rb') as fp:
            webpage = fp.read()

        self.fetcher.for_delete.append(filepath)

        #TODO: replace by beautifoulsoup ?
        html = etree.HTML(webpage)
        hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']")
        links = [href.values() for href in hrefs]

        #The last links of the WEO webpage lead to data we dont want to pull.
        links = links[:-16]
        #These are other links we don't want.
        links.pop(-8)
        links.pop(-10)
        links = [link[0][:-10] + 'download.aspx' for link in links]

        output = []

        for link in links:
            webpage = requests.get(link)
            html = etree.HTML(webpage.text)
            final_link = html.xpath("//div[@id = 'content']//table//a['href']")
            output.append(link[:-13] + final_link[0].values()[0])

        # we need to handle the issue in chronological order
        return sorted(output)

コード例 #12

0

ファイルを表示

ファイル: eurostat.py プロジェクト: vishalbelsare/dlstats

    def _load(self):

        download = Downloader(url=self.dataset_url,
                              filename="data-%s.zip" % self.dataset_code,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file)

        filepaths = (extract_zip_file(download.get_filepath()))
        dsd_fp = filepaths[self.dataset_code + ".dsd.xml"]
        data_fp = filepaths[self.dataset_code + ".sdmx.xml"]

        self.fetcher.for_delete.append(dsd_fp)
        self.fetcher.for_delete.append(data_fp)

        self.xml_dsd.process(dsd_fp)
        self._set_dataset()

        self.xml_data = XMLData(
            provider_name=self.provider_name,
            dataset_code=self.dataset_code,
            xml_dsd=self.xml_dsd,
            dsd_id=self.dataset_code,
            #TODO: frequencies_supported=FREQUENCIES_SUPPORTED
        )
        self.rows = self.xml_data.process(data_fp)

コード例 #13

0

ファイルを表示

ファイル: bls.py プロジェクト: vishalbelsare/dlstats

 def get_data_directory(self):
     """ Get directory content for one dataset
     Returns a directory dict
     """
     dirname = self.dataset_code
     download = Downloader(url=self.dataset_url,
                           filename="index.html",
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     with open(download.get_filepath()) as f:
         html = etree.HTML(f.read())
     directory = {}
     for br in html.xpath('.//br'):
         text = br.tail
         if not text:
             continue
         entry = text.strip().split()
         filename = br.getnext().text
         splitdate = entry[0].split('/')
         (hour, minute) = entry[1].split(':')
         if entry[2] == 'PM' and int(hour) < 12:
             hour = str(int(hour) + 12)
         directory[filename] = {
             'year': int(splitdate[2]),
             'month': int(splitdate[0]),
             'day': int(splitdate[1]),
             'hour': int(hour),
             'minute': int(minute),
         }
     return directory

コード例 #14

0

ファイルを表示

ファイル: bls.py プロジェクト: vishalbelsare/dlstats

 def get_dimension_data(self, filename, fmt):
     """Parses code file for one dimension
     Returns a dict
     """
     download = Downloader(url=os.path.join(self.dataset_url, filename),
                           filename=filename,
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     entries1 = {}
     entries2 = {}
     with open(filepath) as source_file:
         data = csv.reader(source_file, delimiter='\t')
         fields = next(data)
         for row in data:
             if len(row) == 0:
                 continue
             if fmt == 1:
                 entries1[row[0]] = row[1]
             elif fmt == 2:
                 entries1[row[1]] = row[2]
             elif fmt == 3:
                 entries1[row[0]] = row[2]
                 entries2[row[1]] = row[1]
             elif fmt == 4:
                 entries1[row[0]] = row[0]
                 entries2[row[1]] = row[3]
             else:
                 raise Exception("fmt {} doesn't exist".format(fmt))
     return (entries1, entries2)

コード例 #15

0

ファイルを表示

ファイル: bdf.py プロジェクト: Menandalbee/dlstats

 def _load_xls(self):
     url_xls = make_xls_url(self.dataset_code)
     download = Downloader(url=url_xls, 
                       filename=self.dataset_code + '_info.xls',
                       store_filepath=self.get_store_path(),
                       use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     return filepath

コード例 #16

0

ファイルを表示

ファイル: bdf.py プロジェクト: Menandalbee/dlstats

 def _load_xls(self):
     url_xls = make_xls_url(self.dataset_code)
     download = Downloader(url=url_xls,
                           filename=self.dataset_code + '_info.xls',
                           store_filepath=self.get_store_path(),
                           use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     return filepath

コード例 #17

0

ファイルを表示

ファイル: eurostat.py プロジェクト: MichelJuillard/dlstats

    def _load_datas(self):

        store_filepath = self.get_store_path()
        download = Downloader(url=self.dataset_url, 
                              filename=self.filename, 
                              store_filepath=store_filepath)

        '''Return 2 filepath (dsd and data)'''    
        return (extract_zip_file(download.get_filepath()))

コード例 #18

0

ファイルを表示

ファイル: esri.py プロジェクト: ThomasRoca/dlstats

 def _load_datas(self):
     # TODO: timeout, replace
     download = Downloader(url=self.dataset_url, 
                           filename=self.dataset_code,
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     self.fetcher.for_delete.append(filepath)
     return filepath

コード例 #19

0

ファイルを表示

ファイル: bls.py プロジェクト: vishalbelsare/dlstats

 def _load_datas(self):
     # TODO: timeout, replace
     download = Downloader(url=self.dataset_url,
                           filename=self.dataset_code,
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     self.fetcher.for_delete.append(filepath)
     return filepath

コード例 #20

0

ファイルを表示

    def _get_agenda(self):
        download = Downloader(url=AGENDA['url'],
                              filename=AGENDA['filename'],
                              store_filepath=self.store_path)
        filepath = download.get_filepath()

        with open(filepath, 'rb') as fp:
            content = fp.read()
            self.for_delete.append(filepath)
            return content

コード例 #21

0

ファイルを表示

ファイル: bis.py プロジェクト: gitter-badger/dlstats

    def _get_agenda(self):
        download = Downloader(url=AGENDA['url'],
                              filename=AGENDA['filename'],
                              store_filepath=self.store_path)
        filepath = download.get_filepath()        

        with open(filepath, 'rb') as fp:
            content = fp.read()
            self.for_delete.append(filepath)
            return content

コード例 #22

0

ファイルを表示

ファイル: bls.py プロジェクト: vishalbelsare/dlstats

 def get_series_filepath(self):
     """Parse series file for a dataset
     Returns a dict of dict
     """
     filename = self.dataset_code + '.series'
     download = Downloader(url=os.path.join(self.dataset_url, filename),
                           filename=filename,
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     return download.get_filepath()

コード例 #23

0

ファイルを表示

ファイル: bdf.py プロジェクト: Menandalbee/dlstats

 def _load_datas(self):
     filepath = list()
     if self.dataset_url:
         download = Downloader(url=self.dataset_url, 
                               filename=self.dataset_code,
                               store_filepath=self.get_store_path(),
                               use_existing_file=self.fetcher.use_existing_file)
         filepath.append(download.get_filepath())
     else:
         _filter_name, _filter_codes = get_filter(self.dataset_code)
         for code in _filter_codes:
             url = "http://webstat.banque-france.fr/en/export.do?node=DATASETS_%s&%s=%s&exportType=sdmx" % (self.dataset_code, _filter_name, code)              
             name = self.dataset_code + "_%s" % (code) 
             # print(name)
             download = Downloader(url=url, 
                                   filename=name,
                                   store_filepath=self.get_store_path(),
                                   use_existing_file=self.fetcher.use_existing_file)
             filepath.append(download.get_filepath())
     return filepath

コード例 #24

0

ファイルを表示

ファイル: oecd.py プロジェクト: gitter-badger/dlstats

 def _load_dsd(self):
     url = self._get_url_dsd()
     download = Downloader(store_filepath=self.store_path,
                           url=url, 
                           filename="dsd-%s.xml" % self.dataset_code,
                           use_existing_file=self.fetcher.use_existing_file,
                           client=self.fetcher.requests_client)
     filepath = download.get_filepath()
     self.fetcher.for_delete.append(filepath)
     
     self.xml_dsd.process(filepath)
     self._set_dataset()

コード例 #25

0

ファイルを表示

    def _load_dsd(self):
        url = self._get_url_dsd()
        download = Downloader(store_filepath=self.store_path,
                              url=url,
                              filename="dsd-%s.xml" % self.dataset_code,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)

        self.xml_dsd.process(filepath)
        self._set_dataset()

コード例 #26

0

ファイルを表示

ファイル: bdf.py プロジェクト: Menandalbee/dlstats

 def _load_datas(self):
     filepath = list()
     if self.dataset_url:
         download = Downloader(
             url=self.dataset_url,
             filename=self.dataset_code,
             store_filepath=self.get_store_path(),
             use_existing_file=self.fetcher.use_existing_file)
         filepath.append(download.get_filepath())
     else:
         _filter_name, _filter_codes = get_filter(self.dataset_code)
         for code in _filter_codes:
             url = "http://webstat.banque-france.fr/en/export.do?node=DATASETS_%s&%s=%s&exportType=sdmx" % (
                 self.dataset_code, _filter_name, code)
             name = self.dataset_code + "_%s" % (code)
             # print(name)
             download = Downloader(
                 url=url,
                 filename=name,
                 store_filepath=self.get_store_path(),
                 use_existing_file=self.fetcher.use_existing_file)
             filepath.append(download.get_filepath())
     return filepath

コード例 #27

0

ファイルを表示

ファイル: fed.py プロジェクト: MichelJuillard/dlstats

    def _load(self):

        download = Downloader(url=self.url, 
                              filename="data-%s.xml" % self.dataset_code,
                              #headers=SDMX_DATA_HEADERS        
                              )
        data_fp, dsd_fp = (extract_zip_file(download.get_filepath()))

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                #dimension_keys=self.xml_dsd.dimension_keys
                                )
        
        self.rows = self.xml_data.process(data_fp)

コード例 #28

0

ファイルを表示

ファイル: insee.py プロジェクト: vishalbelsare/dlstats

    def _load_dsd_by_element(self):

        #FIXME: Manque codelist et concepts ?

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id
        download = Downloader(url=url,
                              filename="datastructure-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

コード例 #29

0

ファイルを表示

ファイル: insee.py プロジェクト: Widukind/dlstats

    def _load_dsd_by_element(self):

        #FIXME: Manque codelist et concepts ?

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id
        download = Downloader(url=url,
                              filename="datastructure-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()

コード例 #30

0

ファイルを表示

ファイル: bls.py プロジェクト: vishalbelsare/dlstats

 def iter_row(self, url, filename, store_path, use_existing_file):
     download = Downloader(url=url,
                           filename=filename,
                           store_filepath=store_path,
                           use_existing_file=use_existing_file)
     filepath = download.get_filepath()
     with open(filepath) as source_file:
         data = csv.reader(source_file, delimiter='\t')
         fields = [f.strip() for f in next(data)]
         # for pc dataset
         if 'footnotes' in fields:
             i = fields.index('footnotes')
             fields[i] = 'footnote_codes'
         #check that data are in the right order
         assert (fields == [
             'series_id', 'year', 'period', 'value', 'footnote_codes'
         ])
         for row in data:
             yield [elem.strip() for elem in row]

コード例 #31

0

ファイルを表示

    def _process(self):
        for url in self.urls:

            #TODO: if not url.endswith("alla.xls"):

            #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls]
            date_str = match(".*WEO(\w{7})", url).groups()[0]  #Sep2006
            self.release_date = datetime.strptime(date_str,
                                                  "%b%Y")  #2006-09-01 00:00:00

            if not self._is_updated():
                msg = "upsert dataset[%s] bypass because is updated from release_date[%s]"
                logger.info(msg % (self.dataset_code, self.release_date))
                continue

            self.dataset.last_update = self.release_date

            logger.info("load url[%s]" % url)

            download = Downloader(
                url=url,
                store_filepath=self.store_path,
                filename=os.path.basename(url),
                use_existing_file=self.fetcher.use_existing_file)

            data_filepath = download.get_filepath()
            self.fetcher.for_delete.append(data_filepath)

            with open(data_filepath, encoding='latin-1') as fp:

                self.sheet = csv.DictReader(fp, dialect=csv.excel_tab)
                self.years = self.sheet.fieldnames[8:-1]
                self.start_date = get_ordinal_from_period(self.years[0],
                                                          freq=self.frequency)
                self.end_date = get_ordinal_from_period(self.years[-1],
                                                        freq=self.frequency)

                for row in self.sheet:
                    if not row or not row.get('Country Group Name'):
                        break
                    yield row, None

        yield None, None

コード例 #32

0

ファイルを表示

ファイル: imf.py プロジェクト: ThomasRoca/dlstats

    def _process(self):        
        for url in self.urls:
            
            #TODO: if not url.endswith("alla.xls"):
            
            #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls]
            date_str = match(".*WEO(\w{7})", url).groups()[0] #Sep2006
            self.release_date = datetime.strptime(date_str, "%b%Y") #2006-09-01 00:00:00
            
            if not self.is_updated():
                msg = "upsert dataset[%s] bypass because is updated from release_date[%s]"
                logger.info(msg % (self.dataset_code, self.release_date))
                continue

            self.dataset.last_update = self.release_date        
                
            logger.info("load url[%s]" % url)
            
            download = Downloader(url=url,
                                  store_filepath=self.store_path, 
                                  filename=os.path.basename(url),
                                  use_existing_file=self.fetcher.use_existing_file)        
            
            data_filepath = download.get_filepath()
            self.fetcher.for_delete.append(data_filepath)
            
            with open(data_filepath, encoding='latin-1') as fp:
                
                self.sheet = csv.DictReader(fp, dialect=csv.excel_tab)
                self.years = self.sheet.fieldnames[9:-1]
                self.start_date = get_ordinal_from_period(self.years[0], 
                                                          freq=self.frequency)
                self.end_date = get_ordinal_from_period(self.years[-1], 
                                                        freq=self.frequency)
                
                for row in self.sheet:
                    if not row or not row.get('Country'):
                        break       
                    yield row, None

            #self.dataset.update_database(save_only=True)
        
        yield None, None

コード例 #33

0

ファイルを表示

ファイル: bis.py プロジェクト: MichelJuillard/dlstats

def download_all_sources():
    """Download all datasets files (if not exist) and store local temp directory
    
    Store in /[TMP_DIR]/[PROVIDER_NAME]/[DATASET_CODE]/[FILENAME]
    
    return a dict with key is filename and value is full filepath
    """

    filepaths = {}

    for dataset_code, dataset in DATASETS.items():
        store_filepath = os.path.abspath(os.path.join(tempfile.gettempdir(), PROVIDER_NAME, dataset_code))
        download = Downloader(
            url=dataset["url"], filename=dataset["filename"], store_filepath=store_filepath
        )  # TODO:, timeout, replace)
        filepaths[dataset["filename"]] = os.path.abspath(os.path.join(store_filepath, dataset["filename"]))
        logger.info("Download file[%s]" % download.get_filepath())

    return filepaths

コード例 #34

0

ファイルを表示

ファイル: ecb.py プロジェクト: MichelJuillard/dlstats

 def parse_agenda(self):
     #TODO: use Downloader
     download = Downloader(url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html",
                           filename="statscall.html")
     with open(download.get_filepath(), 'rb') as fp:
         agenda = lxml.html.parse(fp)
     
     regex_date = re.compile("Reference period: (.*)")
     regex_dataset = re.compile(".*Dataset: (.*)\)")
     entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | '
                            '//div[@class="ecb-faytdd"]/*/dd')[2:]
     entries = zip(entries[::2], entries[1::2])
     for entry in entries:
         item = {}
         match_key = regex_dataset.match(entry[1][0].text_content())
         item['dataflow_key'] = match_key.groups()[0]
         match_date = regex_date.match(entry[1][1].text_content())
         item['reference_period'] = match_date.groups()[0]
         item['scheduled_date'] = entry[0].text_content().replace('\n','')
         yield(item)

コード例 #35

0

ファイルを表示

ファイル: bea.py プロジェクト: gitter-badger/dlstats

    def _get_sheet(self, url, filename, sheet_name):
        
        if url in self._current_urls:
            filepath = self._current_urls[url]
        else:
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.use_existing_file)
            
            filepath = download.get_filepath()
            self._current_urls[url] = filepath        

        zipfile_ = zipfile.ZipFile(filepath)
        section = zipfile_.namelist()[0]
                
        file_contents = zipfile_.read(section)
                    
        excel_book = xlrd.open_workbook(file_contents=file_contents)
                    
        return excel_book.sheet_by_name(sheet_name)

コード例 #36

0

ファイルを表示

ファイル: insee.py プロジェクト: MichelJuillard/dlstats

    def _load(self):

        self.dsd_id = self.dataset_code

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url, 
                              filename="dsd-%s.xml" % self.dataset_code,
                              headers=SDMX_METADATA_HEADERS)
        self.xml_dsd.process(download.get_filepath())
        
        self.dataset.name = self.xml_dsd.dataset_name
        
        dimensions = OrderedDict()
        for key, item in self.xml_dsd.dimensions.items():
            dimensions[key] = item["dimensions"]
        self.dimension_list.set_dict(dimensions)
        
        attributes = OrderedDict()
        for key, item in self.xml_dsd.attributes.items():
            attributes[key] = item["values"]
        self.attribute_list.set_dict(attributes)
        
        url = "http://www.bdm.insee.fr/series/sdmx/data/%s" % self.dataset_code
        download = Downloader(url=url, 
                              filename="data-%s.xml" % self.dataset_code,
                              headers=SDMX_DATA_HEADERS)

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                dimension_keys=self.xml_dsd.dimension_keys)
        
        #TODO: response and exception
        try:
            filepath, response = download.get_filepath_and_response()        
        except requests.exceptions.HTTPError as err:
            logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code)
            raise
            
        self.rows = self.xml_data.process(filepath)

コード例 #37

0

ファイルを表示

ファイル: bea.py プロジェクト: vishalbelsare/dlstats

    def _get_sheet(self, url, filename, sheet_name):

        if url in self._current_urls:
            filepath = self._current_urls[url]
        else:
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.use_existing_file)

            filepath = download.get_filepath()
            #self.for_delete.append(filepath)
            self._current_urls[url] = filepath

        zipfile_ = zipfile.ZipFile(filepath)
        section = zipfile_.namelist()[0]

        file_contents = zipfile_.read(section)

        excel_book = xlrd.open_workbook(file_contents=file_contents)

        return excel_book.sheet_by_name(sheet_name)

コード例 #38

0

ファイルを表示

ファイル: bis.py プロジェクト: MichelJuillard/dlstats

    def _load_datas(self, datas=None):

        kwargs = {}

        if not datas:
            store_filepath = self.get_store_path()
            # TODO: timeout, replace
            download = Downloader(url=self.url, filename=self.filename, store_filepath=store_filepath)

            filepath = extract_zip_file(download.get_filepath())
            kwargs["filepath"] = filepath
        else:
            kwargs["fileobj"] = io.StringIO(datas, newline="\n")

        kwargs["date_format"] = "%a %b %d %H:%M:%S %Z %Y"
        kwargs["headers_line"] = DATASETS[self.dataset.dataset_code]["lines"]["headers"]
        self.rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(**kwargs)

        self.dataset.last_update = self.release_date

        self.start_date = pandas.Period(self.periods[0], freq=self.frequency)
        self.end_date = pandas.Period(self.periods[-1], freq=self.frequency)

コード例 #39

0

ファイルを表示

    def _load_datas(self, datas=None):

        kwargs = {}

        if not datas:
            # TODO: timeout, replace
            download = Downloader(
                url=self.url,
                store_filepath=self.store_path,
                filename=self.filename,
                use_existing_file=self.fetcher.use_existing_file)

            zip_filepath = download.get_filepath()
            self.fetcher.for_delete.append(zip_filepath)
            filepath = extract_zip_file(zip_filepath)
            self.fetcher.for_delete.append(zip_filepath)

            kwargs['filepath'] = filepath
        else:
            kwargs['fileobj'] = io.StringIO(datas, newline="\n")

        kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y"
        kwargs['headers_line'] = DATASETS[
            self.dataset.dataset_code]['lines']['headers']
        self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(
            **kwargs)

        self.dataset.dimension_keys = self.dimension_keys

        #TODO: if "frequency" in self.dataset.dimension_keys:
        #    self.dataset.set_dimension_frequency("frequency")

        self.dataset.last_update = self.release_date

        self.start_date = get_ordinal_from_period(self.periods[0],
                                                  freq=self.frequency)
        self.end_date = get_ordinal_from_period(self.periods[-1],
                                                freq=self.frequency)

コード例 #40

0

ファイルを表示

ファイル: eurostat.py プロジェクト: srault95/dlstats

    def build_data_tree(self):
        """Builds the data tree
        """

        download = Downloader(
            url=self.url_table_of_contents,
            filename="table_of_contents.xml",
            store_filepath=self.store_path,
            use_existing_file=self.use_existing_file,
        )
        filepath = download.get_filepath()

        categories = []
        categories_keys = []

        it = etree.iterparse(filepath, events=["end"], tag="{urn:eu.europa.ec.eurostat.navtree}leaf")

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.categories_filter:
                if _select in parent_codes:
                    return True
            return False

        def get_category(category_code):
            for c in categories:
                if c["category_code"] == category_code:
                    return c

        def create_categories(parent_codes, parent_titles, position):

            position += 1

            for i in range(len(parent_codes)):
                category_code = parent_codes.pop()
                name = parent_titles.pop()
                all_parents = parent_codes.copy()
                parent = None
                if all_parents:
                    parent = all_parents[-1]
                if not category_code in categories_keys:
                    _category = {
                        "provider_name": self.provider_name,
                        "category_code": category_code,
                        "name": name,
                        "position": position + i,
                        "parent": parent,
                        "all_parents": all_parents,
                        "datasets": [],
                        "doc_href": None,
                        "metadata": None,
                    }
                    categories_keys.append(category_code)
                    categories.append(_category)

        position = 0
        is_verify_creation_date = False

        for event, dataset in it:

            if is_verify_creation_date is False:
                _root = dataset.getroottree().getroot()
                creation_date_str = _root.attrib.get("creationDate")
                creation_date = clean_datetime(datetime.strptime(creation_date_str, "%Y%m%dT%H%M"))

                if self._is_updated_catalog(creation_date) is False:
                    msg = "no update from eurostat catalog. current[%s] - db[%s]"
                    logger.warning(msg % (creation_date, self.provider.metadata["creation_date"]))
                    if not self.force_update:
                        return []

                is_verify_creation_date = True
                if not self.force_update:
                    self.updated_catalog = True

            parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP)

            if not is_selected(parent_codes):
                continue

            parent_titles = dataset.xpath(
                "ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP
            )
            category_code = parent_codes[-1]

            create_categories(parent_codes, parent_titles, position)

            category = get_category(category_code)

            name = xpath_title(dataset)[0]
            last_update = xpath_ds_last_update(dataset)
            last_modified = xpath_ds_last_modified(dataset)
            doc_href = xpath_ds_metadata_html(dataset)
            data_start = xpath_ds_data_start(dataset)
            data_end = xpath_ds_data_end(dataset)
            values = xpath_ds_values(dataset)

            last_update = datetime.strptime(last_update[0], "%d.%m.%Y")
            if last_modified:
                last_modified = datetime.strptime(last_modified[0], "%d.%m.%Y")
                last_update = max(last_update, last_modified)

            dataset_code = xpath_code(dataset)[0]
            _dataset = {
                "dataset_code": dataset_code,
                "name": name,
                "last_update": clean_datetime(last_update),
                "metadata": {
                    "doc_href": first_element_xpath(doc_href),
                    "data_start": first_element_xpath(data_start),
                    "data_end": first_element_xpath(data_end),
                    "values": int(first_element_xpath(values, default="0")),
                },
            }
            category["datasets"].append(_dataset)

        self.for_delete.append(filepath)

        return categories

コード例 #41

0

ファイルを表示

ファイル: eurostat.py プロジェクト: vishalbelsare/dlstats

    def build_data_tree(self):
        """Builds the data tree
        """

        download = Downloader(url=self.url_table_of_contents,
                              filename="table_of_contents.xml",
                              store_filepath=self.store_path,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()

        categories = []
        categories_keys = []

        it = etree.iterparse(filepath,
                             events=['end'],
                             tag="{urn:eu.europa.ec.eurostat.navtree}leaf")

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.categories_filter:
                if _select in parent_codes:
                    return True
            return False

        def get_category(category_code):
            for c in categories:
                if c["category_code"] == category_code:
                    return c

        def create_categories(parent_codes, parent_titles, position):

            position += 1

            for i in range(len(parent_codes)):
                category_code = parent_codes.pop()
                name = parent_titles.pop()
                all_parents = parent_codes.copy()
                parent = None
                if all_parents:
                    parent = all_parents[-1]
                if not category_code in categories_keys:
                    _category = {
                        "provider_name": self.provider_name,
                        "category_code": category_code,
                        "name": name,
                        "position": position + i,
                        "parent": parent,
                        'all_parents': all_parents,
                        "datasets": [],
                        "doc_href": None,
                        "metadata": None
                    }
                    categories_keys.append(category_code)
                    categories.append(_category)

        position = 0
        is_verify_creation_date = False

        for event, dataset in it:

            if is_verify_creation_date is False:
                _root = dataset.getroottree().getroot()
                creation_date_str = _root.attrib.get("creationDate")
                creation_date = clean_datetime(
                    datetime.strptime(creation_date_str, '%Y%m%dT%H%M'))

                if self._is_updated_catalog(creation_date) is False:
                    msg = "no update from eurostat catalog. current[%s] - db[%s]"
                    logger.warning(msg %
                                   (creation_date,
                                    self.provider.metadata["creation_date"]))
                    if not self.force_update:
                        return []

                is_verify_creation_date = True
                if not self.force_update:
                    self.updated_catalog = True

            parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()",
                                         namespaces=TABLE_OF_CONTENT_NSMAP)

            if not is_selected(parent_codes):
                continue

            parent_titles = dataset.xpath(
                "ancestor::nt:branch/nt:title[attribute::language='en']/text()",
                namespaces=TABLE_OF_CONTENT_NSMAP)
            category_code = parent_codes[-1]

            create_categories(parent_codes, parent_titles, position)

            category = get_category(category_code)

            name = xpath_title(dataset)[0]
            last_update = xpath_ds_last_update(dataset)
            last_modified = xpath_ds_last_modified(dataset)
            doc_href = xpath_ds_metadata_html(dataset)
            data_start = xpath_ds_data_start(dataset)
            data_end = xpath_ds_data_end(dataset)
            values = xpath_ds_values(dataset)

            last_update = datetime.strptime(last_update[0], '%d.%m.%Y')
            if last_modified:
                last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y')
                last_update = max(last_update, last_modified)

            dataset_code = xpath_code(dataset)[0]
            _dataset = {
                "dataset_code": dataset_code,
                "name": name,
                "last_update": clean_datetime(last_update),
                "metadata": {
                    "doc_href": first_element_xpath(doc_href),
                    "data_start": first_element_xpath(data_start),
                    "data_end": first_element_xpath(data_end),
                    "values": int(first_element_xpath(values, default="0")),
                }
            }
            category["datasets"].append(_dataset)

        self.for_delete.append(filepath)

        return categories

コード例 #42

0

ファイルを表示

ファイル: bea.py プロジェクト: Menandalbee/dlstats

    def build_data_tree(self):
        
        categories = []
        
        for category_code, values in CATEGORIES.items():
            if "url" in values:
                continue
            
            cat = {
                "category_code": category_code,
                "name": values["name"],
                "parent": values["parent"],
                "all_parents": values["all_parents"],                
                "doc_href": values["doc_href"],
                "datasets": []
            }
            categories.append(cat)

        for category_code, category in CATEGORIES.items():
            
            if not "url" in category:
                continue
            
            url = category["url"]
            #filename = category["filename"]
            filename = "%s.xls.zip" % category_code
            
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.use_existing_file)
            filepath = download.get_filepath()
            #self.for_delete.append(filepath)        

            self._current_urls[url] = filepath
            
            try:
                zipfile_ = zipfile.ZipFile(filepath)
            except Exception as err:
                logger.error("bea zip error - url[%s] - filepath[%s] - error[%s]" % (url, filepath, str(err)))
                continue
            
            for section in zipfile_.namelist():
                
                if section in ['Iip_PrevT3a.xls', 'Iip_PrevT3b.xls', 'Iip_PrevT3c.xls']:
                    continue

                file_contents = zipfile_.read(section)
                excel_book = xlrd.open_workbook(file_contents=file_contents)
    
                try:                    
                    sheet = excel_book.sheet_by_name('Contents')
                    
                    cat = {
                        "category_code": category_code,
                        "name": category["name"],
                        "parent": category.get("parent"),
                        "all_parents": category.get("all_parents"),
                        "doc_href": None,
                        "datasets": []
                    }

                    dataset_base_names = {}

                    first_line = 0

                    for i, cell in enumerate(sheet.col(1)):
                        if "Code" in cell.value:
                            first_line = i+2
                            break
                        
                    for i, cell in enumerate(sheet.col(1)):
                        if i < first_line:
                            continue
                        
                        cell_row = sheet.row(i)
                        if cell_row[1].value != '':
                            dataset_code = cell_row[1].value
                            dataset_name = cell_row[2].value

                            dataset_base_names[dataset_code] = dataset_name
                            
                    for sheet_name in excel_book.sheet_names():
                        
                        _dataset_code = sheet_name.split()[0]
                        
                        if not _dataset_code in dataset_base_names:
                            continue
                        
                        _dataset_name = dataset_base_names[_dataset_code]
                        
                        frequency_name, frequency_code = _get_frequency(sheet_name)
                        
                        if not frequency_name:
                            msg = "not frequency name for sheet[%s] - url[%s] - filename[%s]" % (sheet_name, url, filename) 
                            logger.critical(msg)
                            raise Exception(msg)
                        
                        dataset_code = "%s-%s-%s" % (category_code, _dataset_code, frequency_code.lower()) 
                        dataset_name = "%s - %s" % (_dataset_name, frequency_name)
                        
                        cat["datasets"].append({
                            "name": dataset_name, 
                            "dataset_code": dataset_code,
                            "last_update": self._get_release_date(url, excel_book.sheet_by_name(sheet_name)), 
                            "metadata": {
                                "url": url, 
                                "filename": filename,
                                "sheet_name": sheet_name
                            }
                        })

                    categories.append(cat)
                    
                except Exception as err:
                    logger.error(str(err))
                
        return categories

コード例 #43

0

ファイルを表示

ファイル: esri.py プロジェクト: MichelJuillard/dlstats

 def parse_sna_agenda(self):
     #TODO: use Downloader
     download = Downloader(url="http://www.esri.cao.go.jp/en/sna/kouhyou/kouhyou_top.html",
                           filename="agenda_sna.html")
     with open(download.get_filepath(), 'rb') as fp:
         agenda = lxml.html.parse(fp)

コード例 #44

0

ファイルを表示

ファイル: eurostat.py プロジェクト: gitter-badger/dlstats

    def build_data_tree(self):
        """Builds the data tree
        """
        
        download = Downloader(url=self.url_table_of_contents, 
                              filename="table_of_contents.xml",
                              store_filepath=self.store_path,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        
        categories = []
        categories_keys = []
        
        it = etree.iterparse(filepath, events=['end'], tag="{urn:eu.europa.ec.eurostat.navtree}leaf")

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.categories_filter: 
                if _select in parent_codes:
                    return True
            return False

        def get_category(category_code):
            for c in categories:
                if c["category_code"] == category_code:
                    return c

        #TODO: date TOC à stocker dans provider !!!
        
        def create_categories(parent_codes, parent_titles, position):
            
            position += 1
            
            for i in range(len(parent_codes)):
                category_code = parent_codes.pop()                
                name = parent_titles.pop()                
                all_parents = parent_codes.copy()
                parent = None
                if all_parents:
                    parent = all_parents[-1]
                if not category_code in categories_keys:
                    _category = {
                        "provider_name": self.provider_name,
                        "category_code": category_code,
                        "name": name,
                        "position": position + i,
                        "parent": parent,
                        'all_parents': all_parents,
                        "datasets": [],
                        "doc_href": None,
                        "metadata": None
                    }
                    categories_keys.append(category_code)
                    categories.append(_category)
        
        #    .getroottree().creationDate="20160225T1102"

        position = 0
        
        for event, dataset in it:
            
            parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()", namespaces=TABLE_OF_CONTENT_NSMAP)
            
            if not is_selected(parent_codes):
                continue
            
            parent_titles = dataset.xpath("ancestor::nt:branch/nt:title[attribute::language='en']/text()", namespaces=TABLE_OF_CONTENT_NSMAP)
            category_code = parent_codes[-1]

            create_categories(parent_codes, parent_titles, position)
            
            category = get_category(category_code)

            name = xpath_title(dataset)[0]
            last_update = xpath_ds_last_update(dataset)
            last_modified = xpath_ds_last_modified(dataset)
            doc_href = xpath_ds_metadata_html(dataset)
            data_start = xpath_ds_data_start(dataset)
            data_end = xpath_ds_data_end(dataset)
            values = xpath_ds_values(dataset)

            last_update = datetime.strptime(last_update[0], '%d.%m.%Y')
            if last_modified:
                last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y')
                last_update = max(last_update, last_modified)

            dataset_code = xpath_code(dataset)[0]
            _dataset = {
                "dataset_code": dataset_code, 
                "name": name,
                "last_update": last_update,
                "metadata": {
                    "doc_href": first_element_xpath(doc_href),
                    "data_start": first_element_xpath(data_start),
                    "data_end": first_element_xpath(data_end),
                    "values": int(first_element_xpath(values, default="0")),
                }
            }             
            category["datasets"].append(_dataset)

        self.for_delete.append(filepath)
        
        return categories

コード例 #45

0

ファイルを表示

ファイル: bis.py プロジェクト: MichelJuillard/dlstats

def get_agenda():
    download = Downloader(url=AGENDA["url"], filename=AGENDA["filename"])
    with open(download.get_filepath(), "rb") as fp:
        return fp.read()

コード例 #46

0

ファイルを表示

ファイル: bea.py プロジェクト: vishalbelsare/dlstats

    def build_data_tree(self):

        categories = []

        for category_code, values in sorted(CATEGORIES.items()):
            if "url" in values:
                continue

            cat = {
                "category_code": category_code,
                "name": values["name"],
                "parent": values["parent"],
                "all_parents": values["all_parents"],
                "doc_href": values["doc_href"],
                "datasets": []
            }
            categories.append(cat)

        for category_code, category in sorted(CATEGORIES.items()):
            if not "url" in category:
                continue

            url = category["url"]
            #filename = category["filename"]
            filename = "%s.xls.zip" % category_code

            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.use_existing_file)
            filepath = download.get_filepath()
            #self.for_delete.append(filepath)

            self._current_urls[url] = filepath

            try:
                zipfile_ = zipfile.ZipFile(filepath)
            except Exception as err:
                logger.error(
                    "bea zip error - url[%s] - filepath[%s] - error[%s]" %
                    (url, filepath, str(err)))
                continue

            for section in zipfile_.namelist():

                if section in [
                        'Iip_PrevT3a.xls', 'Iip_PrevT3b.xls', 'Iip_PrevT3c.xls'
                ]:
                    continue

                file_contents = zipfile_.read(section)
                excel_book = xlrd.open_workbook(file_contents=file_contents)

                try:
                    sheet = excel_book.sheet_by_name('Contents')

                    cat = {
                        "category_code": category_code,
                        "name": category["name"],
                        "parent": category.get("parent"),
                        "all_parents": category.get("all_parents"),
                        "doc_href": None,
                        "datasets": []
                    }

                    dataset_base_names = {}

                    first_line = 0

                    for i, cell in enumerate(sheet.col(1)):
                        if "Code" in cell.value:
                            first_line = i + 2
                            break

                    for i, cell in enumerate(sheet.col(1)):
                        if i < first_line:
                            continue

                        cell_row = sheet.row(i)
                        if cell_row[1].value != '':
                            dataset_code = cell_row[1].value
                            dataset_name = cell_row[2].value

                            dataset_base_names[dataset_code] = dataset_name

                    for sheet_name in excel_book.sheet_names():

                        _dataset_code = sheet_name.split()[0]

                        if not _dataset_code in dataset_base_names:
                            continue

                        _dataset_name = dataset_base_names[_dataset_code]

                        frequency_name, frequency_code = _get_frequency(
                            sheet_name)

                        if not frequency_name:
                            msg = "not frequency name for sheet[%s] - url[%s] - filename[%s]" % (
                                sheet_name, url, filename)
                            logger.critical(msg)
                            raise Exception(msg)

                        dataset_code = "%s-%s-%s" % (category_code,
                                                     _dataset_code,
                                                     frequency_code.lower())
                        dataset_name = "%s - %s" % (_dataset_name,
                                                    frequency_name)

                        cat["datasets"].append({
                            "name":
                            dataset_name,
                            "dataset_code":
                            dataset_code,
                            "last_update":
                            self._get_release_date(
                                url, excel_book.sheet_by_name(sheet_name)),
                            "metadata": {
                                "url": url,
                                "filename": filename,
                                "sheet_name": sheet_name
                            }
                        })

                    categories.append(cat)

                except Exception as err:
                    logger.error(str(err))

        return categories