Example #1
0
    def _load_file(self):

        filename = "data-%s.zip" % (self.dataset_code)
        download = Downloader(
            url=self.url,
            filename=filename,
            store_filepath=self.get_store_path(),
            use_existing_file=self.fetcher.use_existing_file,
        )
        self.filepath, response = download.get_filepath_and_response()

        if self.filepath:
            self.fetcher.for_delete.append(self.filepath)

        release_date_str = response.headers["Last-Modified"]
        # Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT
        self.release_date = clean_datetime(datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT"))

        if self.dataset.last_update and self.dataset.last_update >= self.release_date:
            comments = "update-date[%s]" % self.release_date
            raise errors.RejectUpdatedDataset(
                provider_name=self.provider_name, dataset_code=self.dataset_code, comments=comments
            )

        self.dataset.last_update = self.release_date
Example #2
0
    def _load_dsd(self):
        """
        #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400),
        - download 1 dsd partage par plusieurs dataset
        - 668 datase
        """

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url,
                              filename="dsd-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)

        filepath, response = download.get_filepath_and_response()

        if response:
            if response.status_code == HTTP_ERROR_LONG_RESPONSE:
                self._load_dsd_by_element()
                return
            elif response.status_code >= 400:
                raise response.raise_for_status()

        if not os.path.exists(filepath):
            self._load_dsd_by_element()
            return

        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()
Example #3
0
    def _load(self):

        download = Downloader(url=self.dataset_url,
                              filename="data-%s.zip" % self.dataset_code,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file)

        filepaths = (extract_zip_file(download.get_filepath()))
        dsd_fp = filepaths[self.dataset_code + ".dsd.xml"]
        data_fp = filepaths[self.dataset_code + ".sdmx.xml"]

        self.fetcher.for_delete.append(dsd_fp)
        self.fetcher.for_delete.append(data_fp)

        self.xml_dsd.process(dsd_fp)
        self._set_dataset()

        self.xml_data = XMLData(
            provider_name=self.provider_name,
            dataset_code=self.dataset_code,
            xml_dsd=self.xml_dsd,
            dsd_id=self.dataset_code,
            #TODO: frequencies_supported=FREQUENCIES_SUPPORTED
        )
        self.rows = self.xml_data.process(data_fp)
Example #4
0
    def _load_structure_dataflows(self, force=False):

        if self._dataflows and not force:
            return

        self.provider_verify()

        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name

        if self.refresh_meta is False:
            self._dataflows = self._structure_get("dataflows")

            if self._dataflows:
                self.xml_dsd.dataflows = self._dataflows
                logger.info(
                    "load structure [dataflows] from metadata for url[%s]" %
                    url)
                return

        download = Downloader(url=url,
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file,
                              client=self.requests_client)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        self._structure_put("dataflows", url, **self._dataflows)
Example #5
0
 def _load_datas(self):
     
     store_filepath = self.get_store_path()
     # TODO: timeout, replace
     download = Downloader(url=self.dataset_url, filename=self.filename, store_filepath=store_filepath)
         
     return(download.get_filepath())
Example #6
0
    def _load(self):
        
        #TODO: DSD
        """
        url = "xxx/%s" % self.dataset_code
        download = Downloader(url=url, 
                              filename="dataflow-%s.xml" % self.dataset_code)
        self.xml_dsd.process(download.get_filepath())
        """

        url = "https://www.destatis.de/sddsplus/%s.xml" % self.dataset_code
        download = Downloader(url=url, 
                              filename="data-%s.xml" % self.dataset_code)

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                ns_tag_data=self.ns_tag_data,
                                #dimension_keys=self.xml_dsd.dimension_keys
                                )
        
        #TODO: response and exception
        try:
            filepath, response = download.get_filepath_and_response()        
        except requests.exceptions.HTTPError as err:
            logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code)
            raise
            
        self.rows = self.xml_data.process(filepath)
Example #7
0
 def _load_datas(self, datas=None):
     
     kwargs = {}
     
     if not datas:
         # TODO: timeout, replace
         download = Downloader(url=self.url,
                               store_filepath=self.store_path, 
                               filename=self.filename,
                               use_existing_file=self.fetcher.use_existing_file)
         
         zip_filepath = download.get_filepath()
         self.fetcher.for_delete.append(zip_filepath)
         filepath = extract_zip_file(zip_filepath)
         self.fetcher.for_delete.append(zip_filepath)
         
         kwargs['filepath'] = filepath
     else:
         kwargs['fileobj'] = io.StringIO(datas, newline="\n")
     
     kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y"
     kwargs['headers_line'] = DATASETS[self.dataset.dataset_code]['lines']['headers']
     self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(**kwargs)
     
     self.dataset.dimension_keys = self.dimension_keys
     
     self.dataset.last_update = self.release_date
     
     self.start_date = get_ordinal_from_period(self.periods[0], freq=self.frequency)
     self.end_date = get_ordinal_from_period(self.periods[-1], freq=self.frequency)
Example #8
0
    def _load_dsd(self):
        """
        #TODO: il y a une DSD pour chaque groupe de séries (soit environ 400),
        - download 1 dsd partage par plusieurs dataset
        - 668 datase
        """

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url,
                              filename="dsd-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)

        filepath, response = download.get_filepath_and_response()

        if response:
            if response.status_code == HTTP_ERROR_LONG_RESPONSE:
                self._load_dsd_by_element()
                return
            elif response.status_code >= 400:
                raise response.raise_for_status()

        if not os.path.exists(filepath):
            self._load_dsd_by_element()
            return

        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()
Example #9
0
    def weo_urls(self):
        download = Downloader(
            url='http://www.imf.org/external/ns/cs.aspx?id=28',
            filename="weo.html",
            store_filepath=self.store_path)

        filepath = download.get_filepath()
        with open(filepath, 'rb') as fp:
            webpage = fp.read()

        self.fetcher.for_delete.append(filepath)

        #TODO: replace by beautifoulsoup ?
        html = etree.HTML(webpage)
        hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']")
        links = [href.values() for href in hrefs]

        #The last links of the WEO webpage lead to data we dont want to pull.
        links = links[:-16]
        #These are other links we don't want.
        links.pop(-8)
        links.pop(-10)
        links = [link[0][:-10] + 'download.aspx' for link in links]

        output = []

        for link in links:
            webpage = requests.get(link)
            html = etree.HTML(webpage.text)
            final_link = html.xpath("//div[@id = 'content']//table//a['href']")
            output.append(link[:-13] + final_link[0].values()[0])

        # we need to handle the issue in chronological order
        return sorted(output)
Example #10
0
    def _load(self):

        download = Downloader(
            url=self.dataset_url,
            filename="data-%s.zip" % self.dataset_code,
            store_filepath=self.store_path,
            use_existing_file=self.fetcher.use_existing_file,
        )

        filepaths = extract_zip_file(download.get_filepath())
        dsd_fp = filepaths[self.dataset_code + ".dsd.xml"]
        data_fp = filepaths[self.dataset_code + ".sdmx.xml"]

        self.fetcher.for_delete.append(dsd_fp)
        self.fetcher.for_delete.append(data_fp)

        self.xml_dsd.process(dsd_fp)
        self._set_dataset()

        self.xml_data = XMLData(
            provider_name=self.provider_name,
            dataset_code=self.dataset_code,
            xml_dsd=self.xml_dsd,
            dsd_id=self.dataset_code,
            # TODO: frequencies_supported=FREQUENCIES_SUPPORTED
        )
        self.rows = self.xml_data.process(data_fp)
Example #11
0
    def weo_urls(self):
        download = Downloader(url='http://www.imf.org/external/ns/cs.aspx?id=28',
                              filename="weo.html",
                              store_filepath=self.store_path)
        
        filepath = download.get_filepath()
        with open(filepath, 'rb') as fp:
            webpage = fp.read()
        
        self.fetcher.for_delete.append(filepath)
            
        #TODO: replace by beautifoulsoup ?
        html = etree.HTML(webpage)
        hrefs = html.xpath("//div[@id = 'content-main']/h4/a['href']")
        links = [href.values() for href in hrefs]
        
        #The last links of the WEO webpage lead to data we dont want to pull.
        links = links[:-16]
        #These are other links we don't want.
        links.pop(-8)
        links.pop(-10)
        links = [link[0][:-10]+'download.aspx' for link in links]

        output = []
    
        for link in links:
            webpage = requests.get(link)
            html = etree.HTML(webpage.text)
            final_link = html.xpath("//div[@id = 'content']//table//a['href']")
            output.append(link[:-13]+final_link[0].values()[0])
            
        # we need to handle the issue in chronological order
        return sorted(output)
Example #12
0
    def _load_structure_dataflows(self, force=False):

        if self._dataflows and not force:
            return

        self.provider_verify()

        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name

        if self.refresh_meta is False:
            self._dataflows = self._structure_get("dataflows")

            if self._dataflows:
                self.xml_dsd.dataflows = self._dataflows
                logger.info("load structure [dataflows] from metadata for url[%s]" % url)
                return

        download = Downloader(url=url,
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file,
                              client=self.requests_client)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        self._structure_put("dataflows", url, **self._dataflows)
Example #13
0
 def get_data_directory(self):
     """ Get directory content for one dataset
     Returns a directory dict
     """
     dirname = self.dataset_code
     download = Downloader(url=self.dataset_url,
                           filename="index.html",
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     with open(download.get_filepath()) as f:
         html = etree.HTML(f.read())
     directory = {}
     for br in html.xpath('.//br'):
         text = br.tail
         if not text:
             continue
         entry = text.strip().split()
         filename = br.getnext().text
         splitdate = entry[0].split('/')
         (hour, minute) = entry[1].split(':')
         if entry[2] == 'PM' and int(hour) < 12:
             hour = str(int(hour) + 12)
         directory[filename] = {
             'year': int(splitdate[2]),
             'month': int(splitdate[0]),
             'day': int(splitdate[1]),
             'hour': int(hour),
             'minute': int(minute),
         }
     return directory
Example #14
0
 def get_dimension_data(self, filename, fmt):
     """Parses code file for one dimension
     Returns a dict
     """
     download = Downloader(url=os.path.join(self.dataset_url, filename),
                           filename=filename,
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     entries1 = {}
     entries2 = {}
     with open(filepath) as source_file:
         data = csv.reader(source_file, delimiter='\t')
         fields = next(data)
         for row in data:
             if len(row) == 0:
                 continue
             if fmt == 1:
                 entries1[row[0]] = row[1]
             elif fmt == 2:
                 entries1[row[1]] = row[2]
             elif fmt == 3:
                 entries1[row[0]] = row[2]
                 entries2[row[1]] = row[1]
             elif fmt == 4:
                 entries1[row[0]] = row[0]
                 entries2[row[1]] = row[3]
             else:
                 raise Exception("fmt {} doesn't exist".format(fmt))
     return (entries1, entries2)
Example #15
0
    def _load_file(self):

        filename = "data-%s.zip" % (self.dataset_code)
        download = Downloader(
            url=self.url,
            filename=filename,
            store_filepath=self.get_store_path(),
            use_existing_file=self.fetcher.use_existing_file,
        )
        self.filepath, response = download.get_filepath_and_response()

        if self.filepath:
            self.fetcher.for_delete.append(self.filepath)

        release_date_str = response.headers['Last-Modified']
        #Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT
        self.release_date = clean_datetime(
            datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT"))

        if self.dataset.last_update and self.dataset.last_update >= self.release_date:
            comments = "update-date[%s]" % self.release_date
            raise errors.RejectUpdatedDataset(provider_name=self.provider_name,
                                              dataset_code=self.dataset_code,
                                              comments=comments)

        self.dataset.last_update = self.release_date
Example #16
0
 def _load_xls(self):
     url_xls = make_xls_url(self.dataset_code)
     download = Downloader(url=url_xls,
                           filename=self.dataset_code + '_info.xls',
                           store_filepath=self.get_store_path(),
                           use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     return filepath
Example #17
0
 def _load_xls(self):
     url_xls = make_xls_url(self.dataset_code)
     download = Downloader(url=url_xls, 
                       filename=self.dataset_code + '_info.xls',
                       store_filepath=self.get_store_path(),
                       use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     return filepath
Example #18
0
    def _get_data_by_dimension(self):

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dsd_id,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        position, _key, dimension_values = select_dimension(
            dimension_keys, dimensions)

        count_dimensions = len(dimension_keys)

        for dimension_value in dimension_values:

            key = get_key_for_dimension(count_dimensions, position,
                                        dimension_value)

            #http://sdw-wsrest.ecb.int/service/data/IEAQ/A............
            url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (
                self.dataset_code, key)
            if not self._is_good_url(
                    url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]):
                print("bypass url[%s]" % url)
                continue

            headers = SDMX_DATA_HEADERS

            filename = "data-%s-%s.xml" % (self.dataset_code,
                                           key.replace(".", "_"))
            download = Downloader(
                url=url,
                filename=filename,
                store_filepath=self.store_path,
                headers=headers,
                use_existing_file=self.fetcher.use_existing_file,
                #client=self.fetcher.requests_client
            )
            filepath, response = download.get_filepath_and_response()

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response:
                self._add_url_cache(url, response.status_code)
            elif response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

        yield None, None
Example #19
0
 def _load_datas(self):
     # TODO: timeout, replace
     download = Downloader(url=self.dataset_url,
                           filename=self.dataset_code,
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     self.fetcher.for_delete.append(filepath)
     return filepath
Example #20
0
    def _load_datas(self):

        store_filepath = self.get_store_path()
        download = Downloader(url=self.dataset_url, 
                              filename=self.filename, 
                              store_filepath=store_filepath)

        '''Return 2 filepath (dsd and data)'''    
        return (extract_zip_file(download.get_filepath()))
Example #21
0
 def _load_datas(self):
     # TODO: timeout, replace
     download = Downloader(url=self.dataset_url, 
                           filename=self.dataset_code,
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     filepath = download.get_filepath()
     self.fetcher.for_delete.append(filepath)
     return filepath
Example #22
0
 def get_series_filepath(self):
     """Parse series file for a dataset
     Returns a dict of dict
     """
     filename = self.dataset_code + '.series'
     download = Downloader(url=os.path.join(self.dataset_url, filename),
                           filename=filename,
                           store_filepath=self.store_path,
                           use_existing_file=self.fetcher.use_existing_file)
     return download.get_filepath()
Example #23
0
    def _get_agenda(self):
        download = Downloader(url=AGENDA['url'],
                              filename=AGENDA['filename'],
                              store_filepath=self.store_path)
        filepath = download.get_filepath()        

        with open(filepath, 'rb') as fp:
            content = fp.read()
            self.for_delete.append(filepath)
            return content
Example #24
0
    def _get_agenda(self):
        download = Downloader(url=AGENDA['url'],
                              filename=AGENDA['filename'],
                              store_filepath=self.store_path)
        filepath = download.get_filepath()

        with open(filepath, 'rb') as fp:
            content = fp.read()
            self.for_delete.append(filepath)
            return content
Example #25
0
    def _get_data_by_dimension(self):

        self.xml_data = XMLData(
            provider_name=self.provider_name,
            dataset_code=self.dataset_code,
            xml_dsd=self.xml_dsd,
            dsd_id=self.dsd_id,
            frequencies_supported=FREQUENCIES_SUPPORTED,
        )

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        position, _key, dimension_values = select_dimension(dimension_keys, dimensions)

        count_dimensions = len(dimension_keys)

        for dimension_value in dimension_values:

            key = get_key_for_dimension(count_dimensions, position, dimension_value)

            # http://sdw-wsrest.ecb.int/service/data/IEAQ/A............
            url = "http://sdw-wsrest.ecb.int/service/data/%s/%s" % (self.dataset_code, key)
            if not self._is_good_url(url, good_codes=[200, HTTP_ERROR_NOT_MODIFIED]):
                print("bypass url[%s]" % url)
                continue

            headers = SDMX_DATA_HEADERS

            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(
                url=url,
                filename=filename,
                store_filepath=self.store_path,
                headers=headers,
                use_existing_file=self.fetcher.use_existing_file,
                # client=self.fetcher.requests_client
            )
            filepath, response = download.get_filepath_and_response()

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response:
                self._add_url_cache(url, response.status_code)
            elif response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

        yield None, None
Example #26
0
    def _get_data_by_dimension(self):

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        choice = "avg"
        if self.dataset_code in ["IPC-2015-COICOP"]:
            choice = "max"

        position, _key, dimension_values = select_dimension(dimension_keys,
                                                            dimensions,
                                                            choice=choice)

        count_dimensions = len(dimension_keys)

        logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code))

        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''

            key = get_key_for_dimension(count_dimensions, position, dimension_value)

            url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key)
            if self._is_good_url(url) is False:
                logger.warning("bypass not good url[%s]" % url)
                continue

            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.fetcher.use_existing_file,
                                  #NOT USE FOR INSEE client=self.fetcher.requests_client
                                  )
            filepath, response = download.get_filepath_and_response()

            if not response is None:
                self._add_url_cache(url, response.status_code)

            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath):
                continue

            if response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)

        yield None, None
Example #27
0
    def _get_data_by_dimension(self):
        
        dimension_keys, dimensions = self._get_dimensions_from_dsd()
        
        choice = "avg" 
        if self.dataset_code in ["IPC-2015-COICOP"]:
            choice = "max"        
        
        position, _key, dimension_values = select_dimension(dimension_keys, 
                                                            dimensions, 
                                                            choice=choice)
        
        count_dimensions = len(dimension_keys)
        
        logger.info("choice[%s] - filterkey[%s] - count[%s] - provider[%s] - dataset[%s]" % (choice, _key, len(dimension_values), self.provider_name, self.dataset_code))
        
        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''
        
            key = get_key_for_dimension(count_dimensions, position, dimension_value)    

            url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key)
            if self._is_good_url(url) is False:
                logger.warning("bypass not good url[%s]" % url)
                continue
            
            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(url=url, 
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.fetcher.use_existing_file,
                                  #NOT USE FOR INSEE client=self.fetcher.requests_client
                                  )
            filepath, response = download.get_filepath_and_response()

            if not response is None:
                self._add_url_cache(url, response.status_code)
            
            if filepath and os.path.exists(filepath):
                self.fetcher.for_delete.append(filepath)
            elif not filepath or not os.path.exists(filepath): 
                continue

            if response and response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response and response.status_code >= 400:
                raise response.raise_for_status()
            
            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)
        
        yield None, None
Example #28
0
    def _load_dsd(self):
        url = self._get_url_dsd()
        download = Downloader(store_filepath=self.store_path,
                              url=url,
                              filename="dsd-%s.xml" % self.dataset_code,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)

        self.xml_dsd.process(filepath)
        self._set_dataset()
Example #29
0
 def _load_dsd(self):
     url = self._get_url_dsd()
     download = Downloader(store_filepath=self.store_path,
                           url=url, 
                           filename="dsd-%s.xml" % self.dataset_code,
                           use_existing_file=self.fetcher.use_existing_file,
                           client=self.fetcher.requests_client)
     filepath = download.get_filepath()
     self.fetcher.for_delete.append(filepath)
     
     self.xml_dsd.process(filepath)
     self._set_dataset()
Example #30
0
    def _get_data_by_dimension(self):

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        position, _key, dimension_values = select_dimension(dimension_keys,
                                                            dimensions,
                                                            choice="max")

        count_dimensions = len(dimension_keys)

        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''

            local_count = 0

            sdmx_key = []
            for i in range(count_dimensions):
                if i == position:
                    sdmx_key.append(dimension_value)
                else:
                    sdmx_key.append(".")
            key = "".join(sdmx_key)

            url = "%s/%s" % (self._get_url_data(), key)
            filename = "data-%s-%s.xml" % (self.dataset_code,
                                           key.replace(".", "_"))
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  client=self.fetcher.requests_client)
            filepath, response = download.get_filepath_and_response()

            if filepath:
                self.fetcher.for_delete.append(filepath)

            if response.status_code >= 400 and response.status_code < 500:
                continue
            elif response.status_code >= 500:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err
                local_count += 1

            if local_count >= 2999:
                logger.warning(
                    "TODO: VRFY - series > 2999 for provider[IMF] - dataset[%s] - key[%s]"
                    % (self.dataset_code, key))

            #self.dataset.update_database(save_only=True)

        yield None, None
Example #31
0
    def _get_data_by_dimension(self):
        
        dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd,
                                                                       self.provider_name,
                                                                       self.dataset_code)
        
        choice = "avg" 
        if self.dataset_code in ["IPC-2015-COICOP"]:
            choice = "max"        
        
        position, _key, dimension_values = select_dimension(dimension_keys, 
                                                            dimensions, 
                                                            choice=choice)
        
        count_dimensions = len(dimension_keys)
        
        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''
            
            sdmx_key = []
            for i in range(count_dimensions):
                if i == position:
                    sdmx_key.append(dimension_value)
                else:
                    sdmx_key.append(".")
            key = "".join(sdmx_key)

            url = "http://www.bdm.insee.fr/series/sdmx/data/%s/%s" % (self.dataset_code, key)
            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(url=url, 
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  #client=self.fetcher.requests_client
                                  )
            filepath, response = download.get_filepath_and_response()

            if filepath:
                self.fetcher.for_delete.append(filepath)

            if response.status_code == HTTP_ERROR_NO_RESULT:
                continue
            elif response.status_code >= 400:
                raise response.raise_for_status()
            
            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)
        
        yield None, None
Example #32
0
    def _load(self):

        download = Downloader(url=self.url, 
                              filename="data-%s.xml" % self.dataset_code,
                              #headers=SDMX_DATA_HEADERS        
                              )
        data_fp, dsd_fp = (extract_zip_file(download.get_filepath()))

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                #dimension_keys=self.xml_dsd.dimension_keys
                                )
        
        self.rows = self.xml_data.process(data_fp)
Example #33
0
    def _get_data_by_dimension(self):
        
        dimension_keys, dimensions = get_dimensions_from_dsd(self.xml_dsd,
                                                             self.provider_name,
                                                             self.dataset_code)
        
        position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max")
        
        count_dimensions = len(dimension_keys)
        
        for dimension_value in dimension_values:
            '''Pour chaque valeur de la dimension, generer une key d'url'''
            
            local_count = 0
                        
            sdmx_key = []
            for i in range(count_dimensions):
                if i == position:
                    sdmx_key.append(dimension_value)
                else:
                    sdmx_key.append(".")
            key = "".join(sdmx_key)

            url = "%s/%s" % (self._get_url_data(), key)
            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(url=url, 
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  client=self.fetcher.requests_client)            
            filepath, response = download.get_filepath_and_response()

            if filepath:
                self.fetcher.for_delete.append(filepath)
            
            if response.status_code >= 400 and response.status_code < 500:
                continue
            elif response.status_code >= 500:
                raise response.raise_for_status()
            
            for row, err in self.xml_data.process(filepath):
                yield row, err
                local_count += 1
                
            if local_count >= 2999:
                logger.warning("TODO: VRFY - series > 2999 for provider[IMF] - dataset[%s] - key[%s]" % (self.dataset_code, key))

            #self.dataset.update_database(save_only=True)
        
        yield None, None
Example #34
0
    def _get_data_by_dimension(self):

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dataset_code,
                                frequencies_supported=FREQUENCIES_SUPPORTED)

        dimension_keys, dimensions = self._get_dimensions_from_dsd()

        position, _key, dimension_values = select_dimension(dimension_keys,
                                                            dimensions,
                                                            choice="max")

        count_dimensions = len(dimension_keys)

        for dimension_value in dimension_values:

            sdmx_key = []
            for i in range(count_dimensions):
                if i == position:
                    sdmx_key.append(dimension_value)
                else:
                    sdmx_key.append(".")
            key = "".join(sdmx_key)

            url = "%s/%s" % (self._get_url_data(), key)
            filename = "data-%s-%s.xml" % (self.dataset_code,
                                           key.replace(".", "_"))
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  client=self.fetcher.requests_client)
            filepath, response = download.get_filepath_and_response()

            if filepath:
                self.fetcher.for_delete.append(filepath)

            if response.status_code >= 400 and response.status_code < 500:
                continue
            elif response.status_code >= 500:
                raise response.raise_for_status()

            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)

        yield None, None
Example #35
0
    def _load_dsd_by_element(self):

        #FIXME: Manque codelist et concepts ?

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id
        download = Downloader(url=url,
                              filename="datastructure-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()
Example #36
0
    def _load_dsd_by_element(self):

        #FIXME: Manque codelist et concepts ?

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s" % self.dsd_id
        download = Downloader(url=url,
                              filename="datastructure-%s.xml" % self.dsd_id,
                              headers=SDMX_METADATA_HEADERS,
                              store_filepath=self.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              client=self.fetcher.requests_client)
        filepath = download.get_filepath()
        self.fetcher.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._set_dataset()
Example #37
0
    def _load_structure_datatree(self, force=False):

        if self._categoryschemes and self._categorisations and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categoryschemes = self._structure_get("categoryschemes")
            if self._categoryschemes:
                logger.info("load structure [categoryschemes] from metadata for url[%s]" % url)
        """
        if not self._categoryschemes:
            download = Downloader(url=url,
                                  filename="categoryscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categoryschemes = self.xml_dsd.categories
            #self._structure_put("categoryschemes", url, **self._categoryschemes)

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categorisations = self._structure_get("categorisation")
            if self._categorisations:
                self._categorisations_categories = self._structure_get("categorisations_categories")
                logger.info("load structure [categorisation] from metadata for url[%s]" % url)
        """

        if not self._categorisations:
            download = Downloader(url=url,
                                  filename="categorisation.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categorisations = self.xml_dsd.categorisations
            self._categorisations_categories = self.xml_dsd.categorisations_categories
Example #38
0
    def _get_data_by_dimension(self):

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                xml_dsd=self.xml_dsd,
                                dsd_id=self.dataset_code,
                                frequencies_supported=FREQUENCIES_SUPPORTED)
        
        dimension_keys, dimensions = self._get_dimensions_from_dsd()
        
        position, _key, dimension_values = select_dimension(dimension_keys, dimensions, choice="max")
        
        count_dimensions = len(dimension_keys)
        
        for dimension_value in dimension_values:
            
            sdmx_key = []
            for i in range(count_dimensions):
                if i == position:
                    sdmx_key.append(dimension_value)
                else:
                    sdmx_key.append(".")
            key = "".join(sdmx_key)

            url = "%s/%s" % (self._get_url_data(), key)
            filename = "data-%s-%s.xml" % (self.dataset_code, key.replace(".", "_"))
            download = Downloader(url=url, 
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  client=self.fetcher.requests_client
                                  )
            filepath, response = download.get_filepath_and_response()

            if filepath:
                self.fetcher.for_delete.append(filepath)
            
            if response.status_code >= 400 and response.status_code < 500:
                continue
            elif response.status_code >= 500:
                raise response.raise_for_status()
            
            for row, err in self.xml_data.process(filepath):
                yield row, err

            #self.dataset.update_database(save_only=True)
        
        yield None, None
Example #39
0
    def _load_structure(self, force=False):
        
        if self._dataflows and not force:
            return

        self.xml_sdmx = XMLSDMX(agencyID=self.provider_name)
        
        self.xml_dsd = XMLStructure(provider_name=self.provider_name,
                                    sdmx_client=self.xml_sdmx)       
        
        url = "http://www.bdm.insee.fr/series/sdmx/dataflow/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="dataflow.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._dataflows = self.xml_dsd.dataflows

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categoryscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categoryschemes = self.xml_dsd.categories

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="categorisation.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._categorisations = self.xml_dsd.categorisations
        
        url = "http://www.bdm.insee.fr/series/sdmx/conceptscheme/%s" % self.provider_name
        download = Downloader(url=url, 
                              filename="conceptscheme.xml",
                              store_filepath=self.store_path,
                              headers=SDMX_METADATA_HEADERS,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()
        self.for_delete.append(filepath)
        self.xml_dsd.process(filepath)
        self._concepts = self.xml_dsd.concepts
Example #40
0
def download_all_sources():
    """Download all datasets files (if not exist) and store local temp directory
    
    Store in /[TMP_DIR]/[PROVIDER_NAME]/[DATASET_CODE]/[FILENAME]
    
    return a dict with key is filename and value is full filepath
    """

    filepaths = {}

    for dataset_code, dataset in DATASETS.items():
        store_filepath = os.path.abspath(os.path.join(tempfile.gettempdir(), PROVIDER_NAME, dataset_code))
        download = Downloader(
            url=dataset["url"], filename=dataset["filename"], store_filepath=store_filepath
        )  # TODO:, timeout, replace)
        filepaths[dataset["filename"]] = os.path.abspath(os.path.join(store_filepath, dataset["filename"]))
        logger.info("Download file[%s]" % download.get_filepath())

    return filepaths
Example #41
0
    def _process(self):
        for url in self.urls:

            #TODO: if not url.endswith("alla.xls"):

            #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls]
            date_str = match(".*WEO(\w{7})", url).groups()[0]  #Sep2006
            self.release_date = datetime.strptime(date_str,
                                                  "%b%Y")  #2006-09-01 00:00:00

            if not self._is_updated():
                msg = "upsert dataset[%s] bypass because is updated from release_date[%s]"
                logger.info(msg % (self.dataset_code, self.release_date))
                continue

            self.dataset.last_update = self.release_date

            logger.info("load url[%s]" % url)

            download = Downloader(
                url=url,
                store_filepath=self.store_path,
                filename=os.path.basename(url),
                use_existing_file=self.fetcher.use_existing_file)

            data_filepath = download.get_filepath()
            self.fetcher.for_delete.append(data_filepath)

            with open(data_filepath, encoding='latin-1') as fp:

                self.sheet = csv.DictReader(fp, dialect=csv.excel_tab)
                self.years = self.sheet.fieldnames[8:-1]
                self.start_date = get_ordinal_from_period(self.years[0],
                                                          freq=self.frequency)
                self.end_date = get_ordinal_from_period(self.years[-1],
                                                        freq=self.frequency)

                for row in self.sheet:
                    if not row or not row.get('Country Group Name'):
                        break
                    yield row, None

        yield None, None
Example #42
0
    def _process(self):        
        for url in self.urls:
            
            #TODO: if not url.endswith("alla.xls"):
            
            #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls]
            date_str = match(".*WEO(\w{7})", url).groups()[0] #Sep2006
            self.release_date = datetime.strptime(date_str, "%b%Y") #2006-09-01 00:00:00
            
            if not self.is_updated():
                msg = "upsert dataset[%s] bypass because is updated from release_date[%s]"
                logger.info(msg % (self.dataset_code, self.release_date))
                continue

            self.dataset.last_update = self.release_date        
                
            logger.info("load url[%s]" % url)
            
            download = Downloader(url=url,
                                  store_filepath=self.store_path, 
                                  filename=os.path.basename(url),
                                  use_existing_file=self.fetcher.use_existing_file)        
            
            data_filepath = download.get_filepath()
            self.fetcher.for_delete.append(data_filepath)
            
            with open(data_filepath, encoding='latin-1') as fp:
                
                self.sheet = csv.DictReader(fp, dialect=csv.excel_tab)
                self.years = self.sheet.fieldnames[9:-1]
                self.start_date = get_ordinal_from_period(self.years[0], 
                                                          freq=self.frequency)
                self.end_date = get_ordinal_from_period(self.years[-1], 
                                                        freq=self.frequency)
                
                for row in self.sheet:
                    if not row or not row.get('Country'):
                        break       
                    yield row, None

            #self.dataset.update_database(save_only=True)
        
        yield None, None
Example #43
0
 def iter_row(self, url, filename, store_path, use_existing_file):
     download = Downloader(url=url,
                           filename=filename,
                           store_filepath=store_path,
                           use_existing_file=use_existing_file)
     filepath = download.get_filepath()
     with open(filepath) as source_file:
         data = csv.reader(source_file, delimiter='\t')
         fields = [f.strip() for f in next(data)]
         # for pc dataset
         if 'footnotes' in fields:
             i = fields.index('footnotes')
             fields[i] = 'footnote_codes'
         #check that data are in the right order
         assert (fields == [
             'series_id', 'year', 'period', 'value', 'footnote_codes'
         ])
         for row in data:
             yield [elem.strip() for elem in row]
Example #44
0
 def parse_agenda(self):
     #TODO: use Downloader
     download = Downloader(url="http://www.ecb.europa.eu/press/calendars/statscal/html/index.en.html",
                           filename="statscall.html")
     with open(download.get_filepath(), 'rb') as fp:
         agenda = lxml.html.parse(fp)
     
     regex_date = re.compile("Reference period: (.*)")
     regex_dataset = re.compile(".*Dataset: (.*)\)")
     entries = agenda.xpath('//div[@class="ecb-faytdd"]/*/dt | '
                            '//div[@class="ecb-faytdd"]/*/dd')[2:]
     entries = zip(entries[::2], entries[1::2])
     for entry in entries:
         item = {}
         match_key = regex_dataset.match(entry[1][0].text_content())
         item['dataflow_key'] = match_key.groups()[0]
         match_date = regex_date.match(entry[1][1].text_content())
         item['reference_period'] = match_date.groups()[0]
         item['scheduled_date'] = entry[0].text_content().replace('\n','')
         yield(item)
Example #45
0
 def _load_datas(self):
     filepath = list()
     if self.dataset_url:
         download = Downloader(url=self.dataset_url, 
                               filename=self.dataset_code,
                               store_filepath=self.get_store_path(),
                               use_existing_file=self.fetcher.use_existing_file)
         filepath.append(download.get_filepath())
     else:
         _filter_name, _filter_codes = get_filter(self.dataset_code)
         for code in _filter_codes:
             url = "http://webstat.banque-france.fr/en/export.do?node=DATASETS_%s&%s=%s&exportType=sdmx" % (self.dataset_code, _filter_name, code)              
             name = self.dataset_code + "_%s" % (code) 
             # print(name)
             download = Downloader(url=url, 
                                   filename=name,
                                   store_filepath=self.get_store_path(),
                                   use_existing_file=self.fetcher.use_existing_file)
             filepath.append(download.get_filepath())
     return filepath 
Example #46
0
    def _get_sheet(self, url, filename, sheet_name):
        
        if url in self._current_urls:
            filepath = self._current_urls[url]
        else:
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.use_existing_file)
            
            filepath = download.get_filepath()
            self._current_urls[url] = filepath        

        zipfile_ = zipfile.ZipFile(filepath)
        section = zipfile_.namelist()[0]
                
        file_contents = zipfile_.read(section)
                    
        excel_book = xlrd.open_workbook(file_contents=file_contents)
                    
        return excel_book.sheet_by_name(sheet_name)
Example #47
0
    def _get_sheet(self, url, filename, sheet_name):

        if url in self._current_urls:
            filepath = self._current_urls[url]
        else:
            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.use_existing_file)

            filepath = download.get_filepath()
            #self.for_delete.append(filepath)
            self._current_urls[url] = filepath

        zipfile_ = zipfile.ZipFile(filepath)
        section = zipfile_.namelist()[0]

        file_contents = zipfile_.read(section)

        excel_book = xlrd.open_workbook(file_contents=file_contents)

        return excel_book.sheet_by_name(sheet_name)
Example #48
0
    def _load_datas(self, datas=None):

        kwargs = {}

        if not datas:
            store_filepath = self.get_store_path()
            # TODO: timeout, replace
            download = Downloader(url=self.url, filename=self.filename, store_filepath=store_filepath)

            filepath = extract_zip_file(download.get_filepath())
            kwargs["filepath"] = filepath
        else:
            kwargs["fileobj"] = io.StringIO(datas, newline="\n")

        kwargs["date_format"] = "%a %b %d %H:%M:%S %Z %Y"
        kwargs["headers_line"] = DATASETS[self.dataset.dataset_code]["lines"]["headers"]
        self.rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(**kwargs)

        self.dataset.last_update = self.release_date

        self.start_date = pandas.Period(self.periods[0], freq=self.frequency)
        self.end_date = pandas.Period(self.periods[-1], freq=self.frequency)
Example #49
0
    def load_datas(self):

        filename = "data-%s.zip" % (self.dataset_code)
        download = Downloader(url=self.url, 
                              filename=filename,
                              store_filepath=self.fetcher.store_path,
                              use_existing_file=self.fetcher.use_existing_file,
                              #client=self.fetcher.requests_client
                              )
        filepath, response = download.get_filepath_and_response()

        if response:
            release_date_str = response.headers['Last-Modified']
    
            #Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT            
            self.release_date = datetime.strptime(release_date_str, 
                                                          "%a, %d %b %Y %H:%M:%S GMT")
            
            self._is_updated(self.release_date)
            
        self.zipfile = zipfile.ZipFile(filepath)
        self.excel_filenames = iter(self.zipfile.namelist())
Example #50
0
    def _load(self):

        self.dsd_id = self.dataset_code

        url = "http://www.bdm.insee.fr/series/sdmx/datastructure/INSEE/%s?references=children" % self.dsd_id
        download = Downloader(url=url, 
                              filename="dsd-%s.xml" % self.dataset_code,
                              headers=SDMX_METADATA_HEADERS)
        self.xml_dsd.process(download.get_filepath())
        
        self.dataset.name = self.xml_dsd.dataset_name
        
        dimensions = OrderedDict()
        for key, item in self.xml_dsd.dimensions.items():
            dimensions[key] = item["dimensions"]
        self.dimension_list.set_dict(dimensions)
        
        attributes = OrderedDict()
        for key, item in self.xml_dsd.attributes.items():
            attributes[key] = item["values"]
        self.attribute_list.set_dict(attributes)
        
        url = "http://www.bdm.insee.fr/series/sdmx/data/%s" % self.dataset_code
        download = Downloader(url=url, 
                              filename="data-%s.xml" % self.dataset_code,
                              headers=SDMX_DATA_HEADERS)

        self.xml_data = XMLData(provider_name=self.provider_name,
                                dataset_code=self.dataset_code,
                                dimension_keys=self.xml_dsd.dimension_keys)
        
        #TODO: response and exception
        try:
            filepath, response = download.get_filepath_and_response()        
        except requests.exceptions.HTTPError as err:
            logger.critical("AUTRE ERREUR HTTP : %s" % err.response.status_code)
            raise
            
        self.rows = self.xml_data.process(filepath)
Example #51
0
    def _load_datas(self, datas=None):

        kwargs = {}

        if not datas:
            # TODO: timeout, replace
            download = Downloader(
                url=self.url,
                store_filepath=self.store_path,
                filename=self.filename,
                use_existing_file=self.fetcher.use_existing_file)

            zip_filepath = download.get_filepath()
            self.fetcher.for_delete.append(zip_filepath)
            filepath = extract_zip_file(zip_filepath)
            self.fetcher.for_delete.append(zip_filepath)

            kwargs['filepath'] = filepath
        else:
            kwargs['fileobj'] = io.StringIO(datas, newline="\n")

        kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y"
        kwargs['headers_line'] = DATASETS[
            self.dataset.dataset_code]['lines']['headers']
        self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(
            **kwargs)

        self.dataset.dimension_keys = self.dimension_keys

        #TODO: if "frequency" in self.dataset.dimension_keys:
        #    self.dataset.set_dimension_frequency("frequency")

        self.dataset.last_update = self.release_date

        self.start_date = get_ordinal_from_period(self.periods[0],
                                                  freq=self.frequency)
        self.end_date = get_ordinal_from_period(self.periods[-1],
                                                freq=self.frequency)
Example #52
0
    def _load_structure_datatree(self, force=False):

        if self._categoryschemes and self._categorisations and not force:
            return

        self._load_structure_dataflows(force)

        url = "http://www.bdm.insee.fr/series/sdmx/categoryscheme/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categoryschemes = self._structure_get("categoryschemes")
            if self._categoryschemes:
                logger.info("load structure [categoryschemes] from metadata for url[%s]" % url)
        """
        if not self._categoryschemes:
            download = Downloader(url=url,
                                  filename="categoryscheme.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categoryschemes = self.xml_dsd.categories
            #self._structure_put("categoryschemes", url, **self._categoryschemes)

        url = "http://www.bdm.insee.fr/series/sdmx/categorisation/%s" % self.provider_name
        """
        if self.refresh_meta is False:
            self._categorisations = self._structure_get("categorisation")
            if self._categorisations:
                self._categorisations_categories = self._structure_get("categorisations_categories")
                logger.info("load structure [categorisation] from metadata for url[%s]" % url)
        """

        if not self._categorisations:
            download = Downloader(url=url,
                                  filename="categorisation.xml",
                                  store_filepath=self.store_path,
                                  headers=SDMX_METADATA_HEADERS,
                                  use_existing_file=self.use_existing_file,
                                  client=self.requests_client)
            filepath = download.get_filepath()
            self.for_delete.append(filepath)
            self.xml_dsd.process(filepath)
            self._categorisations = self.xml_dsd.categorisations
            self._categorisations_categories = self.xml_dsd.categorisations_categories
Example #53
0
 def _load_datas(self):
     filepath = list()
     if self.dataset_url:
         download = Downloader(
             url=self.dataset_url,
             filename=self.dataset_code,
             store_filepath=self.get_store_path(),
             use_existing_file=self.fetcher.use_existing_file)
         filepath.append(download.get_filepath())
     else:
         _filter_name, _filter_codes = get_filter(self.dataset_code)
         for code in _filter_codes:
             url = "http://webstat.banque-france.fr/en/export.do?node=DATASETS_%s&%s=%s&exportType=sdmx" % (
                 self.dataset_code, _filter_name, code)
             name = self.dataset_code + "_%s" % (code)
             # print(name)
             download = Downloader(
                 url=url,
                 filename=name,
                 store_filepath=self.get_store_path(),
                 use_existing_file=self.fetcher.use_existing_file)
             filepath.append(download.get_filepath())
     return filepath
Example #54
0
    def build_data_tree(self):

        categories = []

        for category_code, values in sorted(CATEGORIES.items()):
            if "url" in values:
                continue

            cat = {
                "category_code": category_code,
                "name": values["name"],
                "parent": values["parent"],
                "all_parents": values["all_parents"],
                "doc_href": values["doc_href"],
                "datasets": []
            }
            categories.append(cat)

        for category_code, category in sorted(CATEGORIES.items()):
            if not "url" in category:
                continue

            url = category["url"]
            #filename = category["filename"]
            filename = "%s.xls.zip" % category_code

            download = Downloader(url=url,
                                  filename=filename,
                                  store_filepath=self.store_path,
                                  use_existing_file=self.use_existing_file)
            filepath = download.get_filepath()
            #self.for_delete.append(filepath)

            self._current_urls[url] = filepath

            try:
                zipfile_ = zipfile.ZipFile(filepath)
            except Exception as err:
                logger.error(
                    "bea zip error - url[%s] - filepath[%s] - error[%s]" %
                    (url, filepath, str(err)))
                continue

            for section in zipfile_.namelist():

                if section in [
                        'Iip_PrevT3a.xls', 'Iip_PrevT3b.xls', 'Iip_PrevT3c.xls'
                ]:
                    continue

                file_contents = zipfile_.read(section)
                excel_book = xlrd.open_workbook(file_contents=file_contents)

                try:
                    sheet = excel_book.sheet_by_name('Contents')

                    cat = {
                        "category_code": category_code,
                        "name": category["name"],
                        "parent": category.get("parent"),
                        "all_parents": category.get("all_parents"),
                        "doc_href": None,
                        "datasets": []
                    }

                    dataset_base_names = {}

                    first_line = 0

                    for i, cell in enumerate(sheet.col(1)):
                        if "Code" in cell.value:
                            first_line = i + 2
                            break

                    for i, cell in enumerate(sheet.col(1)):
                        if i < first_line:
                            continue

                        cell_row = sheet.row(i)
                        if cell_row[1].value != '':
                            dataset_code = cell_row[1].value
                            dataset_name = cell_row[2].value

                            dataset_base_names[dataset_code] = dataset_name

                    for sheet_name in excel_book.sheet_names():

                        _dataset_code = sheet_name.split()[0]

                        if not _dataset_code in dataset_base_names:
                            continue

                        _dataset_name = dataset_base_names[_dataset_code]

                        frequency_name, frequency_code = _get_frequency(
                            sheet_name)

                        if not frequency_name:
                            msg = "not frequency name for sheet[%s] - url[%s] - filename[%s]" % (
                                sheet_name, url, filename)
                            logger.critical(msg)
                            raise Exception(msg)

                        dataset_code = "%s-%s-%s" % (category_code,
                                                     _dataset_code,
                                                     frequency_code.lower())
                        dataset_name = "%s - %s" % (_dataset_name,
                                                    frequency_name)

                        cat["datasets"].append({
                            "name":
                            dataset_name,
                            "dataset_code":
                            dataset_code,
                            "last_update":
                            self._get_release_date(
                                url, excel_book.sheet_by_name(sheet_name)),
                            "metadata": {
                                "url": url,
                                "filename": filename,
                                "sheet_name": sheet_name
                            }
                        })

                    categories.append(cat)

                except Exception as err:
                    logger.error(str(err))

        return categories
Example #55
0
    def build_data_tree(self):
        """Builds the data tree
        """

        download = Downloader(url=self.url_table_of_contents,
                              filename="table_of_contents.xml",
                              store_filepath=self.store_path,
                              use_existing_file=self.use_existing_file)
        filepath = download.get_filepath()

        categories = []
        categories_keys = []

        it = etree.iterparse(filepath,
                             events=['end'],
                             tag="{urn:eu.europa.ec.eurostat.navtree}leaf")

        def is_selected(parent_codes):
            """parent_codes is array of category_code
            """
            for _select in self.categories_filter:
                if _select in parent_codes:
                    return True
            return False

        def get_category(category_code):
            for c in categories:
                if c["category_code"] == category_code:
                    return c

        def create_categories(parent_codes, parent_titles, position):

            position += 1

            for i in range(len(parent_codes)):
                category_code = parent_codes.pop()
                name = parent_titles.pop()
                all_parents = parent_codes.copy()
                parent = None
                if all_parents:
                    parent = all_parents[-1]
                if not category_code in categories_keys:
                    _category = {
                        "provider_name": self.provider_name,
                        "category_code": category_code,
                        "name": name,
                        "position": position + i,
                        "parent": parent,
                        'all_parents': all_parents,
                        "datasets": [],
                        "doc_href": None,
                        "metadata": None
                    }
                    categories_keys.append(category_code)
                    categories.append(_category)

        position = 0
        is_verify_creation_date = False

        for event, dataset in it:

            if is_verify_creation_date is False:
                _root = dataset.getroottree().getroot()
                creation_date_str = _root.attrib.get("creationDate")
                creation_date = clean_datetime(
                    datetime.strptime(creation_date_str, '%Y%m%dT%H%M'))

                if self._is_updated_catalog(creation_date) is False:
                    msg = "no update from eurostat catalog. current[%s] - db[%s]"
                    logger.warning(msg %
                                   (creation_date,
                                    self.provider.metadata["creation_date"]))
                    if not self.force_update:
                        return []

                is_verify_creation_date = True
                if not self.force_update:
                    self.updated_catalog = True

            parent_codes = dataset.xpath("ancestor::nt:branch/nt:code/text()",
                                         namespaces=TABLE_OF_CONTENT_NSMAP)

            if not is_selected(parent_codes):
                continue

            parent_titles = dataset.xpath(
                "ancestor::nt:branch/nt:title[attribute::language='en']/text()",
                namespaces=TABLE_OF_CONTENT_NSMAP)
            category_code = parent_codes[-1]

            create_categories(parent_codes, parent_titles, position)

            category = get_category(category_code)

            name = xpath_title(dataset)[0]
            last_update = xpath_ds_last_update(dataset)
            last_modified = xpath_ds_last_modified(dataset)
            doc_href = xpath_ds_metadata_html(dataset)
            data_start = xpath_ds_data_start(dataset)
            data_end = xpath_ds_data_end(dataset)
            values = xpath_ds_values(dataset)

            last_update = datetime.strptime(last_update[0], '%d.%m.%Y')
            if last_modified:
                last_modified = datetime.strptime(last_modified[0], '%d.%m.%Y')
                last_update = max(last_update, last_modified)

            dataset_code = xpath_code(dataset)[0]
            _dataset = {
                "dataset_code": dataset_code,
                "name": name,
                "last_update": clean_datetime(last_update),
                "metadata": {
                    "doc_href": first_element_xpath(doc_href),
                    "data_start": first_element_xpath(data_start),
                    "data_end": first_element_xpath(data_end),
                    "values": int(first_element_xpath(values, default="0")),
                }
            }
            category["datasets"].append(_dataset)

        self.for_delete.append(filepath)

        return categories