Example #1
0
 def _load_datas(self, datas=None):
     
     kwargs = {}
     
     if not datas:
         # TODO: timeout, replace
         download = Downloader(url=self.url,
                               store_filepath=self.store_path, 
                               filename=self.filename,
                               use_existing_file=self.fetcher.use_existing_file)
         
         zip_filepath = download.get_filepath()
         self.fetcher.for_delete.append(zip_filepath)
         filepath = extract_zip_file(zip_filepath)
         self.fetcher.for_delete.append(zip_filepath)
         
         kwargs['filepath'] = filepath
     else:
         kwargs['fileobj'] = io.StringIO(datas, newline="\n")
     
     kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y"
     kwargs['headers_line'] = DATASETS[self.dataset.dataset_code]['lines']['headers']
     self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(**kwargs)
     
     self.dataset.dimension_keys = self.dimension_keys
     
     self.dataset.last_update = self.release_date
     
     self.start_date = get_ordinal_from_period(self.periods[0], freq=self.frequency)
     self.end_date = get_ordinal_from_period(self.periods[-1], freq=self.frequency)
Example #2
0
    def build_series(self, datas):
        
        datas = datas["datas"]
        
        series = {}
        series['key'] = "%s.%s" % (self.current_indicator["id"], self.current_country)
        series['name'] = "%s - %s" % (self.current_indicator["name"], self.available_countries[self.current_country]["name"])
        series['frequency'] = self._search_frequency(datas[0])

        #if self.current_indicator.get("sourceNote"):
        #    series["notes"] = self.current_indicator.get("sourceNote")
        
        values = []
        value_found = False
        for point in datas:
            
            frequency = self._search_frequency(point)
            if frequency != series['frequency']:
                raise Exception("Diff frequency [%s] != [%s] - series[%s]" % (frequency, series['frequency'], series['key']))
            
            value = {
                'attributes': None,
                'release_date': self.release_date,
                'value': str(point["value"]).replace("None", ""),
                'ordinal': get_ordinal_from_period(point["date"], freq=series['frequency']),
                'period': point["date"],
            }
            if not value_found and value["value"] != "":
                value_found = True
            
            if "obs_status" in point:
                obs_status = point.get("obs_status")
                if obs_status and len(obs_status) > 0:
                    value["attributes"] = {"obs_status": obs_status}
                    if not "obs_status" in self.dataset.codelists:
                        self.dataset.codelists["obs_status"] = self.obs_status
                    if not "obs_status" in self.dataset.concepts:
                        self.dataset.concepts["obs_status"] = "Observation Status"
            
            values.append(value)

        if not value_found:
            msg = {"provider_name": self.provider_name, 
                   "dataset_code": self.dataset_code}            
            raise errors.RejectEmptySeries(**msg)                

        keyfunc = lambda x: x["ordinal"]
        series['values'] = sorted(values, key=keyfunc)

        series['provider_name'] = self.provider_name
        series['dataset_code'] = self.dataset_code
                
        series['start_date'] = series['values'][0]["ordinal"]
        series['end_date'] = series['values'][-1]["ordinal"]

        series['dimensions'] = {'country': self.current_country}
        series['attributes'] = None

        return series
Example #3
0
    def _process(self):
        for url in self.urls:

            #TODO: if not url.endswith("alla.xls"):

            #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls]
            date_str = match(".*WEO(\w{7})", url).groups()[0]  #Sep2006
            self.release_date = datetime.strptime(date_str,
                                                  "%b%Y")  #2006-09-01 00:00:00

            if not self._is_updated():
                msg = "upsert dataset[%s] bypass because is updated from release_date[%s]"
                logger.info(msg % (self.dataset_code, self.release_date))
                continue

            self.dataset.last_update = self.release_date

            logger.info("load url[%s]" % url)

            download = Downloader(
                url=url,
                store_filepath=self.store_path,
                filename=os.path.basename(url),
                use_existing_file=self.fetcher.use_existing_file)

            data_filepath = download.get_filepath()
            self.fetcher.for_delete.append(data_filepath)

            with open(data_filepath, encoding='latin-1') as fp:

                self.sheet = csv.DictReader(fp, dialect=csv.excel_tab)
                self.years = self.sheet.fieldnames[8:-1]
                self.start_date = get_ordinal_from_period(self.years[0],
                                                          freq=self.frequency)
                self.end_date = get_ordinal_from_period(self.years[-1],
                                                        freq=self.frequency)

                for row in self.sheet:
                    if not row or not row.get('Country Group Name'):
                        break
                    yield row, None

        yield None, None
Example #4
0
    def _process(self):        
        for url in self.urls:
            
            #TODO: if not url.endswith("alla.xls"):
            
            #ex: http://www.imf.org/external/pubs/ft/weo/2006/02/data/WEOSep2006all.xls]
            date_str = match(".*WEO(\w{7})", url).groups()[0] #Sep2006
            self.release_date = datetime.strptime(date_str, "%b%Y") #2006-09-01 00:00:00
            
            if not self.is_updated():
                msg = "upsert dataset[%s] bypass because is updated from release_date[%s]"
                logger.info(msg % (self.dataset_code, self.release_date))
                continue

            self.dataset.last_update = self.release_date        
                
            logger.info("load url[%s]" % url)
            
            download = Downloader(url=url,
                                  store_filepath=self.store_path, 
                                  filename=os.path.basename(url),
                                  use_existing_file=self.fetcher.use_existing_file)        
            
            data_filepath = download.get_filepath()
            self.fetcher.for_delete.append(data_filepath)
            
            with open(data_filepath, encoding='latin-1') as fp:
                
                self.sheet = csv.DictReader(fp, dialect=csv.excel_tab)
                self.years = self.sheet.fieldnames[9:-1]
                self.start_date = get_ordinal_from_period(self.years[0], 
                                                          freq=self.frequency)
                self.end_date = get_ordinal_from_period(self.years[-1], 
                                                        freq=self.frequency)
                
                for row in self.sheet:
                    if not row or not row.get('Country'):
                        break       
                    yield row, None

            #self.dataset.update_database(save_only=True)
        
        yield None, None
Example #5
0
    def _load_datas(self, datas=None):

        kwargs = {}

        if not datas:
            # TODO: timeout, replace
            download = Downloader(
                url=self.url,
                store_filepath=self.store_path,
                filename=self.filename,
                use_existing_file=self.fetcher.use_existing_file)

            zip_filepath = download.get_filepath()
            self.fetcher.for_delete.append(zip_filepath)
            filepath = extract_zip_file(zip_filepath)
            self.fetcher.for_delete.append(zip_filepath)

            kwargs['filepath'] = filepath
        else:
            kwargs['fileobj'] = io.StringIO(datas, newline="\n")

        kwargs['date_format'] = "%a %b %d %H:%M:%S %Z %Y"
        kwargs['headers_line'] = DATASETS[
            self.dataset.dataset_code]['lines']['headers']
        self._file, self._rows, self.headers, self.release_date, self.dimension_keys, self.periods = local_read_csv(
            **kwargs)

        self.dataset.dimension_keys = self.dimension_keys

        #TODO: if "frequency" in self.dataset.dimension_keys:
        #    self.dataset.set_dimension_frequency("frequency")

        self.dataset.last_update = self.release_date

        self.start_date = get_ordinal_from_period(self.periods[0],
                                                  freq=self.frequency)
        self.end_date = get_ordinal_from_period(self.periods[-1],
                                                freq=self.frequency)
Example #6
0
    def build_series(self, datas):
        
        datas = datas["datas"]
        
        series = {}
        series['key'] = "%s.%s" % (self.current_indicator["id"], self.current_country)
        series['name'] = "%s - %s" % (self.current_indicator["name"], self.available_countries[self.current_country]["name"])
        series['frequency'] = self._search_frequency(datas[0])

        if self.current_indicator.get("sourceNote"):
            series["notes"] = self.current_indicator.get("sourceNote")
        
        values = []
        for point in datas:
            
            frequency = self._search_frequency(point)
            if frequency != series['frequency']:
                raise Exception("Diff frequency [%s] != [%s] - series[%s]" % (frequency, series['frequency'], series['key']))
            
            value = {
                'attributes': None,
                'release_date': self.release_date,
                'value': str(point["value"]),
                'ordinal': get_ordinal_from_period(point["date"], freq=series['frequency']),
                'period': point["date"],
            }
            
            if "obs_status" in point:
                obs_status = point.get("obs_status")
                if obs_status and len(obs_status) > 0:
                    value["attributes"] = {"obs_status": obs_status}
            
            values.append(value)

        keyfunc = lambda x: x["ordinal"]
        series['values'] = sorted(values, key=keyfunc)

        series['provider_name'] = self.provider_name
        series['dataset_code'] = self.dataset_code
                
        series['start_date'] = series['values'][0]["ordinal"]
        series['end_date'] = series['values'][-1]["ordinal"]

        series['dimensions'] = {'country': self.current_country}
        series['attributes'] = None

        return series
Example #7
0
    def build_series(self, row):
        series_key = row['KEY']

        dimensions = OrderedDict()
        
        for d in self.dimension_keys:
            dim_short_id = row[d].split(":")[0]
            dim_long_id = row[d].split(":")[1]
            dimensions[d] = dim_short_id
            if not d in self.dataset.codelists:
                self.dataset.codelists[d] = {}
            self.dataset.codelists[d][dim_short_id] = dim_long_id
            #dimensions[d] = self.dimension_list.update_entry(d, dim_short_id, dim_long_id)

        series_name = " - ".join([row[d].split(":")[1] for d in self.dimension_keys])

        values = []
        
        for period in self.periods:
            value = {
                'attributes': None,
                'release_date': self.release_date,
                'ordinal': get_ordinal_from_period(period, freq=self.frequency),
                #'period_o': period,
                'period': period,
                'value': row[period]
            }
            values.append(value)
        
        bson = {'provider_name': self.dataset.provider_name,
                'dataset_code': self.dataset.dataset_code,
                'name': series_name,
                'key': series_key,
                'values': values,
                'attributes': None,
                'dimensions': dimensions,
                'last_update': self.release_date,
                'start_date': self.start_date,
                'end_date': self.end_date,
                'frequency': self.frequency}

        return bson
Example #8
0
    def update_sheet(self):
        try:
            self.sheet = next(self.sheets)
        except StopIteration:
            self.update_file()
            self.sheet = next(self.sheets)

        self.columns = iter(range(1,self.sheet.row_len(0)))
        periods = self.sheet.col_slice(0, start_rowx=2)
        start_period = periods[0].value
        end_period = periods[-1].value
        
        self.periods = []
        
        if self.sheet.name == 'annual':    
            self.frequency = 'A'
            self.start_date = get_ordinal_from_period(str(int(start_period)), freq='A')
            self.end_date = get_ordinal_from_period(str(int(end_period)), freq='A')
            self.periods = [str(int(p.value)) for p in periods]
            
        elif self.sheet.name == 'quarterly':    
            self.frequency = 'Q'
            self.start_date = get_ordinal_from_period(start_period,freq='Q')
            self.end_date = get_ordinal_from_period(end_period,freq='Q')
            self.periods = [p.value for p in periods]
            
        elif self.sheet.name == 'monthly':    
            self.frequency = 'M'
            self.start_date = get_ordinal_from_period(start_period.replace('M','-'),freq='M')
            self.end_date = get_ordinal_from_period(end_period.replace('M','-'),freq='M')
            self.periods = [p.value.replace('M','-') for p in periods]
            
        else:
            msg = {"provider_name": self.provider_name, 
                   "dataset_code": self.dataset_code,
                   "frequency": self.sheet.name}
            raise errors.RejectFrequency(**msg)
        """
        elif self.sheet.name == 'daily':    
            self.frequency = 'D'
            self.start_date = self.translate_daily_dates(start_period)
            self.end_date = self.translate_daily_dates(end_period)
            TODO: self.periods = [p.value for p in periods]
        """
        self.dataset.add_frequency(self.frequency)
Example #9
0
    def _get_datas(self):

        _zipfile = zipfile.ZipFile(self.filepath)

        for fname in _zipfile.namelist():
            info = _zipfile.getinfo(fname)

            # bypass directory
            if info.file_size == 0 or info.filename.endswith("/"):
                continue

            if "Commodity Prices" in fname:
                logger.warning("bypass %s" % fname)
                continue

            # if not self.release_date:
            #    last_update = clean_datetime(datetime(*self.zipfile.getinfo(fname).date_time[0:6]))

            series_name = fname[:-5]
            logger.info("open excel file[%s] - series.name[%s]" % (fname, series_name))

            excel_book = xlrd.open_workbook(file_contents=_zipfile.read(fname))

            for sheet in excel_book.sheets():
                if sheet.name in [
                    "Sheet1",
                    "Sheet2",
                    "Sheet3",
                    "Sheet4",
                    "Feuille1",
                    "Feuille2",
                    "Feuille3",
                    "Feuille4",
                ]:
                    continue

                periods = sheet.col_slice(0, start_rowx=2)
                start_period = periods[0].value
                end_period = periods[-1].value

                frequency = None
                start_date = None
                end_date = None

                if sheet.name == "annual":
                    frequency = "A"
                    start_date = get_ordinal_from_period(str(int(start_period)), freq="A")
                    end_date = get_ordinal_from_period(str(int(end_period)), freq="A")
                    periods = [str(int(p.value)) for p in periods]
                elif sheet.name == "quarterly":
                    frequency = "Q"
                    start_date = get_ordinal_from_period(start_period, freq="Q")
                    end_date = get_ordinal_from_period(end_period, freq="Q")
                    periods = [p.value for p in periods]
                elif sheet.name == "monthly":
                    frequency = "M"
                    start_date = get_ordinal_from_period(start_period.replace("M", "-"), freq="M")
                    end_date = get_ordinal_from_period(end_period.replace("M", "-"), freq="M")
                    periods = [p.value.replace("M", "-") for p in periods]
                # elif sheet.name == 'daily':
                #    frequency = 'D'
                #    start_date = self._translate_daily_dates(start_period)
                #    end_date = self._translate_daily_dates(end_period)
                #    TODO: periods = [p.value for p in periods]
                else:
                    msg = {
                        "provider_name": self.provider_name,
                        "dataset_code": self.dataset_code,
                        "frequency": sheet.name,
                    }
                    raise errors.RejectFrequency(**msg)

                self.dataset.add_frequency(frequency)

                columns = iter(range(1, sheet.row_len(0)))

                for column in columns:
                    settings = {
                        "column": column,
                        "sheet": sheet,
                        "periods": periods,
                        "series_name": series_name,
                        "bson": {"frequency": frequency, "start_date": start_date, "end_date": end_date},
                    }
                    yield settings, None
Example #10
0
    def test_get_ordinal_from_period(self):
        """
        >>> pd.Period("1970-Q1", freq="Q").ordinal
        0
        >>> pd.Period("1970-Q2", freq="Q").ordinal
        1
        >>> pd.Period("1970-Q3", freq="Q").ordinal
        2
        >>> pd.Period("1970-Q4", freq="Q").ordinal
        3
        >>> pd.Period("1971-Q1", freq="Q").ordinal
        4        
        >>> pd.Period("1969-Q1", freq="Q").ordinal
        -4
        >>> pd.Period("1969-Q4", freq="Q").ordinal
        -1
        >>> pd.Period("1968-Q1", freq="Q").ordinal
        -8
        
        >>> pd.Period('1970', freq='A')
        Period('1970', 'A-DEC')
        >>> pd.Period('1970', freq='A').ordinal
        0
        >>> pd.Period('1970', freq='M').ordinal
        0
        >>> pd.Period('1970-01', freq='M').ordinal
        0
        >>> pd.Period('1970-02', freq='M').ordinal
        1
        >>> pd.Period('1969-12', freq='M').ordinal
        -1
        >>> pd.Period('1968-01', freq='M').ordinal
        -24
        >>> pd.Period('1971-01', freq='M').ordinal
        12
        >>> pd.Period('1969-01', freq='M').ordinal
        -12
        >>> pd.Period('1970-07', freq='M').ordinal
        6
        >>> pd.Period('1971-07', freq='M').ordinal
        18
        >>> pd.Period('1969-07', freq='M').ordinal
        -6    
        """

        TEST_VALUES = [("1970", "A", 0), ("1969", "A", -1), ("1971", "A", 1),
                       ("1970-01-01", "A", 0), ("19700101", "A", 0),
                       ("1970-01", "M", 0), ("197001", "M", 0),
                       ("1970-02", "M", 1), ("1969-12", "M", -1),
                       ("1969-01", "M", -12), ("1971-01", "M", 12),
                       ("1970-07", "M", 6), ("1971-07", "M", 18),
                       ("1969-07", "M", -6), ("1970-Q1", "Q", 0),
                       ("1970Q1", "Q", 0), ("1968-Q1", "Q", -8)]

        for date_str, freq, result in TEST_VALUES:
            _value = utils.get_ordinal_from_period(date_str, freq)
            msg = "DATE[%s] - FREQ[%s] - ATEMPT[%s] - RETURN[%s]" % (
                date_str, freq, result, _value)
            self.assertEquals(_value, result, msg)

        cache.configure_cache()

        for date_str, freq, result in TEST_VALUES:
            _value = utils.get_ordinal_from_period(date_str, freq)
            msg = "DATE[%s] - FREQ[%s] - ATEMPT[%s] - RETURN[%s]" % (
                date_str, freq, result, _value)
            self.assertEquals(_value, result, msg)
Example #11
0
    def build_series(self, row):
        
        dimensions = {}
        attributes = {}
        
        #'WEO Subject Code': (BCA, Current account balance)
        dimensions['WEO Subject Code'] = self.dimension_list.update_entry('WEO Subject Code', 
                                                                row['WEO Subject Code'], 
                                                                row['Subject Descriptor'])
        if not 'WEO Subject Code' in self.dataset.codelists:
            self.dataset.codelists['WEO Subject Code'] = {}

        if not dimensions['WEO Subject Code'] in self.dataset.codelists['WEO Subject Code']:
            self.dataset.codelists['WEO Subject Code'][dimensions['WEO Subject Code']] = row['Subject Descriptor']
                                                                          
        #'ISO': (DEU, Germany)
        dimensions['ISO'] = self.dimension_list.update_entry('ISO', 
                                                             row['ISO'], 
                                                             row['Country'])

        if not 'ISO' in self.dataset.codelists:
            self.dataset.codelists['ISO'] = {}

        if not dimensions['ISO'] in self.dataset.codelists['ISO']:
            self.dataset.codelists['ISO'][dimensions['ISO']] = row['Country']

        #'WEO Country Code': (134, Germany)    
        dimensions['WEO Country Code'] = self.dimension_list.update_entry('WEO Country Code', 
                                                             row['WEO Country Code'], 
                                                             row['Country'])

        if not 'WEO Country Code' in self.dataset.codelists:
            self.dataset.codelists['WEO Country Code'] = {}

        if not dimensions['WEO Country Code'] in self.dataset.codelists['WEO Country Code']:
            self.dataset.codelists['WEO Country Code'][dimensions['WEO Country Code']] = row['Country']

        #'Units': (2, U.S. dollars)
        dimensions['Units'] = self.dimension_list.update_entry('Units', 
                                                               '', 
                                                               row['Units'])

        if not 'Units' in self.dataset.codelists:
            self.dataset.codelists['Units'] = {}

        if not dimensions['Units'] in self.dataset.codelists['Units']:
            self.dataset.codelists['Units'][dimensions['Units']] = row['Units']

        attributes['Scale'] = self.attribute_list.update_entry('Scale', 
                                                               '', #row['Scale'], 
                                                               row['Scale'])

        if not 'Scale' in self.dataset.codelists:
            self.dataset.codelists['Scale'] = {}

        if not attributes['Scale'] in self.dataset.codelists['Scale']:
            self.dataset.codelists['Scale'][attributes['Scale']] = row['Scale']


        #'BCA.DEU.2'
        # TODO: <Series FREQ="A" WEO Country Code="122" INDICATOR="AIP_IX" SCALE="0" SERIESCODE="122AIP_IX.A" BASE_YEAR="2010" TIME_FORMAT="P1Y" xmlns="http://dataservices.imf.org/compact/IFS">
        series_key = "%s.%s.%s" % (dimensions['WEO Subject Code'],
                                   dimensions['ISO'],
                                   dimensions['Units'])

        #'Current account balance - Germany - U.S. dollars',
        series_name = "%s - %s - %s" % (row['Subject Descriptor'], 
                                        row['Country'],
                                        row['Units'])


        values = []
        estimation_start = None

        if row['Estimates Start After']:
            estimation_start = int(row['Estimates Start After'])
            
        for period in self.years:
            value = {
                'attributes': None,
                'release_date': self.release_date,
                'ordinal': get_ordinal_from_period(period, freq=self.frequency),
                'period': period,
                'value': row[period].replace(',' ,'')
            }
            if estimation_start:
                if int(period) >= estimation_start:
                    value["attributes"] = {'flag': 'e'}
            
            values.append(value)
    
        bson = {
            'provider_name': self.dataset.provider_name,
            'dataset_code': self.dataset.dataset_code,
            'name': series_name,
            'key': series_key,
            'values': values,
            'attributes': attributes,
            'dimensions': dimensions,
            'last_update': self.release_date,
            'start_date': self.start_date,
            'end_date': self.end_date,
            'frequency': self.frequency
        }
            
        notes = []
        
        if row['Subject Notes']:
            notes.append(row['Subject Notes'])
        
        if row['Country/Series-specific Notes']:
            notes.append(row['Country/Series-specific Notes'])
            
        if notes:
            bson["notes"] = "\n".join(notes)

        return bson
Example #12
0
    def build_series(self):
        try:
            column = next(self.columns)
        except StopIteration:
            self.update_sheet()
            column = next(self.columns)
            
        dimensions = {}
        
        col_header = self.sheet.cell_value(0,column)
        
        if self.series_name == 'Commodity Prices':
            dimensions['commodity'] = self.dimension_list.update_entry('Commodity','',col_header)
            if not col_header in self.dataset.codelists["commodity"]:
                self.dataset.codelists["commodity"][col_header] = col_header
        else:    
            if col_header in self.available_countries:
                dimensions['country'] = self.available_countries[col_header]["id"]
            elif col_header in self.manual_countries:
                dimensions['country'] = self.manual_countries[col_header]
            else:
                logger.warning("country not found [%s]" % col_header)
                #self.countries_not_found.add(col_header)
                dimensions['country'] = self.dimension_list.update_entry('country','',col_header)
            
            if not dimensions['country'] in self.dataset.codelists["country"]:
                self.dataset.codelists["country"][dimensions['country']] = col_header
        
        values = []
        
        _values = [str(v) for v in self.sheet.col_values(column, start_rowx=2)]
        
        for i, v in enumerate(_values):            
            value = {
                'attributes': None,
                'release_date': self.last_update,
                'ordinal': get_ordinal_from_period(self.periods[i], freq=self.frequency),
                'period': self.periods[i], #str(period),
                'value': v
            }
            values.append(value)
        
        series = {}
        series['values'] = values                
        
        series_key = self.series_name.replace(' ','_').replace(',', '')
        # don't add a period if there is already one
        if series_key[-1] != '.':
            series_key += '.'
        series_key += col_header + '.' + self.frequency

        series['provider_name'] = self.provider_name
        series['dataset_code'] = self.dataset_code
        series['name'] = self.series_name + ' - ' + col_header + ' - ' + self.freq_long_name[self.frequency]
        series['key'] = series_key
        #series['values'] = values
        series['attributes'] = None
        series['dimensions'] = dimensions
        series['last_update'] = self.last_update
        #series['release_dates'] = release_dates
        series['start_date'] = self.start_date
        series['end_date'] = self.end_date
        series['frequency'] = self.frequency
        
        return series
Example #13
0
def parse_dates(column):
    for row_nbr, c in enumerate(column):

        if type(c) is not str:
            continue

        matches = re.match(REGEX_ANNUAL, c)
        if matches:
            freq = 'A'
            start_year = int(matches.group(1))
            end_year = start_year
            first_row = row_nbr
            last_row = first_row
            break

        matches = re.match(REGEX_QUARTER, c)
        if matches:
            freq = 'Q'
            start_year = int(matches.group(1))
            start_quarter = parse_quarter(matches.group(2))
            # checking next year beginning
            matches = re.match(REGEX_QUARTER,
                               column[row_nbr + 5 - start_quarter])
            if (not matches) or int(matches.group(1)) != start_year + 1:
                raise Exception('start_date not recognized')
            end_year = start_year
            end_quarter = start_quarter
            first_row = row_nbr
            last_row = first_row
            break

        if (row_nbr + 1) == len(column):
            raise Exception('start_date not recognized')

    if freq == 'A':
        for c in column[first_row + 1:]:
            if type(c) is not str:
                break
            matches = re.match(REGEX_ANNUAL, c)
            if not matches:
                break
            else:
                next_year = int(matches.group(1))
            if next_year != end_year + 1:
                raise Exception('error in year sequence')
            end_year = next_year
            last_row = last_row + 1
    else:
        for c in column[first_row + 1:]:
            if type(c) is not str:
                break
            matches = re.match(REGEX_QUARTER, c)
            if not matches:
                break
            elif matches.group(1):
                next_year = int(matches.group(1))
                if next_year != end_year + 1:
                    raise Exception('error in year sequence')
                next_quarter = parse_quarter(matches.group(2))
                if next_quarter != 1:
                    raise Exception('first quarter of the year is not 1')
                end_year = next_year
            else:
                next_quarter = parse_quarter(matches.group(2))
                if next_quarter != end_quarter + 1:
                    raise Exception('error in quarter sequence')
            end_quarter = next_quarter
            last_row = last_row + 1

    if freq == 'A':
        start_date = get_ordinal_from_period(str(start_year), freq='A')
        end_date = get_ordinal_from_period(str(end_year), freq='A')
    elif freq == 'Q':
        start_date = get_ordinal_from_period('%sQ%s' %
                                             (start_year, start_quarter),
                                             freq='Q')
        end_date = get_ordinal_from_period('%sQ%s' % (end_year, end_quarter),
                                           freq='Q')

    return (freq, start_date, end_date, first_row, last_row)
Example #14
0
    def _get_datas(self):

        _zipfile = zipfile.ZipFile(self.filepath)

        for fname in _zipfile.namelist():
            info = _zipfile.getinfo(fname)

            #bypass directory
            if info.file_size == 0 or info.filename.endswith('/'):
                continue

            if 'Commodity Prices' in fname:
                logger.warning("bypass %s" % fname)
                continue

            #if not self.release_date:
            #    last_update = clean_datetime(datetime(*self.zipfile.getinfo(fname).date_time[0:6]))

            series_name = fname[:-5]
            logger.info("open excel file[%s] - series.name[%s]" %
                        (fname, series_name))

            excel_book = xlrd.open_workbook(file_contents=_zipfile.read(fname))

            for sheet in excel_book.sheets():
                if sheet.name in [
                        'Sheet1', 'Sheet2', 'Sheet3', 'Sheet4', 'Feuille1',
                        'Feuille2', 'Feuille3', 'Feuille4'
                ]:
                    continue

                periods = sheet.col_slice(0, start_rowx=2)
                start_period = periods[0].value
                end_period = periods[-1].value

                frequency = None
                start_date = None
                end_date = None

                if sheet.name == 'annual':
                    frequency = 'A'
                    start_date = get_ordinal_from_period(str(
                        int(start_period)),
                                                         freq='A')
                    end_date = get_ordinal_from_period(str(int(end_period)),
                                                       freq='A')
                    periods = [str(int(p.value)) for p in periods]
                elif sheet.name == 'quarterly':
                    frequency = 'Q'
                    start_date = get_ordinal_from_period(start_period,
                                                         freq='Q')
                    end_date = get_ordinal_from_period(end_period, freq='Q')
                    periods = [p.value for p in periods]
                elif sheet.name == 'monthly':
                    frequency = 'M'
                    start_date = get_ordinal_from_period(start_period.replace(
                        'M', '-'),
                                                         freq='M')
                    end_date = get_ordinal_from_period(end_period.replace(
                        'M', '-'),
                                                       freq='M')
                    periods = [p.value.replace('M', '-') for p in periods]
                #elif sheet.name == 'daily':
                #    frequency = 'D'
                #    start_date = self._translate_daily_dates(start_period)
                #    end_date = self._translate_daily_dates(end_period)
                #    TODO: periods = [p.value for p in periods]
                else:
                    msg = {
                        "provider_name": self.provider_name,
                        "dataset_code": self.dataset_code,
                        "frequency": sheet.name
                    }
                    raise errors.RejectFrequency(**msg)

                self.dataset.add_frequency(frequency)

                columns = iter(range(1, sheet.row_len(0)))

                for column in columns:
                    settings = {
                        "column": column,
                        "sheet": sheet,
                        "periods": periods,
                        "series_name": series_name,
                        "bson": {
                            "frequency": frequency,
                            "start_date": start_date,
                            "end_date": end_date,
                        }
                    }
                    yield settings, None
Example #15
0
    def test_get_ordinal_from_period(self):
        
        """
        >>> pd.Period("1970-Q1", freq="Q").ordinal
        0
        >>> pd.Period("1970-Q2", freq="Q").ordinal
        1
        >>> pd.Period("1970-Q3", freq="Q").ordinal
        2
        >>> pd.Period("1970-Q4", freq="Q").ordinal
        3
        >>> pd.Period("1971-Q1", freq="Q").ordinal
        4        
        >>> pd.Period("1969-Q1", freq="Q").ordinal
        -4
        >>> pd.Period("1969-Q4", freq="Q").ordinal
        -1
        >>> pd.Period("1968-Q1", freq="Q").ordinal
        -8
        
        >>> pd.Period('1970', freq='A')
        Period('1970', 'A-DEC')
        >>> pd.Period('1970', freq='A').ordinal
        0
        >>> pd.Period('1970', freq='M').ordinal
        0
        >>> pd.Period('1970-01', freq='M').ordinal
        0
        >>> pd.Period('1970-02', freq='M').ordinal
        1
        >>> pd.Period('1969-12', freq='M').ordinal
        -1
        >>> pd.Period('1968-01', freq='M').ordinal
        -24
        >>> pd.Period('1971-01', freq='M').ordinal
        12
        >>> pd.Period('1969-01', freq='M').ordinal
        -12
        >>> pd.Period('1970-07', freq='M').ordinal
        6
        >>> pd.Period('1971-07', freq='M').ordinal
        18
        >>> pd.Period('1969-07', freq='M').ordinal
        -6    
        """
        
        TEST_VALUES = [
             ("1970", "A", 0),
             ("1969", "A", -1),
             ("1971", "A", 1),
             ("1970-01-01", "A", 0),
             ("19700101", "A", 0),

             ("1970-01", "M", 0),
             ("197001", "M", 0),
             ("1970-02", "M", 1),
             ("1969-12", "M", -1),
             ("1969-01", "M", -12),
             ("1971-01", "M", 12),
             ("1970-07", "M", 6),
             ("1971-07", "M", 18),
             ("1969-07", "M", -6),
             
             ("1970-Q1", "Q", 0),
             ("1970Q1", "Q", 0),
             ("1968-Q1", "Q", -8)
             
        ]
        
        for date_str, freq, result in TEST_VALUES:
            _value = utils.get_ordinal_from_period(date_str, freq)
            msg = "DATE[%s] - FREQ[%s] - ATEMPT[%s] - RETURN[%s]" % (date_str, freq, result, _value)
            self.assertEquals(_value, result, msg) 
    
        cache.configure_cache()
        
        for date_str, freq, result in TEST_VALUES:
            _value = utils.get_ordinal_from_period(date_str, freq)
            msg = "DATE[%s] - FREQ[%s] - ATEMPT[%s] - RETURN[%s]" % (date_str, freq, result, _value)
            self.assertEquals(_value, result, msg) 
Example #16
0
    def _build_series(self, group, p_series, obs):
        dimensions = OrderedDict()
        attributes = OrderedDict()
        bson = OrderedDict()
        dim = group.copy()
        dim.update(p_series)
        attrib = defaultdict(list)

        frequency, start_date, end_date = get_dates(dim, obs)
        self.dataset.add_frequency(frequency)

        values = list()
        for v in obs:
            Obs_attribute_keys = [
                k for k in v.keys() if k not in ['time-period', 'obs-value']
            ]

            for key in Obs_attribute_keys:
                if key not in self.dataset.attribute_keys:
                    self.dataset.attribute_keys.append(key)
                    self.dataset.concepts[key] = key
                    self.dataset.codelists[key] = {}
                if v.get(key) not in self.dataset.codelists[key]:
                    self.dataset.codelists[key][v.get(key)] = v.get(key)

        for v in obs:
            period = v['time-period']
            a = OrderedDict()
            for k in self.dataset.attribute_keys:
                try:
                    a[k] = v[k]
                except KeyError:
                    a[k] = ''
                attrib[k].append(a[k])
            value = {
                'attributes': a,
                'release_date': self.release_date,
                'ordinal': get_ordinal_from_period(period, freq=frequency),
                'period': period,
                'value': v['obs-value']
            }
            values.append(value)

        for key in self.dataset.dimension_keys:
            dimensions[key] = self.dimension_list.update_entry(
                key, dim[key], self.dataset.codelists[key][slugify(dim[key])])
        for key in self.dataset.attribute_keys:
            attributes[key] = self.attribute_list.update_entry(
                key, str(attrib[key]), attrib[key])

        serie_key = self.fix_series_keys(dimensions)
        serie_name = self.fix_series_names(dim, serie_key)

        bson['values'] = values
        bson['provider_name'] = self.provider_name
        bson['dataset_code'] = self.dataset_code
        bson['name'] = serie_name
        bson['key'] = str(serie_key)
        bson['start_date'] = start_date
        bson['end_date'] = end_date
        bson['last_update'] = self.release_date
        bson['dimensions'] = dimensions
        bson['frequency'] = frequency
        bson['attributes'] = attributes
        return bson
Example #17
0
def get_dates(dim, obs):
    frequency = dim['freq']
    start_date = get_ordinal_from_period(obs[0]['time-period'], freq=frequency)
    end_date = get_ordinal_from_period(obs[-1]['time-period'], freq=frequency)
    return frequency, start_date, end_date
Example #18
0
    def build_series(self, datas):

        datas = datas["datas"]

        series = {}
        series["last_update"] = self.release_date
        series["frequency"] = self._search_frequency(datas[0])

        series["key"] = "%s.%s.%s" % (self.current_indicator["id"], self.current_country, series["frequency"])

        series["name"] = "%s - %s - %s" % (
            self.current_indicator["name"],
            self.available_countries[self.current_country]["name"],
            constants.FREQUENCIES_DICT[series["frequency"]],
        )

        # if self.current_indicator.get("sourceNote"):
        #    series["notes"] = self.current_indicator.get("sourceNote")

        values = []
        value_found = False
        for point in datas:

            frequency = self._search_frequency(point)
            if frequency != series["frequency"]:
                raise Exception(
                    "Diff frequency [%s] != [%s] - series[%s]" % (frequency, series["frequency"], series["key"])
                )

            value = {
                "attributes": None,
                "value": str(point["value"]).replace("None", ""),
                "ordinal": get_ordinal_from_period(point["date"], freq=series["frequency"]),  # tmp value
                "period": point["date"],
            }
            if not value_found and value["value"] != "":
                value_found = True

            if "obs_status" in point:
                obs_status = point.get("obs_status")
                if obs_status and len(obs_status) > 0:
                    value["attributes"] = {"obs_status": obs_status}
                    if not "obs_status" in self.dataset.codelists:
                        self.dataset.codelists["obs_status"] = self.obs_status
                    if not "obs_status" in self.dataset.concepts:
                        self.dataset.concepts["obs_status"] = "Observation Status"
                    if not "obs_status" in self.dataset.attribute_keys:
                        self.dataset.attribute_keys.append("obs_status")

            values.append(value)

        if not value_found:
            msg = {"provider_name": self.provider_name, "dataset_code": self.dataset_code}
            raise errors.RejectEmptySeries(**msg)

        keyfunc = lambda x: x["ordinal"]
        series["values"] = sorted(values, key=keyfunc)

        series["provider_name"] = self.provider_name
        series["dataset_code"] = self.dataset_code

        series["start_date"] = series["values"][0]["ordinal"]
        series["end_date"] = series["values"][-1]["ordinal"]

        # PATCH
        for v in series["values"]:
            v.pop("ordinal")

        series["dimensions"] = {
            "country": self.current_country,
            "indicator": self.current_indicator["id"],
            "frequency": series["frequency"],
        }
        if not self.current_indicator["id"] in self.dataset.codelists["indicator"]:
            self.dataset.codelists["indicator"][self.current_indicator["id"]] = self.current_indicator["name"]

        if not series["frequency"] in self.dataset.codelists["frequency"]:
            self.dataset.codelists["frequency"][series["frequency"]] = constants.FREQUENCIES_DICT[series["frequency"]]

        series["attributes"] = None

        self.dataset.add_frequency(series["frequency"])

        return series
Example #19
0
def parse_dates(column):
    for row_nbr, c in enumerate(column):
        
        if type(c) is not str:
            continue
        
        matches = re.match(REGEX_ANNUAL, c)
        if matches:
            freq = 'A'
            start_year = int(matches.group(1))
            end_year = start_year
            first_row = row_nbr
            last_row = first_row
            break
        
        matches = re.match(REGEX_QUARTER, c)
        if matches:
            freq = 'Q'
            start_year = int(matches.group(1))
            start_quarter = parse_quarter(matches.group(2))
            # checking next year beginning
            matches = re.match(REGEX_QUARTER, column[row_nbr + 5 - start_quarter])
            if (not matches) or int(matches.group(1)) != start_year + 1:
                raise Exception('start_date not recognized')
            end_year = start_year
            end_quarter = start_quarter
            first_row = row_nbr
            last_row = first_row
            break
        
        if (row_nbr + 1) == len(column):
            raise Exception('start_date not recognized')

    if freq == 'A':
        for c in column[first_row+1:]:
            if type(c) is not str:
                break
            matches = re.match(REGEX_ANNUAL,c)
            if not matches:
                break
            else:
                next_year = int(matches.group(1))
            if next_year != end_year + 1:
                raise Exception('error in year sequence')
            end_year = next_year
            last_row = last_row + 1
    else:
        for c in column[first_row+1:]:
            if type(c) is not str:
                break
            matches = re.match(REGEX_QUARTER,c)
            if not matches:
                break
            elif matches.group(1):
                next_year = int(matches.group(1))
                if next_year != end_year + 1:
                    raise Exception('error in year sequence')
                next_quarter = parse_quarter(matches.group(2))
                if next_quarter != 1:
                    raise Exception('first quarter of the year is not 1')
                end_year = next_year
            else:
                next_quarter = parse_quarter(matches.group(2))
                if next_quarter != end_quarter + 1:
                    raise Exception('error in quarter sequence')
            end_quarter = next_quarter
            last_row = last_row + 1

    if freq == 'A':
        start_date = get_ordinal_from_period(start_year, freq='A')
        end_date = get_ordinal_from_period(end_year, freq='A')
    elif freq == 'Q':
        start_date = get_ordinal_from_period('%sQ%s' % (start_year, start_quarter), freq='Q')
        end_date = get_ordinal_from_period('%sQ%s' % (end_year, end_quarter), freq='Q')

    return (freq, start_date, end_date, first_row, last_row)
Example #20
0
    def _build_series(self, group, p_series, obs):
        dimensions = OrderedDict()
        attributes = OrderedDict()
        bson = OrderedDict() 
        dim = group.copy()
        dim.update(p_series) 
        attrib = defaultdict(list)
        
        frequency, start_date, end_date = get_dates(dim, obs)
        self.dataset.add_frequency(frequency)
        
        values=list()
        for v in obs:
            Obs_attribute_keys = [k for k in v.keys() if k not in ['time-period', 'obs-value']]            
            
            for key in Obs_attribute_keys:
                if key not in self.dataset.attribute_keys:
                    self.dataset.attribute_keys.append(key)
                    self.dataset.concepts[key] = key
                    self.dataset.codelists[key] = {}
                if v.get(key) not in self.dataset.codelists[key]:
                    self.dataset.codelists[key][v.get(key)] = v.get(key)

        for v in obs: 
            period = v['time-period']
            a=OrderedDict()
            for k in self.dataset.attribute_keys:
                try:
                    a[k]=v[k]
                except KeyError:
                    a[k]=''
                attrib[k].append(a[k])                        
            value = { 'attributes': a,
                    'release_date': self.release_date,
                    'ordinal': get_ordinal_from_period(period, freq=frequency),
                    'period': period,
                    'value': v['obs-value']
                    }
            values.append(value)

        for key in self.dataset.dimension_keys:
            dimensions[key] = self.dimension_list.update_entry(key,
                                                                dim[key], 
                                                                self.dataset.codelists[key][slugify(dim[key])])                                                                           
        for key in self.dataset.attribute_keys:
            attributes[key] = self.attribute_list.update_entry(key,
                                                                str(attrib[key]),
                                                                attrib[key])
                                                        
        serie_key =  self.fix_series_keys(dimensions)
        serie_name = self.fix_series_names(dim, serie_key)
    
        bson['values'] = values                
        bson['provider_name'] = self.provider_name       
        bson['dataset_code'] = self.dataset_code
        bson['name'] = serie_name
        bson['key'] = str(serie_key)
        bson['start_date'] = start_date
        bson['end_date'] = end_date
        bson['last_update'] = self.release_date
        bson['dimensions'] = dimensions
        bson['frequency'] = frequency
        bson['attributes'] = attributes
        return bson
Example #21
0
    def build_series(self, datas):

        datas = datas["datas"]

        series = {}
        series["last_update"] = self.release_date
        series['frequency'] = self._search_frequency(datas[0])

        series['key'] = "%s.%s.%s" % (self.current_indicator["id"],
                                      self.current_country,
                                      series['frequency'])

        series['name'] = "%s - %s - %s" % (
            self.current_indicator["name"],
            self.available_countries[self.current_country]["name"],
            constants.FREQUENCIES_DICT[series["frequency"]])

        #if self.current_indicator.get("sourceNote"):
        #    series["notes"] = self.current_indicator.get("sourceNote")

        values = []
        value_found = False
        for point in datas:

            frequency = self._search_frequency(point)
            if frequency != series['frequency']:
                raise Exception(
                    "Diff frequency [%s] != [%s] - series[%s]" %
                    (frequency, series['frequency'], series['key']))

            value = {
                'attributes':
                None,
                'value':
                str(point["value"]).replace("None", ""),
                'ordinal':
                get_ordinal_from_period(point["date"],
                                        freq=series['frequency']),  #tmp value
                'period':
                point["date"],
            }
            if not value_found and value["value"] != "":
                value_found = True

            if "obs_status" in point:
                obs_status = point.get("obs_status")
                if obs_status and len(obs_status) > 0:
                    value["attributes"] = {"obs_status": obs_status}
                    if not "obs_status" in self.dataset.codelists:
                        self.dataset.codelists["obs_status"] = self.obs_status
                    if not "obs_status" in self.dataset.concepts:
                        self.dataset.concepts[
                            "obs_status"] = "Observation Status"
                    if not "obs_status" in self.dataset.attribute_keys:
                        self.dataset.attribute_keys.append("obs_status")

            values.append(value)

        if not value_found:
            msg = {
                "provider_name": self.provider_name,
                "dataset_code": self.dataset_code
            }
            raise errors.RejectEmptySeries(**msg)

        keyfunc = lambda x: x["ordinal"]
        series['values'] = sorted(values, key=keyfunc)

        series['provider_name'] = self.provider_name
        series['dataset_code'] = self.dataset_code

        series['start_date'] = series['values'][0]["ordinal"]
        series['end_date'] = series['values'][-1]["ordinal"]

        #PATCH
        for v in series['values']:
            v.pop("ordinal")

        series['dimensions'] = {
            'country': self.current_country,
            'indicator': self.current_indicator["id"],
            'frequency': series["frequency"]
        }
        if not self.current_indicator["id"] in self.dataset.codelists[
                'indicator']:
            self.dataset.codelists['indicator'][
                self.current_indicator["id"]] = self.current_indicator["name"]

        if not series["frequency"] in self.dataset.codelists['frequency']:
            self.dataset.codelists['frequency'][series[
                "frequency"]] = constants.FREQUENCIES_DICT[series["frequency"]]

        series['attributes'] = None

        self.dataset.add_frequency(series["frequency"])

        return series
Example #22
0
def get_dates(dim, obs):
    frequency = dim['freq']
    start_date = get_ordinal_from_period(obs[0]['time-period'], freq=frequency)
    end_date = get_ordinal_from_period(obs[-1]['time-period'], freq=frequency)
    return frequency, start_date, end_date