Exemple #1
0
    def test_all_metadata_created(self, mock_hits, *_):
        self._update_popularity_metadata(mock_hits)

        self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_30_DAYS))
        self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_90_DAYS))
        self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_180_DAYS))
        self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_TOTAL))
Exemple #2
0
    def test_update_metadata_for_many_series(self, mock_hits, *_):
        other_field = self.distribution.field_set.create(identifier='other_field')
        other_field.enhanced_meta.create(key=meta_keys.AVAILABLE, value='true')
        self._update_popularity_metadata(mock_hits)

        self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_TOTAL))
        self.assertTrue(meta_keys.get(other_field, meta_keys.HITS_TOTAL))
    def test_start_end_dates(self):
        df = self.init_df()

        update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id)

        self.assertEqual(str(df.index[0].date()), meta_keys.get(self.field, meta_keys.INDEX_START))
        self.assertEqual(str(df.index[-1].date()), meta_keys.get(self.field, meta_keys.INDEX_END))
Exemple #4
0
    def test_periodicity(self):
        df = self.init_df()
        update_enhanced_meta(df[df.columns[0]], self.catalog_id,
                             self.distribution_id)

        self.assertEqual(
            meta_keys.get(self.field, meta_keys.PERIODICITY),
            meta_keys.get(self.field.distribution, meta_keys.PERIODICITY))
    def test_last_values(self):
        df = self.init_df()

        serie = df[df.columns[-1]]
        update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id)

        self.assertEqual(meta_keys.get(self.field, meta_keys.LAST_VALUE), str(serie[-1]))
        self.assertEqual(meta_keys.get(self.field, meta_keys.SECOND_TO_LAST_VALUE), str(serie[-2]))
    def test_metadata_csv_hits(self):
        file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_METADATA,
                                          file_type=DumpFile.TYPE_CSV).file
        reader = read_file_as_csv(file)
        next(reader)  # Header

        row = next(reader)

        field = Field.objects.get(identifier=row[3])
        self.assertEqual(row[25], meta_keys.get(field, meta_keys.HITS_TOTAL))
        self.assertEqual(row[26], meta_keys.get(field, meta_keys.HITS_30_DAYS))
        self.assertEqual(row[27], meta_keys.get(field, meta_keys.HITS_90_DAYS))
        self.assertEqual(row[28], meta_keys.get(field, meta_keys.HITS_180_DAYS))
Exemple #7
0
    def generate_row(self, serie_name, values):
        dataset = values['dataset']
        distribution = values['distribution']
        serie = values['serie']

        return {
            constants.CATALOG_ID:
            dataset.catalog.identifier,
            constants.DATASET_ID:
            dataset.identifier,
            constants.DISTRIBUTION_ID:
            distribution.identifier,
            constants.SERIE_ID:
            serie_name,
            constants.TIME_INDEX_FREQUENCY:
            meta_keys.get(distribution, meta_keys.PERIODICITY),
            constants.SERIES_TITLE:
            values[constants.SERIES_TITLE],
            constants.SERIES_UNITS:
            values[constants.SERIES_UNITS],
            constants.SERIES_DESCRIPTION:
            values[constants.SERIES_DESCRIPTION],
            constants.DISTRIBUTION_TITLE:
            values[constants.DISTRIBUTION_TITLE],
            constants.DISTRIBUTION_DESCRIPTION:
            values[constants.DATASET_PUBLISHER],
            constants.DISTRIBUTION_DOWNLOAD_URL:
            values[constants.DISTRIBUTION_DOWNLOAD_URL],
            constants.DATASET_PUBLISHER:
            values[constants.DATASET_TITLE],
            constants.DATASET_SOURCE:
            values[constants.DATASET_SOURCE],
            constants.DATASET_TITLE:
            values[constants.DATASET_TITLE],
            constants.DATASET_DESCRIPTION:
            values[constants.DATASET_DESCRIPTION],
            constants.DATASET_THEME:
            values[constants.DATASET_THEME],
            constants.SERIES_INDEX_START:
            meta_keys.get(serie, meta_keys.INDEX_START),
            constants.SERIES_INDEX_END:
            meta_keys.get(serie, meta_keys.INDEX_END),
            constants.SERIES_VALUES_AMT:
            meta_keys.get(serie, meta_keys.INDEX_SIZE),
            constants.SERIES_DAYS_SINCE_LAST_UPDATE:
            meta_keys.get(serie, meta_keys.DAYS_SINCE_LAST_UPDATE),
            constants.SERIES_IS_UPDATED:
            meta_keys.get(serie, meta_keys.IS_UPDATED),
            constants.SERIES_LAST_VALUE:
            meta_keys.get(serie, meta_keys.LAST_VALUE),
            constants.SERIES_SECOND_LAST_VALUE:
            meta_keys.get(serie, meta_keys.SECOND_TO_LAST_VALUE),
            constants.SERIES_PCT_CHANGE:
            meta_keys.get(serie, meta_keys.LAST_PCT_CHANGE),
        }
Exemple #8
0
    def test_size(self):
        df = self.init_df()
        update_enhanced_meta(df[df.columns[0]], self.catalog_id,
                             self.distribution_id)

        self.assertEqual(meta_keys.get(self.field, meta_keys.INDEX_SIZE),
                         str(len(df)))
    def test_last_pct_change(self):
        df = self.init_df()

        serie = df[df.columns[-1]]
        update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id)

        self.assertEqual(meta_keys.get(self.field, meta_keys.LAST_PCT_CHANGE), str(serie[-1] / serie[-2] - 1))
Exemple #10
0
def update_enhanced_meta(serie: pd.Series, catalog_id: str, distribution_id: str):
    """Crea o actualiza los metadatos enriquecidos de la serie pasada. El título de
    la misma DEBE ser el ID de la serie en la base de datos"""

    field = Field.objects.get(distribution__dataset__catalog__identifier=catalog_id,
                              distribution__identifier=distribution_id,
                              identifier=serie.name)
    periodicity = meta_keys.get(field.distribution, meta_keys.PERIODICITY)
    days_since_update = (datetime.now() - _get_last_day_of_period(serie, periodicity)).days

    last = serie[-1]
    second_to_last = serie[-2] if serie.index.size > 1 else None
    last_pct_change = last / second_to_last - 1

    # Cálculos
    meta = {
        meta_keys.INDEX_START: serie.first_valid_index().date(),
        meta_keys.INDEX_END: serie.last_valid_index().date(),
        meta_keys.PERIODICITY: periodicity,
        meta_keys.INDEX_SIZE: _get_index_size(serie),
        meta_keys.DAYS_SINCE_LAST_UPDATE: days_since_update,
        meta_keys.LAST_VALUE: last,
        meta_keys.SECOND_TO_LAST_VALUE: second_to_last,
        meta_keys.LAST_PCT_CHANGE: last_pct_change,
        meta_keys.IS_UPDATED: _is_series_updated(days_since_update, periodicity),
        meta_keys.MAX: serie.max(),
        meta_keys.MIN: serie.min(),
        meta_keys.AVERAGE: serie.mean(),
    }

    for meta_key, value in meta.items():
        field.enhanced_meta.update_or_create(key=meta_key, defaults={'value': value})
    def test_sources_csv_columns(self):
        dataset = Field.objects.first().distribution.dataset
        meta = json.loads(dataset.metadata)

        file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_SOURCES).file
        reader = read_file_as_csv(file)
        next(reader)  # Header

        row = next(reader)
        series = Field.objects.exclude(title='indice_tiempo')
        self.assertEqual(row[0], meta['source'])  # nombre de la fuente
        self.assertEqual(int(row[1]), 3)  # Cantidad de series
        self.assertEqual(int(row[2]), sum([int(meta_keys.get(x, meta_keys.INDEX_SIZE))
                                           for x in series]))
        self.assertEqual(row[3], min(meta_keys.get(x, meta_keys.INDEX_START) for x in series))
        self.assertEqual(row[4], max(meta_keys.get(x, meta_keys.INDEX_END) for x in series))
Exemple #12
0
    def test_metadata_is_updated(self, mock_hits, *_):
        self._update_popularity_metadata(mock_hits)

        updated_value = self._update_popularity_metadata(mock_hits)

        hits = int(meta_keys.get(self.field, meta_keys.HITS_TOTAL))
        self.assertEqual(hits, updated_value)
Exemple #13
0
 def row_order(self, field: str):
     field_data = self.fields[field]
     return (field_data['dataset'].catalog.identifier,
             field_data['dataset'].identifier,
             field_data['distribution'].identifier, field,
             meta_keys.get(field_data['distribution'],
                           meta_keys.PERIODICITY))
Exemple #14
0
 def test_is_updated(self):
     df = self.init_df()
     with mock.patch(
             'series_tiempo_ar_api.libs.indexing.indexer.metadata.datetime',
             self.MockDatetime(df.index[-1])):
         update_enhanced_meta(df[df.columns[0]], self.catalog_id,
                              self.distribution_id)
     self.assertEqual(meta_keys.get(self.field, meta_keys.IS_UPDATED),
                      str(True))
Exemple #15
0
    def test_full_metadata_periodicty_with_collapse(self):
        self.query.add_series(self.single_series, self.field)
        self.query.add_collapse('year')
        self.query.set_metadata_config('full')

        resp = self.query.run()

        self.assertEqual(resp['meta'][0]['frequency'], 'year')
        self.assertEqual(resp['meta'][1]['field'][meta_keys.PERIODICITY],
                         meta_keys.get(self.field, meta_keys.PERIODICITY))
    def generate(self):
        sources = {}

        for field in filter(lambda x: self.fields[x]['dataset_fuente'],
                            self.fields):
            source = self.fields[field]['dataset_fuente']
            field_model: Field = self.fields[field]['serie']

            if source not in sources:
                sources[source] = {
                    constants.SOURCES_DATASET_SOURCE: source,
                    constants.SOURCE_SERIES_AMT: 0,
                    constants.SOURCE_VALUES_AMT: 0,
                    constants.SOURCE_FIRST_INDEX: None,
                    constants.SOURCE_LAST_INDEX: None,
                }

            sources[source][constants.SOURCE_SERIES_AMT] += 1
            index_start = meta_keys.get(field_model, meta_keys.INDEX_START)

            # ☢☢☢
            if index_start:
                index_start = iso8601.parse_date(index_start).date()
                first_index = sources[source][constants.SOURCE_FIRST_INDEX]
                if first_index is None or first_index > index_start:
                    sources[source][constants.SOURCE_FIRST_INDEX] = index_start

            index_end = meta_keys.get(field_model, meta_keys.INDEX_END)
            if index_end:
                index_end = iso8601.parse_date(index_end).date()
                last_index = sources[source][constants.SOURCE_LAST_INDEX]
                if last_index is None or last_index < index_end:
                    sources[source][constants.SOURCE_LAST_INDEX] = index_end

            index_size = meta_keys.get(field_model, meta_keys.INDEX_SIZE) or 0

            if index_size:
                index_size = int(index_size)

            sources[source][constants.SOURCE_VALUES_AMT] += index_size

        self.write_tmp_file(sources)
    def test_days_since_last_update(self):
        df = self.init_df()
        update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id)

        last_date = df.index[-1]

        # Sólo válido porque la serie es diaria! Con otra periodicity hay que considerar
        # el fin del período
        days = (datetime.datetime.today() - last_date).days

        self.assertEqual(meta_keys.get(self.field, meta_keys.DAYS_SINCE_LAST_UPDATE),
                         str(days))
 def add_pagination(self, start, limit):
     start_dates = {
         serie.identifier: meta_keys.get(serie, meta_keys.INDEX_START)
         for serie in self.series_models
     }
     start_dates = {
         k: iso8601.parse_date(v) if v is not None else None
         for k, v in start_dates.items()
     }
     return self.es_query.add_pagination(start,
                                         limit,
                                         start_dates=start_dates)
    def write_distribution(self, distribution: Distribution,
                           writer: csv.writer):
        # noinspection PyBroadException
        try:
            df = read_distribution_csv(distribution)
            fields = distribution.field_set.all()
            fields = {field.title: field.identifier for field in fields}

            periodicity = meta_keys.get(distribution, meta_keys.PERIODICITY)
            df.apply(self.write_serie, args=(periodicity, fields, writer))
        except Exception as e:
            msg = f'[{self.tag} Error en la distribución {distribution.identifier}: {e.__class__}: {e}'
            GenerateDumpTask.info(self.task, msg)
            logger.error(msg)
 def write_distribution(self, distribution: Distribution,
                        writer: csv.writer):
     # noinspection PyBroadException
     try:
         fields = distribution.field_set.all()
         fields = {field.title: field.identifier for field in fields}
         periodicity = meta_keys.get(distribution, meta_keys.PERIODICITY)
         index_col = DistributionRepository(
             distribution).get_time_index_series().title
         df = DistributionCsvReader(distribution, index_col).read()
         df.apply(self.write_serie, args=(periodicity, fields, writer))
     except Exception as e:
         msg = f'[{self.tag} Error en la distribución {distribution.identifier}: {e.__class__}: {e}'
         GenerateDumpTask.info(self.task, msg)
         logger.warning(msg)
    def _get_model(self, series_id):
        """Valida si el 'series_id' es válido, es decir, si la serie
        pedida es un ID contenido en la base de datos. De no
        encontrarse, llena la lista de errores según corresponda.
        """

        field_model = SeriesRepository.get_available_series(identifier=series_id).first()
        if field_model is None:
            self._append_error(SERIES_DOES_NOT_EXIST.format(series_id), series_id=series_id)
            return None

        index_start_metadata = meta_keys.get(field_model, meta_keys.INDEX_START)
        if index_start_metadata is None:
            self._append_error(SERIES_DOES_NOT_EXIST.format(series_id), series_id=series_id)
            return None

        return field_model
Exemple #22
0
    def test_min(self):
        df = self.init_df()
        update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id)

        self.assertAlmostEqual(float(meta_keys.get(self.field, meta_keys.MIN)), df[df.columns[0]].min())
 def serie_periodicity(self, field):
     return meta_keys.get(field, meta_keys.PERIODICITY) or meta_keys.get(field.distribution, meta_keys.PERIODICITY)
Exemple #24
0
 def test_get_start_date(self):
     start_date = meta_keys.get(self.field, meta_keys.INDEX_START)
     start_date = iso8601.parse_date(start_date)
     self.assertEqual(self.serie.start_date(), start_date.date())
 def _get_series_periodicity(self, serie_model):
     serie_periodicity = meta_keys.get(serie_model, meta_keys.PERIODICITY)
     distribution_periodicity = meta_keys.get(serie_model.distribution,
                                              meta_keys.PERIODICITY)
     return get_periodicity_human_format(serie_periodicity
                                         or distribution_periodicity)
Exemple #26
0
 def test_is_updated(self):
     df = self.init_df()
     with freeze_time(df.index[-1]):
         update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id)
     self.assertEqual(meta_keys.get(self.field, meta_keys.IS_UPDATED),
                      str(True))
Exemple #27
0
    def test_is_not_updated(self):
        df = self.init_df()
        update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id)

        self.assertEqual(meta_keys.get(self.field, meta_keys.IS_UPDATED),
                         str(False))
    def _read_from_cache(self, key, model, meta_key):
        if key not in self.cache:
            self.cache[key] = meta_keys.get(model, meta_key)

        return self.cache[key]