def test_all_metadata_created(self, mock_hits, *_): self._update_popularity_metadata(mock_hits) self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_30_DAYS)) self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_90_DAYS)) self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_180_DAYS)) self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_TOTAL))
def test_update_metadata_for_many_series(self, mock_hits, *_): other_field = self.distribution.field_set.create(identifier='other_field') other_field.enhanced_meta.create(key=meta_keys.AVAILABLE, value='true') self._update_popularity_metadata(mock_hits) self.assertTrue(meta_keys.get(self.field, meta_keys.HITS_TOTAL)) self.assertTrue(meta_keys.get(other_field, meta_keys.HITS_TOTAL))
def test_start_end_dates(self): df = self.init_df() update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertEqual(str(df.index[0].date()), meta_keys.get(self.field, meta_keys.INDEX_START)) self.assertEqual(str(df.index[-1].date()), meta_keys.get(self.field, meta_keys.INDEX_END))
def test_periodicity(self): df = self.init_df() update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertEqual( meta_keys.get(self.field, meta_keys.PERIODICITY), meta_keys.get(self.field.distribution, meta_keys.PERIODICITY))
def test_last_values(self): df = self.init_df() serie = df[df.columns[-1]] update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertEqual(meta_keys.get(self.field, meta_keys.LAST_VALUE), str(serie[-1])) self.assertEqual(meta_keys.get(self.field, meta_keys.SECOND_TO_LAST_VALUE), str(serie[-2]))
def test_metadata_csv_hits(self): file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_METADATA, file_type=DumpFile.TYPE_CSV).file reader = read_file_as_csv(file) next(reader) # Header row = next(reader) field = Field.objects.get(identifier=row[3]) self.assertEqual(row[25], meta_keys.get(field, meta_keys.HITS_TOTAL)) self.assertEqual(row[26], meta_keys.get(field, meta_keys.HITS_30_DAYS)) self.assertEqual(row[27], meta_keys.get(field, meta_keys.HITS_90_DAYS)) self.assertEqual(row[28], meta_keys.get(field, meta_keys.HITS_180_DAYS))
def generate_row(self, serie_name, values): dataset = values['dataset'] distribution = values['distribution'] serie = values['serie'] return { constants.CATALOG_ID: dataset.catalog.identifier, constants.DATASET_ID: dataset.identifier, constants.DISTRIBUTION_ID: distribution.identifier, constants.SERIE_ID: serie_name, constants.TIME_INDEX_FREQUENCY: meta_keys.get(distribution, meta_keys.PERIODICITY), constants.SERIES_TITLE: values[constants.SERIES_TITLE], constants.SERIES_UNITS: values[constants.SERIES_UNITS], constants.SERIES_DESCRIPTION: values[constants.SERIES_DESCRIPTION], constants.DISTRIBUTION_TITLE: values[constants.DISTRIBUTION_TITLE], constants.DISTRIBUTION_DESCRIPTION: values[constants.DATASET_PUBLISHER], constants.DISTRIBUTION_DOWNLOAD_URL: values[constants.DISTRIBUTION_DOWNLOAD_URL], constants.DATASET_PUBLISHER: values[constants.DATASET_TITLE], constants.DATASET_SOURCE: values[constants.DATASET_SOURCE], constants.DATASET_TITLE: values[constants.DATASET_TITLE], constants.DATASET_DESCRIPTION: values[constants.DATASET_DESCRIPTION], constants.DATASET_THEME: values[constants.DATASET_THEME], constants.SERIES_INDEX_START: meta_keys.get(serie, meta_keys.INDEX_START), constants.SERIES_INDEX_END: meta_keys.get(serie, meta_keys.INDEX_END), constants.SERIES_VALUES_AMT: meta_keys.get(serie, meta_keys.INDEX_SIZE), constants.SERIES_DAYS_SINCE_LAST_UPDATE: meta_keys.get(serie, meta_keys.DAYS_SINCE_LAST_UPDATE), constants.SERIES_IS_UPDATED: meta_keys.get(serie, meta_keys.IS_UPDATED), constants.SERIES_LAST_VALUE: meta_keys.get(serie, meta_keys.LAST_VALUE), constants.SERIES_SECOND_LAST_VALUE: meta_keys.get(serie, meta_keys.SECOND_TO_LAST_VALUE), constants.SERIES_PCT_CHANGE: meta_keys.get(serie, meta_keys.LAST_PCT_CHANGE), }
def test_size(self): df = self.init_df() update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertEqual(meta_keys.get(self.field, meta_keys.INDEX_SIZE), str(len(df)))
def test_last_pct_change(self): df = self.init_df() serie = df[df.columns[-1]] update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertEqual(meta_keys.get(self.field, meta_keys.LAST_PCT_CHANGE), str(serie[-1] / serie[-2] - 1))
def update_enhanced_meta(serie: pd.Series, catalog_id: str, distribution_id: str): """Crea o actualiza los metadatos enriquecidos de la serie pasada. El título de la misma DEBE ser el ID de la serie en la base de datos""" field = Field.objects.get(distribution__dataset__catalog__identifier=catalog_id, distribution__identifier=distribution_id, identifier=serie.name) periodicity = meta_keys.get(field.distribution, meta_keys.PERIODICITY) days_since_update = (datetime.now() - _get_last_day_of_period(serie, periodicity)).days last = serie[-1] second_to_last = serie[-2] if serie.index.size > 1 else None last_pct_change = last / second_to_last - 1 # Cálculos meta = { meta_keys.INDEX_START: serie.first_valid_index().date(), meta_keys.INDEX_END: serie.last_valid_index().date(), meta_keys.PERIODICITY: periodicity, meta_keys.INDEX_SIZE: _get_index_size(serie), meta_keys.DAYS_SINCE_LAST_UPDATE: days_since_update, meta_keys.LAST_VALUE: last, meta_keys.SECOND_TO_LAST_VALUE: second_to_last, meta_keys.LAST_PCT_CHANGE: last_pct_change, meta_keys.IS_UPDATED: _is_series_updated(days_since_update, periodicity), meta_keys.MAX: serie.max(), meta_keys.MIN: serie.min(), meta_keys.AVERAGE: serie.mean(), } for meta_key, value in meta.items(): field.enhanced_meta.update_or_create(key=meta_key, defaults={'value': value})
def test_sources_csv_columns(self): dataset = Field.objects.first().distribution.dataset meta = json.loads(dataset.metadata) file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_SOURCES).file reader = read_file_as_csv(file) next(reader) # Header row = next(reader) series = Field.objects.exclude(title='indice_tiempo') self.assertEqual(row[0], meta['source']) # nombre de la fuente self.assertEqual(int(row[1]), 3) # Cantidad de series self.assertEqual(int(row[2]), sum([int(meta_keys.get(x, meta_keys.INDEX_SIZE)) for x in series])) self.assertEqual(row[3], min(meta_keys.get(x, meta_keys.INDEX_START) for x in series)) self.assertEqual(row[4], max(meta_keys.get(x, meta_keys.INDEX_END) for x in series))
def test_metadata_is_updated(self, mock_hits, *_): self._update_popularity_metadata(mock_hits) updated_value = self._update_popularity_metadata(mock_hits) hits = int(meta_keys.get(self.field, meta_keys.HITS_TOTAL)) self.assertEqual(hits, updated_value)
def row_order(self, field: str): field_data = self.fields[field] return (field_data['dataset'].catalog.identifier, field_data['dataset'].identifier, field_data['distribution'].identifier, field, meta_keys.get(field_data['distribution'], meta_keys.PERIODICITY))
def test_is_updated(self): df = self.init_df() with mock.patch( 'series_tiempo_ar_api.libs.indexing.indexer.metadata.datetime', self.MockDatetime(df.index[-1])): update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertEqual(meta_keys.get(self.field, meta_keys.IS_UPDATED), str(True))
def test_full_metadata_periodicty_with_collapse(self): self.query.add_series(self.single_series, self.field) self.query.add_collapse('year') self.query.set_metadata_config('full') resp = self.query.run() self.assertEqual(resp['meta'][0]['frequency'], 'year') self.assertEqual(resp['meta'][1]['field'][meta_keys.PERIODICITY], meta_keys.get(self.field, meta_keys.PERIODICITY))
def generate(self): sources = {} for field in filter(lambda x: self.fields[x]['dataset_fuente'], self.fields): source = self.fields[field]['dataset_fuente'] field_model: Field = self.fields[field]['serie'] if source not in sources: sources[source] = { constants.SOURCES_DATASET_SOURCE: source, constants.SOURCE_SERIES_AMT: 0, constants.SOURCE_VALUES_AMT: 0, constants.SOURCE_FIRST_INDEX: None, constants.SOURCE_LAST_INDEX: None, } sources[source][constants.SOURCE_SERIES_AMT] += 1 index_start = meta_keys.get(field_model, meta_keys.INDEX_START) # ☢☢☢ if index_start: index_start = iso8601.parse_date(index_start).date() first_index = sources[source][constants.SOURCE_FIRST_INDEX] if first_index is None or first_index > index_start: sources[source][constants.SOURCE_FIRST_INDEX] = index_start index_end = meta_keys.get(field_model, meta_keys.INDEX_END) if index_end: index_end = iso8601.parse_date(index_end).date() last_index = sources[source][constants.SOURCE_LAST_INDEX] if last_index is None or last_index < index_end: sources[source][constants.SOURCE_LAST_INDEX] = index_end index_size = meta_keys.get(field_model, meta_keys.INDEX_SIZE) or 0 if index_size: index_size = int(index_size) sources[source][constants.SOURCE_VALUES_AMT] += index_size self.write_tmp_file(sources)
def test_days_since_last_update(self): df = self.init_df() update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) last_date = df.index[-1] # Sólo válido porque la serie es diaria! Con otra periodicity hay que considerar # el fin del período days = (datetime.datetime.today() - last_date).days self.assertEqual(meta_keys.get(self.field, meta_keys.DAYS_SINCE_LAST_UPDATE), str(days))
def add_pagination(self, start, limit): start_dates = { serie.identifier: meta_keys.get(serie, meta_keys.INDEX_START) for serie in self.series_models } start_dates = { k: iso8601.parse_date(v) if v is not None else None for k, v in start_dates.items() } return self.es_query.add_pagination(start, limit, start_dates=start_dates)
def write_distribution(self, distribution: Distribution, writer: csv.writer): # noinspection PyBroadException try: df = read_distribution_csv(distribution) fields = distribution.field_set.all() fields = {field.title: field.identifier for field in fields} periodicity = meta_keys.get(distribution, meta_keys.PERIODICITY) df.apply(self.write_serie, args=(periodicity, fields, writer)) except Exception as e: msg = f'[{self.tag} Error en la distribución {distribution.identifier}: {e.__class__}: {e}' GenerateDumpTask.info(self.task, msg) logger.error(msg)
def write_distribution(self, distribution: Distribution, writer: csv.writer): # noinspection PyBroadException try: fields = distribution.field_set.all() fields = {field.title: field.identifier for field in fields} periodicity = meta_keys.get(distribution, meta_keys.PERIODICITY) index_col = DistributionRepository( distribution).get_time_index_series().title df = DistributionCsvReader(distribution, index_col).read() df.apply(self.write_serie, args=(periodicity, fields, writer)) except Exception as e: msg = f'[{self.tag} Error en la distribución {distribution.identifier}: {e.__class__}: {e}' GenerateDumpTask.info(self.task, msg) logger.warning(msg)
def _get_model(self, series_id): """Valida si el 'series_id' es válido, es decir, si la serie pedida es un ID contenido en la base de datos. De no encontrarse, llena la lista de errores según corresponda. """ field_model = SeriesRepository.get_available_series(identifier=series_id).first() if field_model is None: self._append_error(SERIES_DOES_NOT_EXIST.format(series_id), series_id=series_id) return None index_start_metadata = meta_keys.get(field_model, meta_keys.INDEX_START) if index_start_metadata is None: self._append_error(SERIES_DOES_NOT_EXIST.format(series_id), series_id=series_id) return None return field_model
def test_min(self): df = self.init_df() update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertAlmostEqual(float(meta_keys.get(self.field, meta_keys.MIN)), df[df.columns[0]].min())
def serie_periodicity(self, field): return meta_keys.get(field, meta_keys.PERIODICITY) or meta_keys.get(field.distribution, meta_keys.PERIODICITY)
def test_get_start_date(self): start_date = meta_keys.get(self.field, meta_keys.INDEX_START) start_date = iso8601.parse_date(start_date) self.assertEqual(self.serie.start_date(), start_date.date())
def _get_series_periodicity(self, serie_model): serie_periodicity = meta_keys.get(serie_model, meta_keys.PERIODICITY) distribution_periodicity = meta_keys.get(serie_model.distribution, meta_keys.PERIODICITY) return get_periodicity_human_format(serie_periodicity or distribution_periodicity)
def test_is_updated(self): df = self.init_df() with freeze_time(df.index[-1]): update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertEqual(meta_keys.get(self.field, meta_keys.IS_UPDATED), str(True))
def test_is_not_updated(self): df = self.init_df() update_enhanced_meta(df[df.columns[0]], self.catalog_id, self.distribution_id) self.assertEqual(meta_keys.get(self.field, meta_keys.IS_UPDATED), str(False))
def _read_from_cache(self, key, model, meta_key): if key not in self.cache: self.cache[key] = meta_keys.get(model, meta_key) return self.cache[key]