def index_distribution(distribution_id, node_id, task_id, read_local=False, index=settings.TS_INDEX, force=False): node = Node.objects.get(id=node_id) task = IndexDataTask.objects.get(id=task_id) distribution_model = Distribution.objects.get( identifier=distribution_id, dataset__catalog__identifier=node.catalog_id, present=True) try: DistributionValidator(read_local).run(distribution_model) changed = True _hash = distribution_model.enhanced_meta.filter( key=meta_keys.LAST_HASH) if _hash: changed = _hash[0].value != distribution_model.data_hash if changed or force: DistributionIndexer(index=index).reindex(distribution_model) update_distribution_metadata(changed, distribution_model) except Exception as e: _handle_exception(distribution_model.dataset, distribution_id, e, node, task)
def index_distribution(distribution_id, node_id, task_id, read_local=False, index=settings.TS_INDEX, force=False): node = Node.objects.get(id=node_id) task = ReadDataJsonTask.objects.get(id=task_id) catalog = DataJson(json.loads(node.catalog)) distribution_model = Distribution.objects.get(identifier=distribution_id, dataset__catalog__identifier=node.catalog_id) try: Scraper(read_local).run(distribution_model, catalog) changed = True _hash = distribution_model.enhanced_meta.filter(key=meta_keys.LAST_HASH) if _hash: changed = _hash[0].value != distribution_model.data_hash if changed or force: DistributionIndexer(index=index).run(distribution_model) distribution_model.enhanced_meta.update_or_create(key=meta_keys.LAST_HASH, defaults={'value': distribution_model.data_hash}) distribution_model.enhanced_meta.update_or_create(key=meta_keys.CHANGED, defaults={'value': str(changed)}) except Exception as e: _handle_exception(distribution_model.dataset, distribution_id, e, node, task)
def _index_catalog(self, catalog_path): self.read_data(catalog_path) with mock.patch('series_tiempo_ar_api.libs.indexing.indexer.distribution_indexer.parallel_bulk'): distributions = Distribution.objects.all() for distribution in distributions: DistributionIndexer('some_index').reindex(distribution)
def test_index_series_no_identifier(self): catalog = os.path.join(SAMPLES_DIR, 'series_metadata_no_identifier.json') self._index_catalog(catalog) distribution = Distribution.objects.first() actions = DistributionIndexer('some_index').generate_es_actions(distribution) self.assertFalse(actions)
def test_init_dataframe_columns(self): self._index_catalog('full_ts_data.json') distribution = Distribution.objects.get(identifier="212.1") fields = distribution.field_set.all() fields = {field.title: field.identifier for field in fields} df = DistributionIndexer(self.test_index).init_df(distribution, fields) for field in fields.values(): self.assertTrue(field in df.columns)
def test_reindex_distribution_no_time_index_identifier(self): self._index_catalog('distribution_time_index_no_identifier.json') DistributionIndexer(index=self.test_index).reindex(Distribution.objects.first()) self.elastic.indices.forcemerge(index=self.test_index) series_id = '89.2_TS_INTEALL_0_D_18' results = Search(index=self.test_index) \ .filter('match', series_id=series_id).execute() self.assertTrue(list(results))
def init_df(self): self.field.distribution.data_file = File(open(os.path.join(SAMPLES_DIR, 'daily_periodicity.csv'))) self.field.distribution.field_set.create(identifier='indice_tiempo', metadata='{"specialTypeDetail": "R/P1D"}') df = DistributionIndexer('test_index').init_df( self.field.distribution, {'tasas_interes_call': self.field.identifier, 'indice_tiempo': 'indice_tiempo'} ) return df
def test_reindex_same_distribution(self): self._index_catalog('single_data.json') self.assertEqual(Field.objects.count(), 2) series_id = '102.1_I2NG_ABRI_M_22' results = Search(index=self.test_index) \ .filter('match', series_id=series_id).execute() DistributionIndexer(index=self.test_index).reindex(Distribution.objects.first()) updated_results = Search(index=self.test_index) \ .filter('match', series_id=series_id).execute() # No cambia nada self.assertEqual(list(results), list(updated_results))
def _index_catalog(self, catalog_path): Node.objects.create(catalog_id='test_catalog', catalog_url=catalog_path, indexable=True) task = ReadDataJsonTask.objects.create() read_datajson(task, whitelist=True) with mock.patch( 'series_tiempo_ar_api.libs.indexing.indexer.distribution_indexer.parallel_bulk' ): distributions = Distribution.objects.all() for distribution in distributions: DistributionIndexer('some_index').reindex(distribution)
def test_reindex_remove_value(self): self._index_catalog('single_data_updated.json') self.assertEqual(Field.objects.count(), 2) series_id = '102.1_I2NG_ABRI_M_22' results = Search(index=self.test_index) \ .filter('match', series_id=series_id).execute() self.assertEqual(len(list(results)), 3) distribution = Distribution.objects.get(identifier="102.1") distribution.data_file = File(open(os.path.join(SAMPLES_DIR, 'single_data.csv'), 'rb')) distribution.save() DistributionIndexer(index=self.test_index).reindex(distribution) self.elastic.indices.forcemerge(index=self.test_index) updated_results = Search(index=self.test_index) \ .filter('match', series_id=series_id).execute() self.assertEqual(len(list(updated_results)), 2) self.assertEqual(list(results[:2]), list(updated_results))
def index_catalog(catalog_id, catalog_path, index, node=None): """Indexa un catálogo. Útil para tests""" if not node: node = Node(catalog_id=catalog_id, catalog_url=catalog_path, indexable=True) catalog = DataJson(node.catalog_url) node.catalog = json.dumps(catalog) node.save() task = ReadDataJsonTask() task.save() read_datajson(task, read_local=True, whitelist=True) for distribution in Distribution.objects.filter( dataset__catalog__identifier=catalog_id): DistributionIndexer(index=index).run(distribution) ElasticInstance.get().indices.forcemerge(index=index)
def index_distribution(distribution_id, node_id, task_id, read_local=False, index=None, force=False): if index is None: # Lazy loading index = settings.TS_INDEX node = Node.objects.get(id=node_id) task = IndexDataTask.objects.get(id=task_id) distribution_model = Distribution.objects.get( identifier=distribution_id, dataset__catalog__identifier=node.catalog_id, present=True) try: config = DistributionValidatorConfig().get_solo() options = ValidationOptions.create_with_defaults( minimum_values=config.minimum_values, max_missing_proportion=config.max_missing_proportion, max_too_small_proportion=config.max_too_small_proportion, max_field_title_len=config.max_field_title_len, max_null_series_proportion=config.max_null_series_proportion, ) validator = DataValidator(get_distribution_errors, options) DistributionValidator(read_local, data_validator=validator).run(distribution_model) changed = True _hash = distribution_model.enhanced_meta.filter( key=meta_keys.LAST_HASH) if _hash: changed = _hash[0].value != distribution_model.data_hash if changed or force: DistributionIndexer(index=index).reindex(distribution_model) update_distribution_metadata(changed, distribution_model) except Exception as e: _handle_exception(distribution_model.dataset, distribution_id, e, node, task)
def reindex_distribution(distribution: Distribution): DistributionIndexer(index=settings.TS_INDEX).reindex(distribution)