def setUp(self): self.task = ReadDataJsonTask() self.task.save() self.loader = DatabaseLoader(self.task, read_local=True, default_whitelist=False)
def handle(self, *args, **options): if ReadDataJsonTask.objects.filter(status=ReadDataJsonTask.RUNNING): logger.info(u'Ya está corriendo una indexación') return task = ReadDataJsonTask() task.save() read_datajson(task, whitelist=options['whitelist'])
def log_exception(task, msg, model, field_kw): ReadDataJsonTask.info(task, msg) try: error_model = model.objects.get(**field_kw) error_model.error = True error_model.error_msg = msg error_model.save() return error_model except model.DoesNotExist: return None
def test_read(self): identifier = 'test_id' Node(catalog_id=identifier, catalog_url=os.path.join(dir_path, 'sample_data.json'), indexable=True).save() task = ReadDataJsonTask() task.save() read_datajson(task, whitelist=True) self.assertTrue( Field.objects.filter( distribution__dataset__catalog__identifier=identifier))
def parse_catalog(catalog_id, catalog_path, node=None): if not node: node = Node.objects.create(catalog_id=catalog_id, catalog_url=catalog_path, indexable=True) catalog = DataJson(node.catalog_url) node.catalog = json.dumps(catalog) node.save() task = ReadDataJsonTask() task.save() read_datajson(task, whitelist=True) return node
def _index_catalog(self, catalog, node, task): verify_ssl = self.indexing_config.verify_ssl or node.verify_ssl try: loader = DatabaseLoader(task, read_local=self.read_local, default_whitelist=self.whitelist, verify_ssl=verify_ssl) ReadDataJsonTask.info( task, u"Corriendo loader para catalogo {}".format(node.catalog_id)) loader.run(catalog, node.catalog_id) except Exception as e: msg = u"Excepcion en catalogo {}: {}".format(node.catalog_id, e) log_exception(task, msg, Catalog, {'identifier': node.catalog_id})
def index_catalog(catalog_id, catalog_path, index, node=None): """Indexa un catálogo. Útil para tests""" if not node: node = Node(catalog_id=catalog_id, catalog_url=catalog_path, indexable=True) catalog = DataJson(node.catalog_url) node.catalog = json.dumps(catalog) node.save() task = ReadDataJsonTask() task.save() read_datajson(task, read_local=True, whitelist=True) for distribution in Distribution.objects.filter( dataset__catalog__identifier=catalog_id): DistributionIndexer(index=index).run(distribution) ElasticInstance.get().indices.forcemerge(index=index)
def test_read_datajson_while_indexing(self): identifier = 'test_id' Node(catalog_id=identifier, catalog_url=os.path.join(dir_path, 'sample_data.json'), indexable=True).save() ReadDataJsonTask(status=ReadDataJsonTask.RUNNING).save() # Esperado: no se crea una segunda tarea call_command('read_datajson') self.assertEqual(ReadDataJsonTask.objects.all().count(), 1)
def index(self, node, task): self._reset_catalog_if_exists(node) try: catalog = DataJson(node.catalog_url, catalog_format=node.catalog_format, verify_ssl=self.indexing_config.verify_ssl) catalog.generate_distribution_ids() node.catalog = json.dumps(catalog) node.save() except NonParseableCatalog as e: self._set_catalog_as_errored(node) ReadDataJsonTask.info(task, READ_ERROR.format(node.catalog_id, e)) return self.reset_fields(node) self._index_catalog(catalog, node, task) file_generator = CatalogFileGenerator(node) file_generator.generate_files()
def _catalog_model(self, catalog, catalog_id): """Crea o actualiza el catalog model con el título pedido a partir de el diccionario de metadatos de un catálogo """ trimmed_catalog = self._trim_dict_fields( catalog, settings.CATALOG_BLACKLIST, constants.DATASET) catalog_model, created = Catalog.objects.update_or_create( identifier=catalog_id, defaults={'title': trimmed_catalog.get('title', 'No Title')} ) only_time_series = getattr(settings, 'DATAJSON_AR_TIME_SERIES_ONLY', False) datasets = catalog.get_datasets(only_time_series=only_time_series) updated_datasets = False issued_dates = [] for dataset in datasets: try: dataset_model = self._dataset_model(dataset, catalog_model) updated_datasets = updated_datasets or dataset_model.updated issued_dates.append(dataset_model.issued.strftime("%Y-%m-%dT%H:%M:%S")) except Exception as e: msg = u"Excepción en dataset {}: {}" \ .format(dataset.get('identifier'), e) log_exception(self.task, msg, Dataset, {'identifier': dataset.get('identifier'), 'catalog': catalog_model} ) continue if not datasets and only_time_series: msg = u"No fueron encontrados series de tiempo en el catálogo {}".format(catalog_id) ReadDataJsonTask.info(self.task, msg) if not trimmed_catalog.get('issued') and issued_dates: trimmed_catalog['issued'] = min(issued_dates) update_model(trimmed_catalog, catalog_model, updated_children=updated_datasets) return catalog_model
class InferredMetadataTests(TestCase): catalog_id = 'test_catalog' def setUp(self): self.task = ReadDataJsonTask() self.task.save() self.loader = DatabaseLoader(self.task, read_local=True, default_whitelist=False) def test_catalog_issued_infers_as_oldest_dataset(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'two_datasets.json')) self.loader.run(catalog, self.catalog_id) issued = Catalog.objects.first().issued dataset_issued = Dataset.objects.aggregate( Min('issued'))['issued__min'] self.assertEqual(issued.date(), dataset_issued.date()) def test_dataset_issued_infers_as_oldest_distribution(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'two_datasets.json')) self.loader.run(catalog, self.catalog_id) dataset = Dataset.objects.first() distribution_issued = Distribution.objects.filter(dataset=dataset). \ aggregate(Min('issued'))['issued__min'] self.assertEqual(dataset.issued.date(), distribution_issued.date()) @freeze_time("2019-01-01") def test_issued_dataset_metadata_inferred(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) del catalog["dataset"][0]['issued'] self.loader.run(catalog, self.catalog_id) issued = Dataset.objects.first().issued self.assertEqual(issued.date(), datetime.now().date()) @freeze_time("2019-01-01") def test_issued_distributed_metadata_inferred(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) self.loader.run(catalog, self.catalog_id) issued = Distribution.objects.first().issued self.assertEqual(issued.date(), datetime.now().date()) def test_catalog_issued_no_inference(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) self.loader.run(catalog, self.catalog_id) issued = Catalog.objects.first().issued self.assertEqual(issued.date(), iso8601.parse_date(catalog['issued']).date()) def test_dataset_issued_no_inference(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) self.loader.run(catalog, self.catalog_id) issued = Dataset.objects.first().issued self.assertEqual( issued.date(), iso8601.parse_date(catalog.get_datasets()[0]['issued']).date()) def test_distribution_issued_no_inference(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json')) catalog.get_datasets()[0]['distribution'][0]['issued'] = '2016-04-14' self.loader.run(catalog, self.catalog_id) issued = Distribution.objects.first().issued self.assertEqual(issued.date(), datetime(2016, 4, 14).date()) @freeze_time("2019-01-01") def test_catalog_without_issued_dates(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'no_issued_dates.json')) self.loader.run(catalog, self.catalog_id) issued = Catalog.objects.first().issued self.assertEqual(issued.date(), datetime.now().date()) @freeze_time("2019-01-01") def test_dataset_without_issued_dates(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'no_issued_dates.json')) self.loader.run(catalog, self.catalog_id) issued = Dataset.objects.first().issued self.assertEqual(issued.date(), datetime.now().date()) @freeze_time("2019-01-01") def test_distribution_without_issued_dates(self): catalog = DataJson(os.path.join(SAMPLES_DIR, 'no_issued_dates.json')) self.loader.run(catalog, self.catalog_id) issued = Distribution.objects.first().issued self.assertEqual(issued.date(), datetime.now().date())