Example #1
0
    def setUp(self):
        self.task = ReadDataJsonTask()
        self.task.save()

        self.loader = DatabaseLoader(self.task,
                                     read_local=True,
                                     default_whitelist=False)
    def handle(self, *args, **options):
        if ReadDataJsonTask.objects.filter(status=ReadDataJsonTask.RUNNING):
            logger.info(u'Ya está corriendo una indexación')
            return

        task = ReadDataJsonTask()
        task.save()

        read_datajson(task, whitelist=options['whitelist'])
Example #3
0
def log_exception(task, msg, model, field_kw):
    ReadDataJsonTask.info(task, msg)
    try:
        error_model = model.objects.get(**field_kw)
        error_model.error = True
        error_model.error_msg = msg
        error_model.save()
        return error_model
    except model.DoesNotExist:
        return None
 def test_read(self):
     identifier = 'test_id'
     Node(catalog_id=identifier,
          catalog_url=os.path.join(dir_path, 'sample_data.json'),
          indexable=True).save()
     task = ReadDataJsonTask()
     task.save()
     read_datajson(task, whitelist=True)
     self.assertTrue(
         Field.objects.filter(
             distribution__dataset__catalog__identifier=identifier))
Example #5
0
def parse_catalog(catalog_id, catalog_path, node=None):
    if not node:
        node = Node.objects.create(catalog_id=catalog_id,
                                   catalog_url=catalog_path,
                                   indexable=True)
    catalog = DataJson(node.catalog_url)
    node.catalog = json.dumps(catalog)
    node.save()
    task = ReadDataJsonTask()
    task.save()
    read_datajson(task, whitelist=True)
    return node
 def _index_catalog(self, catalog, node, task):
     verify_ssl = self.indexing_config.verify_ssl or node.verify_ssl
     try:
         loader = DatabaseLoader(task,
                                 read_local=self.read_local,
                                 default_whitelist=self.whitelist,
                                 verify_ssl=verify_ssl)
         ReadDataJsonTask.info(
             task,
             u"Corriendo loader para catalogo {}".format(node.catalog_id))
         loader.run(catalog, node.catalog_id)
     except Exception as e:
         msg = u"Excepcion en catalogo {}: {}".format(node.catalog_id, e)
         log_exception(task, msg, Catalog, {'identifier': node.catalog_id})
Example #7
0
def index_catalog(catalog_id, catalog_path, index, node=None):
    """Indexa un catálogo. Útil para tests"""
    if not node:
        node = Node(catalog_id=catalog_id,
                    catalog_url=catalog_path,
                    indexable=True)

    catalog = DataJson(node.catalog_url)
    node.catalog = json.dumps(catalog)
    node.save()
    task = ReadDataJsonTask()
    task.save()

    read_datajson(task, read_local=True, whitelist=True)
    for distribution in Distribution.objects.filter(
            dataset__catalog__identifier=catalog_id):
        DistributionIndexer(index=index).run(distribution)
    ElasticInstance.get().indices.forcemerge(index=index)
    def test_read_datajson_while_indexing(self):
        identifier = 'test_id'
        Node(catalog_id=identifier,
             catalog_url=os.path.join(dir_path, 'sample_data.json'),
             indexable=True).save()

        ReadDataJsonTask(status=ReadDataJsonTask.RUNNING).save()

        # Esperado: no se crea una segunda tarea
        call_command('read_datajson')
        self.assertEqual(ReadDataJsonTask.objects.all().count(), 1)
    def index(self, node, task):
        self._reset_catalog_if_exists(node)

        try:
            catalog = DataJson(node.catalog_url,
                               catalog_format=node.catalog_format,
                               verify_ssl=self.indexing_config.verify_ssl)
            catalog.generate_distribution_ids()
            node.catalog = json.dumps(catalog)
            node.save()
        except NonParseableCatalog as e:
            self._set_catalog_as_errored(node)
            ReadDataJsonTask.info(task, READ_ERROR.format(node.catalog_id, e))
            return

        self.reset_fields(node)

        self._index_catalog(catalog, node, task)

        file_generator = CatalogFileGenerator(node)
        file_generator.generate_files()
    def _catalog_model(self, catalog, catalog_id):
        """Crea o actualiza el catalog model con el título pedido a partir
        de el diccionario de metadatos de un catálogo
        """
        trimmed_catalog = self._trim_dict_fields(
            catalog, settings.CATALOG_BLACKLIST, constants.DATASET)

        catalog_model, created = Catalog.objects.update_or_create(
            identifier=catalog_id,
            defaults={'title': trimmed_catalog.get('title', 'No Title')}
        )

        only_time_series = getattr(settings, 'DATAJSON_AR_TIME_SERIES_ONLY', False)
        datasets = catalog.get_datasets(only_time_series=only_time_series)
        updated_datasets = False
        issued_dates = []
        for dataset in datasets:
            try:
                dataset_model = self._dataset_model(dataset, catalog_model)
                updated_datasets = updated_datasets or dataset_model.updated
                issued_dates.append(dataset_model.issued.strftime("%Y-%m-%dT%H:%M:%S"))
            except Exception as e:
                msg = u"Excepción en dataset {}: {}" \
                    .format(dataset.get('identifier'), e)
                log_exception(self.task, msg, Dataset,
                              {'identifier': dataset.get('identifier'),
                               'catalog': catalog_model}
                              )
                continue

        if not datasets and only_time_series:
            msg = u"No fueron encontrados series de tiempo en el catálogo {}".format(catalog_id)
            ReadDataJsonTask.info(self.task, msg)

        if not trimmed_catalog.get('issued') and issued_dates:
            trimmed_catalog['issued'] = min(issued_dates)

        update_model(trimmed_catalog, catalog_model, updated_children=updated_datasets)
        return catalog_model
Example #11
0
class InferredMetadataTests(TestCase):

    catalog_id = 'test_catalog'

    def setUp(self):
        self.task = ReadDataJsonTask()
        self.task.save()

        self.loader = DatabaseLoader(self.task,
                                     read_local=True,
                                     default_whitelist=False)

    def test_catalog_issued_infers_as_oldest_dataset(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'two_datasets.json'))
        self.loader.run(catalog, self.catalog_id)
        issued = Catalog.objects.first().issued
        dataset_issued = Dataset.objects.aggregate(
            Min('issued'))['issued__min']
        self.assertEqual(issued.date(), dataset_issued.date())

    def test_dataset_issued_infers_as_oldest_distribution(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'two_datasets.json'))
        self.loader.run(catalog, self.catalog_id)
        dataset = Dataset.objects.first()
        distribution_issued = Distribution.objects.filter(dataset=dataset). \
            aggregate(Min('issued'))['issued__min']
        self.assertEqual(dataset.issued.date(), distribution_issued.date())

    @freeze_time("2019-01-01")
    def test_issued_dataset_metadata_inferred(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
        del catalog["dataset"][0]['issued']
        self.loader.run(catalog, self.catalog_id)
        issued = Dataset.objects.first().issued
        self.assertEqual(issued.date(), datetime.now().date())

    @freeze_time("2019-01-01")
    def test_issued_distributed_metadata_inferred(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
        self.loader.run(catalog, self.catalog_id)
        issued = Distribution.objects.first().issued
        self.assertEqual(issued.date(), datetime.now().date())

    def test_catalog_issued_no_inference(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
        self.loader.run(catalog, self.catalog_id)
        issued = Catalog.objects.first().issued
        self.assertEqual(issued.date(),
                         iso8601.parse_date(catalog['issued']).date())

    def test_dataset_issued_no_inference(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
        self.loader.run(catalog, self.catalog_id)
        issued = Dataset.objects.first().issued
        self.assertEqual(
            issued.date(),
            iso8601.parse_date(catalog.get_datasets()[0]['issued']).date())

    def test_distribution_issued_no_inference(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
        catalog.get_datasets()[0]['distribution'][0]['issued'] = '2016-04-14'
        self.loader.run(catalog, self.catalog_id)
        issued = Distribution.objects.first().issued
        self.assertEqual(issued.date(), datetime(2016, 4, 14).date())

    @freeze_time("2019-01-01")
    def test_catalog_without_issued_dates(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'no_issued_dates.json'))
        self.loader.run(catalog, self.catalog_id)
        issued = Catalog.objects.first().issued
        self.assertEqual(issued.date(), datetime.now().date())

    @freeze_time("2019-01-01")
    def test_dataset_without_issued_dates(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'no_issued_dates.json'))
        self.loader.run(catalog, self.catalog_id)
        issued = Dataset.objects.first().issued
        self.assertEqual(issued.date(), datetime.now().date())

    @freeze_time("2019-01-01")
    def test_distribution_without_issued_dates(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'no_issued_dates.json'))
        self.loader.run(catalog, self.catalog_id)
        issued = Distribution.objects.first().issued
        self.assertEqual(issued.date(), datetime.now().date())