Exemple #1
0
 def write(self):
     try:
         self.csv_to_xlsx()
     except IOError as e:
         catalog = self.csv_dump_file.node or 'global'
         msg = f"Error escribiendo dump XLSX de dump {catalog} {self.csv_dump_file.file_name}: {e.__class__}: {e}"
         GenerateDumpTask.info(self.task, msg)
Exemple #2
0
    def generate(self):
        if not self.fields:
            GenerateDumpTask.info(self.task, f"No hay series cargadas para el catálogo {self.catalog}")
            return

        FullCsvGenerator(self.task, self.fields, self.catalog).generate()
        ValuesCsvGenerator(self.task, self.fields, self.catalog).generate()
        SourcesCsvGenerator(self.task, self.fields, self.catalog).generate()
        MetadataCsvGenerator(self.task, self.fields, self.catalog).generate()
Exemple #3
0
def enqueue_dump_task(task: GenerateDumpTask):
    task.save()  # Evito problemas de DoesNotExist cuando se llama async
    task_choices = {
        GenerateDumpTask.TYPE_CSV: write_csv,
        GenerateDumpTask.TYPE_XLSX: write_xlsx,
        GenerateDumpTask.TYPE_SQL: write_sql,
        GenerateDumpTask.TYPE_DTA: write_dta,
    }

    task_choices[task.file_type](task.id)
    def write_distribution(self, distribution: Distribution,
                           writer: csv.writer):
        # noinspection PyBroadException
        try:
            df = read_distribution_csv(distribution)
            fields = distribution.field_set.all()
            fields = {field.title: field.identifier for field in fields}

            periodicity = meta_keys.get(distribution, meta_keys.PERIODICITY)
            df.apply(self.write_serie, args=(periodicity, fields, writer))
        except Exception as e:
            msg = f'[{self.tag} Error en la distribución {distribution.identifier}: {e.__class__}: {e}'
            GenerateDumpTask.info(self.task, msg)
            logger.error(msg)
    def test_leading_nulls_distribution(self):
        path = os.path.join(samples_dir, 'leading_nulls_distribution.json')
        index_catalog('leading_null', path, self.index)
        self.task = GenerateDumpTask()
        self.task.save()
        gen = DumpGenerator(self.task, 'leading_null')
        gen.generate()

        file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_FULL,
                                          file_type=DumpFile.TYPE_CSV,
                                          node__catalog_id='leading_null').file
        reader = read_file_as_csv(file)

        next(reader)  # Header!!!!
        self.assertEqual(len(list(reader)), 2)
 def write_distribution(self, distribution: Distribution,
                        writer: csv.writer):
     # noinspection PyBroadException
     try:
         fields = distribution.field_set.all()
         fields = {field.title: field.identifier for field in fields}
         periodicity = meta_keys.get(distribution, meta_keys.PERIODICITY)
         index_col = DistributionRepository(
             distribution).get_time_index_series().title
         df = DistributionCsvReader(distribution, index_col).read()
         df.apply(self.write_serie, args=(periodicity, fields, writer))
     except Exception as e:
         msg = f'[{self.tag} Error en la distribución {distribution.identifier}: {e.__class__}: {e}'
         GenerateDumpTask.info(self.task, msg)
         logger.warning(msg)
Exemple #7
0
 def setUpClass(cls):
     super(CSVTest, cls).setUpClass()
     cls.catalog_id = 'csv_dump_test_catalog'
     path = os.path.join(samples_dir, 'distribution_daily_periodicity.json')
     index_catalog(cls.catalog_id, path, cls.index)
     cls.task = GenerateDumpTask()
     cls.task.save()
     gen = DumpGenerator(cls.task)
     gen.generate()
    def test_invalid_catalog(self):
        task = GenerateDumpTask()
        task.save()

        gen = DumpGenerator(task, "no_catalog")
        gen.generate()
        task.refresh_from_db()

        self.assertFalse(DumpFile.objects.filter(file_type=DumpFile.TYPE_CSV, task=task))
    def test_dump_distribution_no_periodicity(self):
        task = GenerateDumpTask()
        task.save()
        Distribution.objects.first().enhanced_meta.get(key=meta_keys.PERIODICITY).delete()
        gen = DumpGenerator(task)
        gen.generate()
        task.refresh_from_db()

        self.assertTrue(DumpFile.objects.filter(file_type=DumpFile.TYPE_CSV, task=task))
Exemple #10
0
    def setUpClass(cls):
        super(ViewTests, cls).setUpClass()
        es_client = ElasticInstance.get()
        if es_client.indices.exists(cls.index):
            es_client.indices.delete(cls.index)
        es_client.indices.create(cls.index, body=INDEX_CREATION_BODY)

        cls.catalog_id = 'csv_dump_test_catalog'
        path = os.path.join(samples_dir, 'distribution_daily_periodicity.json')
        index_catalog(cls.catalog_id, path, cls.index)
        cls.task = GenerateDumpTask()
        cls.task.save()
        gen = DumpGenerator(cls.task)
        gen.generate()

        DumpGenerator(cls.task, cls.catalog_id).generate()
Exemple #11
0
class CSVTest(TestCase):
    index = 'csv_dump_test_index'
    # noinspection PyUnresolvedReferences
    directory = os.path.join(settings.MEDIA_ROOT, 'test_dump')

    @classmethod
    def setUpClass(cls):
        super(CSVTest, cls).setUpClass()
        cls.catalog_id = 'csv_dump_test_catalog'
        path = os.path.join(samples_dir, 'distribution_daily_periodicity.json')
        index_catalog(cls.catalog_id, path, cls.index)
        cls.task = GenerateDumpTask()
        cls.task.save()
        gen = DumpGenerator(cls.task)
        gen.generate()

    def test_invalid_catalog(self):
        task = GenerateDumpTask()
        task.save()

        gen = DumpGenerator(task, "no_catalog")
        gen.generate()
        task.refresh_from_db()

        self.assertFalse(
            DumpFile.objects.filter(file_type=DumpFile.TYPE_CSV, task=task))

    def test_values_dump(self):
        file = self.task.dumpfile_set.get(
            file_name=DumpFile.FILENAME_VALUES).file
        reader = read_file_as_csv(file)
        next(reader)  # skip header
        row = next(reader)
        self.assertEqual(row[0], self.catalog_id)
        self.assertEqual(row[6], 'R/P1D')

    def test_values_length(self):
        file = self.task.dumpfile_set.get(
            file_name=DumpFile.FILENAME_VALUES).file
        reader = read_file_as_csv(file)
        header = next(reader)
        self.assertEqual(len(header), 7)

    def test_entity_identifiers(self):
        file = self.task.dumpfile_set.get(
            file_name=DumpFile.FILENAME_VALUES).file
        reader = read_file_as_csv(file)
        next(reader)

        row = next(reader)

        field_id = row[3]
        field = Field.objects.get(identifier=field_id)

        self.assertEqual(self.catalog_id, row[0])
        self.assertEqual(field.distribution.identifier, row[2])
        self.assertEqual(field.distribution.dataset.identifier, row[1])
        self.assertEqual(
            row[6],
            field.distribution.enhanced_meta.get(
                key=meta_keys.PERIODICITY).value)

    def test_full_csv_zipped(self):
        dump_file = self.task.dumpfile_set.get(
            file_name=DumpFile.FILENAME_FULL, file_type=DumpFile.TYPE_CSV)
        zip_file = ZipDumpFile.objects.get(dump_file=dump_file).file
        csv_zipped = zipfile.ZipFile(zip_file)

        full_csv = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_FULL,
                                              file_type=DumpFile.TYPE_CSV)
        # Necesario para abrir archivos zippeados en modo texto (no bytes)
        src_file = io.TextIOWrapper(csv_zipped.open(full_csv.get_file_name()),
                                    encoding='utf8',
                                    newline='')
        reader = csv.reader(src_file)

        header = next(reader)

        self.assertEqual(len(header), 15)

    def test_values_csv_zipped(self):
        dump_file = self.task.dumpfile_set.get(
            file_name=DumpFile.FILENAME_VALUES, file_type=DumpFile.TYPE_CSV)
        zip_file = ZipDumpFile.objects.get(dump_file=dump_file).file
        csv_zipped = zipfile.ZipFile(zip_file)

        # Necesario para abrir archivos zippeados en modo texto (no bytes)
        src_file = io.TextIOWrapper(csv_zipped.open(dump_file.get_file_name()),
                                    encoding='utf8',
                                    newline='')
        reader = csv.reader(src_file)

        header = next(reader)

        self.assertEqual(len(header), len(VALUES_HEADER))

    def test_full_csv_identifier_fields(self):
        file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_FULL,
                                          file_type=DumpFile.TYPE_CSV).file
        reader = read_file_as_csv(file)
        next(reader)  # Header

        row = next(reader)

        field = Field.objects.get(identifier=row[3])
        self.assertEqual(row[0], self.catalog_id)
        self.assertEqual(row[1], field.distribution.dataset.identifier)
        self.assertEqual(row[2], field.distribution.identifier)
        self.assertEqual(
            row[5],
            field.distribution.enhanced_meta.get(
                key=meta_keys.PERIODICITY).value)

    def test_full_csv_metadata_fields(self):
        file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_FULL,
                                          file_type=DumpFile.TYPE_CSV).file
        reader = read_file_as_csv(file)
        next(reader)  # Header

        row = next(reader)

        field = Field.objects.get(identifier=row[3])

        field_meta = json.loads(field.metadata)
        distribution_meta = json.loads(field.distribution.metadata)
        self.assertEqual(row[7], field.title)
        self.assertEqual(row[8], field_meta['units'])
        self.assertEqual(row[9], field_meta['description'])
        self.assertEqual(row[10], distribution_meta['description'])

    def test_full_csv_dataset_metadata_fields(self):
        file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_FULL,
                                          file_type=DumpFile.TYPE_CSV).file
        reader = read_file_as_csv(file)
        next(reader)  # Header

        row = next(reader)

        field = Field.objects.get(identifier=row[3])

        dataset_meta = json.loads(field.distribution.dataset.metadata)
        self.assertEqual(row[12], dataset_meta['publisher']['name'])
        self.assertEqual(row[13], dataset_meta['source'])
        self.assertEqual(row[14], field.distribution.dataset.title)

    def test_full_csv_dataset_theme_field(self):
        file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_FULL,
                                          file_type=DumpFile.TYPE_CSV).file
        reader = read_file_as_csv(file)
        next(reader)  # Header
        row = next(reader)

        field = Field.objects.get(identifier=row[3])

        dataset_meta = json.loads(field.distribution.dataset.metadata)

        themes = json.loads(
            Node.objects.get(
                catalog_id=self.catalog_id).catalog)['themeTaxonomy']

        theme_label = ''
        for theme in themes:
            if theme['id'] == dataset_meta['theme'][0]:
                theme_label = theme['label']
                break

        self.assertEqual(theme_label, row[11])

    def test_metadata_csv(self):
        file = self.task.dumpfile_set.get(
            file_name=DumpFile.FILENAME_METADATA).file
        reader = read_file_as_csv(file)
        next(reader)

        # self.assertListEqual(header, constants.METADATA_ROWS)

        self.assertEqual(len(list(reader)), 3)  # Un row por serie

    def test_sources_csv(self):
        file = self.task.dumpfile_set.get(
            file_name=DumpFile.FILENAME_SOURCES).file
        reader = read_file_as_csv(file)
        next(reader)  # Header

        self.assertEqual(len(list(reader)), 1)  # Un row por fuente

    def test_sources_csv_columns(self):
        dataset = Field.objects.first().distribution.dataset
        meta = json.loads(dataset.metadata)

        file = self.task.dumpfile_set.get(
            file_name=DumpFile.FILENAME_SOURCES).file
        reader = read_file_as_csv(file)
        next(reader)  # Header

        row = next(reader)
        series = Field.objects.exclude(title='indice_tiempo')
        self.assertEqual(row[0], meta['source'])  # nombre de la fuente
        self.assertEqual(int(row[1]), 3)  # Cantidad de series
        self.assertEqual(
            int(row[2]),
            sum([int(meta_keys.get(x, meta_keys.INDEX_SIZE)) for x in series]))
        self.assertEqual(
            row[3],
            min(meta_keys.get(x, meta_keys.INDEX_START) for x in series))
        self.assertEqual(
            row[4], max(meta_keys.get(x, meta_keys.INDEX_END) for x in series))

    def test_leading_nulls_distribution(self):
        path = os.path.join(samples_dir, 'leading_nulls_distribution.json')
        index_catalog('leading_null', path, self.index)
        self.task = GenerateDumpTask()
        self.task.save()
        gen = DumpGenerator(self.task, 'leading_null')
        gen.generate()

        file = self.task.dumpfile_set.get(file_name=DumpFile.FILENAME_FULL,
                                          file_type=DumpFile.TYPE_CSV,
                                          node__catalog_id='leading_null').file
        reader = read_file_as_csv(file)

        next(reader)  # Header!!!!
        self.assertEqual(len(list(reader)),
                         1)  # Un único row, para un único valor del CSV

    @classmethod
    def tearDownClass(cls):
        super(CSVTest, cls).tearDownClass()
        ElasticInstance.get().indices.delete(cls.index)
        Node.objects.all().delete()