コード例 #1
0
 def test_dataset_issued_no_inference(self):
     catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
     self.loader.run(catalog, self.catalog_id)
     issued = Dataset.objects.first().issued
     self.assertEqual(
         issued.date(),
         iso8601.parse_date(catalog.get_datasets()[0]['issued']).date())
コード例 #2
0
    def test_validate_all_null_series(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR,
                                        'ts_all_null_series.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        self.scrapper.run(distribution, catalog)
コード例 #3
0
    def test_validate_all_zero_series(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR,
                                        'ts_all_zero_series.json'))
        valid = self.scrapper.run(
            catalog.get_distributions(only_time_series=True)[0], catalog)

        self.assertTrue(valid)
コード例 #4
0
def get_distribution_metadata(resource_id):
    # Se importa 'datajson_actions' en la función para evitar dependencias circulares con 'config_controller'
    json_dict = get_data_json_contents()
    html_parser = HTMLParser()
    json_dict = html_parser.unescape(json_dict)
    datajson = DataJson(json_dict)
    dist = datajson.get_distribution(resource_id)
    return dist
コード例 #5
0
    def test_scrapper(self):
        catalog = DataJson(os.path.join(SAMPLES_DIR, 'full_ts_data.json'))
        distribution = catalog.get_distributions(only_time_series=True)[0]

        distribution = MockDistribution(distribution)
        result = self.scrapper.run(distribution, catalog)

        self.assertTrue(result)
コード例 #6
0
    def test_missing_dataframe_column(self):
        """Si falta una columna indicada por los metadatos, no se
        scrapea la distribución
        """

        catalog = DataJson(
            os.path.join(SAMPLES_DIR, 'distribution_missing_column.json'))
        self.scrapper.run(
            catalog.get_distributions(only_time_series=True)[0], catalog)
コード例 #7
0
    def test_missing_metadata_field(self):
        """No importa que un field no esté en metadatos, se scrapea
        igual, para obtener todas las series posibles
        """

        catalog = DataJson(os.path.join(SAMPLES_DIR, 'missing_field.json'))
        result = self.scrapper.run(
            catalog.get_distributions(only_time_series=True)[0], catalog)
        self.assertTrue(result)
コード例 #8
0
    def test_validate(self):
        catalog = os.path.join(SAMPLES_DIR, "data.json")
        catalog = DataJson(catalog)
        distrib_meta = catalog.get_distribution(identifier="125.1")
        df = pd.read_csv(distrib_meta["downloadURL"],
                         parse_dates=["indice_tiempo"
                                      ]).set_index("indice_tiempo")
        dataset_meta = catalog.get_dataset(
            identifier=distrib_meta["dataset_identifier"])

        validate_distribution(df, catalog, dataset_meta, distrib_meta)
コード例 #9
0
 def test_central_node_default(self, mock_indic, mock_load):
     mock_load.return_value = self.catalogs
     mock_indic.return_value = (self.indicators, self.network_indicators)
     task = IndicatorsGenerationTask.objects.create()
     generate_indicators(task)
     mock_indic.assert_any_call(DataJson(),
                                self.catalogs,
                                identifier_search=True)
     mock_indic.assert_any_call(DataJson(),
                                self.catalogs,
                                CENTRAL,
                                identifier_search=True)
コード例 #10
0
    def test_repeated_field_id(self):
        catalog = os.path.join(SAMPLES_DIR, "repeated_field_id.json")
        catalog = DataJson(catalog)
        identifier = "125.1"
        distribution = catalog.get_distribution(identifier=identifier)
        dataset = catalog.get_dataset(
            identifier=distribution["dataset_identifier"])

        df = pd.read_csv(distribution["downloadURL"],
                         parse_dates=["indice_tiempo"
                                      ]).set_index("indice_tiempo")

        validate_distribution(df, catalog, dataset, distribution)
コード例 #11
0
def update_catalog():
    from pydatajson import writers, DataJson
    # Chequeo que la caché del datajson exista antes de pasar su path como parámetro
    if not os.path.isfile(CACHE_FILENAME):
        # No existe, así que la genero
        update_datajson_cache()
    catalog = DataJson(CACHE_FILENAME)
    catalog['themeTaxonomy'] = catalog.get('themeTaxonomy', [])
    new_catalog_filename = '%s/catalog.xlsx' % tempfile.mkdtemp(
        dir=CACHE_DIRECTORY)
    writers.write_xlsx_catalog(catalog, new_catalog_filename)
    os.rename(new_catalog_filename, XLSX_FILENAME)
    os.rmdir(new_catalog_filename.replace('/catalog.xlsx', ''))
コード例 #12
0
    def get_or_init_catalog_themes(self, catalog_id):
        """Devuelve un dict ID: label de los themes del catálogo"""
        if catalog_id in self.catalog_themes:
            return self.catalog_themes[catalog_id]

        # No lo tenemos guardado, parseo el datajson
        catalog = DataJson(
            json.loads(Node.objects.get(catalog_id=catalog_id).catalog))

        self.catalog_themes[catalog_id] = {}
        for theme in catalog.get_themes():
            self.catalog_themes[catalog_id][theme['id']] = theme['label']

        return self.catalog_themes[catalog_id]
コード例 #13
0
 def test_undefined_central_node_uses_default(self, mock_indic, mock_load):
     mock_load.return_value = self.catalogs
     mock_indic.return_value = (self.indicators, self.network_indicators)
     CentralNode.objects.create()
     task = IndicatorsGenerationTask.objects.create()
     generate_indicators(task)
     mock_indic.assert_any_call(DataJson(), self.catalogs,
                                identifier_search=True,
                                broken_links=False,
                                broken_links_threads=1)
     mock_indic.assert_any_call(DataJson(), self.catalogs, CENTRAL,
                                identifier_search=True,
                                broken_links=False,
                                broken_links_threads=1)
コード例 #14
0
def index_distribution(distribution_id, node_id, task_id,
                       read_local=False, index=settings.TS_INDEX, force=False):

    node = Node.objects.get(id=node_id)
    task = ReadDataJsonTask.objects.get(id=task_id)
    catalog = DataJson(json.loads(node.catalog))
    distribution_model = Distribution.objects.get(identifier=distribution_id,
                                                  dataset__catalog__identifier=node.catalog_id)

    try:
        Scraper(read_local).run(distribution_model, catalog)

        changed = True
        _hash = distribution_model.enhanced_meta.filter(key=meta_keys.LAST_HASH)
        if _hash:
            changed = _hash[0].value != distribution_model.data_hash

        if changed or force:
            DistributionIndexer(index=index).run(distribution_model)

        distribution_model.enhanced_meta.update_or_create(key=meta_keys.LAST_HASH,
                                                          defaults={'value': distribution_model.data_hash})
        distribution_model.enhanced_meta.update_or_create(key=meta_keys.CHANGED,
                                                          defaults={'value': str(changed)})

    except Exception as e:
        _handle_exception(distribution_model.dataset, distribution_id, e, node, task)
コード例 #15
0
def daily_routine():
    """Rutina a ser ejecutada cada mañana por cron."""

    logger.info('>>> COMIENZO DE LA RUTINA <<<')

    # Creates DataJson object to validate oragnisms
    logger.info('Instanciación DataJson')
    datajson = DataJson()

    logger.info('Creación de carpetas necesarias (de archivo y versionadas).')
    for org in ORGANISMS:
        ensure_dir_exists(org)
        ensure_dir_exists(os.path.join(TODAY_DIR, org))

    logger.info('Procesamiento de cada organismo:')
    os.chdir(TODAY_DIR)

    for org in ORGANISMS:
        process_catalog(org, datajson)

    os.chdir(ROOT_DIR)

    logger.info('Actualizo los archivos bajo control de versiones:')
    files_of_day = glob.glob('{}/*/*'.format(TODAY_DIR))
    for filename in files_of_day:
        logger.debug('- %s', filename)
        update_versioning(filename)

    logger.info('Push de los cambios encontrados.')
    GIT.push('origin', 'master')

    logger.info('>>> FIN DE LA RUTINA <<<')
コード例 #16
0
 def test_catalog_issued_infers_as_oldest_dataset(self):
     catalog = DataJson(os.path.join(SAMPLES_DIR, 'two_datasets.json'))
     self.loader.run(catalog, self.catalog_id)
     issued = Catalog.objects.first().issued
     dataset_issued = Dataset.objects.aggregate(
         Min('issued'))['issued__min']
     self.assertEqual(issued.date(), dataset_issued.date())
コード例 #17
0
def index_catalog(node: Node, task, read_local=False, force=False):
    """Ejecuta el pipeline de lectura, guardado e indexado de datos
    y metadatos sobre cada distribución del catálogo especificado
    """

    try:
        catalog = DataJson(node.catalog_url,
                           catalog_format=node.catalog_format)
        node.catalog = json.dumps(catalog)
        node.save()
    except Exception as e:
        IndexDataTask.info(task, READ_ERROR.format(node.catalog_id, e))
        return

    distributions = Distribution.objects.filter(
        present=True,
        dataset__indexable=True,
        dataset__catalog__identifier=node.catalog_id)
    for distribution in distributions:
        api_index_enqueue(index_distribution,
                          distribution.identifier,
                          node.id,
                          task.id,
                          read_local,
                          force=force)
コード例 #18
0
 def test_dataset_issued_infers_as_oldest_distribution(self):
     catalog = DataJson(os.path.join(SAMPLES_DIR, 'two_datasets.json'))
     self.loader.run(catalog, self.catalog_id)
     dataset = Dataset.objects.first()
     distribution_issued = Distribution.objects.filter(dataset=dataset). \
         aggregate(Min('issued'))['issued__min']
     self.assertEqual(dataset.issued.date(), distribution_issued.date())
コード例 #19
0
    def __init__(self, node: Node, task: IndexMetadataTask, index: str):
        self.node = node
        self.task = task
        self.index_name = index
        self.elastic: Elasticsearch = connections.get_connection()

        if not self.elastic.indices.exists(self.index_name):
            init_index(self.index_name)

        self.fields_meta = {}
        self.init_fields_meta_cache()
        try:
            data_json = DataJson(node.catalog_url)
            themes = data_json.get('themeTaxonomy', [])
            self.themes = self.get_themes(themes)
        except Exception:
            raise ValueError("Error de lectura de los themes del catálogo")
コード例 #20
0
    def get_catalog_errors(self):
        catalog = DataJson(catalog=self.catalog_url, catalog_format=self.catalog_format)

        all_errors = catalog.validate_catalog(only_errors=True)
        error_messages = []

        catalog_validation = all_errors['error']['catalog']
        if catalog_validation['errors']:
            error_messages.append(f"En catálogo {catalog_validation['title']}:"
                                  f" {catalog_validation['errors']}")

        for dataset_validation in all_errors['error']['dataset']:
            for error in dataset_validation['errors']:
                error_messages.append(f"En dataset {dataset_validation['title']}:"
                                      f" {error['message']}")

        return error_messages
コード例 #21
0
 def test_defined_central_node_catalog(self, mock_indic, mock_load):
     mock_load.return_value = self.catalogs
     mock_indic.return_value = (self.indicators, self.network_indicators)
     harvesting = HarvestingNode.objects.create(name='aName',
                                                url='harvest_url/',
                                                apikey='apikey',
                                                enabled=True)
     CentralNode.objects.create(node=harvesting)
     task = IndicatorsGenerationTask.objects.create()
     generate_indicators(task)
     mock_indic.assert_any_call(DataJson(),
                                self.catalogs,
                                identifier_search=True)
     mock_indic.assert_any_call(DataJson(),
                                self.catalogs,
                                'harvest_url/data.json',
                                identifier_search=True)
コード例 #22
0
    def generate(self):
        node = self.node

        try:
            data_json = DataJson(node.catalog_url)
            data_json.get_fields(only_time_series=True)
            catalog = Catalog.objects.get(identifier=node.catalog_id)
        except Exception as e:
            self.task.info(
                self.task,
                "Error en la lectura del data.json de {}: {}".format(
                    node.catalog_id, e))
            return

        self.calculate_catalog_indicators(node, catalog)
        self.calculate_series_indicators(node, data_json, catalog)
        self.calculate_distribution_indicators(node, data_json, catalog)
        self.calculate_dataset_indicators(node, data_json, catalog)
コード例 #23
0
 def setUpTestData(cls):
     cls.node = Node(catalog_id=cls.catalog_id,
                     catalog_url=os.path.join(dir_path, 'full_data.json'),
                     catalog_format='json',
                     indexable=True)
     cls.node.catalog = json.dumps(DataJson(cls.node.catalog_url))
     cls.node.save()
     cls.task = IndicatorsGenerationTask.objects.create()
     cls.catalogs = load_catalogs(cls.task, Node.objects.all())
コード例 #24
0
 def validate_format(self, url, file, _format):
     path = file.temporary_file_path() if file else url
     try:
         DataJson(path, catalog_format=_format)
     except NonParseableCatalog:
         raise ValidationError("El catálogo ingresado no es válido")
     except Exception as e:
         logging.getLogger(__file__).error(e)
         raise ValidationError("El catálogo ingresado no es válido")
コード例 #25
0
    def setUpTestData(cls):
        HarvestingNode.objects.create(
            name='aName', url='harvest_url', apikey='apikey', enabled=True)
        Node.objects.create(catalog_id='id1',
                            catalog_url=cls.get_sample('full_data.json'),
                            indexable=True)
        Node.objects.create(catalog_id='id2',
                            catalog_url=cls.get_sample('minimum_data.json'),
                            indexable=True)
        HarvestingNode.objects.create(
            catalog_id='idx1',
            name='indexador1',
            url=cls.get_sample('catalogo_justicia.json'),
            apikey='apikey',
            enabled=True)
        HarvestingNode.objects.create(
            catalog_id='idx2',
            name='indexador2',
            url=cls.get_sample('full_data.json'),
            apikey='apikey',
            enabled=True)
        task = IndicatorsGenerationTask.objects.create()
        cls.catalogs = load_catalogs(task, Node.objects.all())
        # Quiero que los cargue por el path, no como url. Uso harvesting=False
        cls.indexing_catalogs = load_catalogs(task,
                                              HarvestingNode.objects.all())
        central = DataJson(cls.get_sample('full_data.json'))
        cls.indicators, cls.network_indicators = \
            DataJson().generate_catalogs_indicators(cls.catalogs,
                                                    central_catalog=central,
                                                    identifier_search=True,
                                                    broken_links=True)
        cls.indexing_indicators, _ = \
            DataJson().generate_catalogs_indicators(cls.indexing_catalogs,
                                                    identifier_search=True,
                                                    broken_links=True)
        config = TasksConfig.get_solo()
        config.indicators_url_check = True
        config.save()

        cls.dj = DataJson()
        with patch('monitoreo.apps.dashboard.indicators_tasks.CENTRAL',
                   cls.get_sample('full_data.json')):
            call_command('indicadores')
コード例 #26
0
def load_catalogs(task, nodes, harvesting=False):
    catalogs = []
    for node in nodes:
        try:
            if harvesting:
                url = urljoin(node.url, 'data.json')
                catalog = DataJson(url)
            else:
                catalog = DataJson(node.catalog_url,
                                   catalog_format=node.catalog_format,
                                   verify_ssl=node.verify_ssl)
        except Exception as e:
            msg = f'Error accediendo al catálogo {node.catalog_id}: {str(e)}'
            IndicatorsGenerationTask.info(task, msg)
            continue

        catalog['identifier'] = node.catalog_id
        catalogs.append(catalog)
    return catalogs
コード例 #27
0
    def validate(self):
        error_messages = []
        file_field = self.json_file if self.json_file else self.xlsx_file
        file_path = os.path.join(settings.MEDIA_ROOT, file_field.name)

        try:
            data_json = DataJson(file_path)
        except KeyError:
            return ["No se puede validar el catálogo ingresado"]

        if not data_json.is_valid_catalog():
            error_report = data_json.validate_catalog()
            errors = error_report['error']['catalog']['errors']

            for dataset in error_report['error']['dataset']:
                errors += dataset['errors']

            error_messages = [error['message'] for error in errors]

        return error_messages
コード例 #28
0
ファイル: search.py プロジェクト: devartis/series-tiempo-ar
def get_time_series_distributions(catalog):
    """
    Devuelve las distribuciones que tengan un campo de series de tiempo
    Args:
        catalog (str o dict): DataJson o string con ruta o URL a un data.json
    Returns:
        list: lista de identifiers de las distribuciones
    """

    dj = DataJson(catalog)

    distributions = dj.get_distributions()

    def has_time_index(distribution):
        for field in distribution.get("field", []):
            if field.get("specialType") == "time_index":
                return True
        return False

    return list(filter(has_time_index, distributions))
コード例 #29
0
    def setUpClass(cls):
        super(AttachmentTests, cls).setUpClass()
        ReadDataJsonTask.objects.all().delete()
        Node.objects.all().delete()
        Catalog.objects.all().delete()
        cls.node = Node(catalog_id=cls.catalog_id,
                        catalog_url=cls.catalog,
                        indexable=True,
                        catalog=json.dumps(DataJson(cls.catalog)))

        cls.node.save()
        call_command('read_datajson', whitelist=True, read_local=True)
コード例 #30
0
    def index(self, node, task):
        self._reset_catalog_if_exists(node)

        try:
            catalog = DataJson(node.catalog_url,
                               catalog_format=node.catalog_format,
                               verify_ssl=self.indexing_config.verify_ssl)
            catalog.generate_distribution_ids()
            node.catalog = json.dumps(catalog)
            node.save()
        except NonParseableCatalog as e:
            self._set_catalog_as_errored(node)
            ReadDataJsonTask.info(task, READ_ERROR.format(node.catalog_id, e))
            return

        self.reset_fields(node)

        self._index_catalog(catalog, node, task)

        file_generator = CatalogFileGenerator(node)
        file_generator.generate_files()