def get_distribution_metadata(resource_id):
    # Se importa 'datajson_actions' en la función para evitar dependencias circulares con 'config_controller'
    json_dict = get_data_json_contents()
    html_parser = HTMLParser()
    json_dict = html_parser.unescape(json_dict)
    datajson = DataJson(json_dict)
    dist = datajson.get_distribution(resource_id)
    return dist
Beispiel #2
0
    def test_validate(self):
        catalog = os.path.join(SAMPLES_DIR, "data.json")
        catalog = DataJson(catalog)
        distrib_meta = catalog.get_distribution(identifier="125.1")
        df = pd.read_csv(distrib_meta["downloadURL"],
                         parse_dates=["indice_tiempo"
                                      ]).set_index("indice_tiempo")
        dataset_meta = catalog.get_dataset(
            identifier=distrib_meta["dataset_identifier"])

        validate_distribution(df, catalog, dataset_meta, distrib_meta)
Beispiel #3
0
    def test_repeated_field_id(self):
        catalog = os.path.join(SAMPLES_DIR, "repeated_field_id.json")
        catalog = DataJson(catalog)
        identifier = "125.1"
        distribution = catalog.get_distribution(identifier=identifier)
        dataset = catalog.get_dataset(
            identifier=distribution["dataset_identifier"])

        df = pd.read_csv(distribution["downloadURL"],
                         parse_dates=["indice_tiempo"
                                      ]).set_index("indice_tiempo")

        validate_distribution(df, catalog, dataset, distribution)
Beispiel #4
0
    def run(self, distribution_model: Distribution, catalog: DataJson):
        """
        Valida las distribuciones de series de tiempo de un catálogo
        entero a partir de su URL, o archivo fuente

        Returns:
            bool: True si la distribución pasa las validaciones, False caso contrario
        """

        df = self.init_df(distribution_model)

        dataset_id = distribution_model.dataset.identifier
        if dataset_id is None:
            raise ValueError(
                NO_DATASET_IDENTIFIER.format(distribution_model.identifier))
        dataset = catalog.get_dataset(dataset_id)

        distribution = catalog.get_distribution(distribution_model.identifier)

        validate_distribution(df, catalog, dataset, distribution)

        return True
Beispiel #5
0
def index_distribution(distribution_id,
                       node_id,
                       task_id,
                       read_local=False,
                       index=settings.TS_INDEX,
                       force=False):

    node = Node.objects.get(id=node_id)
    task = ReadDataJsonTask.objects.get(id=task_id)
    catalog = DataJson(json.loads(node.catalog))
    distribution = catalog.get_distribution(identifier=distribution_id)
    distribution_model = Distribution.objects.get(
        identifier=distribution_id,
        dataset__catalog__identifier=node.catalog_id)

    try:
        Scraper(read_local).run(distribution, catalog)

        changed = True
        _hash = distribution_model.enhanced_meta.filter(
            key=meta_keys.LAST_HASH)
        if _hash:
            changed = _hash[0].value != distribution_model.data_hash

        if changed or force:
            DistributionIndexer(index=index).run(distribution_model)

        distribution_model.enhanced_meta.update_or_create(
            key=meta_keys.LAST_HASH,
            defaults={'value': distribution_model.data_hash})
        distribution_model.enhanced_meta.update_or_create(
            key=meta_keys.CHANGED, defaults={'value': str(changed)})

    except Exception as e:
        _handle_exception(distribution_model.dataset, distribution_id, e, node,
                          task)