def get_distribution_metadata(resource_id): # Se importa 'datajson_actions' en la función para evitar dependencias circulares con 'config_controller' json_dict = get_data_json_contents() html_parser = HTMLParser() json_dict = html_parser.unescape(json_dict) datajson = DataJson(json_dict) dist = datajson.get_distribution(resource_id) return dist
def test_validate(self): catalog = os.path.join(SAMPLES_DIR, "data.json") catalog = DataJson(catalog) distrib_meta = catalog.get_distribution(identifier="125.1") df = pd.read_csv(distrib_meta["downloadURL"], parse_dates=["indice_tiempo" ]).set_index("indice_tiempo") dataset_meta = catalog.get_dataset( identifier=distrib_meta["dataset_identifier"]) validate_distribution(df, catalog, dataset_meta, distrib_meta)
def test_repeated_field_id(self): catalog = os.path.join(SAMPLES_DIR, "repeated_field_id.json") catalog = DataJson(catalog) identifier = "125.1" distribution = catalog.get_distribution(identifier=identifier) dataset = catalog.get_dataset( identifier=distribution["dataset_identifier"]) df = pd.read_csv(distribution["downloadURL"], parse_dates=["indice_tiempo" ]).set_index("indice_tiempo") validate_distribution(df, catalog, dataset, distribution)
def run(self, distribution_model: Distribution, catalog: DataJson): """ Valida las distribuciones de series de tiempo de un catálogo entero a partir de su URL, o archivo fuente Returns: bool: True si la distribución pasa las validaciones, False caso contrario """ df = self.init_df(distribution_model) dataset_id = distribution_model.dataset.identifier if dataset_id is None: raise ValueError( NO_DATASET_IDENTIFIER.format(distribution_model.identifier)) dataset = catalog.get_dataset(dataset_id) distribution = catalog.get_distribution(distribution_model.identifier) validate_distribution(df, catalog, dataset, distribution) return True
def index_distribution(distribution_id, node_id, task_id, read_local=False, index=settings.TS_INDEX, force=False): node = Node.objects.get(id=node_id) task = ReadDataJsonTask.objects.get(id=task_id) catalog = DataJson(json.loads(node.catalog)) distribution = catalog.get_distribution(identifier=distribution_id) distribution_model = Distribution.objects.get( identifier=distribution_id, dataset__catalog__identifier=node.catalog_id) try: Scraper(read_local).run(distribution, catalog) changed = True _hash = distribution_model.enhanced_meta.filter( key=meta_keys.LAST_HASH) if _hash: changed = _hash[0].value != distribution_model.data_hash if changed or force: DistributionIndexer(index=index).run(distribution_model) distribution_model.enhanced_meta.update_or_create( key=meta_keys.LAST_HASH, defaults={'value': distribution_model.data_hash}) distribution_model.enhanced_meta.update_or_create( key=meta_keys.CHANGED, defaults={'value': str(changed)}) except Exception as e: _handle_exception(distribution_model.dataset, distribution_id, e, node, task)