Ejemplo n.º 1
0
    def test_dataset_is_updated_correctly(self):
        catalog = self.full_catalog
        catalog_id = title_to_name(catalog['title'])
        dataset_id = catalog.datasets[0]['identifier']
        push_dataset_to_ckan(
            catalog,
            "oficina-de-muestra",
            dataset_id,
            self.portal_url,
            self.apikey,
            catalog_id=catalog_id,
        )

        catalog.datasets[0]['description'] = 'updated description'
        return_id = push_dataset_to_ckan(
            catalog,
            "oficina-de-muestra",
            dataset_id,
            self.portal_url,
            self.apikey,
            catalog_id=catalog_id,
        )

        data_dict = {'id': catalog_id + '_' + dataset_id}
        package = self.portal.call_action('package_show', data_dict=data_dict)
        self.assertEqual(return_id, catalog_id + '_' + dataset_id)
        self.assertEqual('updated description', package['notes'])
Ejemplo n.º 2
0
    def test_dataset_array_attributes_are_correct(self):
        package = map_dataset_to_package(self.catalog,
                                         self.dataset,
                                         'owner',
                                         catalog_id=self.catalog_id)
        groups = [group['name'] for group in package.get('groups', [])]
        super_themes = [
            title_to_name(s_theme.lower())
            for s_theme in self.dataset.get('superTheme')
        ]
        try:
            self.assertItemsEqual(super_themes, groups)
        except AttributeError:
            self.assertCountEqual(super_themes, groups)

        tags = [tag['name'] for tag in package['tags']]
        keywords = self.dataset.get('keyword', [])

        themes = self.dataset.get('theme', [])
        theme_labels = []
        for theme in themes:
            label = self.catalog.get_theme(identifier=theme)['label']
            label = re.sub(r'[^\w .-]+', '', label, flags=re.UNICODE)
            theme_labels.append(label)

        try:
            self.assertItemsEqual(keywords + theme_labels, tags)
        except AttributeError:
            self.assertCountEqual(keywords + theme_labels, tags)
Ejemplo n.º 3
0
 def setUpClass(cls):
     cls.catalog = pydatajson.DataJson(cls.get_sample('full_data.json'))
     cls.catalog_id = cls.catalog.get('identifier',
                                      title_to_name(cls.catalog['title']))
     cls.dataset = cls.catalog.datasets[0]
     cls.dataset_id = cls.dataset.get('identifier')
     cls.distributions = cls.dataset['distribution']
Ejemplo n.º 4
0
    def tearDown(self):
        full_dataset = self.full_catalog.datasets[0]
        full_name = title_to_name(full_dataset['title'])
        justice_dataset = self.justice_catalog.datasets[0]
        justice_name = title_to_name(justice_dataset['title'])
        try:
            self.portal.call_action('dataset_purge',
                                    data_dict={'id': full_name})
        except NotFound:
            pass
        try:
            self.portal.call_action('dataset_purge',
                                    data_dict={'id': justice_name})
        except NotFound:
            pass

        self.portal.close()
Ejemplo n.º 5
0
 def test_catalog_id_is_prepended_to_dataset_id_and_name_if_passed(self):
     package = map_dataset_to_package(
         self.catalog, self.dataset, 'owner', catalog_id=self.catalog_id)
     self.assertEqual(self.catalog_id + '_' + self.dataset_id,
                      package['id'])
     self.assertEqual(
         title_to_name(self.catalog_id + '-' + self.dataset['title']),
         package['name'])
Ejemplo n.º 6
0
 def test_dataset_is_created_correctly(self):
     catalog = self.full_catalog
     catalog_id = title_to_name(catalog['title'])
     dataset = catalog.datasets[0]
     dataset_id = dataset['identifier']
     return_id = push_dataset_to_ckan(
         catalog,
         "oficina-de-muestra",
         dataset_id,
         self.portal_url,
         self.apikey,
         catalog_id=catalog_id,
     )
     self.assertEqual(return_id, catalog_id + '_' + dataset_id)
Ejemplo n.º 7
0
    def test_themes_are_preserved_if_not_demoted(self):
        package = map_dataset_to_package(self.catalog,
                                         self.dataset,
                                         'owner',
                                         catalog_id=self.catalog_id,
                                         demote_themes=False)
        groups = [group['name'] for group in package.get('groups', [])]
        super_themes = [
            title_to_name(s_theme.lower())
            for s_theme in self.dataset.get('superTheme')
        ]
        themes = self.dataset.get('theme', [])
        tags = [tag['name'] for tag in package['tags']]
        keywords = self.dataset.get('keyword', [])

        try:
            self.assertItemsEqual(super_themes + themes, groups)
        except AttributeError:
            self.assertCountEqual(super_themes + themes, groups)
        try:
            self.assertItemsEqual(keywords, tags)
        except AttributeError:
            self.assertCountEqual(keywords, tags)
def get_distribution_download_urls(distributions, catalog_id):
    # agrega las url que encuentra junto con su id de catalogo
    urls = []

    for distribution in [
            dist for dist in distributions
            if 'downloadURL' in dist and dist['downloadURL']
    ]:

        if "fileName" in distribution:
            distribution_fileName = distribution["fileName"]
        else:
            distribution_fileName = "{}.{}".format(
                title_to_name(distribution["title"]),
                str(distribution["format"]).split("/")[-1].lower())

        urls.append("{} {} {} {} {}".format(catalog_id,
                                            distribution["dataset_identifier"],
                                            distribution["identifier"],
                                            distribution_fileName,
                                            distribution["downloadURL"]))

    return urls
Ejemplo n.º 9
0
 def test_dataset_id_and_name_are_preserved_if_catalog_id_is_not_passed(
         self):
     package = map_dataset_to_package(self.catalog, self.dataset, 'owner')
     self.assertEqual(self.dataset_id, package['id'])
     self.assertEqual(title_to_name(self.dataset['title']), package['name'])
def analyze_dataset(catalog_id,
                    catalog,
                    dataset_identifier,
                    datasets_output_dir,
                    debug_mode=False,
                    replace=True,
                    debug_distribution_ids=None):
    res = {
        "dataset_status": None,
        "distributions_ok": [],
        "distributions_error": [],
    }

    dataset_meta = catalog.get_dataset(dataset_identifier)

    if dataset_meta:
        dataset_dir = os.path.join(datasets_output_dir, dataset_identifier)
        helpers.ensure_dir_exists(dataset_dir)
        res["dataset_status"] = "OK"
    else:
        res["dataset_status"] = "ERROR: metadata"
        return res

    distribution_ids = [
        distribution["identifier"]
        for distribution in dataset_meta["distribution"]
    ]

    # si está en debug mode, se puede especificar sólo algunos ids
    if debug_mode and debug_distribution_ids:
        distribution_ids = [
            distribution_id for distribution_id in distribution_ids
            if distribution_id in debug_distribution_ids
        ]

    # creo c/u de las distribuciones del dataset
    for distribution_identifier in distribution_ids:
        msg = "Distribución {}: {} ({})"
        try:
            distrib_meta = catalog.get_distribution(distribution_identifier)

            # usa fileName si la distribución lo especifica, sino crea uno
            distribution_name = title_to_name(distrib_meta["title"])
            distribution_file_name = distrib_meta.get(
                "fileName", "{}.csv".format(distribution_name))
            dist_path = os.path.join(dataset_dir, "distribution",
                                     distribution_identifier, "download",
                                     "{}".format(distribution_file_name))
            dist_url = get_distribution_url(dist_path)
            # print("esta es la URL QUE VA AL CATALOGO", dist_url)
            distrib_meta["downloadURL"] = dist_url

            # chequea si ante la existencia del archivo hay que reemplazarlo o
            # saltearlo
            if not os.path.exists(dist_path) or replace:
                status = "Replaced" if os.path.exists(dist_path) else "Created"
                origin_dist_path, df = analyze_distribution(
                    catalog_id, catalog, dataset_identifier,
                    distribution_identifier)

                if isinstance(distribution, list):
                    distribution_complete = pd.concat(distribution)
                else:
                    distribution_complete = distribution

                helpers.ensure_dir_exists(os.path.dirname(dist_path))
                shutil.copyfile(origin_dist_path, dist_path)
            else:
                status = "Skipped"

            res["distributions_ok"].append((distribution_identifier, status))
            logger.info(msg.format(distribution_identifier, "OK", status))

        except Exception as e:
            if isinstance(e, KeyboardInterrupt):
                raise
            res["distributions_error"].append((distribution_identifier,
                                               repr(e).encode("utf8")))

            trace_string = traceback.format_exc()
            logger.error(
                msg.format(distribution_identifier, "ERROR",
                           repr(e).encode("utf8")))
            for line in trace_string.splitlines():
                logger.error(line)

            if debug_mode:
                raise
            res["dataset_status"] = "ERROR: scraping"

    return res
def scrape_dataset(xl,
                   catalog,
                   dataset_identifier,
                   datasets_dir,
                   debug_mode=False,
                   replace=True,
                   debug_distribution_ids=None,
                   catalog_id=None):
    res = {
        "dataset_status": None,
        "distributions_ok": [],
        "distributions_error": [],
    }

    dataset_meta = catalog.get_dataset(dataset_identifier)

    if dataset_meta:
        dataset_dir = os.path.join(datasets_dir, dataset_identifier)
        helpers.ensure_dir_exists(dataset_dir)
        res["dataset_status"] = "OK"
    else:
        res["dataset_status"] = "ERROR: metadata"
        return res

    # filtro los parametros para un dataset en particular
    distribution_ids = [
        distribution["identifier"]
        for distribution in dataset_meta["distribution"]
    ]

    # si está en debug mode, se puede especificar sólo algunos ids
    if debug_mode and debug_distribution_ids:
        distribution_ids = [
            distribution_id for distribution_id in distribution_ids
            if distribution_id in debug_distribution_ids
        ]

    # creo c/u de las distribuciones del dataset
    for distribution_identifier in distribution_ids:
        msg = "Distribución {}: {} ({})"
        try:
            distrib_meta = catalog.get_distribution(distribution_identifier)
            distribution_name = title_to_name(distrib_meta["title"])
            distribution_file_name = distrib_meta.get(
                "fileName", "{}.csv".format(distribution_name))
            dist_download_dir = os.path.join(dataset_dir, "distribution",
                                             distribution_identifier,
                                             "download")
            dist_path = os.path.join(dist_download_dir,
                                     "{}".format(distribution_file_name))
            dist_url = get_distribution_url(dist_path)
            # print("esta es la URL QUE VA AL CATALOGO", dist_url)
            distrib_meta["downloadURL"] = dist_url

            # chequea si ante la existencia del archivo hay que reemplazarlo o
            # saltearlo
            if not os.path.exists(dist_path) or replace:
                status = "Replaced" if os.path.exists(dist_path) else "Created"
                distribution = scrape_distribution(xl, catalog,
                                                   distribution_identifier)

                if isinstance(distribution, list):
                    distribution_complete = pd.concat(distribution)
                else:
                    distribution_complete = distribution

                helpers.remove_other_files(os.path.dirname(dist_path))
                distribution_complete.to_csv(
                    dist_path, encoding="utf-8", index_label="indice_tiempo")
            else:
                status = "Skipped"

            res["distributions_ok"].append((distribution_identifier, status))
            logger.info(msg.format(distribution_identifier, "OK", status))

        except Exception as e:
            if isinstance(e, KeyboardInterrupt):
                raise

            res["distributions_error"].append((distribution_identifier,
                                               repr(e).encode("utf8")))

            trace_string = traceback.format_exc()
            print(
                msg.format(distribution_identifier, "ERROR",
                           repr(e).encode("utf8")))
            print(trace_string)
            if debug_mode:
                raise
            res["dataset_status"] = "ERROR: scraping"

            # si no hay versión vieja de la distribución, elimina del catálogo
            try:
                get_distribution_path(catalog_id, dataset_identifier,
                                      distribution_identifier)
            except:
                catalog.remove_distribution(distribution_identifier,
                                            dataset_identifier)

    return res