Example #1
0
def fetch_resource(dataset, ignore_hashes):
    '''
    Gets the resource and sets the times of last successful update based on the status code.
    If `ignore_hashes` is set to True, `last_parsed` will be set to None and an
    update will be triggered.
    :param resource:
    :return:
    '''
    d = iatikit.data().datasets.get(dataset.name)
    last_updated = iatikit.data().last_updated
    resource = dataset.resources[0]
    resource.last_fetch = last_updated

    try:
        content = d.raw_xml
        resource.last_status_code = 200
        resource.last_succ = last_updated
        if (not resource.document) or \
                (hash(resource.document) != hash(content)) or \
                ignore_hashes:
            resource.document = content
            resource.last_parsed = None
            resource.last_parse_error = None
    except IOError:
        # TODO: this isn't true
        resource.last_status_code = 404

    db.session.add(resource)
    return resource
Example #2
0
def fetch_dataset_list():
    '''
    Fetches datasets from iatikit and stores them in the DB. Used in update() to update the Flask job queue. Uses CKAN metadata to determine
    if an activity is active or deleted.
    :return:
    '''
    existing_datasets = Dataset.query.all()
    existing_ds_names = set(ds.name for ds in existing_datasets)

    package_list = [d.name for d in iatikit.data().datasets]
    incoming_ds_names = set(package_list)

    new_datasets = [
        Dataset(name=n) for n in incoming_ds_names - existing_ds_names
    ]
    all_datasets = existing_datasets + new_datasets
    last_seen = iatikit.data().last_updated
    for dataset in all_datasets:
        dataset.last_seen = last_seen

    db.session.add_all(all_datasets)
    db.session.commit()

    deleted_ds_names = existing_ds_names - incoming_ds_names
    if deleted_ds_names:
        delete_datasets(deleted_ds_names)

    all_datasets = Dataset.query
    return all_datasets
Example #3
0
def get_registry(refresh=False):

    if not (pathlib.Path() / "__iatikitcache__").is_dir() or refresh:
        print("getting regisitry data")
        iatikit.download.data()

    return iatikit.data()
Example #4
0
def fetch_dataset_metadata(dataset):
    d = iatikit.data().datasets.get(dataset.name)
    dataset.publisher = d.metadata['organization']['name']

    dataset.last_modified = date_parser(
        d.metadata.get('metadata_modified',
                       datetime.datetime.now().date().isoformat()))
    new_urls = [
        resource['url'] for resource in d.metadata.get('resources', [])
        if resource['url'] not in dataset.resource_urls
    ]
    dataset.resource_urls.extend(new_urls)

    urls = [resource['url'] for resource in d.metadata.get('resources', [])]
    for deleted in set(dataset.resource_urls) - set(urls):
        dataset.resource_urls.remove(deleted)

    dataset.license = d.metadata.get('license_id')
    dataset.is_open = d.metadata.get('isopen', False)
    db.session.add(dataset)
    return dataset
from os.path import dirname, join
from collections import namedtuple
import datetime

import mock
import iatikit

from . import AppTestCase, fixture_filename
from . import factories as fac

from iatilib import crawler, db, parse
from iatilib.model import Dataset, Log, Resource, Activity, DeletedActivity

registry = iatikit.data(join(dirname(__file__), 'fixtures', 'registry'))


class TestCrawler(AppTestCase):
    @mock.patch('iatikit.data')
    def test_fetch_package_list(self, iatikit_mock):
        data_mock = iatikit_mock.return_value
        data_mock.last_updated = datetime.datetime.utcnow()
        data_mock.datasets = [
            iatikit.Dataset("tst-a.xml"),
            iatikit.Dataset("tst-b.xml")
        ]
        datasets = crawler.fetch_dataset_list()
        self.assertIn("tst-a", [ds.name for ds in datasets])
        self.assertIn("tst-b", [ds.name for ds in datasets])

    @mock.patch('iatikit.data')
    def test_update_adds_datasets(self, iatikit_mock):