Esempio n. 1
0
def main(**kwargs):
    selected_job_codes = kwargs.get('selected_job_codes', [])
    use_local_files = kwargs.get('use_local_files', False)
    clear_first = kwargs.get('clear_first', False)
    wipe_data = kwargs.get('wipe_data', False)
    migrate_schema = kwargs.get('migrate_schema', False)
    test_mode = kwargs.get('test_mode', False)
    if selected_job_codes == []:
        selected_jobs = [Job(job_dict) for job_dict in job_dicts]
    else:
        selected_jobs = select_jobs_by_code(selected_job_codes, job_dicts)

    # [ ] Add in script-level post-processing here, allowing the data.json file of an ArcGIS
    # server to be searched for unharvested tables.
    for job in selected_jobs:
        kwparameters = dict(kwargs)
        package_id = get_package_id(
            job, test_mode)  # This stuff needs to be tested.
        resource_id = find_resource_id(package_id, job.resource_name)
        if migrate_schema and 'ckan' in job.destinations:
            # Delete the Data Table view to avoid new fields not getting added to an existing view.
            delete_datatable_views(resource_id)
            # Is this really necessary though? In etl_util.py, migrate_schema being True is already going to force clear_first to be True
            # which should delete all the views.
            # The scenario of concern is when the schema changes by eliminating a field, and it's not clear whether CKAN
            # supports just dropping a field from the schema and auto-dropping the field from the table while preserving
            # the other values.
            print(
                "Note that setting migrate_schema = True is going to clear the associated datastore."
            )

        if (
                clear_first or migrate_schema
        ) and 'ckan' in job.destinations:  # if the target is a CKAN resource being cleared
            # [ ] Maybe add a check to see if an integrated data dictionary exists.
            data_dictionary = get_data_dictionary(
                resource_id)  # If so, obtain it.
            # Save it to a local file as a backup.
            data_dictionary_filepath = save_to_waiting_room(
                data_dictionary, resource_id, job.resource_name)

            # wipe_data should preserve the data dictionary when the schema stays the same, and
            # migrate_schema should be used to change the schema but try to preserve the data dictionary.

            # If migrate_schema == True, 1) backup the data dictionary,
            # 2) delete the Data Table view, 3) clear the datastore, 4) run the job, and 5) try to restore the data dictionary.

            # Or could we overload "wipe_data" to include schema migration?

            # [ ] Also, it really seems that always_clear_first should become always_wipe_data.

        locators_by_destination = job.process_job(**kwparameters)
        for destination, table_locator in locators_by_destination.items():
            if destination == 'ckan':
                post_process(locators_by_destination[destination])
                if clear_first or migrate_schema:  # [ ] Should the data dictionary definitely be restored if clear_first = True?
                    results = set_data_dictionary(resource_id, data_dictionary)
Esempio n. 2
0
def express_load_then_delete_file(job, **kwparameters):
    """The basic idea is that the job processes with a 'file' destination,
    so the ETL job loads the file into destination_file_path. Then as a
    custom post-processing step, that file is Express-Loaded. This is
    faster (particularly for large files) and avoids 504 errors and unneeded
    API requests."""
    # Eventually this function should be moved either to etl_util.py or
    # more likely the pipeline framework. In either case, this approach
    # can be formalized, either as a destination or upload method and
    # possibly implemented as a loader (CKANExpressLoader).
    if kwparameters['use_local_output_file']:
        return
    if kwparameters['test_mode']:
        job.package_id = TEST_PACKAGE_ID
    ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
    csv_file_path = job.destination_file_path
    resource_id = find_resource_id(job.package_id, job.resource_name)
    if resource_id is None:
        # If the resource does not already exist, create it.
        print(
            f"Unable to find a resource with name '{job.resource_name}' in package with ID {job.package_id}."
        )
        print(
            f"Creating new resource, and uploading CSV file {csv_file_path} to resource with name '{job.resource_name}' in package with ID {job.package_id}."
        )
        resource_as_dict = ckan.action.resource_create(
            package_id=job.package_id,
            name=job.resource_name,
            upload=open(csv_file_path, 'r'))
    else:
        print(
            f"Uploading CSV file {csv_file_path} to resource with name '{job.resource_name}' in package with ID {job.package_id}."
        )
        resource_as_dict = ckan.action.resource_patch(id=resource_id,
                                                      upload=open(
                                                          csv_file_path, 'r'))
        # Running resource_update once sets the file to the correct file and triggers some datastore action and
        # the Express Loader, but for some reason, it seems to be processing the old file.

        # So instead, let's run resource_patch (which just sets the file) and then run resource_update.
        #resource_as_dict = ckan.action.resource_update(id = resource_id)
        resource_as_dict = ckan.action.resource_update(id=resource_id,
                                                       upload=open(
                                                           csv_file_path, 'r'))

    print(f"Removing temp file at {csv_file_path}")
    os.remove(csv_file_path)

    # Since launchpad.py doesn't update the last_etl_update metadata value in this case
    # because this is a workaround, do it manually here:
    post_process(resource_id, job, **kwparameters)
def process_job(**kwparameters):
    job = kwparameters['job']
    use_local_files = kwparameters['use_local_files']
    clear_first = kwparameters['clear_first']
    test_mode = kwparameters['test_mode']
    job.default_setup(use_local_files)

    # [ ] Check whether this process_job function can be put into standard form.
    job.loader_config_string = 'production'
    if OVERRIDE_GEOCODING:  # This part may not convert well to the Job class approach.
        job.target = '/Users/drw/WPRDC/etl/rocket-etl/archives/previously-geocoded-restaurants.csv'
        job.source_connector = pl.FileConnector
        job.source_type = 'local'
        job.connector_config_string = ''
        print("Using local archive file: {}".format(target))
    elif use_local_files:
        job.target = SOURCE_DIR + job.source_file
    else:
        job.target = job.source_dir + "/" + job.source_file

    package_id = job.package if not test_mode else TEST_PACKAGE_ID
    print("==============\n {} in package {}".format(job.resource_name,
                                                     package_id))

    if clear_first:
        print("Clearing the datastore for {}".format(job.resource_name))
    # Upload data to datastore
    print('Uploading tabular data...')
    curr_pipeline = pl.Pipeline(job.resource_name + ' pipeline', job.resource_name + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE) \
        .connect(job.source_connector, job.target, config_string=job.connector_config_string, encoding=job.encoding) \
        .extract(pl.CSVExtractor, firstline_headers=True) \
        .schema(job.schema) \
        .load(pl.CKANDatastoreLoader, job.loader_config_string,
              #fields=schema().serialize_to_ckan_fields(),
              fields=job.schema().serialize_to_ckan_fields(capitalize=False),
              key_fields=job.primary_key_fields,
              package_id=package_id,
              resource_name=job.resource_name,
              clear_first=clear_first,
              method=job.upload_method).run()

    resource_id = find_resource_id(package_id, resource_name)
    locators_by_destination = {destinations[0]: resource_id}
    assert len(destinations) == 1
    return locators_by_destination
Esempio n. 4
0
def get_extant_time_range(job, **kwparameters):
    #    if 'ckan' in job.destinations: # This is a strong argument for making each job_dict
    # have exactly one source and one destination and using job molecules or
    # chains to support multiple destinations (somehow).
    ## JOB CHAINS: 1) Support building up more complicated processes (often
    ## represented by "directed acyclic graphs") by chaining job atoms.
    ## 2) More coherently support multiple destinations at this level by
    ## calling the same job atom twice, with different destinations.
    ## I can imagine that a job atom could be the basis for a superjob
    ## which could replace multiple fields with lists, and some
    ## launchpad logic would iterate over these lists
    ## like
    ##      destination = ['file', 'ckan_filestore']
    ##      destination_filename = ['local_filename.csv', 'Official-Looking CKAN Filename']
    ## and apply the elements in sequence to make len(xs) jobs.

    ## 3) But try to make every parameter into a potential list this way
    ## (at least by allowing one parameter at a time to be changed).
    if job.destination == 'ckan':

        package = get_package_by_id(job.package_id)
        if 'extras' in package:
            extras_list = package['extras']
            # Keep definitions and uses of extras metadata updated here:
            # https://github.com/WPRDC/data-guide/blob/master/docs/metadata_extras.md
            # The format is like this:
            #       u'extras': [{u'key': u'dcat_issued', u'value': u'2014-01-07T15:27:45.000Z'}, ...
            # not a dict, but a list of dicts.
            extras = {d['key']: d['value'] for d in extras_list}
            resource_id = find_resource_id(
                job.package_id,
                job.resource_name)  # This adds a second call to get the
            # package when it's already been obtained a few lines above.
            if resource_id is None:  # The resource does not yet exist.
                return None, None
            if 'time_field' in extras and resource_id in json.loads(
                    extras['time_field']):
                time_field_lookup = json.loads(extras['time_field'])
                first_date, last_date = find_extreme_dates(
                    resource_id, time_field_lookup)
                return first_date, last_date
            else:
                try:
                    time_field = getattr(
                        job, 'time_field',
                        None)  # Try to grab time_field from job
                except AttributeError:
                    return None, None
                else:
                    time_field_lookup = {resource_id: time_field}
                    first_date, last_date = find_extreme_dates(
                        resource_id, time_field_lookup)
                    return first_date, last_date
                return None, None
        else:
            return None, None
    else:  # Find the time range of a non-datastore CSV file at the local destination
        # OR wipe the existing file and rewrite it from scratch.
        if job.destination == 'file':
            try:
                f = open(job.destination_file_path, 'r')
            except FileNotFoundError:
                return None, None
            else:
                reader = csv.DictReader(f)
                first_date = datetime.max
                last_date = datetime.min
                for row in reader:
                    timestamp = row.get(job.time_field, None)
                    if timestamp is not None:
                        timestamp = parser.parse(timestamp)
                        if timestamp < first_date:
                            first_date = timestamp
                        if timestamp > last_date:
                            last_date = timestamp
                f.close()
                if first_date <= last_date:
                    return first_date, last_date
                else:
                    return None, None
        else:
            raise ValueError(
                f"Unable to determine the extant time range for the following destination: {job.destination}"
            )