def process_job(**kwparameters):
    job = kwparameters['job']
    use_local_files = kwparameters['use_local_files']
    clear_first = kwparameters['clear_first']
    test_mode = kwparameters['test_mode']
    job.default_setup(use_local_files)

    # [ ] Check whether this process_job function can be put into standard form.
    job.loader_config_string = 'production'
    if OVERRIDE_GEOCODING:  # This part may not convert well to the Job class approach.
        job.target = '/Users/drw/WPRDC/etl/rocket-etl/archives/previously-geocoded-restaurants.csv'
        job.source_connector = pl.FileConnector
        job.source_type = 'local'
        job.connector_config_string = ''
        print("Using local archive file: {}".format(target))
    elif use_local_files:
        job.target = SOURCE_DIR + job.source_file
    else:
        job.target = job.source_dir + "/" + job.source_file

    package_id = job.package if not test_mode else TEST_PACKAGE_ID
    print("==============\n {} in package {}".format(job.resource_name,
                                                     package_id))

    if clear_first:
        print("Clearing the datastore for {}".format(job.resource_name))
    # Upload data to datastore
    print('Uploading tabular data...')
    curr_pipeline = pl.Pipeline(job.resource_name + ' pipeline', job.resource_name + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE) \
        .connect(job.source_connector, job.target, config_string=job.connector_config_string, encoding=job.encoding) \
        .extract(pl.CSVExtractor, firstline_headers=True) \
        .schema(job.schema) \
        .load(pl.CKANDatastoreLoader, job.loader_config_string,
              #fields=schema().serialize_to_ckan_fields(),
              fields=job.schema().serialize_to_ckan_fields(capitalize=False),
              key_fields=job.primary_key_fields,
              package_id=package_id,
              resource_name=job.resource_name,
              clear_first=clear_first,
              method=job.upload_method).run()

    resource_id = find_resource_id(package_id, resource_name)
    locators_by_destination = {destinations[0]: resource_id}
    assert len(destinations) == 1
    return locators_by_destination
Exemple #2
0
def push_to_datastore(job,
                      file_connector,
                      target,
                      config_string,
                      encoding,
                      loader_config_string,
                      primary_key_fields,
                      test_mode,
                      clear_first,
                      upload_method='upsert'):
    # This is becoming a legacy function because all the new features are going into run_pipeline,
    # but note that this is still used at present by a parking ETL job.
    # (wipe_data support is not being added to push_to_datastore.)
    package_id = job['package'] if not test_mode else TEST_PACKAGE_ID
    resource_name = job['resource_name']
    schema = job['schema']
    extractor = select_extractor(job)
    # Upload data to datastore
    if clear_first:
        print("Clearing the datastore for {}".format(job['resource_name']))
    print('Uploading tabular data...')
    curr_pipeline = pl.Pipeline(job['resource_name'] + ' pipeline', job['resource_name'] + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE) \
        .connect(file_connector, target, config_string=config_string, encoding=encoding) \
        .extract(extractor, firstline_headers=True) \
        .schema(schema) \
        .load(pl.CKANDatastoreLoader, loader_config_string,
              fields=schema().serialize_to_ckan_fields(),
              key_fields=primary_key_fields,
              package_id=package_id,
              resource_name=resource_name,
              clear_first=clear_first,
              method=upload_method).run()

    resource_id = find_resource_id(
        package_id, resource_name
    )  # This IS determined in the pipeline, so it would be nice if the pipeline would return it.
    return resource_id
Exemple #3
0
    def run_pipeline(self,
                     test_mode,
                     clear_first,
                     wipe_data,
                     migrate_schema,
                     use_local_files,
                     retry_without_last_line=False,
                     ignore_empty_rows=False):
        # This is a generalization of push_to_datastore() to optionally use
        # the new FileLoader (exporting data to a file rather than just CKAN).

        # target is a filepath which is actually the source filepath.

        # The retry_without_last_line option is a way of dealing with CSV files
        # that abruptly end mid-line.
        locators_by_destination = {}
        source_file_format = destination_file_format = self.source_file.split(
            '.')[-1].lower()
        # 1) The downside to extracting the file format from the source file name
        # is that it couples the source and destination a bit too tightly.
        # One can imagine a scenario where tabular data is obtained from an API
        # and it's supposed to be uploaded as a CSV file somewhere. In this case
        # a separate "destination_file_format" would need to be specified.
        #
        # 2) While wprdc_etl uses 'CSV' as a
        # format that it sends to CKAN, I'm inclined to switch to 'csv',
        # and uniformly lowercasing all file formats.

        # Though given a format of 'geojson', the CKAN API resource_create
        # response lists format as 'GeoJSON', so CKAN is doing some kind
        # of correction.

        for destination in self.destinations:
            package_id = get_package_id(
                self, test_mode)  # This is the effective package ID,
            # taking into account whether test mode is active.

            # [ ] Maybe the use_local_files and test_mode and any other parameters should be applied in a discrete stage between initialization and running.
            # This would allow the source and destination parameters to be prepared, leaving the pipeline running to just run the pipeline.
            # However, writing the CKANFilestoreLoader is a prerequisite for this.
            #if self.source_type == 'sftp' and destination == 'ckan_filestore':

            if destination == 'ckan_filestore' and self.source_type not in [
                    'sftp', 'local', 'http'
            ]:

                # [ ] Test local file uploads to the CKAN Filestore before deleting all this logic.

                # Maybe a pipeline is not necessary to just upload a file to a CKAN resource, if the file is local!
                ua = 'rocket-etl/1.0 (+https://tools.wprdc.org/)'
                ckan = ckanapi.RemoteCKAN(site, apikey=API_KEY, user_agent=ua)

                upload_kwargs = {
                    'package_id': package_id,
                    'format': destination_file_format,
                    'url': 'dummy-value',  # ignored but required by CKAN<2.6
                }
                if self.source_type in ['local']:
                    upload_kwargs['upload'] = open(
                        self.target, 'r')  # target is the source file path
                    # Specifying the target like this only works if the file is already local.
                    # This can look like this:
                    # <_io.TextIOWrapper name='/Users/drw/WPRDC/etl/rocket-etl/source_files/ac_hd/sourcesites.geojson' mode='r' encoding='UTF-8'>
                elif not use_local_files and self.source_type in [
                        'http'
                ]:  # [ ] Test this untested block.
                    # This could also happen in default_setup, but really everything under "if destination == 'ckan_filestore'"
                    # should be moved/eliminated once a CKANFileStoreLoader is written.
                    self.target = cached_source_file_path = download_file_to_path(
                        self.source_full_url, local_dir=WAITING_ROOM_DIR)
                    upload_kwargs['upload'] = open(
                        self.target, 'r')  # target is the source file path
                elif not use_local_files and self.source_type in ['sftp']:
                    #ftp_connector = pl.SFTPConnector(host =
                    #ftp_file = ftp_connector.connect(self.source_connector, self.target, config_string=self.connector_config_string, encoding=self.encoding, local_cache_filepath=self.local_cache_filepath)
                    # Test the file-based pipeline by using FileExtractor, with the regular SFTP connector (as configured in the air_quality.py script
                    # for the sourcesites.geojson file, and the new CKANFilestoreLoader (which still needs to be written). CKANFilestoreLoader
                    # will only work if paired with another file-based Extractor. Also, the schema has got to be addressed somewhere maybe.

                    loader = pl.CKANFilestoreLoader
                    raise FileNotFoundError(
                        "To get a file via FTP and upload to the CKAN filestore, it would be best to implement a proper CKANFileStoreLoader. This necessitates modifying how pipelines work a bit."
                    )

                if not resource_exists(package_id, self.resource_name):
                    upload_kwargs['name'] = self.resource_name
                    result = ckan.action.resource_create(**upload_kwargs)
                    print(
                        'Creating new resource and uploading file to filestore...'
                    )
                else:
                    upload_kwargs['id'] = find_resource_id(
                        package_id, self.resource_name)
                    result = ckan.action.resource_update(**upload_kwargs)
                    print('Uploading file to filestore...')
            else:
                self.select_extractor()

                # BEGIN Destination-specific configuration
                if destination == 'ckan':
                    loader = pl.CKANDatastoreLoader
                elif destination == 'file':
                    loader = pl.FileLoader
                    self.upload_method = 'insert'  # Note that this will always append records to an existing file
                    # unless 'always_clear_first' (or 'always_wipe_data') is set to True.
                elif destination == 'ckan_filestore':
                    loader = pl.CKANFilestoreLoader
                else:
                    raise ValueError(
                        f"run_pipeline does not know how to handle destination = {destination}"
                    )

                clear_first = clear_first or self.always_clear_first or migrate_schema  # If migrate_schema == True, 1) backup the data dictionary,
                # 2) delete the Data Table view, 3) clear the datastore, 4) run the job, and 5) try to restore the data dictionary.
                # It just seems cleaner to do most of that in launchpad.py (probably because there's so little in the main() function.

                wipe_data = wipe_data or self.always_wipe_data
                if clear_first and wipe_data:
                    raise ValueError(
                        "clear_first and wipe_data should not both be True simultaneously. To clear a datastore for a job that has always_wipe_data = True, add the command-line argument 'override_wipe_data'."
                    )
                elif clear_first:
                    if destination in ['ckan']:
                        if datastore_exists(package_id, self.resource_name):
                            # It should be noted that this will wipe out any integrated data_dictionary (but it's being preserved at the launchpad.py level).
                            print("Clearing the datastore for {}".format(
                                self.resource_name)
                                  )  # Actually done by the pipeline.
                        else:
                            print(
                                "Since it makes no sense to try to clear a datastore that does not exist, clear_first is being toggled to False."
                            )
                            clear_first = False
                elif wipe_data:
                    if destination in ['ckan']:
                        if datastore_exists(package_id, self.resource_name):
                            print("Wiping records from the datastore for {}".
                                  format(self.resource_name))
                        else:
                            print(
                                "Since it makes no sense to try to wipe the records from a datastore that does not exist, wipe_data is being toggled to False."
                            )
                            wipe_data = False

                print(
                    f'Uploading {"tabular data" if loader.has_tabular_output else "file"}...'
                )
                # END Destination-specific configuration

                try:
                    curr_pipeline = pl.Pipeline(self.job_code + ' pipeline', self.job_code + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE, retry_without_last_line = retry_without_last_line, ignore_empty_rows = ignore_empty_rows, filters = self.filters) \
                        .connect(self.source_connector, self.target, config_string=self.connector_config_string, encoding=self.encoding, local_cache_filepath=self.local_cache_filepath) \
                        .extract(self.extractor, firstline_headers=True, rows_to_skip=self.rows_to_skip) \
                        .schema(self.schema) \
                        .load(loader, self.loader_config_string,
                              filepath = self.destination_file_path,
                              file_format = destination_file_format,
                              fields = self.schema().serialize_to_ckan_fields(),
                              key_fields = self.primary_key_fields,
                              package_id = package_id,
                              resource_name = self.resource_name,
                              clear_first = clear_first,
                              wipe_data = wipe_data,
                              method = self.upload_method).run()
                except FileNotFoundError:
                    if self.ignore_if_source_is_missing:
                        print(
                            "The source file for this job wasn't found, but that's not surprising."
                        )
                    else:
                        raise

            if destination in [
                    'ckan', 'ckan_filestore', 'local_monthly_archive_zipped'
            ]:
                resource_id = find_resource_id(
                    package_id, self.resource_name
                )  # This IS determined in the pipeline, so it would be nice if the pipeline would return it.
                locators_by_destination[destination] = resource_id
            elif destination in ['file']:
                locators_by_destination[
                    destination] = self.destination_file_path
        return locators_by_destination
Exemple #4
0
    def run_pipeline(self,
                     test_mode,
                     clear_first,
                     wipe_data,
                     migrate_schema,
                     file_format='csv',
                     retry_without_last_line=False):
        # This is a generalization of push_to_datastore() to optionally use
        # the new FileLoader (exporting data to a file rather than just CKAN).

        # target is a filepath which is actually the source filepath.

        # The retry_without_last_line option is a way of dealing with CSV files
        # that abruptly end mid-line.
        locators_by_destination = {}
        for destination in self.destinations:
            package_id = self.package if not test_mode else TEST_PACKAGE_ID  # Should this be done elsewhere?
            # [ ] Maybe the use_local_files and test_mode and any other parameters should be applied in a discrete stage between initialization and running.

            if destination == 'ckan_filestore':
                # Maybe a pipeline is not necessary to just upload a file to a CKAN resource.
                ua = 'rocket-etl/1.0 (+https://tools.wprdc.org/)'
                ckan = ckanapi.RemoteCKAN(site, apikey=API_KEY, user_agent=ua)

                source_file_format = self.source_file.split('.')[-1].lower()
                # While wprdc_etl uses 'CSV' as a
                # format that it sends to CKAN, I'm inclined to switch to 'csv',
                # and uniformly lowercasing all file formats.

                # Though given a format of 'geojson', the CKAN API resource_create
                # response lists format as 'GeoJSON', so CKAN is doing some kind
                # of correction.
                upload_kwargs = {
                    'package_id': package_id,
                    'format': source_file_format,
                    'url': 'dummy-value',  # ignored but required by CKAN<2.6
                    'upload': open(self.target, 'r')
                }  # target is the source file path
                if not resource_exists(package_id, self.resource_name):
                    upload_kwargs['name'] = self.resource_name
                    result = ckan.action.resource_create(**upload_kwargs)
                    print(
                        'Creating new resource and uploading file to filestore...'
                    )
                else:
                    upload_kwargs['id'] = find_resource_id(
                        package_id, self.resource_name)
                    result = ckan.action.resource_update(**upload_kwargs)
                    print('Uploading file to filestore...')
            elif destination == 'local_monthly_archive_zipped':
                # [ ] Break all of this LMAZ code off into one or more separate functions.
                loader = pl.FileLoader
                self.upload_method = 'insert'
                # Try using destination_directory and destination_file_path for the archives.
                # Append the year-month to the filename (before the extension).
                pathparts = self.destination_file_path.split('/')
                filenameparts = pathparts[-1].split('.')
                now = datetime.now()
                last_month_num = (now.month - 1) % 12
                year = now.year
                if last_month_num == 0:
                    last_month_num = 12
                if last_month_num == 12:  # If going back to December,
                    year -= 1  # set year to last year
                last_month_str = str(last_month_num)
                if len(last_month_str) == 1:
                    last_month_str = '0' + last_month_str
                regex_parts = list(filenameparts)
                filenameparts[-2] += "_{}-{}".format(year, last_month_str)
                timestamped_filename = '.'.join(filenameparts)

                regex_parts[-2] += "_{}".format(year)
                regex_pattern = '.'.join(
                    regex_parts[:-1]
                )  # This is for matching filenames that should be rolled up into the same year of data.
                zip_file_name = regex_pattern + '.zip'

                pathparts[-1] = timestamped_filename
                destination_file_path = '/'.join(
                    pathparts)  # This is the new timestamped filepath.
                ic(self.destination_file_path)
                # Store the file locally

                # Zip the files with matching year in filename.
                destination_directory = '/'.join(
                    self.destination_file_path.split('/')[:-1])
                all_files = os.listdir(destination_directory)
                list_of_files_to_compress = sorted(
                    [f for f in all_files if re.match(regex_pattern, f)])

                #cp synthesized-liens.csv zipped/liens-with-current-status-beta.csv
                zip_file_path = destination_directory + '/' + zip_file_name
                #zip zipped/liens-with-current-status-beta.zip zipped/liens-with-current-status-beta.csv
                import zipfile
                process_zip = zipfile.ZipFile(zip_file_path, 'w')
                for original_file_name in list_of_files_to_compress:
                    file_to_zip = destination_directory + '/' + original_file_name
                    process_zip.write(file_to_zip,
                                      original_file_name,
                                      compress_type=zipfile.ZIP_DEFLATED)
                process_zip.close()
                # Upload the file at zip_file_path to the appropriate resource.
                #####resource_id =  # [ ] This lmaz option needs to be finished.

                # Delete the file at zip_file_path.
                os.remove(zip_file_path)
                # Have the parameters that are being passed to curr_pipeline below correct for uploading the zipped archive? ########
            else:
                self.select_extractor()
                if destination == 'ckan':
                    loader = pl.CKANDatastoreLoader
                elif destination == 'file':
                    loader = pl.FileLoader
                    self.upload_method = 'insert'  # Note that this will always append records to an existing file
                    # unless 'always_clear_first' (or 'always_wipe_data') is set to True.
                else:
                    raise ValueError(
                        "run_pipeline does not know how to handle destination = {}"
                        .format(destination))

                clear_first = clear_first or self.always_clear_first or migrate_schema  # If migrate_schema == True, 1) backup the data dictionary,
                # 2) delete the Data Table view, 3) clear the datastore, 4) run the job, and 5) try to restore the data dictionary.
                # It just seems cleaner to do most of that in launchpad.py (probably because there's so little in the main() function.

                wipe_data = wipe_data or self.always_wipe_data
                if clear_first and wipe_data:
                    raise ValueError(
                        "clear_first and wipe_data should not both be True simultaneously."
                    )
                elif clear_first:
                    if destination in ['ckan']:
                        if datastore_exists(package_id, self.resource_name):
                            # It should be noted that this will wipe out any integrated data_dictionary (but it's being preserved at the launchpad.py level).
                            print("Clearing the datastore for {}".format(
                                self.resource_name)
                                  )  # Actually done by the pipeline.
                        else:
                            print(
                                "Since it makes no sense to try to clear a datastore that does not exist, clear_first is being toggled to False."
                            )
                            clear_first = False
                elif wipe_data:
                    if destination in ['ckan']:
                        if datastore_exists(package_id, self.resource_name):

                            print("Wiping records from the datastore for {}".
                                  format(self.resource_name))
                        else:
                            print(
                                "Since it makes no sense to try to wipe the records from a datastore that does not exist, wipe_data is being toggled to False."
                            )
                            wipe_data = False

                # Upload data to datastore
                print('Uploading tabular data...')
                try:
                    curr_pipeline = pl.Pipeline(self.job_code + ' pipeline', self.job_code + ' Pipeline', log_status=False, chunk_size=4000, settings_file=SETTINGS_FILE, retry_without_last_line = retry_without_last_line) \
                        .connect(self.source_connector, self.target, config_string=self.connector_config_string, encoding=self.encoding, local_cache_filepath=self.local_cache_filepath) \
                        .extract(self.extractor, firstline_headers=True) \
                        .schema(self.schema) \
                        .load(loader, self.loader_config_string,
                              filepath = self.destination_file_path,
                              file_format = file_format,
                              fields = self.schema().serialize_to_ckan_fields(),
                              key_fields = self.primary_key_fields,
                              package_id = package_id,
                              resource_name = self.resource_name,
                              clear_first = clear_first,
                              wipe_data = wipe_data,
                              method = self.upload_method).run()
                except FileNotFoundError:
                    if self.ignore_if_source_is_missing:
                        print(
                            "The source file for this job wasn't found, but that's not surprising."
                        )
                    else:
                        raise

            if destination in [
                    'ckan', 'ckan_filestore', 'local_monthly_archive_zipped'
            ]:
                resource_id = find_resource_id(
                    package_id, self.resource_name
                )  # This IS determined in the pipeline, so it would be nice if the pipeline would return it.
                locators_by_destination[destination] = resource_id
            elif destination in ['file']:
                locators_by_destination[
                    destination] = self.destination_file_path
        return locators_by_destination
Exemple #5
0
    def run_pipeline(self,
                     clear_first,
                     wipe_data,
                     migrate_schema,
                     retry_without_last_line=False,
                     ignore_empty_rows=False):
        # target is a filepath which is actually the source filepath.

        # The retry_without_last_line option is a way of dealing with CSV files
        # that abruptly end mid-line.
        locators_by_destination = {}

        if self.destination == 'ckan_link':  # Handle special case of just wanting to make a resource that is just a hyperlink
            # which really doesn't need a full pipeline at this point.
            from engine.credentials import site, API_key
            ckan = ckanapi.RemoteCKAN(site, apikey=API_key)
            resource_id = find_resource_id(self.package_id, self.resource_name)
            if resource_id is None:
                resource_as_dict = ckan.action.resource_create(
                    package_id=self.package_id,
                    url=self.source_full_url,
                    format='HTML',
                    name=self.resource_name)
                resource_id = resource_as_dict['id']
            else:
                resource_as_dict = ckan.action.resource_update(
                    id=resource_id,
                    url=self.source_full_url,
                    format='HTML',
                    name=self.resource_name)

            locators_by_destination[self.destination] = resource_id
            return locators_by_destination

        source_file_format = self.source_file.split('.')[-1].lower()
        if self.destination_file is not None:
            self.destination_file_format = self.destination_file.split(
                '.')[-1].lower()
        else:
            self.destination_file_format = source_file_format

        # 2) While wprdc_etl uses 'CSV' as a
        # format that it sends to CKAN, I'm inclined to switch to 'csv',
        # and uniformly lowercase all file formats.

        # Though given a format of 'geojson', the CKAN API resource_create
        # response lists format as 'GeoJSON', so CKAN is doing some kind
        # of correction.

        # [ ] Maybe test_mode and any other parameters should be applied in a discrete stage between initialization and running.
        # This would allow the source and destination parameters to be prepared, leaving the pipeline running to just run the pipeline.

        # BEGIN Destination-specific configuration
        # A) First configure the loader. It might make more sense to move this to configure_pipeline_with_options().
        if self.destination == 'ckan':
            self.loader = pl.CKANDatastoreLoader
        elif self.destination == 'file':
            # The tabularity of the data (that is, whether the loader is going
            # to be handed a record (list of dicts) or a file or file-like object
            # should determine which kind of file loader will be used.
            if self.destination_file_format is None:
                raise ValueError(
                    "Destination == 'file' but self.destination_file_format is None!"
                )
            elif self.destination_file_format.lower() in [
                    'csv', 'json'
            ] and self.compressed_file_to_extract is None:
                self.loader = pl.TabularFileLoader  # Isn't this actually very CSV-specific, given the write_or_append_to_csv_file function it uses?
                self.upload_method = 'insert'  # Note that this will always append records to an existing file
                # unless 'always_clear_first' (or 'always_wipe_data') is set to True.
            else:
                self.loader = pl.NontabularFileLoader
        elif self.destination == 'ckan_filestore':
            self.loader = pl.CKANFilestoreLoader
        elif self.destination is None:
            return {
            }  # locators_by_destination should be empty and the pipeline should be skipped
            # for these cases where there is no destination (like in snow_plow_geojson.py when
            # no new files to upload are found.
        else:
            raise ValueError(
                f"run_pipeline does not know how to handle destination = {self.destination}"
            )

            # B) Then do some boolean operations on clear_first, self.always_clear_first, migrate_schema, and wipe_first.
        clear_first = clear_first or self.always_clear_first or migrate_schema  # If migrate_schema == True, 1) backup the data dictionary,
        # 2) delete the Data Table view, 3) clear the datastore, 4) run the job, and 5) try to restore the data dictionary.
        # It just seems cleaner to do most of that in launchpad.py (probably because there's so little in the main() function.

        wipe_data = wipe_data or self.always_wipe_data
        if clear_first and wipe_data:
            raise ValueError(
                "clear_first and wipe_data should not both be True simultaneously. To clear a datastore for a job that has always_wipe_data = True, add the command-line argument 'override_wipe_data'."
            )
        elif clear_first:
            if self.destination in ['ckan']:
                if datastore_exists(self.package_id, self.resource_name):
                    # It should be noted that this will wipe out any integrated data_dictionary (but it's being preserved at the launchpad.py level).
                    print("Clearing the datastore for {}".format(
                        self.resource_name))  # Actually done by the pipeline.
                else:
                    print(
                        "Since it makes no sense to try to clear a datastore that does not exist, clear_first is being toggled to False."
                    )
                    clear_first = False
        elif wipe_data:
            if self.destination in ['ckan']:
                if datastore_exists(self.package_id, self.resource_name):
                    print("Wiping records from the datastore for {}".format(
                        self.resource_name))
                else:
                    print(
                        "Since it makes no sense to try to wipe the records from a datastore that does not exist, wipe_data is being toggled to False."
                    )
                    wipe_data = False

        print(
            f'Loading {"tabular data" if self.loader.has_tabular_output else "file"}...'
        )
        # END Destination-specific configuration

        try:
            curr_pipeline = pl.Pipeline(self.job_code + ' pipeline', self.job_code + ' Pipeline', log_status=False, chunk_size=1000, settings_file=SETTINGS_FILE, retry_without_last_line = retry_without_last_line, ignore_empty_rows = ignore_empty_rows, filters = self.filters) \
                .connect(self.source_connector, self.target, config_string=self.connector_config_string, encoding=self.encoding, local_cache_filepath=self.local_cache_filepath, verify_requests=self.verify_requests, fallback_host=self.source_site) \
                .extract(self.extractor, firstline_headers=True, rows_to_skip=self.rows_to_skip, compressed_file_to_extract=self.compressed_file_to_extract) \
                .schema(self.schema) \
                .load(self.loader, self.loader_config_string,
                      filepath = self.destination_file_path,
                      file_format = self.destination_file_format,
                      fields = self.schema().serialize_to_ckan_fields(),
                      key_fields = self.primary_key_fields,
                      package_id = self.package_id,
                      resource_name = self.resource_name,
                      clear_first = clear_first,
                      wipe_data = wipe_data,
                      method = self.upload_method).run()
        except FileNotFoundError:
            if self.ignore_if_source_is_missing:
                print(
                    "The source file for this job wasn't found, but that's not surprising."
                )
            else:
                raise

        if self.destination in ['ckan', 'ckan_filestore']:
            resource_id = find_resource_id(
                self.package_id, self.resource_name
            )  # This IS determined in the pipeline, so it would be nice if the pipeline would return it.
            locators_by_destination[self.destination] = resource_id
        elif self.destination in ['file']:
            locators_by_destination[
                self.destination] = self.destination_file_path
        return locators_by_destination