def run(self):

        # Running this task doesn't delete anything from CKAN itself - so require --force flag to be sent to run it
        if not self.force:
            raise Exception('Warning: this class does not delete CKAN records. Use --force to run it.')

        # Build a dict of all modules and collections
        # We then retrieve the appropriate collection from the records module name (AudTable)
        # Exclude the MongoDeleteTask though
        collections = {cls.module: cls(None).get_collection() for cls in MongoTask.__subclasses__()}

        ke_data = KEParser(self.input().open('r'), file_path=self.input().path, schema_file=self.keemu_schema_file)

        for record in self.iterate_data(ke_data):

            module = record.get('AudTable')
            irn = record.get('AudKey')

            try:
                collection = collections[module]
            except KeyError:
                log.debug('Skipping eaudit record for %s' % module)
                # We do not have a collection for this module - skip to next record
                continue
            else:
                log.info('Deleting record %s(%s)' % (module, irn))
                self.delete(collection, irn)

        self.mark_complete()
    def bulk_update(self, ke_data):

        bulk = self.collection.initialize_unordered_bulk_op()

        count = 0

        for record in self.iterate_data(ke_data):

            # Find and replace doc - inserting if it doesn't exist
            bulk.find({'_id': record['_id']}).upsert().replace_one(record)
            count += 1

            # Bulk ops can have out of memory errors (I'm getting for ~400,000+ bulk ops)
            # So execute the bulk op in stages, when bulk_op_size is reached
            if count % self.bulk_op_size == 0:
                log.info('Executing bulk op')
                bulk.execute()
                bulk = self.collection.initialize_unordered_bulk_op()

        try:
            bulk.execute()
        except InvalidOperation:
            # If we do not have any records to execute, ignore error
            # They have been executed in ln124
            pass
Exemple #3
0
    def bulk_update(self, ke_data):

        bulk = self.collection.initialize_unordered_bulk_op()

        count = 0

        for record in self.iterate_data(ke_data):

            # Find and replace doc - inserting if it doesn't exist
            bulk.find({'_id': record['_id']}).upsert().replace_one(record)
            count += 1

            # Bulk ops can have out of memory errors (I'm getting for ~400,000+ bulk ops)
            # So execute the bulk op in stages, when bulk_op_size is reached
            if count % self.bulk_op_size == 0:
                log.info('Executing bulk op')
                bulk.execute()
                bulk = self.collection.initialize_unordered_bulk_op()

        try:
            bulk.execute()
        except InvalidOperation:
            # If we do not have any records to execute, ignore error
            # They have been executed in ln124
            pass
def main():

    update_markers = mongo_get_update_markers()

    # Make sure the updates have all mongo classes
    bulk_tasks = [
        MongoCollectionIndexTask,
        MongoCollectionEventTask,
        MongoCatalogueTask,
        MongoTaxonomyTask,
        # MongoMultimediaTask,
        MongoSiteTask,
        UnpublishTask,
        MongoDeleteTask
    ]

    def _get_task_names(tasks):
        """
        We need to initiate and get the family name, not just the class name
        MongoDeleteTask => DeleteTask
        @param tasks:
        @return:
        """
        return [unicode(task(date=0).task_family) for task in tasks]

    full_export_date = int(config.get('keemu', 'full_export_date'))

    for date, update_marker in update_markers.iteritems():

        #  If this is the fll export date, MongoDeleteTask is not required
        if full_export_date and date == full_export_date:
            bulk_task_copy = list(bulk_tasks)
            bulk_task_copy.remove(MongoDeleteTask)
            bulk_task_names = _get_task_names(bulk_task_copy)
        else:
            bulk_task_names = _get_task_names(bulk_tasks)

        # Assert that for every date we have all the bulk tasks
        missing_tasks = list(set(bulk_task_names) - set(update_marker))
        assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % (date, missing_tasks)

    # Get a list of all export files to process
    export_dates = [d for d in get_export_file_dates() if d not in update_markers.keys()]

    # Run setup_interface_logging to ensure luigi commands
    setup_interface_logging()

    sch = scheduler.CentralPlannerScheduler()

    w = BulkWorker(scheduler=sch)

    for export_date in export_dates:

        log.info('Processing date %s', export_date)
        # We only need to call the mongo delete task, as all other tasks are a requirement
        # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask
        w.add(MongoDeleteTask(date=export_date, force=True))
        w.run()
        w.stop()
    def run(self):

        mongo_db = mongo_client_db()
        collection = MongoCatalogueTask(date=None).collection_name
        cites_species = get_cites_species()

        # Set cites=true flag
        cites_records_cursor = mongo_db[collection].update({'DarScientificName': {'$in': cites_species}}, {'$set': {'cites': True}}, multi=True)
        log.info('Updated %s catalogue records as CITES', cites_records_cursor['nModified'])
    def on_success(self):
        """
        On completion, add indexes
        @return: None
        """

        self.collection = self.get_collection()

        log.info("Adding exportFileDate index")

        self.collection.ensure_index('exportFileDate')
Exemple #7
0
    def on_success(self):
        """
        On completion, add indexes
        @return: None
        """

        self.collection = self.get_collection()

        log.info("Adding exportFileDate index")

        self.collection.ensure_index('exportFileDate')
Exemple #8
0
    def write(self, df):

        log.info("Saving records to CKAN resource %s", self.resource_id)

        # Convert all empty/null values to None - so will be NULL values in postgres
        # Ensure any float fields with value 0.0 are actually None
        for col, np_type in self.columns.iteritems():

            if np_type.startswith('float'):
                df[col][df[col] == 0.0] = None
            else:
                # BUGFIX: Multimedia fields are being populated with empty string rather than NULL
                df[col][df[col].astype(str) == ''] = None

        # Loop through all the dataframe columns, removing internal ones (fields starting with _)
        for col in df:
            if col.startswith('_'):
                df.drop(col, axis=1, inplace=True)

        # Convert all NaN to None
        df = df.where(pd.notnull(df), None)

        # Convert records to dictionary
        records = df.to_dict(outtype='records')
        datastore_params = {
            'resource_id': self.resource_id,
            'records': records,
            'force': True
            # 'primary_key': '_id'
        }

        # Check that the data doesn't contain invalid chars
        try:
            json.dumps(datastore_params).encode('ascii')
        except UnicodeDecodeError:
            # At least one of the records contain invalid chars
            # Loop through, validating each of the records

            validated_records = []

            for i, record in enumerate(datastore_params['records']):
                try:
                    json.dumps(record).encode('ascii')
                except UnicodeDecodeError:
                    log.critical('Error encoding record: %s', ' '.join(['%s=%s' % (field, value) for field, value in record.iteritems() if value]))
                else:
                    validated_records.append(record)

            datastore_params['records'] = validated_records

        self.remote_ckan.action.datastore_upsert(**datastore_params)
Exemple #9
0
def ckan_delete(remote_ckan, mongo_record):

    # To avoid circular imports, import the tasks we need to check here
    # Dataset tasks are dependent on the DeleteTask
    from ke2mongo.tasks.indexlot import IndexLotDatasetAPITask
    from ke2mongo.tasks.artefact import ArtefactDatasetAPITask
    from ke2mongo.tasks.specimen import SpecimenDatasetAPITask

    # By default, use SpecimenDatasetAPITask
    task_cls = SpecimenDatasetAPITask

    # Override default class if is Index Lot or Artefact
    for t in [IndexLotDatasetAPITask, ArtefactDatasetAPITask]:
        if t.record_type == mongo_record['ColRecordType']:
            task_cls = t
            break

    # Get the primary key
    for col in task_cls.columns:
        if col[1] == task_cls.datastore['primary_key']:
            primary_key_field = col
            break

    # Get the source primary key - this needs to be split on . as we have added the collection name
    ke_primary_key = primary_key_field[0].split('.')[1]

    # The name of the primary key field used in CKAN
    ckan_primary_key = primary_key_field[1]

    try:
        primary_key_value = mongo_record[ke_primary_key]
    except KeyError:
        log.error('No value for primary key %s', ke_primary_key)
    else:
        resource_id = get_resource_id(remote_ckan, task_cls.package['name'])
        if resource_id:
            try:
                # And delete the record from the datastore
                log.info('Deleting record from CKAN where %s=%s' %
                         (ckan_primary_key, primary_key_value))
                remote_ckan.action.datastore_delete(
                    id=resource_id,
                    filters={ckan_primary_key: primary_key_value},
                    force=True)
            except ckanapi.CKANAPIError:
                # We don't care if the record isn't found
                log.error('Record not found')
        else:
            log.error('No resource ID')
Exemple #10
0
    def delete(self, collection, irn):

        # If this is an ecatalogue record, try and delete from CKAN
        if collection.name == 'ecatalogue':

            # Load the record from mongo
            mongo_record = collection.find_one({'_id': int(irn)})

            if mongo_record:
                ckan_delete(self.remote_ckan, mongo_record)
            else:
                log.info('Record %s does not exist. SKipping delete.' % irn)

        # And call the Mongo Delete task delete() method to remove the record from mongodb
        super(DeleteAPITask, self).delete(collection, irn)
    def run(self):

        mongo_db = mongo_client_db()
        collection = MongoCatalogueTask(date=None).collection_name
        cites_species = get_cites_species()

        # Set cites=true flag
        cites_records_cursor = mongo_db[collection].update(
            {'DarScientificName': {
                '$in': cites_species
            }}, {'$set': {
                'cites': True
            }},
            multi=True)
        log.info('Updated %s catalogue records as CITES',
                 cites_records_cursor['nModified'])
    def get_or_create_resource(self):
        """

        Either load a resource object
        Or if it doesn't exist, create the dataset package, and datastore

        @param package: params to create the package
        @param datastore: params to create the datastore
        @return: CKAN resource ID
        """

        resource_id = None

        try:
            # If the package exists, retrieve the resource
            ckan_package = self.remote_ckan.action.package_show(id=self.package['name'])

            # Does a resource of the same name already exist for this dataset?
            # If it does, assign to resource_id
            for resource in ckan_package['resources']:
                if resource['name'] == self.datastore['resource']['name']:
                    self.validate_resource(resource)
                    resource_id = resource['id']

        except ckanapi.NotFound:
            log.info("Package %s not found - creating", self.package['name'])

            # Create the package
            ckan_package = self.remote_ckan.action.package_create(**self.package)

        # If we don't have the resource ID, create
        if not resource_id:
            log.info("Resource %s not found - creating", self.datastore['resource']['name'])

            self.datastore['fields'] = [{'id': col, 'type': self.numpy_to_ckan_type(np_type)} for col, np_type in self.get_output_columns().iteritems()]
            self.datastore['resource']['package_id'] = ckan_package['id']

            if self.indexed_fields:
                # Create BTREE indexes for all specified indexed fields
                self.datastore['indexes'] = [col['id'] for col in self.datastore['fields'] if col['id'] in self.indexed_fields]
            else:
                # Create BTREE indexes for all citext fields
                self.datastore['indexes'] = [col['id'] for col in self.datastore['fields'] if col['type'] == 'citext']

            # API call to create the datastore
            resource_id = self.remote_ckan.action.datastore_create(**self.datastore)['resource_id']

            # If this has geospatial fields, create geom columns
            if self.geospatial_fields:
                log.info("Creating geometry columns for %s", resource_id)
                self.geospatial_fields['resource_id'] = resource_id
                self.remote_ckan.action.create_geom_columns(**self.geospatial_fields)

            log.info("Created datastore resource %s", resource_id)

        return resource_id
Exemple #13
0
def ckan_delete(remote_ckan, mongo_record):

    # To avoid circular imports, import the tasks we need to check here
    # Dataset tasks are dependent on the DeleteTask
    from ke2mongo.tasks.indexlot import IndexLotDatasetAPITask
    from ke2mongo.tasks.artefact import ArtefactDatasetAPITask
    from ke2mongo.tasks.specimen import SpecimenDatasetAPITask

    # By default, use SpecimenDatasetAPITask
    task_cls = SpecimenDatasetAPITask

    # Override default class if is Index Lot or Artefact
    for t in [IndexLotDatasetAPITask, ArtefactDatasetAPITask]:
        if t.record_type == mongo_record['ColRecordType']:
            task_cls = t
            break

    # Get the primary key
    for col in task_cls.columns:
        if col[1] == task_cls.datastore['primary_key']:
            primary_key_field = col
            break

    # Get the source primary key - this needs to be split on . as we have added the collection name
    ke_primary_key = primary_key_field[0].split('.')[1]

    # The name of the primary key field used in CKAN
    ckan_primary_key = primary_key_field[1]

    try:
        primary_key_value = mongo_record[ke_primary_key]
    except KeyError:
        log.error('No value for primary key %s', ke_primary_key)
    else:
        resource_id = get_resource_id(remote_ckan, task_cls.package['name'])
        if resource_id:
            try:
                # And delete the record from the datastore
                log.info('Deleting record from CKAN where %s=%s' % (ckan_primary_key, primary_key_value))
                remote_ckan.action.datastore_delete(id=resource_id, filters={ckan_primary_key: primary_key_value}, force=True)
            except ckanapi.CKANAPIError:
                # We don't care if the record isn't found
                log.error('Record not found')
        else:
            log.error('No resource ID')
Exemple #14
0
 def on_success(self):
     """
     On completion, add indexes
     @return: None
     """
     self.collection = self.get_collection()
     log.info("Adding ecatalogue indexes")
     self.collection.ensure_index('ColRecordType')
     # Only include active records - not Stubs etc.,
     self.collection.ensure_index('SecRecordStatus')
     # Add index on RegRegistrationParentRef - select records with the same parent
     self.collection.ensure_index('RegRegistrationParentRef')
     # Need to filter on web publishable
     self.collection.ensure_index('AdmPublishWebNoPasswordFlag')
     # Exclude records if they do not have a GUID
     self.collection.ensure_index('AdmGUIDPreferredValue')
     # Add embargo date index
     self.collection.ensure_index('RealEmbargoDate')
     super(MongoCatalogueTask, self).on_success()
    def run(self):
        # Do not run if this is a full export date - all non-publishable records will
        # Already have been removed
        if int(self.full_export_date) == int(self.date):
            log.info("No records to unpublish for full exports")
            self.mark_complete()
            return
        collection = self.output().get_collection('ecatalogue')
        # We only care about records who's status has changed in the past week (6 days to be sure)
        date_object = datetime.strptime(str(self.date), '%Y%m%d')
        q = dict(AdmPublishWebNoPasswordFlag='N',
                 exportFileDate=self.date,
                 ISODateInserted={'$gte': date_object - timedelta(days=6)})
        cursor = collection.find(q)
        log.info('%s records to unpublish', cursor.count())

        for record in cursor:
            ckan_delete(self.remote_ckan, record)

        # And mark the object as complete
        self.mark_complete()
    def run(self):
        count = 0

        host = config.get('mongo', 'host')
        db = config.get('mongo', 'database')

        def _fill_field(field_arr, field_type):
            if field_type.startswith('string'):
                field_arr = field_arr.astype(np.str).filled('')
            elif field_type == 'bool':
                field_arr = field_arr.astype(np.str).filled(None)
            elif field_type.startswith('int'):
                field_arr = field_arr.filled(0)
            elif field_type.startswith('float'):
                field_arr = field_arr.filled(np.NaN)
            else:
                raise Exception('Unknown field type %s' % field_type)

            return field_arr

        with Monary(host) as m:

            log.info("Querying Monary")

            # Get field definitions for default collection
            query_fields, df_cols, field_types = zip(*self.get_collection_source_columns(self.collection_name))

            catalogue_blocks = m.block_query(db, self.collection_name, self.query, query_fields, field_types, block_size=self.block_size)

            log.info("Processing Monary data")

            for catalogue_block in catalogue_blocks:

                # Bit of a hack: fill fields with a blank value (depending on type)
                # So the masked value doesn't get used.  As the masked is shared between
                # each block, if a field is empty it is getting populated by previous values
                catalogue_block = [_fill_field(arr, field_types[i]) for i, arr in enumerate(catalogue_block)]

                # Create a pandas data frame with block of records
                # Columns use the name from the output columns - but must be in the same order as query_fields
                # Which is why we're using tuples for the columns
                df = pd.DataFrame(np.matrix(catalogue_block).transpose(), columns=df_cols)

                # Loop through all the columns and ensure hidden integer fields are cast as int32
                # For example, taxonomy_irn is used to join with taxonomy df
                for i, df_col in enumerate(df_cols):
                    if field_types[i].startswith('int'):
                        df[df_col] = df[df_col].astype(field_types[i])

                df = self.process_dataframe(m, df)

                # Output the dataframe
                self.output().write(df)

                row_count, col_count = df.shape
                count += row_count
                log.info("\t %s records", count)

        # After running, update mongo
        self.mongo_target.touch()
    def on_success(self):

        log.info("Import CSV file with:")
        log.info("COPY \"{resource_id}\" (\"{cols}\") FROM '{path}' DELIMITER ',' CSV ENCODING 'UTF8';".format(
            resource_id=self.resource_id,
            cols='","'.join(col for col in self.get_output_columns()),
            path=self.path
        ))

        log.info("And update full text index:")
        log.info("paster update-fulltext -i \"{resource_id}\" -c /vagrant/etc/default/development.ini".format(
            resource_id=self.resource_id,
        ))

        return super(DatasetCSVTask, self).complete()
Exemple #18
0
    def on_success(self):

        log.info("Import CSV file with:")
        log.info(
            "COPY \"{resource_id}\" (\"{cols}\") FROM '{path}' DELIMITER ',' CSV ENCODING 'UTF8';"
            .format(resource_id=self.resource_id,
                    cols='","'.join(col for col in self.get_output_columns()),
                    path=self.path))

        log.info("And update full text index:")
        log.info(
            "paster update-fulltext -i \"{resource_id}\" -c /vagrant/etc/default/development.ini"
            .format(resource_id=self.resource_id, ))

        return super(DatasetCSVTask, self).complete()
Exemple #19
0
    def batch_insert(self, ke_data):
        def _insert(batch):

            try:
                self.collection.insert(batch)
            except DuplicateKeyError:
                # Duplicate key error - KE export does duplicate some records
                # So switch to bulk upsert for this operation

                log.error('Duplicate key error - switching to upsert')

                bulk = self.collection.initialize_unordered_bulk_op()
                for batch_record in batch:
                    bulk.find({
                        '_id': batch_record['_id']
                    }).upsert().replace_one(batch_record)

                bulk.execute()

        batch = []

        for record in self.iterate_data(ke_data):

            if self.batch_size:
                batch.append(record)

                # If the batch length equals the batch size, commit and clear the batch
                if len(batch) % self.batch_size == 0:
                    log.info('Submitting batch')
                    _insert(batch)
                    batch = []

            else:
                self.collection.insert(record)

        # Add any records remaining in the batch
        if batch:
            _insert(batch)
    def run(self):
        # Do not run if this is a full export date - all non-publishable records will
        # Already have been removed
        if int(self.full_export_date) == int(self.date):
            log.info("No records to unpublish for full exports")
            self.mark_complete()
            return
        collection = self.output().get_collection('ecatalogue')
        # We only care about records who's status has changed in the past week (6 days to be sure)
        date_object = datetime.strptime(str(self.date), '%Y%m%d')
        q = dict(
            AdmPublishWebNoPasswordFlag='N',
            exportFileDate=self.date,
            ISODateInserted={'$gte': date_object - timedelta(days=6)}
        )
        cursor = collection.find(q)
        log.info('%s records to unpublish', cursor.count())

        for record in cursor:
            ckan_delete(self.remote_ckan, record)

        # And mark the object as complete
        self.mark_complete()
    def batch_insert(self, ke_data):

        def _insert(batch):

            try:
                self.collection.insert(batch)
            except DuplicateKeyError:
                # Duplicate key error - KE export does duplicate some records
                # So switch to bulk upsert for this operation

                log.error('Duplicate key error - switching to upsert')

                bulk = self.collection.initialize_unordered_bulk_op()
                for batch_record in batch:
                    bulk.find({'_id': batch_record['_id']}).upsert().replace_one(batch_record)

                bulk.execute()

        batch = []

        for record in self.iterate_data(ke_data):

            if self.batch_size:
                batch.append(record)

                # If the batch length equals the batch size, commit and clear the batch
                if len(batch) % self.batch_size == 0:
                    log.info('Submitting batch')
                    _insert(batch)
                    batch = []

            else:
                self.collection.insert(record)

        # Add any records remaining in the batch
        if batch:
            _insert(batch)
Exemple #22
0
    def iterate_data(self, ke_data):
        """
        Iterate through the data
        @return:
        """
        for record in ke_data:

            status = ke_data.get_status()

            if status:
                log.info(status)

            # Use the IRN as _id
            record['_id'] = record['irn']

            try:
                # Do not process if unprocessed flag is set
                if not self.unprocessed:
                    record = self.process_record(record)

            except InvalidRecordException:
                continue
            else:
                yield record
    def run(self):

        # Running this task doesn't delete anything from CKAN itself - so require --force flag to be sent to run it
        if not self.force:
            raise Exception(
                'Warning: this class does not delete CKAN records. Use --force to run it.'
            )

        # Build a dict of all modules and collections
        # We then retrieve the appropriate collection from the records module name (AudTable)
        # Exclude the MongoDeleteTask though
        collections = {
            cls.module: cls(None).get_collection()
            for cls in MongoTask.__subclasses__()
        }

        ke_data = KEParser(self.input().open('r'),
                           file_path=self.input().path,
                           schema_file=self.keemu_schema_file)

        for record in self.iterate_data(ke_data):

            module = record.get('AudTable')
            irn = record.get('AudKey')

            try:
                collection = collections[module]
            except KeyError:
                log.debug('Skipping eaudit record for %s' % module)
                # We do not have a collection for this module - skip to next record
                continue
            else:
                log.info('Deleting record %s(%s)' % (module, irn))
                self.delete(collection, irn)

        self.mark_complete()
    def iterate_data(self, ke_data):
        """
        Iterate through the data
        @return:
        """
        for record in ke_data:

            status = ke_data.get_status()

            if status:
                log.info(status)

            # Use the IRN as _id
            record['_id'] = record['irn']

            try:
                # Do not process if unprocessed flag is set
                if not self.unprocessed:
                    record = self.process_record(record)

            except InvalidRecordException:
                continue
            else:
                yield record
Exemple #25
0
 def run(self):
     if int(self.full_export_date) == int(self.date):
         log.info("No records to delete for full exports")
         self.mark_complete()
         return
     super(DeleteAPITask, self).run()
Exemple #26
0
    def get_or_create_resource(self):
        """

        Either load a resource object
        Or if it doesn't exist, create the dataset package, and datastore

        @param package: params to create the package
        @param datastore: params to create the datastore
        @return: CKAN resource ID
        """

        resource_id = None

        try:
            # If the package exists, retrieve the resource
            ckan_package = self.remote_ckan.action.package_show(
                id=self.package['name'])

            # Does a resource of the same name already exist for this dataset?
            # If it does, assign to resource_id
            for resource in ckan_package['resources']:
                if resource['name'] == self.datastore['resource']['name']:
                    self.validate_resource(resource)
                    resource_id = resource['id']

        except ckanapi.NotFound:
            log.info("Package %s not found - creating", self.package['name'])

            # Create the package
            ckan_package = self.remote_ckan.action.package_create(
                **self.package)

        # If we don't have the resource ID, create
        if not resource_id:
            log.info("Resource %s not found - creating",
                     self.datastore['resource']['name'])

            self.datastore['fields'] = [{
                'id':
                col,
                'type':
                self.numpy_to_ckan_type(np_type)
            } for col, np_type in self.get_output_columns().iteritems()]
            self.datastore['resource']['package_id'] = ckan_package['id']

            if self.indexed_fields:
                # Create BTREE indexes for all specified indexed fields
                self.datastore['indexes'] = [
                    col['id'] for col in self.datastore['fields']
                    if col['id'] in self.indexed_fields
                ]
            else:
                # Create BTREE indexes for all citext fields
                self.datastore['indexes'] = [
                    col['id'] for col in self.datastore['fields']
                    if col['type'] == 'citext'
                ]

            # API call to create the datastore
            resource_id = self.remote_ckan.action.datastore_create(
                **self.datastore)['resource_id']

            # If this has geospatial fields, create geom columns
            if self.geospatial_fields:
                log.info("Creating geometry columns for %s", resource_id)
                self.geospatial_fields['resource_id'] = resource_id
                self.remote_ckan.action.create_geom_columns(
                    **self.geospatial_fields)

            log.info("Created datastore resource %s", resource_id)

        return resource_id
Exemple #27
0
def main():

    update_markers = mongo_get_update_markers()

    # Make sure the updates have all mongo classes
    bulk_tasks = [
        MongoCollectionIndexTask,
        MongoCollectionEventTask,
        MongoCatalogueTask,
        MongoTaxonomyTask,
        # MongoMultimediaTask,
        MongoSiteTask,
        UnpublishTask,
        MongoDeleteTask
    ]

    def _get_task_names(tasks):
        """
        We need to initiate and get the family name, not just the class name
        MongoDeleteTask => DeleteTask
        @param tasks:
        @return:
        """
        return [unicode(task(date=0).task_family) for task in tasks]

    full_export_date = int(config.get('keemu', 'full_export_date'))

    for date, update_marker in update_markers.iteritems():

        #  If this is the fll export date, MongoDeleteTask is not required
        if full_export_date and date == full_export_date:
            bulk_task_copy = list(bulk_tasks)
            bulk_task_copy.remove(MongoDeleteTask)
            bulk_task_names = _get_task_names(bulk_task_copy)
        else:
            bulk_task_names = _get_task_names(bulk_tasks)

        # Assert that for every date we have all the bulk tasks
        missing_tasks = list(set(bulk_task_names) - set(update_marker))
        assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % (
            date, missing_tasks)

    # Get a list of all export files to process
    export_dates = [
        d for d in get_export_file_dates() if d not in update_markers.keys()
    ]

    # Run setup_interface_logging to ensure luigi commands
    setup_interface_logging()

    sch = scheduler.CentralPlannerScheduler()

    w = BulkWorker(scheduler=sch)

    for export_date in export_dates:

        log.info('Processing date %s', export_date)
        # We only need to call the mongo delete task, as all other tasks are a requirement
        # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask
        w.add(MongoDeleteTask(date=export_date, force=True))
        w.run()
        w.stop()
Exemple #28
0
    def run(self):
        count = 0

        host = config.get('mongo', 'host')
        db = config.get('mongo', 'database')

        def _fill_field(field_arr, field_type):
            if field_type.startswith('string'):
                field_arr = field_arr.astype(np.str).filled('')
            elif field_type == 'bool':
                field_arr = field_arr.astype(np.str).filled(None)
            elif field_type.startswith('int'):
                field_arr = field_arr.filled(0)
            elif field_type.startswith('float'):
                field_arr = field_arr.filled(np.NaN)
            else:
                raise Exception('Unknown field type %s' % field_type)

            return field_arr

        with Monary(host) as m:

            log.info("Querying Monary")

            # Get field definitions for default collection
            query_fields, df_cols, field_types = zip(
                *self.get_collection_source_columns(self.collection_name))

            catalogue_blocks = m.block_query(db,
                                             self.collection_name,
                                             self.query,
                                             query_fields,
                                             field_types,
                                             block_size=self.block_size)

            log.info("Processing Monary data")

            for catalogue_block in catalogue_blocks:

                # Bit of a hack: fill fields with a blank value (depending on type)
                # So the masked value doesn't get used.  As the masked is shared between
                # each block, if a field is empty it is getting populated by previous values
                catalogue_block = [
                    _fill_field(arr, field_types[i])
                    for i, arr in enumerate(catalogue_block)
                ]

                # Create a pandas data frame with block of records
                # Columns use the name from the output columns - but must be in the same order as query_fields
                # Which is why we're using tuples for the columns
                df = pd.DataFrame(np.matrix(catalogue_block).transpose(),
                                  columns=df_cols)

                # Loop through all the columns and ensure hidden integer fields are cast as int32
                # For example, taxonomy_irn is used to join with taxonomy df
                for i, df_col in enumerate(df_cols):
                    if field_types[i].startswith('int'):
                        df[df_col] = df[df_col].astype(field_types[i])

                df = self.process_dataframe(m, df)

                # Output the dataframe
                self.output().write(df)

                row_count, col_count = df.shape
                count += row_count
                log.info("\t %s records", count)

        # After running, update mongo
        self.mongo_target.touch()