Ejemplo n.º 1
0
    def __init__(self, *args, **kwargs):

        # If a date parameter has been passed in, we'll just use that
        # Otherwise, loop through the files and get all dates
        super(DatasetTask, self).__init__(*args, **kwargs)

        # Get or create the resource object
        self.resource_id = self.get_or_create_resource()

        # Set up a mongo target to be used to mark complete
        self.mongo_target = MongoTarget(database=config.get(
            'mongo', 'database'),
                                        update_id=self.update_id())
Ejemplo n.º 2
0
    def __init__(self, *args, **kwargs):

        # If a date parameter has been passed in, we'll just use that
        # Otherwise, loop through the files and get all dates
        super(DatasetTask, self).__init__(*args, **kwargs)

        # Get or create the resource object
        self.resource_id = self.get_or_create_resource()

        # Set up a mongo target to be used to mark complete
        self.mongo_target = MongoTarget(database=config.get('mongo', 'database'), update_id=self.update_id())
Ejemplo n.º 3
0
 def output(self):
     return MongoTarget(database=self.database, update_id=self.task_id)
Ejemplo n.º 4
0
class DatasetTask(APITask):
    """
    Class for processing data mongo into a dataset
    If date set, this task requires all mongo files for that date to have been imported into Mongo DB
    """
    ### Parameters

    # MongoDB params
    collection_name = 'ecatalogue'

    # Default record type - used to select records in query
    record_type = None

    has_run = False

    @abc.abstractproperty
    def columns(self):
        """
        Columns to use from mongoDB
        @return: list
        """
        return None

    @abc.abstractmethod
    def output(self):
        """
        Output method
        This overrides luigi.task.output, to ensure it is set
        """
        return None

    @property
    def query(self):
        """
        Query object for selecting data from mongoDB
        @return: dict
        """

        query = OrderedDict()

        if self.record_type:
            query["ColRecordType"] = self.record_type

        # Exclude un wanted record statuses - this is so much faster than trying to do an active or not exists
        query["SecRecordStatus"] = {
            '$nin': [
                "DELETE", "DELETE-MERGED", "DUPLICATION", "Disposed of",
                "FROZEN ARK", "INVALID", "POSSIBLE TYPE", "PROBLEM",
                "Re-registered in error", "Reserved", "Retired",
                "Retired (see Notes)",
                "Retired (see Notes)Retired (see Notes)", "SCAN_cat",
                "See Notes", "Specimen missing - see notes", "Stub",
                "Stub Record", "Stub record"
            ]
        }

        # Make sure that only the five collections departments are represented, as others can break stats pages
        query["ColDepartment"] = {
            '$in':
            ["Botany", "Entomology", "Mineralogy", "Palaeontology", "Zoology"]
        }

        # Web publishable != No
        query['AdmPublishWebNoPasswordFlag'] = {'$ne': 'N'}

        # And ensure we have a GUID
        query['AdmGUIDPreferredValue'] = {'$exists': True}

        # If this is a full export date, we do not need to filter on date
        if int(self.full_export_date) != int(self.date):
            # Ensure we have processed all files for preceding dates
            self.ensure_export_date(self.date)
            query['exportFileDate'] = self.date
        return query

    # CKAN Dataset params
    geospatial_fields = None

    # Fields that require indexing - if None is set all fields will be indexed
    indexed_fields = None

    @abc.abstractproperty
    def package(self):
        """
        Package property
        @return: dict
        """
        return None

    @abc.abstractproperty
    def datastore(self):
        """
        Datastore property
        @return: dict
        """
        return None

    @abc.abstractproperty
    def block_size(self):
        """
        Number of records to retrieve
        """
        return None

    def __init__(self, *args, **kwargs):

        # If a date parameter has been passed in, we'll just use that
        # Otherwise, loop through the files and get all dates
        super(DatasetTask, self).__init__(*args, **kwargs)

        # Get or create the resource object
        self.resource_id = self.get_or_create_resource()

        # Set up a mongo target to be used to mark complete
        self.mongo_target = MongoTarget(database=config.get(
            'mongo', 'database'),
                                        update_id=self.update_id())

    def update_id(self):
        """
        This update id will be a unique identifier for this insert on this collection.
        """
        return self.task_id

    def complete(self):
        """
        Is this task complete?
        :return:
        """
        return self.mongo_target.exists()

    def ensure_export_date(self, date):
        """
        If cron fails to run for whatever reason, and then reruns the next week, it could be mised
        So when calling this dataset, ensure that all preceding mongo exports have been processed
        @param date: date to check
        @return: None
        """
        def filter_dates(d):
            return d < date

        # Get a list of export files dates and marker dates, prior to the current date being processed
        export_file_dates = filter(filter_dates, get_export_file_dates())
        update_marker_dates = filter(filter_dates,
                                     mongo_get_update_markers().keys())
        assert export_file_dates == update_marker_dates, 'Outstanding previous export file dates need to be processed first: %s' % list(
            set(export_file_dates) - set(update_marker_dates))

    def requires(self):
        return [
            # DeleteTask depends upon all other mongo tasks, but lets add them in anyway so it's
            # obvious what's happening here
            MongoCatalogueTask(date=self.date),
            MongoTaxonomyTask(self.date),
            MongoMultimediaTask(self.date),
            MongoCollectionIndexTask(self.date),
            MongoCollectionEventTask(self.date),
            MongoSiteTask(self.date),
            DeleteAPITask(date=self.date),
            # Removed unpublished - once published, a record cannot be marked as hidden
            # UnpublishTask(date=self.date)
        ]

    def get_or_create_resource(self):
        """

        Either load a resource object
        Or if it doesn't exist, create the dataset package, and datastore

        @param package: params to create the package
        @param datastore: params to create the datastore
        @return: CKAN resource ID
        """

        resource_id = None

        try:
            # If the package exists, retrieve the resource
            ckan_package = self.remote_ckan.action.package_show(
                id=self.package['name'])

            # Does a resource of the same name already exist for this dataset?
            # If it does, assign to resource_id
            for resource in ckan_package['resources']:
                if resource['name'] == self.datastore['resource']['name']:
                    self.validate_resource(resource)
                    resource_id = resource['id']

        except ckanapi.NotFound:
            log.info("Package %s not found - creating", self.package['name'])

            # Create the package
            ckan_package = self.remote_ckan.action.package_create(
                **self.package)

        # If we don't have the resource ID, create
        if not resource_id:
            log.info("Resource %s not found - creating",
                     self.datastore['resource']['name'])

            self.datastore['fields'] = [{
                'id':
                col,
                'type':
                self.numpy_to_ckan_type(np_type)
            } for col, np_type in self.get_output_columns().iteritems()]
            self.datastore['resource']['package_id'] = ckan_package['id']

            if self.indexed_fields:
                # Create BTREE indexes for all specified indexed fields
                self.datastore['indexes'] = [
                    col['id'] for col in self.datastore['fields']
                    if col['id'] in self.indexed_fields
                ]
            else:
                # Create BTREE indexes for all citext fields
                self.datastore['indexes'] = [
                    col['id'] for col in self.datastore['fields']
                    if col['type'] == 'citext'
                ]

            # API call to create the datastore
            resource_id = self.remote_ckan.action.datastore_create(
                **self.datastore)['resource_id']

            # If this has geospatial fields, create geom columns
            if self.geospatial_fields:
                log.info("Creating geometry columns for %s", resource_id)
                self.geospatial_fields['resource_id'] = resource_id
                self.remote_ckan.action.create_geom_columns(
                    **self.geospatial_fields)

            log.info("Created datastore resource %s", resource_id)

        return resource_id

    def validate_resource(self, resource):
        # Validate the resource - see DatasetCSVTask
        # Raise Exception on failure
        pass  # default impl

    @staticmethod
    def numpy_to_ckan_type(pandas_type):
        """
        For a pandas field type, return s the corresponding ckan data type, to be used when creating datastore
        init32 => integer
        @param pandas_type: pandas data type
        @return: ckan data type
        """
        try:
            type_num, type_arg, numpy_type = get_monary_numpy_type(pandas_type)
        except ValueError:
            # There is no numpy type - just use original value (JSON)
            return pandas_type

        try:
            if issubclass(numpy_type, np.signedinteger):
                ckan_type = 'integer'
            elif issubclass(numpy_type, np.floating):
                ckan_type = 'float'
            elif numpy_type is bool:
                ckan_type = 'bool'
            else:
                ckan_type = 'citext'
        except TypeError:
            # Strings are not objects, so we'll get a TypeError
            ckan_type = 'citext'

        return ckan_type

    @staticmethod
    def ckan_to_numpy_type(ckan_type):
        """
        Convert CKAN field types to numpy types
        Essentially convert special types (UUID; JSON) to strings
        @param pandas_type:
        @return:
        """

        if ckan_type == 'uuid':
            # UUID fields should be retrieved as 36 byte strings
            numpy_type = 'string:36'
        elif ckan_type == 'json':
            # JSON fields should be retrieved as strings
            numpy_type = 'string:200'
        else:
            # Otherwise keep the original type
            numpy_type = ckan_type

        return numpy_type

    def get_collection_source_columns(self, collection=None):
        """
        Parse columns into dictionary keyed by collection name
        And return all fields for a particular collection
        @param collection:
        @return: list of fields
        """
        collection_columns = {}

        for (source_field, destination_field, field_type) in self.columns:
            field_collection, field_name = source_field.split('.')
            field_type = self.ckan_to_numpy_type(field_type)

            try:
                collection_columns[field_collection].append(
                    (field_name, destination_field, field_type))
            except KeyError:
                collection_columns[field_collection] = [
                    (field_name, destination_field, field_type)
                ]

        if collection:
            return collection_columns[collection]
        else:
            return collection_columns

    @timeit
    def run(self):
        count = 0

        host = config.get('mongo', 'host')
        db = config.get('mongo', 'database')

        def _fill_field(field_arr, field_type):
            if field_type.startswith('string'):
                field_arr = field_arr.astype(np.str).filled('')
            elif field_type == 'bool':
                field_arr = field_arr.astype(np.str).filled(None)
            elif field_type.startswith('int'):
                field_arr = field_arr.filled(0)
            elif field_type.startswith('float'):
                field_arr = field_arr.filled(np.NaN)
            else:
                raise Exception('Unknown field type %s' % field_type)

            return field_arr

        with Monary(host) as m:

            log.info("Querying Monary")

            # Get field definitions for default collection
            query_fields, df_cols, field_types = zip(
                *self.get_collection_source_columns(self.collection_name))

            catalogue_blocks = m.block_query(db,
                                             self.collection_name,
                                             self.query,
                                             query_fields,
                                             field_types,
                                             block_size=self.block_size)

            log.info("Processing Monary data")

            for catalogue_block in catalogue_blocks:

                # Bit of a hack: fill fields with a blank value (depending on type)
                # So the masked value doesn't get used.  As the masked is shared between
                # each block, if a field is empty it is getting populated by previous values
                catalogue_block = [
                    _fill_field(arr, field_types[i])
                    for i, arr in enumerate(catalogue_block)
                ]

                # Create a pandas data frame with block of records
                # Columns use the name from the output columns - but must be in the same order as query_fields
                # Which is why we're using tuples for the columns
                df = pd.DataFrame(np.matrix(catalogue_block).transpose(),
                                  columns=df_cols)

                # Loop through all the columns and ensure hidden integer fields are cast as int32
                # For example, taxonomy_irn is used to join with taxonomy df
                for i, df_col in enumerate(df_cols):
                    if field_types[i].startswith('int'):
                        df[df_col] = df[df_col].astype(field_types[i])

                df = self.process_dataframe(m, df)

                # Output the dataframe
                self.output().write(df)

                row_count, col_count = df.shape
                count += row_count
                log.info("\t %s records", count)

        # After running, update mongo
        self.mongo_target.touch()

    def process_dataframe(self, m, df):
        return df

    @staticmethod
    def _get_unique_irns(df, field_name):
        """
        Return a list of IRNs converted to integers, and not 0 ('0' as treated like string)
        @param df:
        @param field_name:
        @return:
        """
        return pd.unique(df[field_name][df[field_name] != 0].astype(
            'int32').values.ravel()).tolist()

    def ensure_multimedia(self, df, multimedia_field):

        mongo_client = mongo_client_db()

        # The multimedia field contains IRNS of all items - not just images
        # So we need to look up the IRNs against the multimedia record to get the mime type
        # And filter out non-image mimetypes we do not support

        # Convert associatedMedia field to a list
        df[multimedia_field] = df[multimedia_field].apply(
            lambda x: list(int(z.strip()) for z in x.split(';') if z.strip()))

        # Get a unique list of IRNS
        unique_multimedia_irns = list(
            set(itertools.chain(*[irn
                                  for irn in df[multimedia_field].values])))

        # Get a list of dictionary of valid multimedia valid mimetypes
        # It's not enough to just check for the derived image heights - some of these are tiffs etc., and undeliverable
        cursor = mongo_client['emultimedia'].find(
            {
                '_id': {
                    '$in': unique_multimedia_irns
                },
                'AdmPublishWebNoPasswordFlag': 'Y',
                #'NhmSecEmbargoDate': 0,
                'GenDigitalMediaId': {
                    '$ne': 0
                }
            },
            {
                'GenDigitalMediaId': 1,
                'MulTitle': 1,
                'MulMimeFormat': 1,
                'NhmSecEmbargoDate': 1,
                'NhmSecEmbargoExtensionDate': 1
            })

        # Create a dictionary of multimedia records, keyed by _id
        multimedia_dict = {}

        for record in cursor:

            if record['GenDigitalMediaId'] == 'Pending':
                continue

# If the embargo extension date exists and is in the future, then skip
            if 'NhmSecEmbargoExtensionDate' in record:
                if record['NhmSecEmbargoExtensionDate'] > 0 and record[
                        'NhmSecEmbargoExtensionDate'] > datetime.datetime.today(
                        ).strftime("%Y-%m-%d"):
                    continue

# For remaining records, if the original embargo date exists and is in the future then skip
            if record['NhmSecEmbargoDate'] > 0 and record[
                    'NhmSecEmbargoDate'] > datetime.datetime.today().strftime(
                        "%Y-%m-%d"):
                continue

            multimedia_dict[record['_id']] = {
                'identifier':
                'http://www.nhm.ac.uk/services/media-store/asset/{mam_id}/contents/preview'
                .format(mam_id=record['GenDigitalMediaId'], ),
                'format':
                'image/%s' % record['MulMimeFormat'],
                "type":
                "StillImage",
                "license":
                "http://creativecommons.org/licenses/by/4.0/",
                "rightsHolder":
                "The Trustees of the Natural History Museum, London"
            }

            # Add the title if it exists
            if record.get('MulTitle', None):
                multimedia_dict[record['_id']]['title'] = record.get(
                    'MulTitle')

        def multimedia_to_json(irns):
            """
            Convert multimedia fields to json
            Loop through all the irns in the field, check they key exists in multimedia_dict
            (If it's not the image might not be publishable / be in the correct format)
            @param irns:
            @return: json
            """

            multimedia_records = [
                multimedia_dict[irn] for irn in irns if irn in multimedia_dict
            ]
            return json.dumps(
                multimedia_records) if multimedia_records else np.nan

        # And finally update the associatedMedia field, so formatting with the IRN with MULTIMEDIA_URL, if the IRN is in valid_multimedia
        df[multimedia_field] = df[multimedia_field].apply(multimedia_to_json)

    @staticmethod
    def get_dataframe(m, collection, columns, irns, key):

        query_fields, df_cols, field_types = zip(*columns)
        assert key in df_cols, 'Merge dataframe key must be present in dataframe columns'

        q = {'_id': {'$in': irns}}

        query = m.query('keemu', collection, q, query_fields, field_types)
        df = pd.DataFrame(np.matrix(query).transpose(), columns=df_cols)

        # Convert to int
        df[key] = df[key].astype('int32')
        # And make index
        df.index = df[key]

        return df

    @staticmethod
    def _is_output_field(field):
        """
        Fields starting with _ are hidden and shouldn't be included in output
        @param field:
        @return: bool
        """
        return not field.startswith('_') and field != '_id'

    def get_output_columns(self):

        return OrderedDict((col[1], col[2]) for col in self.columns
                           if self._is_output_field(col[1]))
Ejemplo n.º 5
0
class DatasetTask(APITask):
    """
    Class for processing data mongo into a dataset
    If date set, this task requires all mongo files for that date to have been imported into Mongo DB
    """
    ### Parameters

    # MongoDB params
    collection_name = 'ecatalogue'

    # Default record type - used to select records in query
    record_type = None

    has_run = False

    @abc.abstractproperty
    def columns(self):
        """
        Columns to use from mongoDB
        @return: list
        """
        return None

    @abc.abstractmethod
    def output(self):
        """
        Output method
        This overrides luigi.task.output, to ensure it is set
        """
        return None

    @property
    def query(self):
        """
        Query object for selecting data from mongoDB
        @return: dict
        """

        query = OrderedDict()

        if self.record_type:
            query["ColRecordType"] = self.record_type

        # Exclude un wanted record statuses - this is so much faster than trying to do an active or not exists
        query["SecRecordStatus"] = {
            '$nin': [
                "DELETE",
                "DELETE-MERGED",
                "DUPLICATION",
                "Disposed of",
                "FROZEN ARK",
                "INVALID",
                "POSSIBLE TYPE",
                "PROBLEM",
                "Re-registered in error",
                "Reserved",
                "Retired",
                "Retired (see Notes)",
                "Retired (see Notes)Retired (see Notes)",
                "SCAN_cat",
                "See Notes",
                "Specimen missing - see notes",
                "Stub",
                "Stub Record",
                "Stub record"
                ]
            }

	# Make sure that only the five collections departments are represented, as others can break stats pages
	query["ColDepartment"] = {
	    '$in': [
	        "Botany",
	        "Entomology",
                "Mineralogy",
                "Palaeontology",
	        "Zoology"
	        ]		
	}

        # Web publishable != No
        query['AdmPublishWebNoPasswordFlag'] = {'$ne': 'N'}

        # And ensure we have a GUID
        query['AdmGUIDPreferredValue'] = {'$exists': True}

        # If this is a full export date, we do not need to filter on date
        if int(self.full_export_date) != int(self.date):
            # Ensure we have processed all files for preceding dates
            self.ensure_export_date(self.date)
            query['exportFileDate'] = self.date
        return query

    # CKAN Dataset params
    geospatial_fields = None

    # Fields that require indexing - if None is set all fields will be indexed
    indexed_fields = None

    @abc.abstractproperty
    def package(self):
        """
        Package property
        @return: dict
        """
        return None

    @abc.abstractproperty
    def datastore(self):
        """
        Datastore property
        @return: dict
        """
        return None

    @abc.abstractproperty
    def block_size(self):
        """
        Number of records to retrieve
        """
        return None

    def __init__(self, *args, **kwargs):

        # If a date parameter has been passed in, we'll just use that
        # Otherwise, loop through the files and get all dates
        super(DatasetTask, self).__init__(*args, **kwargs)

        # Get or create the resource object
        self.resource_id = self.get_or_create_resource()

        # Set up a mongo target to be used to mark complete
        self.mongo_target = MongoTarget(database=config.get('mongo', 'database'), update_id=self.update_id())

    def update_id(self):
        """
        This update id will be a unique identifier for this insert on this collection.
        """
        return self.task_id

    def complete(self):
        """
        Is this task complete?
        :return:
        """
        return self.mongo_target.exists()

    def ensure_export_date(self, date):
        """
        If cron fails to run for whatever reason, and then reruns the next week, it could be mised
        So when calling this dataset, ensure that all preceding mongo exports have been processed
        @param date: date to check
        @return: None
        """

        def filter_dates(d):
            return d < date

        # Get a list of export files dates and marker dates, prior to the current date being processed
        export_file_dates = filter(filter_dates, get_export_file_dates())
        update_marker_dates = filter(filter_dates, mongo_get_update_markers().keys())
        assert export_file_dates == update_marker_dates, 'Outstanding previous export file dates need to be processed first: %s' % list(set(export_file_dates) - set(update_marker_dates))

    def requires(self):
        return [
            # DeleteTask depends upon all other mongo tasks, but lets add them in anyway so it's
            # obvious what's happening here
            MongoCatalogueTask(date=self.date),
            MongoTaxonomyTask(self.date),
            MongoMultimediaTask(self.date),
            MongoCollectionIndexTask(self.date),
            MongoCollectionEventTask(self.date),
            MongoSiteTask(self.date),
            DeleteAPITask(date=self.date),
            # Removed unpublished - once published, a record cannot be marked as hidden
            # UnpublishTask(date=self.date)
        ]


    def get_or_create_resource(self):
        """

        Either load a resource object
        Or if it doesn't exist, create the dataset package, and datastore

        @param package: params to create the package
        @param datastore: params to create the datastore
        @return: CKAN resource ID
        """

        resource_id = None

        try:
            # If the package exists, retrieve the resource
            ckan_package = self.remote_ckan.action.package_show(id=self.package['name'])

            # Does a resource of the same name already exist for this dataset?
            # If it does, assign to resource_id
            for resource in ckan_package['resources']:
                if resource['name'] == self.datastore['resource']['name']:
                    self.validate_resource(resource)
                    resource_id = resource['id']

        except ckanapi.NotFound:
            log.info("Package %s not found - creating", self.package['name'])

            # Create the package
            ckan_package = self.remote_ckan.action.package_create(**self.package)

        # If we don't have the resource ID, create
        if not resource_id:
            log.info("Resource %s not found - creating", self.datastore['resource']['name'])

            self.datastore['fields'] = [{'id': col, 'type': self.numpy_to_ckan_type(np_type)} for col, np_type in self.get_output_columns().iteritems()]
            self.datastore['resource']['package_id'] = ckan_package['id']

            if self.indexed_fields:
                # Create BTREE indexes for all specified indexed fields
                self.datastore['indexes'] = [col['id'] for col in self.datastore['fields'] if col['id'] in self.indexed_fields]
            else:
                # Create BTREE indexes for all citext fields
                self.datastore['indexes'] = [col['id'] for col in self.datastore['fields'] if col['type'] == 'citext']

            # API call to create the datastore
            resource_id = self.remote_ckan.action.datastore_create(**self.datastore)['resource_id']

            # If this has geospatial fields, create geom columns
            if self.geospatial_fields:
                log.info("Creating geometry columns for %s", resource_id)
                self.geospatial_fields['resource_id'] = resource_id
                self.remote_ckan.action.create_geom_columns(**self.geospatial_fields)

            log.info("Created datastore resource %s", resource_id)

        return resource_id

    def validate_resource(self, resource):
        # Validate the resource - see DatasetCSVTask
        # Raise Exception on failure
        pass  # default impl

    @staticmethod
    def numpy_to_ckan_type(pandas_type):
        """
        For a pandas field type, return s the corresponding ckan data type, to be used when creating datastore
        init32 => integer
        @param pandas_type: pandas data type
        @return: ckan data type
        """
        try:
            type_num, type_arg, numpy_type = get_monary_numpy_type(pandas_type)
        except ValueError:
            # There is no numpy type - just use original value (JSON)
            return pandas_type;

        try:
            if issubclass(numpy_type, np.signedinteger):
                ckan_type = 'integer'
            elif issubclass(numpy_type, np.floating):
                ckan_type = 'float'
            elif numpy_type is bool:
                ckan_type = 'bool'
            else:
                ckan_type = 'citext'
        except TypeError:
            # Strings are not objects, so we'll get a TypeError
            ckan_type = 'citext'

        return ckan_type

    @staticmethod
    def ckan_to_numpy_type(ckan_type):
        """
        Convert CKAN field types to numpy types
        Essentially convert special types (UUID; JSON) to strings
        @param pandas_type:
        @return:
        """

        if ckan_type == 'uuid':
            # UUID fields should be retrieved as 36 byte strings
            numpy_type = 'string:36'
        elif ckan_type == 'json':
            # JSON fields should be retrieved as strings
            numpy_type = 'string:200'
        else:
            # Otherwise keep the original type
            numpy_type = ckan_type

        return numpy_type


    def get_collection_source_columns(self, collection=None):
        """
        Parse columns into dictionary keyed by collection name
        And return all fields for a particular collection
        @param collection:
        @return: list of fields
        """
        collection_columns = {}

        for (source_field, destination_field, field_type) in self.columns:
            field_collection, field_name = source_field.split('.')
            field_type = self.ckan_to_numpy_type(field_type)

            try:
                collection_columns[field_collection].append((field_name, destination_field, field_type))
            except KeyError:
                collection_columns[field_collection] = [(field_name, destination_field, field_type)]

        if collection:
            return collection_columns[collection]
        else:
            return collection_columns

    @timeit
    def run(self):
        count = 0

        host = config.get('mongo', 'host')
        db = config.get('mongo', 'database')

        def _fill_field(field_arr, field_type):
            if field_type.startswith('string'):
                field_arr = field_arr.astype(np.str).filled('')
            elif field_type == 'bool':
                field_arr = field_arr.astype(np.str).filled(None)
            elif field_type.startswith('int'):
                field_arr = field_arr.filled(0)
            elif field_type.startswith('float'):
                field_arr = field_arr.filled(np.NaN)
            else:
                raise Exception('Unknown field type %s' % field_type)

            return field_arr

        with Monary(host) as m:

            log.info("Querying Monary")

            # Get field definitions for default collection
            query_fields, df_cols, field_types = zip(*self.get_collection_source_columns(self.collection_name))

            catalogue_blocks = m.block_query(db, self.collection_name, self.query, query_fields, field_types, block_size=self.block_size)

            log.info("Processing Monary data")

            for catalogue_block in catalogue_blocks:

                # Bit of a hack: fill fields with a blank value (depending on type)
                # So the masked value doesn't get used.  As the masked is shared between
                # each block, if a field is empty it is getting populated by previous values
                catalogue_block = [_fill_field(arr, field_types[i]) for i, arr in enumerate(catalogue_block)]

                # Create a pandas data frame with block of records
                # Columns use the name from the output columns - but must be in the same order as query_fields
                # Which is why we're using tuples for the columns
                df = pd.DataFrame(np.matrix(catalogue_block).transpose(), columns=df_cols)

                # Loop through all the columns and ensure hidden integer fields are cast as int32
                # For example, taxonomy_irn is used to join with taxonomy df
                for i, df_col in enumerate(df_cols):
                    if field_types[i].startswith('int'):
                        df[df_col] = df[df_col].astype(field_types[i])

                df = self.process_dataframe(m, df)

                # Output the dataframe
                self.output().write(df)

                row_count, col_count = df.shape
                count += row_count
                log.info("\t %s records", count)

        # After running, update mongo
        self.mongo_target.touch()

    def process_dataframe(self, m, df):
        return df

    @staticmethod
    def _get_unique_irns(df, field_name):
        """
        Return a list of IRNs converted to integers, and not 0 ('0' as treated like string)
        @param df:
        @param field_name:
        @return:
        """
        return pd.unique(df[field_name][df[field_name] != 0].astype('int32').values.ravel()).tolist()

    def ensure_multimedia(self, df, multimedia_field):

        mongo_client = mongo_client_db()

        # The multimedia field contains IRNS of all items - not just images
        # So we need to look up the IRNs against the multimedia record to get the mime type
        # And filter out non-image mimetypes we do not support

        # Convert associatedMedia field to a list
        df[multimedia_field] = df[multimedia_field].apply(lambda x: list(int(z.strip()) for z in x.split(';') if z.strip()))
	
        # Get a unique list of IRNS
        unique_multimedia_irns = list(set(itertools.chain(*[irn for irn in df[multimedia_field].values])))

        # Get a list of dictionary of valid multimedia valid mimetypes
        # It's not enough to just check for the derived image heights - some of these are tiffs etc., and undeliverable
        cursor = mongo_client['emultimedia'].find(
            {
                '_id': {'$in': unique_multimedia_irns},
                'AdmPublishWebNoPasswordFlag': 'Y',
                #'NhmSecEmbargoDate': 0,
                'GenDigitalMediaId': {'$ne': 0}
                },
            {
                'GenDigitalMediaId': 1,
                'MulTitle': 1,
                'MulMimeFormat': 1,
		'NhmSecEmbargoDate': 1,
		'NhmSecEmbargoExtensionDate': 1
            }
        )

        # Create a dictionary of multimedia records, keyed by _id
        multimedia_dict = {}

        for record in cursor:

            if record['GenDigitalMediaId'] == 'Pending':
                continue

	    # If the embargo extension date exists and is in the future, then skip
	    if 'NhmSecEmbargoExtensionDate' in record: 
	    	if record['NhmSecEmbargoExtensionDate'] > 0 and record['NhmSecEmbargoExtensionDate'] > datetime.datetime.today().strftime("%Y-%m-%d"):
			continue

	    # For remaining records, if the original embargo date exists and is in the future then skip
	    if record['NhmSecEmbargoDate'] > 0 and record['NhmSecEmbargoDate'] > datetime.datetime.today().strftime("%Y-%m-%d"):
		continue
		
            multimedia_dict[record['_id']] = {
                'identifier': 'http://www.nhm.ac.uk/services/media-store/asset/{mam_id}/contents/preview'.format(
                    mam_id=record['GenDigitalMediaId'],
                ),
                'format': 'image/%s' % record['MulMimeFormat'],
                "type": "StillImage",
                "license": "http://creativecommons.org/licenses/by/4.0/",
                "rightsHolder": "The Trustees of the Natural History Museum, London"
            }

            # Add the title if it exists
            if record.get('MulTitle', None):
                multimedia_dict[record['_id']]['title'] = record.get('MulTitle')

        def multimedia_to_json(irns):
            """
            Convert multimedia fields to json
            Loop through all the irns in the field, check they key exists in multimedia_dict
            (If it's not the image might not be publishable / be in the correct format)
            @param irns:
            @return: json
            """

            multimedia_records = [multimedia_dict[irn] for irn in irns if irn in multimedia_dict]
	    return json.dumps(multimedia_records) if multimedia_records else np.nan

        # And finally update the associatedMedia field, so formatting with the IRN with MULTIMEDIA_URL, if the IRN is in valid_multimedia
        df[multimedia_field] = df[multimedia_field].apply(multimedia_to_json)
	
    @staticmethod
    def get_dataframe(m, collection, columns, irns, key):

        query_fields, df_cols, field_types = zip(*columns)
        assert key in df_cols, 'Merge dataframe key must be present in dataframe columns'

        q = {'_id': {'$in': irns}}

        query = m.query('keemu', collection, q, query_fields, field_types)
        df = pd.DataFrame(np.matrix(query).transpose(), columns=df_cols)

        # Convert to int
        df[key] = df[key].astype('int32')
        # And make index
        df.index = df[key]

        return df

    @staticmethod
    def _is_output_field(field):
        """
        Fields starting with _ are hidden and shouldn't be included in output
        @param field:
        @return: bool
        """
        return not field.startswith('_') and field != '_id'

    def get_output_columns(self):

        return OrderedDict((col[1], col[2]) for col in self.columns if self._is_output_field(col[1]))