Example #1
0
    def run(self):
        count = 0

        host = config.get('mongo', 'host')
        db = config.get('mongo', 'database')

        def _fill_field(field_arr, field_type):
            if field_type.startswith('string'):
                field_arr = field_arr.astype(np.str).filled('')
            elif field_type == 'bool':
                field_arr = field_arr.astype(np.str).filled(None)
            elif field_type.startswith('int'):
                field_arr = field_arr.filled(0)
            elif field_type.startswith('float'):
                field_arr = field_arr.filled(np.NaN)
            else:
                raise Exception('Unknown field type %s' % field_type)

            return field_arr

        with Monary(host) as m:

            log.info("Querying Monary")

            # Get field definitions for default collection
            query_fields, df_cols, field_types = zip(*self.get_collection_source_columns(self.collection_name))

            catalogue_blocks = m.block_query(db, self.collection_name, self.query, query_fields, field_types, block_size=self.block_size)

            log.info("Processing Monary data")

            for catalogue_block in catalogue_blocks:

                # Bit of a hack: fill fields with a blank value (depending on type)
                # So the masked value doesn't get used.  As the masked is shared between
                # each block, if a field is empty it is getting populated by previous values
                catalogue_block = [_fill_field(arr, field_types[i]) for i, arr in enumerate(catalogue_block)]

                # Create a pandas data frame with block of records
                # Columns use the name from the output columns - but must be in the same order as query_fields
                # Which is why we're using tuples for the columns
                df = pd.DataFrame(np.matrix(catalogue_block).transpose(), columns=df_cols)

                # Loop through all the columns and ensure hidden integer fields are cast as int32
                # For example, taxonomy_irn is used to join with taxonomy df
                for i, df_col in enumerate(df_cols):
                    if field_types[i].startswith('int'):
                        df[df_col] = df[df_col].astype(field_types[i])

                df = self.process_dataframe(m, df)

                # Output the dataframe
                self.output().write(df)

                row_count, col_count = df.shape
                count += row_count
                log.info("\t %s records", count)

        # After running, update mongo
        self.mongo_target.touch()
Example #2
0
def get_export_file_dates():
    """
    Gets all the dates of outstanding files
    @return: list of dates
    """

    export_dir = config.get('keemu', 'export_dir')

    try:
        full_export_date = int(config.get('keemu', 'full_export_date'))
    except NoOptionError:
        full_export_date = None

    files = [f for f in os.listdir(export_dir) if os.path.isfile(os.path.join(export_dir,f))]

    # Use a set so we don't have duplicate dates
    dates = set()

    for f in files:

        # So this will work with both .gz and not compressed files
        f = f.replace('.gz', '')

        try:
            # Extract the date from the file name
            _, _, date = f.split('.')
        except ValueError:
            # file not in the correct format - hidden directory etc.,
            pass
        else:

            try:
                date = int(date)
            except ValueError:
                # First dump did not contain date stamp
                # ecatalogue.export.zip
                continue
            else:
                # If we have full export date (the data the last full dump was produced)
                # we only want dates after the last full dump - so skip prior dates
                if full_export_date and date < full_export_date:
                    continue

                dates.add(date)

    # Make sure they are in the right order and convert to list
    dates = sorted(list(dates))

    return dates
Example #3
0
 def path(self):
     """
     File name to output
     @return: str
     """
     file_name = self.__class__.__name__.replace('DatasetCSVTask', '').lower() + '-' + str(self.date)
     return os.path.join(config.get('csv', 'output_dir'), file_name + '.csv')
def main():

    update_markers = mongo_get_update_markers()

    # Make sure the updates have all mongo classes
    bulk_tasks = [
        MongoCollectionIndexTask,
        MongoCollectionEventTask,
        MongoCatalogueTask,
        MongoTaxonomyTask,
        # MongoMultimediaTask,
        MongoSiteTask,
        UnpublishTask,
        MongoDeleteTask
    ]

    def _get_task_names(tasks):
        """
        We need to initiate and get the family name, not just the class name
        MongoDeleteTask => DeleteTask
        @param tasks:
        @return:
        """
        return [unicode(task(date=0).task_family) for task in tasks]

    full_export_date = int(config.get('keemu', 'full_export_date'))

    for date, update_marker in update_markers.iteritems():

        #  If this is the fll export date, MongoDeleteTask is not required
        if full_export_date and date == full_export_date:
            bulk_task_copy = list(bulk_tasks)
            bulk_task_copy.remove(MongoDeleteTask)
            bulk_task_names = _get_task_names(bulk_task_copy)
        else:
            bulk_task_names = _get_task_names(bulk_tasks)

        # Assert that for every date we have all the bulk tasks
        missing_tasks = list(set(bulk_task_names) - set(update_marker))
        assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % (date, missing_tasks)

    # Get a list of all export files to process
    export_dates = [d for d in get_export_file_dates() if d not in update_markers.keys()]

    # Run setup_interface_logging to ensure luigi commands
    setup_interface_logging()

    sch = scheduler.CentralPlannerScheduler()

    w = BulkWorker(scheduler=sch)

    for export_date in export_dates:

        log.info('Processing date %s', export_date)
        # We only need to call the mongo delete task, as all other tasks are a requirement
        # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask
        w.add(MongoDeleteTask(date=export_date, force=True))
        w.run()
        w.stop()
Example #5
0
 def path(self):
     """
     File name to output
     @return: str
     """
     file_name = self.__class__.__name__.replace(
         'DatasetCSVTask', '').lower() + '-' + str(self.date)
     return os.path.join(config.get('csv', 'output_dir'),
                         file_name + '.csv')
class UnpublishTask(APITask):
    """

    Deprecated - once published, a record cannot be marked "do not publish to internet".

    If a KE EMu record has been marked non web publishable, it needs to be deleted from CKAN
    NB: This does not remove embargoed records which have already been published.
    You cannot embargo a record after it's release.
    """
    database = config.get('mongo', 'database')
    keemu_schema_file = config.get('keemu', 'schema')

    def requires(self):
        # Mongo catalogue task for date must have run
        yield MongoCatalogueTask(self.date)

    @timeit
    def run(self):
        # Do not run if this is a full export date - all non-publishable records will
        # Already have been removed
        if int(self.full_export_date) == int(self.date):
            log.info("No records to unpublish for full exports")
            self.mark_complete()
            return
        collection = self.output().get_collection('ecatalogue')
        # We only care about records who's status has changed in the past week (6 days to be sure)
        date_object = datetime.strptime(str(self.date), '%Y%m%d')
        q = dict(AdmPublishWebNoPasswordFlag='N',
                 exportFileDate=self.date,
                 ISODateInserted={'$gte': date_object - timedelta(days=6)})
        cursor = collection.find(q)
        log.info('%s records to unpublish', cursor.count())

        for record in cursor:
            ckan_delete(self.remote_ckan, record)

        # And mark the object as complete
        self.mark_complete()

    def mark_complete(self):
        self.output().touch()

    def output(self):
        return MongoTarget(database=self.database, update_id=self.task_id)
Example #7
0
    def __init__(self, *args, **kwargs):

        # If a date parameter has been passed in, we'll just use that
        # Otherwise, loop through the files and get all dates
        super(DatasetTask, self).__init__(*args, **kwargs)

        # Get or create the resource object
        self.resource_id = self.get_or_create_resource()

        # Set up a mongo target to be used to mark complete
        self.mongo_target = MongoTarget(database=config.get('mongo', 'database'), update_id=self.update_id())
Example #8
0
    def mark_complete(self):

        # Move the file to the archive directory (if specified)
        try:
            archive_dir = config.get('keemu', 'archive_dir')
            self.input().move(os.path.join(archive_dir, self.input().file_name))
        except NoOptionError:
            # Allow archive dir to be none
            pass

        # And mark the object as complete
        self.output().touch()
Example #9
0
    def __init__(self, *args, **kwargs):

        # If a date parameter has been passed in, we'll just use that
        # Otherwise, loop through the files and get all dates
        super(DatasetTask, self).__init__(*args, **kwargs)

        # Get or create the resource object
        self.resource_id = self.get_or_create_resource()

        # Set up a mongo target to be used to mark complete
        self.mongo_target = MongoTarget(database=config.get(
            'mongo', 'database'),
                                        update_id=self.update_id())
Example #10
0
    def mark_complete(self):

        # Move the file to the archive directory (if specified)
        try:
            archive_dir = config.get('keemu', 'archive_dir')
            self.input().move(os.path.join(archive_dir,
                                           self.input().file_name))
        except NoOptionError:
            # Allow archive dir to be none
            pass

        # And mark the object as complete
        self.output().touch()
Example #11
0
class APITask(luigi.Task):
    """
    Base CKAN API Task
    """

    # Date to process
    date = luigi.IntParameter()

    full_export_date = config.get('keemu', 'full_export_date')

    def __init__(self, *args, **kwargs):

        # If a date parameter has been passed in, we'll just use that
        # Otherwise, loop through the files and get all dates
        super(APITask, self).__init__(*args, **kwargs)
        self.remote_ckan = ckanapi.RemoteCKAN(config.get('ckan', 'site_url'), apikey=config.get('ckan', 'api_key'))
Example #12
0
class ArtefactDatasetTask(DatasetTask):

    # CKAN Dataset params
    package = {
        'name': 'collection-artefacts',
        'notes': u'Cultural and historical artefacts from The Natural History Museum',
        'title': "Artefacts",
        'author': DATASET_AUTHOR,
        'license_id': DATASET_LICENCE,
        'resources': [],
        'dataset_category': DATASET_TYPE,
        'owner_org': config.get('ckan', 'owner_org')
    }

    # And now save to the datastore
    datastore = {
        'resource': {
            'name': 'Artefacts',
            'description': 'Museum artefacts',
            'format': 'csv'
        },
        'primary_key': 'GUID'
    }

    columns = [
        ('ecatalogue.AdmGUIDPreferredValue', 'GUID', 'uuid'),
        ('ecatalogue.ArtName', 'Name', 'string:100'),
        ('ecatalogue.ArtKind', 'Kind', 'string:100'),
        ('ecatalogue.PalArtDescription', 'Description', 'string:100'),
        ('ecatalogue.IdeCurrentScientificName', 'Scientific name', 'string:100'),
        ('ecatalogue.MulMultiMediaRef', 'Multimedia', 'json')
    ]

    record_type = 'Artefact'

    def process_dataframe(self, m, df):
        """
        Process the dataframe, converting image IRNs to URIs
        @param m: monary
        @param df: dataframe
        @return: dataframe
        """
        # And update images to URLs

        df = super(ArtefactDatasetTask, self).process_dataframe(m, df)
        self.ensure_multimedia(df, 'Multimedia')
        return df
Example #13
0
def solr_reindex():
    indexes = config.get('solr', 'indexes').split(',')

    # Loop through the indexes, request a full import and wait until it completes before
    # requesting for the next index - ensures there's always a stable index available for requests

    for index in indexes:
        solr_index = SolrIndex(index)
        print("Starting full import of index: %s" % index)
        solr_index.full_import()

        # Enter loop to keep checking status every SLEEP_INTERVAL
        while True:
            r = solr_index.status()
            if r['status'] == 'busy':
                print('Total Rows Fetched: %s' % r['statusMessages'].get('Total Rows Fetched'))
                print('Time elapsed: %s' % r['statusMessages'].get('Time Elapsed'))
                time.sleep(SLEEP_INTERVAL)
            else:
                print(r['statusMessages'].get(''))
                print('Time taken: %s' % r['statusMessages'].get('Time taken'))
                break;
Example #14
0
def solr_reindex():
    indexes = config.get('solr', 'indexes').split(',')

    # Loop through the indexes, request a full import and wait until it completes before
    # requesting for the next index - ensures there's always a stable index available for requests

    for index in indexes:
        solr_index = SolrIndex(index)
        print("Starting full import of index: %s" % index)
        solr_index.full_import()

        # Enter loop to keep checking status every SLEEP_INTERVAL
        while True:
            r = solr_index.status()
            if r['status'] == 'busy':
                print('Total Rows Fetched: %s' %
                      r['statusMessages'].get('Total Rows Fetched'))
                print('Time elapsed: %s' %
                      r['statusMessages'].get('Time Elapsed'))
                time.sleep(SLEEP_INTERVAL)
            else:
                print(r['statusMessages'].get(''))
                print('Time taken: %s' % r['statusMessages'].get('Time taken'))
                break
Example #15
0
    def __init__(self, *args, **kwargs):

        # If a date parameter has been passed in, we'll just use that
        # Otherwise, loop through the files and get all dates
        super(APITask, self).__init__(*args, **kwargs)
        self.remote_ckan = ckanapi.RemoteCKAN(config.get('ckan', 'site_url'), apikey=config.get('ckan', 'api_key'))
Example #16
0
    def process_dataframe(self, m, df):
        """
        Process the dataframe, updating multimedia irns => URIs
        @param m: monary
        @param df: dataframe
        @return: dataframe
        """
        df = super(SpecimenDatasetTask, self).process_dataframe(m, df)

        # Added literal columns
        for (field_name, _, default_value) in self.literal_columns:
            df[field_name] = default_value

        # Convert collection code to PAL, MIN etc.,
        df['collectionCode'] = df['collectionCode'].str.upper().str[0:3]
        # Entom record collection code = BMNH(E)
        df['collectionCode'][df['collectionCode'] == 'ENT'] = "BMNH(E)"

        # Add the old stable identifier - IRN concatenated with catalogue name
        # etc.,
        df['otherCatalogNumbers'] = 'NHMUK:ecatalogue:' + \
                                    df['_id'].astype('str')

        # Ensure multimedia resources are suitable (jpeg rather than tiff
        # etc.,)
        self.ensure_multimedia(df, 'associatedMedia')

        # Assign determination name, type and field as to determinations for
        # determination history
        determination_fields = [
            ('name', '_determinationNames'),
            ('type', '_determinationTypes'),
            ('filedAs', '_determinationFiledAs')
        ]

        def determinations_json(row):
            """
            Convert determination fields to json
            Dictionary comprehension looping through each field, and if it exists adding to a dict
            @param row:
            @return:
            """
            return json.dumps({field_name: row[determination].split(';') for field_name, determination in determination_fields if row[determination]})

        df['determinations'] = df[df['_determinationNames']
                                  != ''].apply(determinations_json, axis=1)

        # There doesn't seem to be a good way to identify centroids in KE EMu
        # I was using esites.LatDeriveCentroid, but this always defaults to True
        # And trying to use centroid lat/lon fields, also includes pretty much every record
        # But matching against *entroid being added to georeferencing notes
        # produces much better results
        df['centroid'][df['_latLongComments'].str.contains("entroid")] = True

        # Convert all blank strings to NaN so we can use fillna &
        # combine_first() to replace NaNs with value from parent df
        df = df.applymap(lambda x: np.nan if isinstance(
            x, basestring) and x == '' else x)

        df['catalogNumber'].fillna(df['_regRegistrationNumber'], inplace=True)

        # If PalNearestNamedPlaceLocal is missing, use sumPreciseLocation
        # And then try MinNhmVerbatimLocalityLocal
        df['locality'].fillna(df['_preciseLocation'], inplace=True)
        df['locality'].fillna(df['_minLocalityLocal'], inplace=True)

        # Replace missing DarTypeStatus
        df['typeStatus'].fillna(df['_sumTypeStatus'], inplace=True)

        # Replace missing depth fields
        df['minimumDepthInMeters'].fillna(df['_collEventFromMetres'], inplace=True)
        df['maximumDepthInMeters'].fillna(df['_collEventToMetres'], inplace=True)

        # Replace missing CatPreservative
        df['preservative'].fillna(df['_entCatPreservation'], inplace=True)

        # Cultivated should only be set on Botany records - but is actually on
        # everything
        df['cultivated'][df['collectionCode'] != 'BOT'] = np.nan

        # Process part parents
        parent_irns = self._get_unique_irns(df, '_parentRef')

        if parent_irns:
            # We want to get all parts associated to one parent record, so we can provide them as associated records
            # So select all records matching the parent IRN
            q = dict(self.query)

            # Delete _id if it's set - need this for testing
            if '_id' in q:
                del q['_id']

            # Get all records with the same parent, so we can add them as
            # related records
            q['RegRegistrationParentRef'] = {'$in': parent_irns}
            monary_query = m.query(config.get('mongo', 'database'), 'ecatalogue', q, [
                'RegRegistrationParentRef', 'AdmGUIDPreferredValue'], ['int32', 'string:36'])
            part_df = pd.DataFrame(np.matrix(monary_query).transpose(), columns=[
                'RegRegistrationParentRef', 'AdmGUIDPreferredValue'])
            part_df['RegRegistrationParentRef'] = part_df[
                'RegRegistrationParentRef'].astype('int32')

            # Group by parent ref and concatenate all the GUIDs together
            # So we now have:
            # parent_irn   guid; guid
            parts = part_df.groupby('RegRegistrationParentRef')[
                'AdmGUIDPreferredValue'].apply(lambda x: "%s" % ';'.join(x))

            # And update the main date frame with the group parts, merged on
            # _parentRef
            df['relatedResourceID'] = df.apply(lambda row: parts[row['_parentRef']] if row[
                                                                                           '_parentRef'] in parts else np.NaN, axis=1)
            df['relationshipOfResource'][
                df['relatedResourceID'].notnull()] = 'Parts'

            parent_df = self.get_dataframe(m, 'ecatalogue', self.get_collection_source_columns(
                'ecatalogue'), parent_irns, '_id')

            # Ensure the parent multimedia images are usable
            self.ensure_multimedia(parent_df, 'associatedMedia')

            # Assign parentRef as the index to allow us to combine with
            # parent_df
            df.index = df['_parentRef']

            # There is a annoying bug that coerces string columns to integers in combine_first
            # Hack: ensure there's always a string value that cannot be coerced in every column
            # So will create a dummy row, that gets deleted after combine_first
            # is called
            dummy_index = len(df) + 1
            parent_df.loc[dummy_index] = ['-' for _ in parent_df]
            df = df.combine_first(parent_df)
            df = df.drop([dummy_index])

        # Ensure our geo fields are floats
        df['decimalLongitude'] = df['decimalLongitude'].astype('float64')
        df['decimalLatitude'] = df['decimalLatitude'].astype('float64')

        # Get all collection columns
        collection_columns = self.get_collection_source_columns()

        # Load extra sites info (if there's an error radius + unit)
        site_irns = self._get_unique_irns(df, '_siteRef')

        sites_df = self.get_dataframe(m, 'esites', collection_columns[
            'esites'], site_irns, '_esitesIrn')

        df = pd.merge(df, sites_df, how='outer', left_on=[
            '_siteRef'], right_on=['_esitesIrn'])

        # For CITES species, we need to hide Lat/Lon and Locality data - and
        # label images
        for i in ['locality', 'labelLocality', 'decimalLongitude', 'decimalLatitude', 'verbatimLongitude', 'verbatimLatitude', 'centroid', 'maxError', 'higherGeography', 'associatedMedia']:
            df[i][df['_cites'] == 'True'] = np.NaN

        # Some records are being assigned a Centroid even if they have no lat/lon fields.
        # Ensure it's False is latitude is null
        df['centroid'][df['decimalLatitude'].isnull()] = False

        # Load collection event data
        collection_event_irns = self._get_unique_irns(
            df, '_collectionEventRef')

        # if collection_event_irns:
        collection_event_df = self.get_dataframe(m, 'ecollectionevents', collection_columns[
            'ecollectionevents'], collection_event_irns, '_ecollectioneventsIrn')
        # print collection_event_df
        df = pd.merge(df, collection_event_df, how='outer', left_on=[
            '_collectionEventRef'], right_on=['_ecollectioneventsIrn'])

        # Add parasite life stage
        # Parasite cards use a different field for life stage
        df['lifeStage'].fillna(df['_parasiteStage'], inplace=True)

        # Add parasite card
        parasite_taxonomy_irns = self._get_unique_irns(df, '_cardParasiteRef')

        if parasite_taxonomy_irns:
            parasite_df = self.get_dataframe(
                m, 'etaxonomy', self.parasite_taxonomy_fields, parasite_taxonomy_irns, '_irn')
            df.index = df['_cardParasiteRef']
            df = df.combine_first(parasite_df)

        return df
Example #17
0
class SpecimenDatasetTask(DatasetTask):
    # CKAN Dataset params
    package = {
        'name': 'collection-specimens',
        'notes':
        u'Specimen records from the Natural History Museum\'s collection',
        'title': "Collection specimens",
        'author': DATASET_AUTHOR,
        'license_id': DATASET_LICENCE,
        'resources': [],
        'dataset_category': DATASET_TYPE,
        'spatial':
        '{"type":"Polygon","coordinates":[[[-180,82],[180,82],[180,-82],[-180,-82],[-180,82]]]}',
        'owner_org': config.get('ckan', 'owner_org')
    }

    # And now save to the datastore
    datastore = {
        'resource': {
            'id': config.get('dataset_id', 'specimen'),
            'name': 'Specimens',
            'description': 'Specimen records',
            'format': 'dwc'  # Darwin core
        },
        'primary_key': 'occurrenceID'
    }

    geospatial_fields = {
        'latitude_field': 'decimalLatitude',
        'longitude_field': 'decimalLongitude'
    }

    indexed_fields = ['collectionCode', 'catalogNumber', 'created', 'project']

    columns = [
        # List of columns
        # ([KE EMu field], [new field], [field type])
        # Used for logging, joins and the old stable identifier
        ('ecatalogue._id', '_id', 'int32'),
        ('ecatalogue.AdmGUIDPreferredValue', 'occurrenceID', 'uuid'),
        ('ecatalogue.DarCatalogNumber', 'catalogNumber', 'string:100'),
        # Taxonomy
        ('ecatalogue.DarScientificName', 'scientificName', 'string:100'),
        # Rather than using the two darwin core fields DarScientificNameAuthorYear and ScientificNameAuthor
        # It's easier to just use IdeFiledAsAuthors which has them both
        # concatenated
        ('ecatalogue.IdeFiledAsAuthors', 'scientificNameAuthorship',
         'string:100'),
        ('ecatalogue.DarTypeStatus', 'typeStatus', 'string:100'),
        # Use nearest name place rather than precise locality
        # https://github.com/NaturalHistoryMuseum/ke2mongo/issues/29
        ('ecatalogue.PalNearestNamedPlaceLocal', 'locality', 'string:100'),
        ('ecatalogue.DarCountry', 'country', 'string:100'),
        ('ecatalogue.DarWaterBody', 'waterBody', 'string:100'),
        ('ecatalogue.EntLocExpeditionNameLocal', 'expedition', 'string:100'),
        ('ecollectionevents.ColParticipantLocal', 'recordedBy', 'string:100'),
        ('ecatalogue.ColDepartment', 'collectionCode', 'string:100'),
        ('ecatalogue.DarKingdom', 'kingdom', 'string:100'),
        ('ecatalogue.DarPhylum', 'phylum', 'string:100'),
        ('ecatalogue.DarClass', 'class', 'string:100'),
        ('ecatalogue.DarOrder', 'order', 'string:100'),
        ('ecatalogue.DarFamily', 'family', 'string:100'),
        ('ecatalogue.DarGenus', 'genus', 'string:100'),
        ('ecatalogue.DarSubgenus', 'subgenus', 'string:100'),
        ('ecatalogue.DarSpecies', 'specificEpithet', 'string:100'),
        ('ecatalogue.DarSubspecies', 'infraspecificEpithet', 'string:100'),
        ('ecatalogue.DarHigherTaxon', 'higherClassification', 'string:100'),
        ('ecatalogue.DarInfraspecificRank', 'taxonRank', 'string:100'),

        # Location
        ('ecatalogue.DarStateProvince', 'stateProvince', 'string:100'),
        ('ecatalogue.DarContinent', 'continent', 'string:100'),
        ('ecatalogue.DarIsland', 'island', 'string:100'),
        ('ecatalogue.DarIslandGroup', 'islandGroup', 'string:100'),
        # Removed: continentOcean is not in current DwC standard, replaced by waterBody and continent
        # ('ecatalogue.DarContinentOcean', 'continentOcean', 'string:100'),
        ('ecatalogue.DarHigherGeography', 'higherGeography', 'string:100'),
        ('ecatalogue.ColHabitatVerbatim', 'habitat', 'string:100'),
        ('ecatalogue.DarLatLongComments', '_latLongComments', 'string:100'),
        ('ecatalogue.DarDecimalLongitude', 'decimalLongitude', 'float64'),
        ('ecatalogue.DarDecimalLatitude', 'decimalLatitude', 'float64'),
        ('ecatalogue.DarGeodeticDatum', 'geodeticDatum', 'string:100'),
        ('ecatalogue.DarGeorefMethod', 'georeferenceProtocol', 'string:100'),
        ('esites.LatLongitude', 'verbatimLongitude', 'string:100'),
        ('esites.LatLatitude', 'verbatimLatitude', 'string:100'),

        # Occurrence
        ('ecatalogue.DarMinimumElevationInMeters', 'minimumElevationInMeters',
         'string:100'),
        ('ecatalogue.DarMaximumElevationInMeters', 'maximumElevationInMeters',
         'string:100'),
        ('ecatalogue.DarMinimumDepthInMeters', 'minimumDepthInMeters',
         'string:100'),
        ('ecatalogue.DarMaximumDepthInMeters', 'maximumDepthInMeters',
         'string:100'),
        # DarCollector doesn't have multiple collectors NHMUK:ecatalogue:1751715 - Switched to using ecollectionevents.ColParticipantLocal
        # ('ecatalogue.DarCollector', 'Recorded by', 'string:100'),
        ('ecatalogue.DarCollectorNumber', 'recordNumber', 'string:100'),
        ('ecatalogue.DarIndividualCount', 'individualCount', 'string:100'),
        # According to docs, ageClass has been superseded by lifeStage. We have both, but ageClass duplicates
        # And for the ~200 it has extra data, the data isn't good
        # ('ecatalogue.DarAgeClass', 'ageClass', 'string:100'),
        ('ecatalogue.DarLifeStage', 'lifeStage', 'string:100'),
        ('ecatalogue.DarSex', 'sex', 'string:100'),
        ('ecatalogue.DarPreparations', 'preparations', 'string:100'),

        # Identification
        ('ecatalogue.DarIdentifiedBy', 'identifiedBy', 'string:100'),
        # KE Emu has 3 fields for identification date: DarDayIdentified, DarMonthIdentified and DarYearIdentified
        # But EntIdeDateIdentified holds them all - which is what we want for
        # dateIdentified
        ('ecatalogue.EntIdeDateIdentified', 'dateIdentified', 'string:100'),
        ('ecatalogue.DarIdentificationQualifier', 'identificationQualifier',
         'string:100'),
        # ('ecatalogue.DarFieldNumber', 'Field number', 'string:100'),  Removed as mostly duplicates DarCollectorNumber (JW - feedback)
        ('ecatalogue.DarTimeOfDay', 'eventTime', 'string:100'),
        ('ecatalogue.DarDayCollected', 'day', 'string:100'),
        ('ecatalogue.DarMonthCollected', 'month', 'string:100'),
        ('ecatalogue.DarYearCollected', 'year', 'string:100'),

        # Geo
        ('ecatalogue.DarEarliestEon', 'earliestEonOrLowestEonothem',
         'string:100'),
        ('ecatalogue.DarLatestEon', 'latestEonOrHighestEonothem',
         'string:100'),
        ('ecatalogue.DarEarliestEra', 'earliestEraOrLowestErathem',
         'string:100'),
        ('ecatalogue.DarLatestEra', 'latestEraOrHighestErathem', 'string:100'),
        ('ecatalogue.DarEarliestPeriod', 'earliestPeriodOrLowestSystem',
         'string:100'),
        ('ecatalogue.DarLatestPeriod', 'latestPeriodOrHighestSystem',
         'string:100'),
        ('ecatalogue.DarEarliestEpoch', 'earliestEpochOrLowestSeries',
         'string:100'),
        ('ecatalogue.DarLatestEpoch', 'latestEpochOrHighestSeries',
         'string:100'),
        ('ecatalogue.DarEarliestAge', 'earliestAgeOrLowestStage',
         'string:100'),
        ('ecatalogue.DarLatestAge', 'latestAgeOrHighestStage', 'string:100'),
        ('ecatalogue.DarLowestBiostrat', 'lowestBiostratigraphicZone',
         'string:100'),
        ('ecatalogue.DarHighestBiostrat', 'highestBiostratigraphicZone',
         'string:100'),
        ('ecatalogue.DarGroup', 'group', 'string:100'),
        ('ecatalogue.DarFormation', 'formation', 'string:100'),
        ('ecatalogue.DarMember', 'member', 'string:100'),
        ('ecatalogue.DarBed', 'bed', 'string:100'),

        # Resource relationship
        # ('ecatalogue.DarRelatedCatalogItem', 'Related resource id', 'string:100'), Only 34 records have this field populated
        # So it's better to build automatically from part / parent records

        # Multimedia
        ('ecatalogue.MulMultiMediaRef', 'associatedMedia', 'json'),

        # Dynamic properties
        # These fields do not map to DwC, but are still very useful
        ('ecatalogue.ColRecordType', 'recordType', 'string:100'),
        ('ecatalogue.ColSubDepartment', 'subDepartment', 'string:100'),
        ('ecatalogue.PrtType', 'partType', 'string:100'),
        ('ecatalogue.RegCode', 'registrationCode', 'string:100'),
        ('ecatalogue.CatKindOfObject', 'kindOfObject', 'string:100'),
        ('ecatalogue.CatKindOfCollection', 'kindOfCollection', 'string:100'),
        ('ecatalogue.CatPreservative', 'preservative', 'string:100'),
        ('ecatalogue.ColKind', 'collectionKind', 'string:100'),
        ('ecatalogue.EntPriCollectionName', 'collectionName', 'string:100'),
        ('ecatalogue.PalAcqAccLotDonorFullName', 'donorName', 'string:100'),
        ('ecatalogue.DarPreparationType', 'preparationType', 'string:100'),
        ('ecatalogue.DarObservedWeight', 'observedWeight', 'string:100'),

        # Location
        # Data is stored in sumViceCountry field in ecatalogue data - but actually this
        # should be viceCountry (which it is in esites)
        ('ecatalogue.sumViceCountry', 'viceCounty', 'string:100'),
        ('ecatalogue.DnaExtractionMethod', 'extractionMethod', 'string:100'),
        ('ecatalogue.DnaReSuspendedIn', 'resuspendedIn', 'string:100'),
        ('ecatalogue.DnaTotalVolume', 'totalVolume', 'string:100'),
        # Parasite card
        ('ecatalogue.CardBarcode', 'barcode', 'string:100'),
        # Egg
        ('ecatalogue.EggClutchSize', 'clutchSize', 'string:100'),
        ('ecatalogue.EggSetMark', 'setMark', 'string:100'),
        # Nest
        ('ecatalogue.NesShape', 'nestShape', 'string:100'),
        ('ecatalogue.NesSite', 'nestSite', 'string:100'),
        # Silica gel
        ('ecatalogue.SilPopulationCode', 'populationCode', 'string:100'),
        # Botany
        ('ecatalogue.CollExsiccati', 'exsiccati', 'string:100'),
        ('ecatalogue.ColExsiccatiNumber', 'exsiccatiNumber', 'string:100'),
        # JW asked for this to be renamed from Site Description => Label
        # locality
        ('ecatalogue.ColSiteDescription', 'labelLocality', 'string:100'),
        ('ecatalogue.ColPlantDescription', 'plantDescription', 'string:100'),
        ('ecatalogue.FeaCultivated', 'cultivated', 'string:100'),

        # ('ecatalogue.FeaPlantForm', 'Plant form', 'string:100'),  # JW asked for this to be removed
        # Paleo
        ('ecatalogue.PalDesDescription', 'catalogueDescription', 'string:100'),
        ('ecatalogue.PalStrChronostratLocal', 'chronostratigraphy',
         'string:100'),
        ('ecatalogue.PalStrLithostratLocal', 'lithostratigraphy',
         'string:100'),
        # Mineralogy
        ('ecatalogue.MinDateRegistered', 'dateRegistered', 'string:100'),
        ('ecatalogue.MinIdentificationAsRegistered',
         'identificationAsRegistered', 'string:100'),
        ('ecatalogue.MinIdentificationDescription',
         'identificationDescription', 'string:500'),
        ('ecatalogue.MinPetOccurance', 'occurrence', 'string:100'),
        ('ecatalogue.MinOreCommodity', 'commodity', 'string:200'),
        ('ecatalogue.MinOreDepositType', 'depositType', 'string:100'),
        ('ecatalogue.MinTextureStructure', 'texture', 'string:100'),
        ('ecatalogue.MinIdentificationVariety', 'identificationVariety',
         'string:100'),
        ('ecatalogue.MinIdentificationOther', 'identificationOther',
         'string:100'),
        ('ecatalogue.MinHostRock', 'hostRock', 'string:100'),
        ('ecatalogue.MinAgeDataAge', 'age', 'string:100'),
        ('ecatalogue.MinAgeDataType', 'ageType', 'string:100'),
        # Mineralogy location
        ('ecatalogue.MinNhmTectonicProvinceLocal', 'tectonicProvince',
         'string:100'),
        ('ecatalogue.MinNhmStandardMineLocal', 'mine', 'string:100'),
        ('ecatalogue.MinNhmMiningDistrictLocal', 'miningDistrict',
         'string:100'),
        ('ecatalogue.MinNhmComplexLocal', 'mineralComplex', 'string:100'),
        ('ecatalogue.MinNhmRegionLocal', 'geologyRegion', 'string:100'),
        # Meteorite
        ('ecatalogue.MinMetType', 'meteoriteType', 'string:100'),
        ('ecatalogue.MinMetGroup', 'meteoriteGroup', 'string:100'),
        ('ecatalogue.MinMetChondriteAchondrite', 'chondriteAchondrite',
         'string:100'),
        ('ecatalogue.MinMetClass', 'meteoriteClass', 'string:100'),
        ('ecatalogue.MinMetPetType', 'petrologyType', 'string:100'),
        ('ecatalogue.MinMetPetSubtype', 'petrologySubtype', 'string:100'),
        ('ecatalogue.MinMetRecoveryFindFall', 'recovery', 'string:100'),
        ('ecatalogue.MinMetRecoveryDate', 'recoveryDate', 'string:100'),
        ('ecatalogue.MinMetRecoveryWeight', 'recoveryWeight', 'string:100'),
        ('ecatalogue.MinMetWeightAsRegistered', 'registeredWeight',
         'string:100'),
        ('ecatalogue.MinMetWeightAsRegisteredUnit', 'registeredWeightUnit',
         'string:100'),
        # Project
        ('ecatalogue.NhmSecProjectName', 'project', 'string:100'),
        # Project
        ('ecatalogue.EntCatBarcode', 'barcode', 'string:100'),

        # Record level
        ('ecatalogue.AdmDateModified', 'modified', 'string:100'),
        # This isn't actually in DwC - but I'm going to use dcterms:created
        ('ecatalogue.AdmDateInserted', 'created', 'string:100'),

        # Internal
        ('ecatalogue.RegRegistrationParentRef', '_parentRef', 'int32'),
        ('ecatalogue.sumSiteRef', '_siteRef', 'int32'),
        ('ecatalogue.sumCollectionEventRef', '_collectionEventRef', 'int32'),
        ('ecatalogue.CardParasiteRef', '_cardParasiteRef', 'int32'),
        # Used if DarCatalogueNumber is empty
        ('ecatalogue.RegRegistrationNumber', '_regRegistrationNumber',
         'string:100'),

        # Used if CatPreservative is empty
        ('ecatalogue.EntCatPreservation', '_entCatPreservation', 'string:100'),

        # Used to build previous determinations for Botany
        ('ecatalogue.IdeCitationTypeStatus', '_determinationTypes',
         'string:100'),
        ('ecatalogue.EntIdeScientificNameLocal', '_determinationNames',
         'string:250'),
        ('ecatalogue.EntIdeFiledAs', '_determinationFiledAs', 'string:100'),
        # If DarTypeStatus is empty, we'll use sumTypeStatus which has previous
        # determinations
        ('ecatalogue.sumTypeStatus', '_sumTypeStatus', 'string:100'),

        # Id DarMinimumDepthInMeters is empty, use CollEventFromMetres - used for abyssline project
        ('ecatalogue.CollEventFromMetres', '_collEventFromMetres', 'string:100'
         ),
        ('ecatalogue.CollEventToMetres', '_collEventToMetres', 'string:100'),

        # Locality if nearest named place is empty
        # The encoding of DarLocality is buggered - see ecatalogue.1804973
        # So better to use the original field with the correct encoding
        ('ecatalogue.sumPreciseLocation', '_preciseLocation', 'string:100'),
        # Locality if precise and nearest named place is empty
        ('ecatalogue.MinNhmVerbatimLocalityLocal', '_minLocalityLocal',
         'string:100'),

        # CITES specimens
        ('ecatalogue.cites', '_cites', 'bool'),

        # Parasite cards use a different field for life stage
        ('ecatalogue.CardParasiteStage', '_parasiteStage', 'string:100'),

        # Join keys
        ('ecollectionevents._id', '_ecollectioneventsIrn', 'int32'),
        ('esites._id', '_esitesIrn', 'int32'),

        # Removed: We do not want notes, could contain anything
        # ('ecatalogue.DarNotes', 'DarNotes', 'string:100'),
        # ('ecatalogue.DarLatLongComments', 'latLongComments', 'string:100'),
    ]

    # Used to merge in data from parasite cards, which do not have taxonomic
    # data
    parasite_taxonomy_fields = [
        ('_id', '_irn', 'int32'),
        ('ClaScientificNameBuilt', 'scientificName', 'string:100'),
        ('ClaKingdom', 'kingdom', 'string:60'),
        ('ClaPhylum', 'phylum', 'string:100'),
        ('ClaClass', 'class', 'string:100'),
        ('ClaOrder', 'order', 'string:100'),
        ('ClaFamily', 'family', 'string:100'),
        ('ClaGenus', 'genus', 'string:100'),
        ('ClaSubgenus', 'subgenus', 'string:100'),
        ('ClaSpecies', 'specificEpithet', 'string:100'),
        ('ClaSubspecies', 'infraspecificEpithet', 'string:100'),
        ('ClaRank', 'taxonRank', 'string:10')  # NB: CKAN uses rank internally
    ]

    # Columns not selected from the database
    # In the format (field_name, field_type, default_value)
    literal_columns = [
        ('institutionCode', 'string:100', 'NHMUK'),
        ('basisOfRecord', 'string:100', 'Specimen'),
        ('determinations', 'json', np.NaN),
        # This is set dynamically if this is a part record (with parent Ref)
        ('relatedResourceID', 'string:100', np.NaN),
        ('relationshipOfResource', 'string:100', np.NaN),
        ('centroid', 'bool', False),
        ('otherCatalogNumbers', 'string:100', np.NaN)
    ]

    @property
    def query(self):
        """
        Query object for selecting data from mongoDB

        To test encoding, use query = {'_id': 42866}

        @return: dict
        """
        query = super(SpecimenDatasetTask, self).query

        # Override the default ColRecordType
        query['ColRecordType'] = {
            "$nin":
            PARENT_TYPES +
            [ArtefactDatasetTask.record_type, IndexLotDatasetTask.record_type]
        }

        # And exclude all with an embargo date (timestamp) in the future
        query['RealEmbargoDate'] = {"$lt": time.time()}

        return query

    def get_output_columns(self):
        """
        Override default get_output_columns and add in literal columns (not retrieved from mongo)
        @return:
        """
        output_columns = super(SpecimenDatasetTask, self).get_output_columns()

        # Add the literal columns
        for (field_name, field_type, _) in self.literal_columns:
            output_columns[field_name] = field_type

        return output_columns

    def process_dataframe(self, m, df):
        """
        Process the dataframe, updating multimedia irns => URIs
        @param m: monary
        @param df: dataframe
        @return: dataframe
        """
        df = super(SpecimenDatasetTask, self).process_dataframe(m, df)

        # Added literal columns
        for (field_name, _, default_value) in self.literal_columns:
            df[field_name] = default_value

        # Convert collection code to PAL, MIN etc.,
        df['collectionCode'] = df['collectionCode'].str.upper().str[0:3]
        # Entom record collection code = BMNH(E)
        df['collectionCode'][df['collectionCode'] == 'ENT'] = "BMNH(E)"

        # Add the old stable identifier - IRN concatenated with catalogue name
        # etc.,
        df['otherCatalogNumbers'] = 'NHMUK:ecatalogue:' + \
                                    df['_id'].astype('str')

        # Ensure multimedia resources are suitable (jpeg rather than tiff
        # etc.,)
        self.ensure_multimedia(df, 'associatedMedia')

        # Assign determination name, type and field as to determinations for
        # determination history
        determination_fields = [('name', '_determinationNames'),
                                ('type', '_determinationTypes'),
                                ('filedAs', '_determinationFiledAs')]

        def determinations_json(row):
            """
            Convert determination fields to json
            Dictionary comprehension looping through each field, and if it exists adding to a dict
            @param row:
            @return:
            """
            return json.dumps({
                field_name: row[determination].split(';')
                for field_name, determination in determination_fields
                if row[determination]
            })

        df['determinations'] = df[df['_determinationNames'] != ''].apply(
            determinations_json, axis=1)

        # There doesn't seem to be a good way to identify centroids in KE EMu
        # I was using esites.LatDeriveCentroid, but this always defaults to True
        # And trying to use centroid lat/lon fields, also includes pretty much every record
        # But matching against *entroid being added to georeferencing notes
        # produces much better results
        df['centroid'][df['_latLongComments'].str.contains("entroid")] = True

        # Convert all blank strings to NaN so we can use fillna &
        # combine_first() to replace NaNs with value from parent df
        df = df.applymap(lambda x: np.nan
                         if isinstance(x, basestring) and x == '' else x)

        df['catalogNumber'].fillna(df['_regRegistrationNumber'], inplace=True)

        # If PalNearestNamedPlaceLocal is missing, use sumPreciseLocation
        # And then try MinNhmVerbatimLocalityLocal
        df['locality'].fillna(df['_preciseLocation'], inplace=True)
        df['locality'].fillna(df['_minLocalityLocal'], inplace=True)

        # Replace missing DarTypeStatus
        df['typeStatus'].fillna(df['_sumTypeStatus'], inplace=True)

        # Replace missing depth fields
        df['minimumDepthInMeters'].fillna(df['_collEventFromMetres'],
                                          inplace=True)
        df['maximumDepthInMeters'].fillna(df['_collEventToMetres'],
                                          inplace=True)

        # Replace missing CatPreservative
        df['preservative'].fillna(df['_entCatPreservation'], inplace=True)

        # Cultivated should only be set on Botany records - but is actually on
        # everything
        df['cultivated'][df['collectionCode'] != 'BOT'] = np.nan

        # Process part parents
        parent_irns = self._get_unique_irns(df, '_parentRef')

        if parent_irns:
            # We want to get all parts associated to one parent record, so we can provide them as associated records
            # So select all records matching the parent IRN
            q = dict(self.query)

            # Delete _id if it's set - need this for testing
            if '_id' in q:
                del q['_id']

            # Get all records with the same parent, so we can add them as
            # related records
            q['RegRegistrationParentRef'] = {'$in': parent_irns}
            monary_query = m.query(
                config.get('mongo', 'database'), 'ecatalogue', q,
                ['RegRegistrationParentRef', 'AdmGUIDPreferredValue'],
                ['int32', 'string:36'])
            part_df = pd.DataFrame(
                np.matrix(monary_query).transpose(),
                columns=['RegRegistrationParentRef', 'AdmGUIDPreferredValue'])
            part_df['RegRegistrationParentRef'] = part_df[
                'RegRegistrationParentRef'].astype('int32')

            # Group by parent ref and concatenate all the GUIDs together
            # So we now have:
            # parent_irn   guid; guid
            parts = part_df.groupby('RegRegistrationParentRef')[
                'AdmGUIDPreferredValue'].apply(lambda x: "%s" % ';'.join(x))

            # And update the main date frame with the group parts, merged on
            # _parentRef
            df['relatedResourceID'] = df.apply(
                lambda row: parts[row['_parentRef']]
                if row['_parentRef'] in parts else np.NaN,
                axis=1)
            df['relationshipOfResource'][
                df['relatedResourceID'].notnull()] = 'Parts'

            parent_df = self.get_dataframe(
                m, 'ecatalogue',
                self.get_collection_source_columns('ecatalogue'), parent_irns,
                '_id')

            # Ensure the parent multimedia images are usable
            self.ensure_multimedia(parent_df, 'associatedMedia')

            # Assign parentRef as the index to allow us to combine with
            # parent_df
            df.index = df['_parentRef']

            # There is a annoying bug that coerces string columns to integers in combine_first
            # Hack: ensure there's always a string value that cannot be coerced in every column
            # So will create a dummy row, that gets deleted after combine_first
            # is called
            dummy_index = len(df) + 1
            parent_df.loc[dummy_index] = ['-' for _ in parent_df]
            df = df.combine_first(parent_df)
            df = df.drop([dummy_index])

        # Ensure our geo fields are floats
        df['decimalLongitude'] = df['decimalLongitude'].astype('float64')
        df['decimalLatitude'] = df['decimalLatitude'].astype('float64')

        # Get all collection columns
        collection_columns = self.get_collection_source_columns()

        # Load extra sites info (if there's an error radius + unit)
        site_irns = self._get_unique_irns(df, '_siteRef')

        sites_df = self.get_dataframe(m, 'esites',
                                      collection_columns['esites'], site_irns,
                                      '_esitesIrn')

        df = pd.merge(df,
                      sites_df,
                      how='outer',
                      left_on=['_siteRef'],
                      right_on=['_esitesIrn'])

        # For CITES species, we need to hide Lat/Lon and Locality data - and
        # label images
        for i in [
                'locality', 'labelLocality', 'decimalLongitude',
                'decimalLatitude', 'verbatimLongitude', 'verbatimLatitude',
                'centroid', 'maxError', 'higherGeography', 'associatedMedia'
        ]:
            df[i][df['_cites'] == 'True'] = np.NaN

        # Some records are being assigned a Centroid even if they have no lat/lon fields.
        # Ensure it's False is latitude is null
        df['centroid'][df['decimalLatitude'].isnull()] = False

        # Load collection event data
        collection_event_irns = self._get_unique_irns(df,
                                                      '_collectionEventRef')

        # if collection_event_irns:
        collection_event_df = self.get_dataframe(
            m, 'ecollectionevents', collection_columns['ecollectionevents'],
            collection_event_irns, '_ecollectioneventsIrn')
        # print collection_event_df
        df = pd.merge(df,
                      collection_event_df,
                      how='outer',
                      left_on=['_collectionEventRef'],
                      right_on=['_ecollectioneventsIrn'])

        # Add parasite life stage
        # Parasite cards use a different field for life stage
        df['lifeStage'].fillna(df['_parasiteStage'], inplace=True)

        # Add parasite card
        parasite_taxonomy_irns = self._get_unique_irns(df, '_cardParasiteRef')

        if parasite_taxonomy_irns:
            parasite_df = self.get_dataframe(m, 'etaxonomy',
                                             self.parasite_taxonomy_fields,
                                             parasite_taxonomy_irns, '_irn')
            df.index = df['_cardParasiteRef']
            df = df.combine_first(parasite_df)

        return df
Example #18
0
class IndexLotDatasetTask(DatasetTask):

    record_type = 'Index Lot'

    # CKAN Dataset params
    package = {
        'name': 'collection-indexlots',
        'notes':
        u'Index Lot records from the Natural History Museum\'s collection',
        'title': "Index Lot collection",
        'author': DATASET_AUTHOR,
        'license_id': DATASET_LICENCE,
        'resources': [],
        'dataset_category': DATASET_TYPE,
        'owner_org': config.get('ckan', 'owner_org')
    }

    # And now save to the datastore
    datastore = {
        'resource': {
            'name': 'Index Lots',
            'description':
            'Species level record denoting the presence of a taxon in the Museum collection',
            'format': 'csv'
        },
        'primary_key': 'GUID'
    }

    columns = [
        ('etaxonomy2._id', '_current_name_irn', 'int32'),
        ('etaxonomy2.ClaScientificNameBuilt', 'Currently accepted name',
         'string:100'),
        ('etaxonomy._id', '_taxonomy_irn', 'int32'),
        ('etaxonomy.ClaScientificNameBuilt', 'Original name', 'string:100'),
        ('etaxonomy.ClaKingdom', 'Kingdom', 'string:60'),
        ('etaxonomy.ClaPhylum', 'Phylum', 'string:100'),
        ('etaxonomy.ClaClass', 'Class', 'string:100'),
        ('etaxonomy.ClaOrder', 'Order', 'string:100'),
        ('etaxonomy.ClaSuborder', 'Suborder', 'string:100'),
        ('etaxonomy.ClaSuperfamily', 'Superfamily', 'string:100'),
        ('etaxonomy.ClaFamily', 'Family', 'string:100'),
        ('etaxonomy.ClaSubfamily', 'Subfamily', 'string:100'),
        ('etaxonomy.ClaGenus', 'Genus', 'string:100'),
        ('etaxonomy.ClaSubgenus', 'Subgenus', 'string:100'),
        ('etaxonomy.ClaSpecies', 'Species', 'string:100'),
        ('etaxonomy.ClaSubspecies', 'Subspecies', 'string:100'),
        ('etaxonomy.ClaRank', 'Taxonomic rank',
         'string:20'),  # NB: CKAN uses rank internally
        ('ecatalogue.AdmGUIDPreferredValue', 'GUID', 'uuid'),
        ('ecatalogue._id', 'IRN', 'int32'),
        ('ecatalogue.EntIndIndexLotNameRef', '_collection_index_irn', 'int32'),
        ('ecatalogue.EntIndMaterial', 'Material', 'bool'),
        ('ecatalogue.EntIndType', 'Type', 'bool'),
        ('ecatalogue.EntIndMedia', 'Media', 'bool'),
        ('ecatalogue.EntIndBritish', 'British', 'bool'),
        ('ecatalogue.EntIndKindOfMaterial', 'Kind of material', 'string:100'),
        ('ecatalogue.EntIndKindOfMedia', 'Kind of media', 'string:100'),

        # Material detail
        ('ecatalogue.EntIndCount', 'Material count', 'string:100'),
        ('ecatalogue.EntIndSex', 'Material sex', 'string:100'),
        ('ecatalogue.EntIndStage', 'Material stage', 'string:100'),
        ('ecatalogue.EntIndTypes', 'Material types', 'string:100'),
        ('ecatalogue.EntIndPrimaryTypeNo', 'Material primary type no',
         'string:100'),

        # Separate Botany and Entomology
        ('ecatalogue.ColDepartment', 'Department', 'string:100'),

        # Audit info
        ('ecatalogue.AdmDateModified', 'Modified', 'string:100'),
        ('ecatalogue.AdmDateInserted', 'Created', 'string:100'),
    ]

    def process_dataframe(self, m, df):
        """
        Process the dataframe, adding in the taxonomy fields
        @param m: monary
        @param df: dataframe
        @return: dataframe
        """

        # Try and get taxonomy using the collection index
        # BS: 20140804 - Fix indexlots taxonomy bug
        # When the index lot record's taxonomy is updated (via collection index),
        # the index lot record's EntIndIndexLotTaxonNameLocalRef is not updated with the new taxonomy
        # So we need to use collection index to retrieve the record taxonomy

        df = super(IndexLotDatasetTask, self).process_dataframe(m, df)

        # Convert booleans to yes / no for all columns in the main collection
        for (_, field, field_type) in self.get_collection_source_columns(
                self.collection_name):
            if field_type == 'bool':
                df[field][df[field] == 'True'] = 'Yes'
                df[field][df[field] == 'False'] = 'No'
                df[field][df[field] == 'N/A'] = ''

        # BUG FIX BS 140811
        # ColCurrentNameRef Is not being updated correctly - see record 899984
        # ColCurrentNameRef = 964105
        # Not a problem, as indexlots are using ColTaxonomicNameRef for summary data etc.,
        # So ColTaxonomicNameRef is the correct field to use.
        collection_index_columns = [
            ('_id', '_collection_index_irn', 'int32'),
            ('ColTaxonomicNameRef', '_taxonomy_irn', 'int32'),
            ('ColCurrentNameRef', '_current_name_irn', 'int32'),
        ]

        collection_index_irns = self._get_unique_irns(df,
                                                      '_collection_index_irn')
        collection_index_df = self.get_dataframe(m, 'ecollectionindex',
                                                 collection_index_columns,
                                                 collection_index_irns,
                                                 '_collection_index_irn')

        # Get all collection columns
        collection_columns = self.get_collection_source_columns()

        # And get the taxonomy for these collection
        taxonomy_irns = self._get_unique_irns(collection_index_df,
                                              '_taxonomy_irn')

        # The query to pre-load all taxonomy objects takes ~96 seconds
        # It is much faster to load taxonomy objects on the fly, for the current block
        # collection_index_irns = pd.unique(df._collection_index_irn.values.ravel()).tolist()
        taxonomy_df = self.get_dataframe(m, 'etaxonomy',
                                         collection_columns['etaxonomy'],
                                         taxonomy_irns, '_taxonomy_irn')

        # Merge the taxonomy into the collection index dataframe - we need to do this so we can merge into
        # main dataframe keyed by collection index ID
        collection_index_df = pd.merge(collection_index_df,
                                       taxonomy_df,
                                       how='inner',
                                       left_on=['_taxonomy_irn'],
                                       right_on=['_taxonomy_irn'])

        # Add current name - same process as the main taxonomy but using _current_name_irn source fields
        current_name_irns = self._get_unique_irns(collection_index_df,
                                                  '_current_name_irn')
        current_name_df = self.get_dataframe(m, 'etaxonomy',
                                             collection_columns['etaxonomy2'],
                                             current_name_irns,
                                             '_current_name_irn')
        collection_index_df = pd.merge(collection_index_df,
                                       current_name_df,
                                       how='inner',
                                       left_on=['_current_name_irn'],
                                       right_on=['_current_name_irn'])

        # Merge results into main dataframe
        df = pd.merge(df,
                      collection_index_df,
                      how='outer',
                      left_on=['_collection_index_irn'],
                      right_on=['_collection_index_irn'])

        return df

    def get_output_columns(self):
        """
        Get a list of output columns, with bool converted to string:3 (so can be converted to Yes/No)
        @return:
        """
        return OrderedDict((col[1], 'string:3' if col[2] == 'bool' else col[2])
                           for col in self.columns
                           if self._is_output_field(col[1]))
Example #19
0
class MongoTask(luigi.Task):

    date = luigi.IntParameter()
    # Added parameter to allow skipping the processing of records - this is so MW can look at the raw data in mongo
    unprocessed = luigi.BooleanParameter(default=False, significant=False)
    flatten_mode = FlattenModeParameter(default=FLATTEN_ALL, significant=False)

    database = config.get('mongo', 'database')
    keemu_schema_file = config.get('keemu', 'schema')

    batch_size = 1000
    bulk_op_size = 100000
    collection = None
    file_extension = 'export'

    @abc.abstractproperty
    def module(self):
        return None

    @property
    def collection_name(self):
        return self.module  # By default, the collection name will be the same as the module

    def requires(self):
        return KEFileTask(module=self.module,
                          date=self.date,
                          file_extension=self.file_extension)

    def get_collection(self):
        """
        Get a reference to the mongo collection object
        @return:
        """
        return self.output().get_collection(self.collection_name)

    @timeit
    def run(self):

        ke_data = KEParser(self.input().open('r'),
                           file_path=self.input().path,
                           schema_file=self.keemu_schema_file,
                           flatten_mode=self.flatten_mode)
        self.collection = self.get_collection()

        # If we have any records in the collection, use bulk_update with mongo bulk upsert
        # Otherwise use batch insert (20% faster than using bulk insert())
        if self.collection.find_one():
            self.bulk_update(ke_data)
        else:
            self.batch_insert(ke_data)

        self.mark_complete()

    def mark_complete(self):

        # Move the file to the archive directory (if specified)
        try:
            archive_dir = config.get('keemu', 'archive_dir')
            self.input().move(os.path.join(archive_dir,
                                           self.input().file_name))
        except NoOptionError:
            # Allow archive dir to be none
            pass

        # And mark the object as complete
        self.output().touch()

    def bulk_update(self, ke_data):

        bulk = self.collection.initialize_unordered_bulk_op()

        count = 0

        for record in self.iterate_data(ke_data):

            # Find and replace doc - inserting if it doesn't exist
            bulk.find({'_id': record['_id']}).upsert().replace_one(record)
            count += 1

            # Bulk ops can have out of memory errors (I'm getting for ~400,000+ bulk ops)
            # So execute the bulk op in stages, when bulk_op_size is reached
            if count % self.bulk_op_size == 0:
                log.info('Executing bulk op')
                bulk.execute()
                bulk = self.collection.initialize_unordered_bulk_op()

        try:
            bulk.execute()
        except InvalidOperation:
            # If we do not have any records to execute, ignore error
            # They have been executed in ln124
            pass

    def batch_insert(self, ke_data):
        def _insert(batch):

            try:
                self.collection.insert(batch)
            except DuplicateKeyError:
                # Duplicate key error - KE export does duplicate some records
                # So switch to bulk upsert for this operation

                log.error('Duplicate key error - switching to upsert')

                bulk = self.collection.initialize_unordered_bulk_op()
                for batch_record in batch:
                    bulk.find({
                        '_id': batch_record['_id']
                    }).upsert().replace_one(batch_record)

                bulk.execute()

        batch = []

        for record in self.iterate_data(ke_data):

            if self.batch_size:
                batch.append(record)

                # If the batch length equals the batch size, commit and clear the batch
                if len(batch) % self.batch_size == 0:
                    log.info('Submitting batch')
                    _insert(batch)
                    batch = []

            else:
                self.collection.insert(record)

        # Add any records remaining in the batch
        if batch:
            _insert(batch)

    def iterate_data(self, ke_data):
        """
        Iterate through the data
        @return:
        """
        for record in ke_data:

            status = ke_data.get_status()

            if status:
                log.info(status)

            # Use the IRN as _id
            record['_id'] = record['irn']

            try:
                # Do not process if unprocessed flag is set
                if not self.unprocessed:
                    record = self.process_record(record)

            except InvalidRecordException:
                continue
            else:
                yield record

    def process_record(self, record):

        # Keep the IRN but cast as string, so we can use it in $concat
        record['irn'] = str(record['irn'])

        # Add the date of the export file
        record['exportFileDate'] = self.date

        return record

    def output(self):
        return MongoTarget(database=self.database, update_id=self.update_id())

    def update_id(self):
        """This update id will be a unique identifier for this insert on this collection."""
        return self.task_id

    def on_success(self):
        """
        On completion, add indexes
        @return: None
        """

        self.collection = self.get_collection()

        log.info("Adding exportFileDate index")

        self.collection.ensure_index('exportFileDate')
Example #20
0
    def process_dataframe(self, m, df):
        """
        Process the dataframe, updating multimedia irns => URIs
        @param m: monary
        @param df: dataframe
        @return: dataframe
        """
        df = super(SpecimenDatasetTask, self).process_dataframe(m, df)

        # Added literal columns
        for (field_name, _, default_value) in self.literal_columns:
            df[field_name] = default_value

        # Convert collection code to PAL, MIN etc.,
        df['collectionCode'] = df['collectionCode'].str.upper().str[0:3]
        # Entom record collection code = BMNH(E)
        df['collectionCode'][df['collectionCode'] == 'ENT'] = "BMNH(E)"

        # Add the old stable identifier - IRN concatenated with catalogue name
        # etc.,
        df['otherCatalogNumbers'] = 'NHMUK:ecatalogue:' + \
                                    df['_id'].astype('str')

        # Ensure multimedia resources are suitable (jpeg rather than tiff
        # etc.,)
        self.ensure_multimedia(df, 'associatedMedia')

        # Assign determination name, type and field as to determinations for
        # determination history
        determination_fields = [('name', '_determinationNames'),
                                ('type', '_determinationTypes'),
                                ('filedAs', '_determinationFiledAs')]

        def determinations_json(row):
            """
            Convert determination fields to json
            Dictionary comprehension looping through each field, and if it exists adding to a dict
            @param row:
            @return:
            """
            return json.dumps({
                field_name: row[determination].split(';')
                for field_name, determination in determination_fields
                if row[determination]
            })

        df['determinations'] = df[df['_determinationNames'] != ''].apply(
            determinations_json, axis=1)

        # There doesn't seem to be a good way to identify centroids in KE EMu
        # I was using esites.LatDeriveCentroid, but this always defaults to True
        # And trying to use centroid lat/lon fields, also includes pretty much every record
        # But matching against *entroid being added to georeferencing notes
        # produces much better results
        df['centroid'][df['_latLongComments'].str.contains("entroid")] = True

        # Convert all blank strings to NaN so we can use fillna &
        # combine_first() to replace NaNs with value from parent df
        df = df.applymap(lambda x: np.nan
                         if isinstance(x, basestring) and x == '' else x)

        df['catalogNumber'].fillna(df['_regRegistrationNumber'], inplace=True)

        # If PalNearestNamedPlaceLocal is missing, use sumPreciseLocation
        # And then try MinNhmVerbatimLocalityLocal
        df['locality'].fillna(df['_preciseLocation'], inplace=True)
        df['locality'].fillna(df['_minLocalityLocal'], inplace=True)

        # Replace missing DarTypeStatus
        df['typeStatus'].fillna(df['_sumTypeStatus'], inplace=True)

        # Replace missing depth fields
        df['minimumDepthInMeters'].fillna(df['_collEventFromMetres'],
                                          inplace=True)
        df['maximumDepthInMeters'].fillna(df['_collEventToMetres'],
                                          inplace=True)

        # Replace missing CatPreservative
        df['preservative'].fillna(df['_entCatPreservation'], inplace=True)

        # Cultivated should only be set on Botany records - but is actually on
        # everything
        df['cultivated'][df['collectionCode'] != 'BOT'] = np.nan

        # Process part parents
        parent_irns = self._get_unique_irns(df, '_parentRef')

        if parent_irns:
            # We want to get all parts associated to one parent record, so we can provide them as associated records
            # So select all records matching the parent IRN
            q = dict(self.query)

            # Delete _id if it's set - need this for testing
            if '_id' in q:
                del q['_id']

            # Get all records with the same parent, so we can add them as
            # related records
            q['RegRegistrationParentRef'] = {'$in': parent_irns}
            monary_query = m.query(
                config.get('mongo', 'database'), 'ecatalogue', q,
                ['RegRegistrationParentRef', 'AdmGUIDPreferredValue'],
                ['int32', 'string:36'])
            part_df = pd.DataFrame(
                np.matrix(monary_query).transpose(),
                columns=['RegRegistrationParentRef', 'AdmGUIDPreferredValue'])
            part_df['RegRegistrationParentRef'] = part_df[
                'RegRegistrationParentRef'].astype('int32')

            # Group by parent ref and concatenate all the GUIDs together
            # So we now have:
            # parent_irn   guid; guid
            parts = part_df.groupby('RegRegistrationParentRef')[
                'AdmGUIDPreferredValue'].apply(lambda x: "%s" % ';'.join(x))

            # And update the main date frame with the group parts, merged on
            # _parentRef
            df['relatedResourceID'] = df.apply(
                lambda row: parts[row['_parentRef']]
                if row['_parentRef'] in parts else np.NaN,
                axis=1)
            df['relationshipOfResource'][
                df['relatedResourceID'].notnull()] = 'Parts'

            parent_df = self.get_dataframe(
                m, 'ecatalogue',
                self.get_collection_source_columns('ecatalogue'), parent_irns,
                '_id')

            # Ensure the parent multimedia images are usable
            self.ensure_multimedia(parent_df, 'associatedMedia')

            # Assign parentRef as the index to allow us to combine with
            # parent_df
            df.index = df['_parentRef']

            # There is a annoying bug that coerces string columns to integers in combine_first
            # Hack: ensure there's always a string value that cannot be coerced in every column
            # So will create a dummy row, that gets deleted after combine_first
            # is called
            dummy_index = len(df) + 1
            parent_df.loc[dummy_index] = ['-' for _ in parent_df]
            df = df.combine_first(parent_df)
            df = df.drop([dummy_index])

        # Ensure our geo fields are floats
        df['decimalLongitude'] = df['decimalLongitude'].astype('float64')
        df['decimalLatitude'] = df['decimalLatitude'].astype('float64')

        # Get all collection columns
        collection_columns = self.get_collection_source_columns()

        # Load extra sites info (if there's an error radius + unit)
        site_irns = self._get_unique_irns(df, '_siteRef')

        sites_df = self.get_dataframe(m, 'esites',
                                      collection_columns['esites'], site_irns,
                                      '_esitesIrn')

        df = pd.merge(df,
                      sites_df,
                      how='outer',
                      left_on=['_siteRef'],
                      right_on=['_esitesIrn'])

        # For CITES species, we need to hide Lat/Lon and Locality data - and
        # label images
        for i in [
                'locality', 'labelLocality', 'decimalLongitude',
                'decimalLatitude', 'verbatimLongitude', 'verbatimLatitude',
                'centroid', 'maxError', 'higherGeography', 'associatedMedia'
        ]:
            df[i][df['_cites'] == 'True'] = np.NaN

        # Some records are being assigned a Centroid even if they have no lat/lon fields.
        # Ensure it's False is latitude is null
        df['centroid'][df['decimalLatitude'].isnull()] = False

        # Load collection event data
        collection_event_irns = self._get_unique_irns(df,
                                                      '_collectionEventRef')

        # if collection_event_irns:
        collection_event_df = self.get_dataframe(
            m, 'ecollectionevents', collection_columns['ecollectionevents'],
            collection_event_irns, '_ecollectioneventsIrn')
        # print collection_event_df
        df = pd.merge(df,
                      collection_event_df,
                      how='outer',
                      left_on=['_collectionEventRef'],
                      right_on=['_ecollectioneventsIrn'])

        # Add parasite life stage
        # Parasite cards use a different field for life stage
        df['lifeStage'].fillna(df['_parasiteStage'], inplace=True)

        # Add parasite card
        parasite_taxonomy_irns = self._get_unique_irns(df, '_cardParasiteRef')

        if parasite_taxonomy_irns:
            parasite_df = self.get_dataframe(m, 'etaxonomy',
                                             self.parasite_taxonomy_fields,
                                             parasite_taxonomy_irns, '_irn')
            df.index = df['_cardParasiteRef']
            df = df.combine_first(parasite_df)

        return df
Example #21
0
 def output(self):
     export_dir = config.get('keemu', 'export_dir')
     return KEFileTarget(export_dir, self.module, self.date,
                         self.file_extension)
Example #22
0
def mongo_client_db(database=config.get('mongo', 'database'), host=config.get('mongo', 'host')):
    return MongoClient(host)[database]
Example #23
0
    def run(self):
        count = 0

        host = config.get('mongo', 'host')
        db = config.get('mongo', 'database')

        def _fill_field(field_arr, field_type):
            if field_type.startswith('string'):
                field_arr = field_arr.astype(np.str).filled('')
            elif field_type == 'bool':
                field_arr = field_arr.astype(np.str).filled(None)
            elif field_type.startswith('int'):
                field_arr = field_arr.filled(0)
            elif field_type.startswith('float'):
                field_arr = field_arr.filled(np.NaN)
            else:
                raise Exception('Unknown field type %s' % field_type)

            return field_arr

        with Monary(host) as m:

            log.info("Querying Monary")

            # Get field definitions for default collection
            query_fields, df_cols, field_types = zip(
                *self.get_collection_source_columns(self.collection_name))

            catalogue_blocks = m.block_query(db,
                                             self.collection_name,
                                             self.query,
                                             query_fields,
                                             field_types,
                                             block_size=self.block_size)

            log.info("Processing Monary data")

            for catalogue_block in catalogue_blocks:

                # Bit of a hack: fill fields with a blank value (depending on type)
                # So the masked value doesn't get used.  As the masked is shared between
                # each block, if a field is empty it is getting populated by previous values
                catalogue_block = [
                    _fill_field(arr, field_types[i])
                    for i, arr in enumerate(catalogue_block)
                ]

                # Create a pandas data frame with block of records
                # Columns use the name from the output columns - but must be in the same order as query_fields
                # Which is why we're using tuples for the columns
                df = pd.DataFrame(np.matrix(catalogue_block).transpose(),
                                  columns=df_cols)

                # Loop through all the columns and ensure hidden integer fields are cast as int32
                # For example, taxonomy_irn is used to join with taxonomy df
                for i, df_col in enumerate(df_cols):
                    if field_types[i].startswith('int'):
                        df[df_col] = df[df_col].astype(field_types[i])

                df = self.process_dataframe(m, df)

                # Output the dataframe
                self.output().write(df)

                row_count, col_count = df.shape
                count += row_count
                log.info("\t %s records", count)

        # After running, update mongo
        self.mongo_target.touch()
Example #24
0
def main():

    update_markers = mongo_get_update_markers()

    # Make sure the updates have all mongo classes
    bulk_tasks = [
        MongoCollectionIndexTask,
        MongoCollectionEventTask,
        MongoCatalogueTask,
        MongoTaxonomyTask,
        # MongoMultimediaTask,
        MongoSiteTask,
        UnpublishTask,
        MongoDeleteTask
    ]

    def _get_task_names(tasks):
        """
        We need to initiate and get the family name, not just the class name
        MongoDeleteTask => DeleteTask
        @param tasks:
        @return:
        """
        return [unicode(task(date=0).task_family) for task in tasks]

    full_export_date = int(config.get('keemu', 'full_export_date'))

    for date, update_marker in update_markers.iteritems():

        #  If this is the fll export date, MongoDeleteTask is not required
        if full_export_date and date == full_export_date:
            bulk_task_copy = list(bulk_tasks)
            bulk_task_copy.remove(MongoDeleteTask)
            bulk_task_names = _get_task_names(bulk_task_copy)
        else:
            bulk_task_names = _get_task_names(bulk_tasks)

        # Assert that for every date we have all the bulk tasks
        missing_tasks = list(set(bulk_task_names) - set(update_marker))
        assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % (
            date, missing_tasks)

    # Get a list of all export files to process
    export_dates = [
        d for d in get_export_file_dates() if d not in update_markers.keys()
    ]

    # Run setup_interface_logging to ensure luigi commands
    setup_interface_logging()

    sch = scheduler.CentralPlannerScheduler()

    w = BulkWorker(scheduler=sch)

    for export_date in export_dates:

        log.info('Processing date %s', export_date)
        # We only need to call the mongo delete task, as all other tasks are a requirement
        # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask
        w.add(MongoDeleteTask(date=export_date, force=True))
        w.run()
        w.stop()
Example #25
0
 def output(self):
     export_dir = config.get('keemu', 'export_dir')
     return KEFileTarget(export_dir, self.module, self.date, self.file_extension)