コード例 #1
    def __init__(self, database, update_id):

        self.update_id = update_id
        # Set up a connection to the database
        self.db = mongo_client_db(database)
        # Use the postgres table name for the collection
        self.marker_collection = self.get_collection(self.marker_collection_name)
コード例 #3
    def run(self):

        mongo_db = mongo_client_db()
        collection = MongoCatalogueTask(date=None).collection_name
        cites_species = get_cites_species()

        # Set cites=true flag
        cites_records_cursor = mongo_db[collection].update({'DarScientificName': {'$in': cites_species}}, {'$set': {'cites': True}}, multi=True)
        log.info('Updated %s catalogue records as CITES', cites_records_cursor['nModified'])
コード例 #4
def main():

    # Setup MongoDB
    mongo_db = mongo_client_db()

    fields = [
        'DarLocality', 'DarVerbatimElevation', 'DarInfraspecificRank',
        'DarDayIdentified', 'DarMinimumDepthInMeters', 'DarMonthIdentified',
        'DarMaximumDepthInMeters', 'DarIndividualCount', 'DarMaximumDepth',
        'DarVerbatimCollectingDate', 'DarTissues',
        'DarScientificNameAuthorYear', 'DarVerbatimLongitude', 'DarNotes',
        'DarCollectorNumber', 'DarGenBankNum', 'DarIdentificationModifier',
        'DarMinimumDepth', 'DarLatLongComments', 'DarIsland',
        'DarPreviousCatalogNumber', 'DarEndTimeOfDay', 'DarYearCollected',
        'DarVerbatimDepth', 'DarCatalogNumber', 'DarOriginalCoordinateSystem',
        'DarScientificNameAuthor', 'DarOtherCatalogNumbers', 'DarSubgenus',
        'DarFieldNumber', 'DarYearIdentified', 'DarRelationshipType',
        'DarEndMonthCollected', 'DarInfraspecificEpithet', 'DarAgeClass',
        'DarRemarks', 'DarGeodeticDatum', 'DarKingdom',
        'DarStart_EndCoordinatePrecision', 'DarCoordinatePrecision',
        'DarStartTimeOfDay', 'DarSpecificEpithet', 'DarDecimalLongitude',
        'DarLatitude', 'DarCitation', 'DarLifeStage', 'DarFamily',
        'DarStartYearCollected', 'DarEndLatitude', 'DarBasisOfRecord',
        'DarMaximumElevation', 'DarStartLatitude', 'DarCounty',
        'DarRelatedInformation', 'DarObservedIndividualCount', 'DarSource',
        'DarRecordURL', 'DarIslandGroup', 'DarWaterBody',
        'DarCoordinateUncertaintyInMeter', 'DarSex', 'DarStartDayCollected',
        'DarVerbatimLatitude', 'DarGenus', 'DarTimeOfDay', 'DarImageURL',
        'DarDecimalLatitude', 'DarTypeStatus', 'DarStateProvince',
        'DarBoundingBox', 'DarGeorefMethod', 'DarScientificName',
        'DarCollectionCode', 'DarLongitude', 'DarGlobalUniqueIdentifier',
        'DarInstitutionCode', 'DarRelatedCatalogItem', 'DarTimeCollected',
        'DarPreparations', 'DarContinent', 'DarEndJulianDay', 'DarGMLFeature',
        'DarCountry', 'DarJulianDay', 'DarSubspecies', 'DarFieldNotes',
        'DarMaximumElevationInMeters', 'DarContinentOcean',
        'DarIdentificationQualifier', 'DarTimeZone', 'DarEndLongitude',
        'DarHorizontalDatum', 'DarClass', 'DarRelatedCatalogItems',
        'DarPhylum', 'DarStartMonthCollected', 'DarHigherGeography',
        'DarDepthRange', 'DarDateLastModified', 'DarCollector',
        'DarObservedWeight', 'DarMinimumElevationInMeters', 'DarHigherTaxon',
        'DarStartJulianDay', 'DarDayCollected', 'DarTemperature',
        'DarEndDayCollected', 'DarStartLongitude', 'DarCatalogNumberNumeric',
        'DarOrder', 'DarMinimumElevation', 'DarPreparationType',
        'DarEndYearCollected', 'DarMonthCollected', 'DarIdentifiedBy',
        'DarCatalogNumberText', 'DarSpecies'

    for field in fields:
        results = mongo_db.ecatalogue.find({field: {'$exists': 1}})
        print '{0}:\t{1}\r'.format(field, results.count())
コード例 #5
def get_cites_species():
    Load cites species names from mongo

    These will already have been downloaded from http://checklist.cites.org/#/en in JSON
    And then loaded into the database with:

    mongoimport --db keemu --collection cites --type json --file /vagrant/exports/Index_of_CITES_Species_2014-10-17\ 17-34.json --jsonArray

    This should only be run if mongo is rebuilt - new records are marked as CITES on import

    @return: list
    mongo_db = mongo_client_db()
    cursor = mongo_db[CITES_COLLECTION].find({'full_name': {'$ne': None}}, {'full_name':1})
    return [r['full_name'].encode('utf8') for r in cursor]
コード例 #6
コード例 #7
    def ensure_multimedia(self, df, multimedia_field):

        mongo_client = mongo_client_db()

        # The multimedia field contains IRNS of all items - not just images
        # So we need to look up the IRNs against the multimedia record to get the mime type
        # And filter out non-image mimetypes we do not support

        # Convert associatedMedia field to a list
        df[multimedia_field] = df[multimedia_field].apply(
            lambda x: list(int(z.strip()) for z in x.split(';') if z.strip()))

        # Get a unique list of IRNS
        unique_multimedia_irns = list(
                                  for irn in df[multimedia_field].values])))

        # Get a list of dictionary of valid multimedia valid mimetypes
        # It's not enough to just check for the derived image heights - some of these are tiffs etc., and undeliverable
        cursor = mongo_client['emultimedia'].find(
                '_id': {
                    '$in': unique_multimedia_irns
                'AdmPublishWebNoPasswordFlag': 'Y',
                #'NhmSecEmbargoDate': 0,
                'GenDigitalMediaId': {
                    '$ne': 0
                'GenDigitalMediaId': 1,
                'MulTitle': 1,
                'MulMimeFormat': 1,
                'NhmSecEmbargoDate': 1,
                'NhmSecEmbargoExtensionDate': 1

        # Create a dictionary of multimedia records, keyed by _id
        multimedia_dict = {}

        for record in cursor:

            if record['GenDigitalMediaId'] == 'Pending':

# If the embargo extension date exists and is in the future, then skip
            if 'NhmSecEmbargoExtensionDate' in record:
                if record['NhmSecEmbargoExtensionDate'] > 0 and record[
                        'NhmSecEmbargoExtensionDate'] > datetime.datetime.today(

# For remaining records, if the original embargo date exists and is in the future then skip
            if record['NhmSecEmbargoDate'] > 0 and record[
                    'NhmSecEmbargoDate'] > datetime.datetime.today().strftime(

            multimedia_dict[record['_id']] = {
                .format(mam_id=record['GenDigitalMediaId'], ),
                'image/%s' % record['MulMimeFormat'],
                "The Trustees of the Natural History Museum, London"

            # Add the title if it exists
            if record.get('MulTitle', None):
                multimedia_dict[record['_id']]['title'] = record.get(

        def multimedia_to_json(irns):
            Convert multimedia fields to json
            Loop through all the irns in the field, check they key exists in multimedia_dict
            (If it's not the image might not be publishable / be in the correct format)
            @param irns:
            @return: json

            multimedia_records = [
                multimedia_dict[irn] for irn in irns if irn in multimedia_dict
            return json.dumps(
                multimedia_records) if multimedia_records else np.nan

        # And finally update the associatedMedia field, so formatting with the IRN with MULTIMEDIA_URL, if the IRN is in valid_multimedia
        df[multimedia_field] = df[multimedia_field].apply(multimedia_to_json)
コード例 #8
コード例 #9
コード例 #10
