def __init__(self, database, update_id): self.update_id = update_id # Set up a connection to the database self.db = mongo_client_db(database) # Use the postgres table name for the collection self.marker_collection = self.get_collection(self.marker_collection_name)
def __init__(self, database, update_id): self.update_id = update_id # Set up a connection to the database self.db = mongo_client_db(database) # Use the postgres table name for the collection self.marker_collection = self.get_collection( self.marker_collection_name)
def run(self): mongo_db = mongo_client_db() collection = MongoCatalogueTask(date=None).collection_name cites_species = get_cites_species() # Set cites=true flag cites_records_cursor = mongo_db[collection].update({'DarScientificName': {'$in': cites_species}}, {'$set': {'cites': True}}, multi=True) log.info('Updated %s catalogue records as CITES', cites_records_cursor['nModified'])
def main(): # Setup MongoDB mongo_db = mongo_client_db() fields = [ 'DarLocality', 'DarVerbatimElevation', 'DarInfraspecificRank', 'DarDayIdentified', 'DarMinimumDepthInMeters', 'DarMonthIdentified', 'DarMaximumDepthInMeters', 'DarIndividualCount', 'DarMaximumDepth', 'DarVerbatimCollectingDate', 'DarTissues', 'DarScientificNameAuthorYear', 'DarVerbatimLongitude', 'DarNotes', 'DarCollectorNumber', 'DarGenBankNum', 'DarIdentificationModifier', 'DarMinimumDepth', 'DarLatLongComments', 'DarIsland', 'DarPreviousCatalogNumber', 'DarEndTimeOfDay', 'DarYearCollected', 'DarVerbatimDepth', 'DarCatalogNumber', 'DarOriginalCoordinateSystem', 'DarScientificNameAuthor', 'DarOtherCatalogNumbers', 'DarSubgenus', 'DarFieldNumber', 'DarYearIdentified', 'DarRelationshipType', 'DarEndMonthCollected', 'DarInfraspecificEpithet', 'DarAgeClass', 'DarRemarks', 'DarGeodeticDatum', 'DarKingdom', 'DarStart_EndCoordinatePrecision', 'DarCoordinatePrecision', 'DarStartTimeOfDay', 'DarSpecificEpithet', 'DarDecimalLongitude', 'DarLatitude', 'DarCitation', 'DarLifeStage', 'DarFamily', 'DarStartYearCollected', 'DarEndLatitude', 'DarBasisOfRecord', 'DarMaximumElevation', 'DarStartLatitude', 'DarCounty', 'DarRelatedInformation', 'DarObservedIndividualCount', 'DarSource', 'DarRecordURL', 'DarIslandGroup', 'DarWaterBody', 'DarCoordinateUncertaintyInMeter', 'DarSex', 'DarStartDayCollected', 'DarVerbatimLatitude', 'DarGenus', 'DarTimeOfDay', 'DarImageURL', 'DarDecimalLatitude', 'DarTypeStatus', 'DarStateProvince', 'DarBoundingBox', 'DarGeorefMethod', 'DarScientificName', 'DarCollectionCode', 'DarLongitude', 'DarGlobalUniqueIdentifier', 'DarInstitutionCode', 'DarRelatedCatalogItem', 'DarTimeCollected', 'DarPreparations', 'DarContinent', 'DarEndJulianDay', 'DarGMLFeature', 'DarCountry', 'DarJulianDay', 'DarSubspecies', 'DarFieldNotes', 'DarMaximumElevationInMeters', 'DarContinentOcean', 'DarIdentificationQualifier', 'DarTimeZone', 'DarEndLongitude', 'DarHorizontalDatum', 'DarClass', 'DarRelatedCatalogItems', 'DarPhylum', 'DarStartMonthCollected', 'DarHigherGeography', 'DarDepthRange', 'DarDateLastModified', 'DarCollector', 'DarObservedWeight', 'DarMinimumElevationInMeters', 'DarHigherTaxon', 'DarStartJulianDay', 'DarDayCollected', 'DarTemperature', 'DarEndDayCollected', 'DarStartLongitude', 'DarCatalogNumberNumeric', 'DarOrder', 'DarMinimumElevation', 'DarPreparationType', 'DarEndYearCollected', 'DarMonthCollected', 'DarIdentifiedBy', 'DarCatalogNumberText', 'DarSpecies' ] for field in fields: results = mongo_db.ecatalogue.find({field: {'$exists': 1}}) print '{0}:\t{1}\r'.format(field, results.count())
def get_cites_species(): """ Load cites species names from mongo These will already have been downloaded from http://checklist.cites.org/#/en in JSON And then loaded into the database with: mongoimport --db keemu --collection cites --type json --file /vagrant/exports/Index_of_CITES_Species_2014-10-17\ 17-34.json --jsonArray This should only be run if mongo is rebuilt - new records are marked as CITES on import @return: list """ mongo_db = mongo_client_db() cursor = mongo_db[CITES_COLLECTION].find({'full_name': {'$ne': None}}, {'full_name':1}) return [r['full_name'].encode('utf8') for r in cursor]
def run(self): mongo_db = mongo_client_db() collection = MongoCatalogueTask(date=None).collection_name cites_species = get_cites_species() # Set cites=true flag cites_records_cursor = mongo_db[collection].update( {'DarScientificName': { '$in': cites_species }}, {'$set': { 'cites': True }}, multi=True) log.info('Updated %s catalogue records as CITES', cites_records_cursor['nModified'])
def ensure_multimedia(self, df, multimedia_field): mongo_client = mongo_client_db() # The multimedia field contains IRNS of all items - not just images # So we need to look up the IRNs against the multimedia record to get the mime type # And filter out non-image mimetypes we do not support # Convert associatedMedia field to a list df[multimedia_field] = df[multimedia_field].apply( lambda x: list(int(z.strip()) for z in x.split(';') if z.strip())) # Get a unique list of IRNS unique_multimedia_irns = list( set(itertools.chain(*[irn for irn in df[multimedia_field].values]))) # Get a list of dictionary of valid multimedia valid mimetypes # It's not enough to just check for the derived image heights - some of these are tiffs etc., and undeliverable cursor = mongo_client['emultimedia'].find( { '_id': { '$in': unique_multimedia_irns }, 'AdmPublishWebNoPasswordFlag': 'Y', #'NhmSecEmbargoDate': 0, 'GenDigitalMediaId': { '$ne': 0 } }, { 'GenDigitalMediaId': 1, 'MulTitle': 1, 'MulMimeFormat': 1, 'NhmSecEmbargoDate': 1, 'NhmSecEmbargoExtensionDate': 1 }) # Create a dictionary of multimedia records, keyed by _id multimedia_dict = {} for record in cursor: if record['GenDigitalMediaId'] == 'Pending': continue # If the embargo extension date exists and is in the future, then skip if 'NhmSecEmbargoExtensionDate' in record: if record['NhmSecEmbargoExtensionDate'] > 0 and record[ 'NhmSecEmbargoExtensionDate'] > datetime.datetime.today( ).strftime("%Y-%m-%d"): continue # For remaining records, if the original embargo date exists and is in the future then skip if record['NhmSecEmbargoDate'] > 0 and record[ 'NhmSecEmbargoDate'] > datetime.datetime.today().strftime( "%Y-%m-%d"): continue multimedia_dict[record['_id']] = { 'identifier': 'http://www.nhm.ac.uk/services/media-store/asset/{mam_id}/contents/preview' .format(mam_id=record['GenDigitalMediaId'], ), 'format': 'image/%s' % record['MulMimeFormat'], "type": "StillImage", "license": "http://creativecommons.org/licenses/by/4.0/", "rightsHolder": "The Trustees of the Natural History Museum, London" } # Add the title if it exists if record.get('MulTitle', None): multimedia_dict[record['_id']]['title'] = record.get( 'MulTitle') def multimedia_to_json(irns): """ Convert multimedia fields to json Loop through all the irns in the field, check they key exists in multimedia_dict (If it's not the image might not be publishable / be in the correct format) @param irns: @return: json """ multimedia_records = [ multimedia_dict[irn] for irn in irns if irn in multimedia_dict ] return json.dumps( multimedia_records) if multimedia_records else np.nan # And finally update the associatedMedia field, so formatting with the IRN with MULTIMEDIA_URL, if the IRN is in valid_multimedia df[multimedia_field] = df[multimedia_field].apply(multimedia_to_json)
def main(): # Setup MongoDB mongo_db = mongo_client_db() q = { 'MulMimeFormat': {'$in': MULTIMEDIA_FORMATS}, 'DocHeight': {'$exists': True}, 'DocWidth': {'$exists': True}, 'AdmPublishWebNoPasswordFlag': 'Y', 'MulMimeType': 'image' } status = OrderedDict() total_failed = 0 for d in pd.date_range(start='3/1/2014', end=pd.datetime.today()): date_str = str(d.date()) q['AdmDateInserted'] = date_str print 'Checking date %s' % q['AdmDateInserted'] status[date_str] = 0 results = mongo_db.emultimedia.find(q).limit(1) if results.count(): for record in results: url = 'http://www.nhm.ac.uk/emu-classes/class.EMuMedia.php?irn={_id}&image=yes&width={width}&height={height}'.format( _id=record['_id'], width=get_max_dimension(record['DocWidth']), height=get_max_dimension(record['DocHeight']) ) response = requests.head(url) # We only request jpeg images - but the error image is returned in png # So if image type == png, image request has failed failed = response.headers['content-type'] == 'image/png' if failed: print 'Failed: %s' % date_str # Count failures status[date_str] += 1 total_failed += results.count() print 'Total failed: %s' % total_failed # Pause so we don't kill the server time.sleep(0.5) else: print 'No images for %s' % date_str status[date_str] = None for d, failures in status.iteritems(): print '%s: %s' % (d, failures) print '----------------' print 'Total failed: %s' % total_failed
def ensure_multimedia(self, df, multimedia_field): mongo_client = mongo_client_db() # The multimedia field contains IRNS of all items - not just images # So we need to look up the IRNs against the multimedia record to get the mime type # And filter out non-image mimetypes we do not support # Convert associatedMedia field to a list df[multimedia_field] = df[multimedia_field].apply(lambda x: list(int(z.strip()) for z in x.split(';') if z.strip())) # Get a unique list of IRNS unique_multimedia_irns = list(set(itertools.chain(*[irn for irn in df[multimedia_field].values]))) # Get a list of dictionary of valid multimedia valid mimetypes # It's not enough to just check for the derived image heights - some of these are tiffs etc., and undeliverable cursor = mongo_client['emultimedia'].find( { '_id': {'$in': unique_multimedia_irns}, 'AdmPublishWebNoPasswordFlag': 'Y', #'NhmSecEmbargoDate': 0, 'GenDigitalMediaId': {'$ne': 0} }, { 'GenDigitalMediaId': 1, 'MulTitle': 1, 'MulMimeFormat': 1, 'NhmSecEmbargoDate': 1, 'NhmSecEmbargoExtensionDate': 1 } ) # Create a dictionary of multimedia records, keyed by _id multimedia_dict = {} for record in cursor: if record['GenDigitalMediaId'] == 'Pending': continue # If the embargo extension date exists and is in the future, then skip if 'NhmSecEmbargoExtensionDate' in record: if record['NhmSecEmbargoExtensionDate'] > 0 and record['NhmSecEmbargoExtensionDate'] > datetime.datetime.today().strftime("%Y-%m-%d"): continue # For remaining records, if the original embargo date exists and is in the future then skip if record['NhmSecEmbargoDate'] > 0 and record['NhmSecEmbargoDate'] > datetime.datetime.today().strftime("%Y-%m-%d"): continue multimedia_dict[record['_id']] = { 'identifier': 'http://www.nhm.ac.uk/services/media-store/asset/{mam_id}/contents/preview'.format( mam_id=record['GenDigitalMediaId'], ), 'format': 'image/%s' % record['MulMimeFormat'], "type": "StillImage", "license": "http://creativecommons.org/licenses/by/4.0/", "rightsHolder": "The Trustees of the Natural History Museum, London" } # Add the title if it exists if record.get('MulTitle', None): multimedia_dict[record['_id']]['title'] = record.get('MulTitle') def multimedia_to_json(irns): """ Convert multimedia fields to json Loop through all the irns in the field, check they key exists in multimedia_dict (If it's not the image might not be publishable / be in the correct format) @param irns: @return: json """ multimedia_records = [multimedia_dict[irn] for irn in irns if irn in multimedia_dict] return json.dumps(multimedia_records) if multimedia_records else np.nan # And finally update the associatedMedia field, so formatting with the IRN with MULTIMEDIA_URL, if the IRN is in valid_multimedia df[multimedia_field] = df[multimedia_field].apply(multimedia_to_json)