def _insert_image(iterator, reader, chunk_size, skip_existence_check=False): for chunk in iterator(chunk_size, reader): images = [] for row in chunk: if skip_existence_check or models.Image.objects.filter( foreign_identifier=row['ImageID']).exists(): image = models.Image() image.identifier = signals.create_identifier( row['OriginalURL']) image.foreign_identifier = row['ImageID'] image.url = row['OriginalURL'] image.thumbnail = row['Thumbnail300KURL'] image.foreign_landing_url = row['OriginalLandingURL'] image.license = 'by' image.provider = 'flickr' image.source = 'openimages' image.license_version = '2.0' image.creator_url = row['AuthorProfileURL'] image.creator = row['Author'] image.title = row['Title'] image.filesize = row['OriginalSize'] # log.debug("Adding image %s", row['ImageID']) images.append(image) else: # log.debug("Skipping existing image %s", row['ImageID']) pass if len(images) > 0: models.Image.objects.bulk_create(images) log.debug("*** Committing set of %d images", len(images))
def test_tag_image(self): """It should be possible to associate a tag with an image""" image = models.Image(url='http://example.com', license="CC0") tag = models.Tag(name='tagname', foreign_identifier='tagid') tag.save() image.save() image_tag = models.ImageTags(image=image, tag=tag) image_tag.save()
def test_image_model(self): """It should be possible to create an Image record with a few basic fields""" assert models.Image.objects.count() == 0 image = models.Image() image.url = 'http://example.com' image.license = 'CC0' image.save() assert models.Image.objects.count() == 1
def test_tags_list_image(self): """The `tags_list` field on the `Image` table should contain an array of values""" image = models.Image(url='http://example.com', license="CC0") tags_list = ['a', 'b'] image.tags_list = tags_list image.save() # Get it back out and assert that it's a list again image = models.Image.objects.all().first() assert 2 == len(image.tags_list) assert "a" == image.tags_list[0]
def test_list(self): """It should be possible to create a List and add an image to it""" image = models.Image(url='http://example.com', license="CC0") lst = models.List(title='test') lst.save() image.save() lst.images.add(image) assert 1 == models.List.objects.count() assert 1 == models.List.objects.first().images.count() assert image == models.List.objects.first().images.first()
def serialize(result): """For a given Met result, map that to our database""" imageinfos = result['ImageInfo'] thumbnail = None url = None for info in imageinfos: if info['PrimaryDisplay']: # Use this one thumbnail = ENDPOINT_BASE_IMAGE_URL + info['Thumbnail'] url = ENDPOINT_BASE_IMAGE_URL + info['LargeWebsite'] break if not url: log.warning("Did not get an image URL for %s", result) return image = models.Image(url=url) image.provider = PROVIDER_NAME image.source = SOURCE_NAME # Creator might be a few fields tombstone = result['Tombstone'] creator_names = [] for t in tombstone: if t['Name'] in CREATOR_LABELS: val = t['Value'] parser = CreatorParser() parser.feed(val) creator_names.append(" ".join(parser.out)) if len(creator_names) > 0: image.creator = ", ".join(creator_names) image.thumbnail = thumbnail image.license = "cc0" image.license_version = '1.0' image.foreign_identifier = result['CollectionObject']['CRDID'] image.foreign_landing_url = FOREIGN_LANDING_BASE_URL + str( image.foreign_identifier) image.title = result['CollectionObject']['Title'] image.identifier = signals.create_identifier(image.url) image.last_synced_with_source = timezone.now() try: image.save() log.info("Adding image %s-%s (%s) identifier %s", image.title, image.creator, image.foreign_identifier, image.identifier) except IntegrityError as e: log.warn(e) pass return image
def serialize(result): """For a given Europeana result, map that to our database""" if 'edmIsShownBy' in result: # Some Europeana identifiers are longer than we support (>80 chars!) # Skip these records for now or else the database will choke; we don't # want to truncate them or run an expensive db migration on our end right now if len(result['id']) > 79: return None url = result['edmIsShownBy'][0] image = models.Image(url=url) thumbnail = 'https://www.europeana.eu/api/v2/thumbnail-by-url.json?size=w200&type=IMAGE&' image.thumbnail = thumbnail + urllib.parse.urlencode({'uri': url}) image.source = SOURCE_NAME image.provider = SOURCE_NAME image.creator = result['dcCreator'][ 0] if 'dcCreator' in result else None license, version = licenses.url_to_license(result['rights'][0]) image.license = license image.license_version = version image.foreign_landing_url = result['guid'] image.foreign_identifier = result['id'] image.title = result['title'][0] image.identifier = signals.create_identifier(image.url) image.last_synced_with_source = timezone.now() tag_names = [] # Tags, if available if 'edmConceptPrefLabelLangAware' in result and 'en' in result[ 'edmConceptPrefLabelLangAware']: # Each one of these is a tag for tag_label in result['edmConceptPrefLabelLangAware']['en']: #log.debug("Adding tag %s", tag_label) models.Tag.objects.get_or_create(name=tag_label.lower(), source=SOURCE_NAME) tag_names.append(tag_label) image.tags_list = tag_names #log.debug("'%s' from %s", image.title, image.provider) return image
def serialize(result): """For a given Rijks result, map that to our database""" url = result['webImage']['url'] # Thumbnails from Rijks are dynamic; let's make them 200 wide if url.endswith('=s0'): thumbnail = url[:-3] + '=s' + str(THUMBNAIL_WIDTH) image = models.Image(url=url) image.provider = PROVIDER_NAME image.source = SOURCE_NAME image.creator = result['principalOrFirstMaker'] image.thumbnail = thumbnail image.license = "cc0" image.license_version = '1.0' image.foreign_landing_url = result['links']['web'] image.foreign_identifier = result['webImage']['guid'] image.width = result['webImage']['width'] image.height = result['webImage']['height'] image.title = result['longTitle'] image.identifier = signals.create_identifier(image.url) image.last_synced_with_source = timezone.now() return image
def import_from_file(from_file): """Import from an NDJSON file""" # ndjson files are newline delimited results = [] tags = {} for line in open(from_file): result = json.loads(line) if 'still image' in result['resourceType'] and result.get( 'captures') and len(result.get('captures')): url = result.get('captures')[0] url = url[:-3] + 't=w' # 760 jpg, but it's the largest we're guaranteed to find thumbnail = url[:-3] + 't=r' # 300px thumbnail image = models.Image(url=url) image.provider = PROVIDER_NAME image.source = SOURCE_NAME if result.get('contributor'): if 'contributorName' in result.get('contributor')[0]: image.creator = result['contributor'][0]['contributorName'] image.thumbnail = thumbnail image.license = "cc0" image.license_version = '1.0' image.foreign_landing_url = result['digitalCollectionsURL'] image.foreign_identifier = result['UUID'] image.title = result['title'] image.identifier = signals.create_identifier(image.url) image.last_synced_with_source = timezone.now() tag_names = [topic['text'] for topic in result.get('subjectName')] for tag in tag_names: tags[tag] = models.Tag(name=tag, source='nypl') image.tags_list = tag_names results.append(image) # Create the tags objects log.debug("Bulk creating %d new tags", len(tags.values())) models.Tag.objects.bulk_create(tags.values()) return results