class SearchImages(APIView): """ Search for images by a query string. Optionally, filter results by specific licenses, or license "types" (commercial use allowed, modification allowed, etc). Results are ranked in order of relevance. Refer to the Lucene syntax guide for information on structuring advanced searches. https://lucene.apache.org/core/2_9_4/queryparsersyntax.html Although there may be millions of relevant records, only the most relevant several thousand records can be viewed. This is by design: the search endpoint should be used to find the top N most relevant results, not for exhaustive search or bulk download of every barely relevant result. As such, the caller should not try to access pages beyond `page_count`, or else the server will reject the query. """ @swagger_auto_schema(operation_id='image_search', query_serializer=ImageSearchQueryStringSerializer, responses={ 200: ImageSearchResultsSerializer(many=True), 400: ValidationErrorSerializer, }) def get(self, request, format=None): # Parse and validate query parameters params = ImageSearchQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response(status=400, data={"validation_error": params.errors}) hashed_ip = hash(_get_user_ip(request)) page_param = params.data[PAGE] page_size = params.data[PAGESIZE] qa = params.data[QA] filter_dead = params.data[FILTER_DEAD] search_index = 'search-qa' if qa else 'image' try: results, page_count, result_count = search_controller.search( params, search_index, page_size, hashed_ip, request, filter_dead, page=page_param) except ValueError as value_error_message: return Response(status=400, data={VALIDATION_ERROR: str(value_error_message)}) serialized_results = ImageSerializer(results, many=True).data if len(results) < page_size and page_count == 0: result_count = len(results) response_data = { RESULT_COUNT: result_count, PAGE_COUNT: page_count, RESULTS: serialized_results } serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)
def get(self, request, provider, format=None): params = BrowseImageQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response(status=400, data={"validation_error": params.errors}) page_param = params.data[PAGE] page_size = params.data[PAGESIZE] filter_dead = params.data[FILTER_DEAD] lt = None li = None if 'lt' in params.data: lt = params.data['lt'] elif 'li' in params.data: li = params.data['li'] try: results, page_count, result_count = \ search_controller.browse_by_provider( provider, 'image', page_size, hash(_get_user_ip(request)), request, filter_dead, page=page_param, lt=lt, li=li ) except ValueError as value_error_message: return Response(status=400, data={VALIDATION_ERROR: str(value_error_message)}) except serializers.ValidationError: return Response( status=400, data={ VALIDATION_ERROR: 'Provider \'{}\' does not exist.'.format(provider) }) serialized_results = ImageSerializer(results, many=True).data response_data = { 'result_count': result_count, 'page_count': page_count, RESULTS: serialized_results } serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)
def get(self, request, provider, format=None): params = BrowseImageQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response(status=400, data={"validation_error": params.errors}) page_param = params.data[PAGE] page_size = params.data[PAGESIZE] lt = None li = None if 'lt' in params.data: lt = params.data['lt'] elif 'li' in params.data: li = params.data['li'] try: browse_results = search_controller.browse_by_provider( provider, index='image', page_size=page_size, page=page_param, lt=lt, li=li, ip=hash(_get_user_ip(request))) except ValueError: return Response(status=400, data={VALIDATION_ERROR: DEEP_PAGINATION_ERROR}) except serializers.ValidationError: return Response( status=400, data={ VALIDATION_ERROR: 'Provider \'{}\' does not exist.'.format(provider) }) filter_dead = params.data[FILTER_DEAD] results = _post_process_results(browse_results, request, filter_dead) serialized_results = ImageSerializer(results, many=True).data page_count = _get_page_count(browse_results, page_size) response_data = { 'result_count': browse_results.hits.total, 'page_count': page_count, RESULTS: serialized_results } serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)
def get(self, request, format=None): # Parse and validate query parameters params = ImageSearchQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response(status=400, data={"validation_error": params.errors}) page = params.data['page'] page_size = params.data['pagesize'] try: search_results = search_controller.search(params, index='image', page_size=page_size, page=page) except ValueError: return Response( status=400, data={'validation_error': 'Deep pagination is not allowed.'}) # Fetch each result from Elasticsearch. Resolve links to detail views. results = [] for result in search_results: url = request.build_absolute_uri( reverse('image-detail', [result.id])) result.detail = url results.append(result) serialized_results =\ ImageSerializer(results, many=True).data # Elasticsearch does not allow deep pagination of ranked queries. # Adjust returned page count to reflect this. natural_page_count = int(search_results.hits.total / page_size) last_allowed_page = int((5000 + page_size / 2) / page_size) page_count = min(natural_page_count, last_allowed_page) response_data = { 'result_count': search_results.hits.total, 'page_count': page_count, 'results': serialized_results } serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)
def get(self, request, format=None): # Parse and validate query parameters params = ImageSearchQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response(status=400, data={"validation_error": params.errors}) hashed_ip = hash(_get_user_ip(request)) page_param = params.data[PAGE] page_size = params.data[PAGESIZE] qa = params.data[QA] filter_dead = params.data[FILTER_DEAD] search_index = 'search-qa' if qa else 'image' try: results, page_count, result_count = search_controller.search( params, search_index, page_size, hashed_ip, request, filter_dead, page=page_param) except ValueError as value_error_message: return Response(status=400, data={VALIDATION_ERROR: str(value_error_message)}) serialized_results = ImageSerializer(results, many=True).data if len(results) < page_size and page_count == 0: result_count = len(results) response_data = { RESULT_COUNT: result_count, PAGE_COUNT: page_count, RESULTS: serialized_results } serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)
def get(self, request, format=None): # Parse and validate query parameters params = ImageSearchQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response( status=400, data={ "validation_error": params.errors } ) hashed_ip = hash(_get_user_ip(request)) page_param = params.data[PAGE] page_size = params.data[PAGESIZE] try: search_results = search_controller.search(params, index='image', page_size=page_size, ip=hashed_ip, page=page_param) except ValueError: return Response( status=400, data={ VALIDATION_ERROR: 'Deep pagination is not allowed.' } ) # Fetch each result from Elasticsearch. Resolve links to detail views. results = [] to_validate = [] for result in search_results: url = request.build_absolute_uri( reverse('image-detail', [result.identifier]) ) result.detail = url to_validate.append(result.url) results.append(result) if params.data[FILTER_DEAD]: validate_images(results, to_validate) serialized_results =\ ImageSerializer(results, many=True).data # Elasticsearch does not allow deep pagination of ranked queries. # Adjust returned page count to reflect this. natural_page_count = int(search_results.hits.total / page_size) last_allowed_page = int((5000 + page_size / 2) / page_size) page_count = min(natural_page_count, last_allowed_page) result_count = search_results.hits.total if len(results) < page_size and page_count == 0: result_count = len(results) response_data = { 'result_count': result_count, 'page_count': page_count, RESULTS: serialized_results } # Post-process the search results to fix malformed URLs and insecure # HTTP thumbnails. for idx, res in enumerate(serialized_results): if PROXY_THUMBS: provider = res[PROVIDER] # Proxy either the thumbnail or URL, depending on whether # a thumbnail was provided. if THUMBNAIL in res and provider not in PROXY_ALL: to_proxy = THUMBNAIL else: to_proxy = URL if 'http://' in res[to_proxy] or provider in PROXY_ALL: original = res[to_proxy] secure = '{proxy_url}/{width}/{original}'.format( proxy_url=THUMBNAIL_PROXY_URL, width=THUMBNAIL_WIDTH_PX, original=original ) response_data[RESULTS][idx][THUMBNAIL] = secure if FOREIGN_LANDING_URL in res: foreign = _add_protocol(res[FOREIGN_LANDING_URL]) response_data[RESULTS][idx][FOREIGN_LANDING_URL] = foreign if CREATOR_URL in res: creator_url = _add_protocol(res[CREATOR_URL]) response_data[RESULTS][idx][CREATOR_URL] = creator_url serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)
class SearchImages(APIView): """ Search for images by keyword. Optionally, filter the results by specific licenses, or license "types" (commercial use allowed, modification allowed, etc). Results are ranked in order of relevance. Although there may be millions of relevant records, only the most relevant several thousand records can be viewed. This is by design: the search endpoint should be used to find the top N most relevant results, not for exhaustive search or bulk download of every barely relevant result. As such, the caller should not try to access pages beyond `page_count`, or else the server will reject the query. """ @swagger_auto_schema(operation_id='image_search', query_serializer=ImageSearchQueryStringSerializer, responses={ 200: ImageSearchResultsSerializer(many=True), 400: ValidationErrorSerializer, }) def get(self, request, format=None): # Parse and validate query parameters params = ImageSearchQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response( status=400, data={ "validation_error": params.errors } ) hashed_ip = hash(_get_user_ip(request)) page_param = params.data[PAGE] page_size = params.data[PAGESIZE] try: search_results = search_controller.search(params, index='image', page_size=page_size, ip=hashed_ip, page=page_param) except ValueError: return Response( status=400, data={ VALIDATION_ERROR: 'Deep pagination is not allowed.' } ) # Fetch each result from Elasticsearch. Resolve links to detail views. results = [] to_validate = [] for result in search_results: url = request.build_absolute_uri( reverse('image-detail', [result.identifier]) ) result.detail = url to_validate.append(result.url) results.append(result) if params.data[FILTER_DEAD]: validate_images(results, to_validate) serialized_results =\ ImageSerializer(results, many=True).data # Elasticsearch does not allow deep pagination of ranked queries. # Adjust returned page count to reflect this. natural_page_count = int(search_results.hits.total / page_size) last_allowed_page = int((5000 + page_size / 2) / page_size) page_count = min(natural_page_count, last_allowed_page) result_count = search_results.hits.total if len(results) < page_size and page_count == 0: result_count = len(results) response_data = { 'result_count': result_count, 'page_count': page_count, RESULTS: serialized_results } # Post-process the search results to fix malformed URLs and insecure # HTTP thumbnails. for idx, res in enumerate(serialized_results): if PROXY_THUMBS: provider = res[PROVIDER] # Proxy either the thumbnail or URL, depending on whether # a thumbnail was provided. if THUMBNAIL in res and provider not in PROXY_ALL: to_proxy = THUMBNAIL else: to_proxy = URL if 'http://' in res[to_proxy] or provider in PROXY_ALL: original = res[to_proxy] secure = '{proxy_url}/{width}/{original}'.format( proxy_url=THUMBNAIL_PROXY_URL, width=THUMBNAIL_WIDTH_PX, original=original ) response_data[RESULTS][idx][THUMBNAIL] = secure if FOREIGN_LANDING_URL in res: foreign = _add_protocol(res[FOREIGN_LANDING_URL]) response_data[RESULTS][idx][FOREIGN_LANDING_URL] = foreign if CREATOR_URL in res: creator_url = _add_protocol(res[CREATOR_URL]) response_data[RESULTS][idx][CREATOR_URL] = creator_url serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)
class BrowseImages(APIView): """ Browse a collection of CC images by provider, such as the Metropolitan Museum of Art.. See `/statistics/image` for a list of valid collections. The `provider_identifier` field should be used to select the provider. As with the `/image/search` endpoint, this is not intended to be used to bulk download our entire collection of images; only the first ~10,000 images in each collection are accessible. """ @swagger_auto_schema(operation_id='image_browse', query_serializer=BrowseImageQueryStringSerializer, responses={ 200: ImageSearchResultsSerializer(many=True), 400: ValidationErrorSerializer, }) def get(self, request, provider, format=None): params = BrowseImageQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response(status=400, data={"validation_error": params.errors}) page_param = params.data[PAGE] page_size = params.data[PAGESIZE] lt = None li = None if 'lt' in params.data: lt = params.data['lt'] elif 'li' in params.data: li = params.data['li'] try: browse_results = search_controller.browse_by_provider( provider, index='image', page_size=page_size, page=page_param, lt=lt, li=li, ip=hash(_get_user_ip(request))) except ValueError: return Response(status=400, data={VALIDATION_ERROR: DEEP_PAGINATION_ERROR}) except serializers.ValidationError: return Response( status=400, data={ VALIDATION_ERROR: 'Provider \'{}\' does not exist.'.format(provider) }) filter_dead = params.data[FILTER_DEAD] results = _post_process_results(browse_results, request, filter_dead) serialized_results = ImageSerializer(results, many=True).data page_count = _get_page_count(browse_results, page_size) response_data = { 'result_count': browse_results.hits.total, 'page_count': page_count, RESULTS: serialized_results } serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)
class SearchImages(APIView): """ Search for images by keyword. Optionally, filter the results by specific licenses, or license "types" (commercial use allowed, modification allowed, etc). Results are ranked in order of relevance. Although there may be millions of relevant records, only the most relevant several thousand records can be viewed. This is by design: the search endpoint should be used to find the top N most relevant results, not for exhaustive search or bulk download of every barely relevant result. As such, the caller should not try to access pages beyond `page_count`, or else the server will reject the query. """ @swagger_auto_schema(operation_id='image_search', query_serializer=ImageSearchQueryStringSerializer, responses={ 200: ImageSearchResultsSerializer(many=True), 400: ValidationErrorSerializer, }) def get(self, request, format=None): # Parse and validate query parameters params = ImageSearchQueryStringSerializer(data=request.query_params) if not params.is_valid(): return Response(status=400, data={"validation_error": params.errors}) page = params.data['page'] page_size = params.data['pagesize'] try: search_results = search_controller.search(params, index='image', page_size=page_size, page=page) except ValueError: return Response( status=400, data={'validation_error': 'Deep pagination is not allowed.'}) # Fetch each result from Elasticsearch. Resolve links to detail views. results = [] for result in search_results: url = request.build_absolute_uri( reverse('image-detail', [result.id])) result.detail = url results.append(result) serialized_results =\ ImageSerializer(results, many=True).data # Elasticsearch does not allow deep pagination of ranked queries. # Adjust returned page count to reflect this. natural_page_count = int(search_results.hits.total / page_size) last_allowed_page = int((5000 + page_size / 2) / page_size) page_count = min(natural_page_count, last_allowed_page) response_data = { 'result_count': search_results.hits.total, 'page_count': page_count, 'results': serialized_results } serialized_response = ImageSearchResultsSerializer(data=response_data) return Response(status=200, data=serialized_response.initial_data)