class DatasetSearch(ModelSearchAdapter): model = Dataset fuzzy = True mapping = { 'properties': { 'title': { 'type': 'string', 'analyzer': i18n_analyzer, 'fields': { 'raw': { 'type': 'string', 'index': 'not_analyzed' } } }, 'description': { 'type': 'string', 'analyzer': i18n_analyzer }, 'license': { 'type': 'string', 'index': 'not_analyzed' }, 'frequency': { 'type': 'string' }, 'organization': { 'type': 'string' }, 'owner': { 'type': 'string' }, 'supplier': { 'type': 'string' }, 'tags': { 'type': 'string', 'index_name': 'tag', 'index': 'not_analyzed' }, 'tag_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'resources': { 'type': 'object', 'index_name': 'resource', 'properties': { 'title': { 'type': 'string' }, 'description': { 'type': 'string' }, 'license': { 'type': 'string' }, } }, 'format_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'dataset_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': True, }, 'created': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'last_modified': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'metrics': metrics_mapping(Dataset), 'featured': { 'type': 'boolean' }, 'temporal_coverage': { # Store dates as ordinals to handle pre-1900 dates 'type': 'object', 'properties': { 'start': { 'type': 'long' }, 'end': { 'type': 'long' }, } }, 'territories': { 'type': 'object', 'index_name': 'territories', 'properties': { 'id': { 'type': 'string' }, 'name': { 'type': 'string' }, 'code': { 'type': 'string' }, } }, 'granularity': { 'type': 'string', 'index': 'not_analyzed' }, # 'geom': { # 'type': 'geo_shape', # 'precision': '100m', # }, 'extras': { 'type': 'object', 'index_name': 'extra', }, } } fields = ( 'title^6', 'tags^3', 'territories.name^3', 'description', 'code', ) sorts = { 'title': Sort('title.raw'), 'created': Sort('created'), 'last_modified': Sort('last_modified'), 'reuses': Sort('metrics.reuses'), 'followers': Sort('metrics.followers'), 'views': Sort('metrics.views'), } facets = { 'tag': TermFacet('tags'), 'organization': ModelTermFacet('organization', Organization), 'owner': ModelTermFacet('owner', User), 'supplier': ModelTermFacet('supplier', Organization), 'license': ModelTermFacet('license', License), 'territory': ModelTermFacet('territories.id', Territory), 'granularity': TermFacet('granularity', lambda l, v: SPATIAL_GRANULARITIES[v]), 'format': TermFacet('resources.format'), 'reuses': RangeFacet('metrics.reuses'), 'temporal_coverage': TemporalCoverageFacet('temporal_coverage'), 'featured': BoolFacet('featured'), 'extra': ExtrasFacet('extras'), } boosters = [ BoolBooster('featured', 1.1), BoolBooster('from_public_service', 1.3), GaussDecay('metrics.reuses', max_reuses, decay=0.8), GaussDecay('metrics.followers', max_followers, max_followers, decay=0.8), ] @classmethod def is_indexable(cls, dataset): return dataset.deleted is None and len( dataset.resources) > 0 and not dataset.private @classmethod def serialize(cls, dataset): org_id = str(dataset.organization.id ) if dataset.organization is not None else None supplier_id = str( dataset.supplier.id) if dataset.supplier is not None else None supplier_id = supplier_id if supplier_id != org_id else None if dataset.organization: image_url = dataset.organization.logo(40) elif dataset.owner: image_url = dataset.owner.avatar(40) else: image_url = None document = { 'title': dataset.title, 'description': dataset.description, 'license': dataset.license.id if dataset.license is not None else None, 'tags': dataset.tags, 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': org_id, 'owner': str(dataset.owner.id) if dataset.owner else None, 'supplier': supplier_id, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title), 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'extras': dataset.extras, 'featured': dataset.featured, 'from_public_service': dataset.organization.public_service if dataset.organization else False, # TODO: extract tis into plugin } if dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end: document.update({ 'temporal_coverage': { 'start': dataset.temporal_coverage.start.toordinal(), 'end': dataset.temporal_coverage.end.toordinal(), } }) if dataset.spatial is not None: document.update({ 'territories': [{ 'id': str(t.id), 'name': t.name, 'code': t.code } for t in dataset.spatial.territories], # 'geom': dataset.spatial.geom, 'granularity': dataset.spatial.granularity, }) return document
class DatasetSearch(ModelSearchAdapter): model = Dataset fuzzy = True exclude_fields = ['spatial.geom', 'spatial.zones.geom'] class Meta: doc_type = 'Dataset' title = String(analyzer=i18n_analyzer, fields={'raw': String(index='not_analyzed')}) description = String(analyzer=i18n_analyzer) license = String(index='not_analyzed') frequency = String(index='not_analyzed') organization = String(index='not_analyzed') owner = String(index='not_analyzed') tags = String(index='not_analyzed', fields={'i18n': String(index='not_analyzed')}) badges = String(index='not_analyzed') tag_suggest = Completion(analyzer=simple, search_analyzer=simple, payloads=False) resources = Object( properties={ 'title': String(), 'description': String(), 'format': String(index='not_analyzed') }) format_suggest = Completion(analyzer=simple, search_analyzer=simple, payloads=False) dataset_suggest = Completion(analyzer=simple, search_analyzer=simple, payloads=True) created = Date(format='date_hour_minute_second') last_modified = Date(format='date_hour_minute_second') metrics = metrics_mapping_for(Dataset) featured = Boolean() temporal_coverage = Nested(multi=False, properties={ 'start': Long(), 'end': Long() }) temporal_weight = Long(), geozones = Object( properties={ 'id': String(index='not_analyzed'), 'name': String(index='not_analyzed'), 'keys': String(index='not_analyzed') }) granularity = String(index='not_analyzed') spatial_weight = Long() from_certified = Boolean() fields = ( 'geozones.keys^9', 'geozones.name^9', 'acronym^7', 'title^6', 'tags.i18n^3', 'description', ) sorts = { 'title': 'title.raw', 'created': 'created', 'last_modified': 'last_modified', 'reuses': 'metrics.reuses', 'followers': 'metrics.followers', 'views': 'metrics.views', } facets = { 'tag': TermsFacet(field='tags'), 'badge': TermsFacet(field='badges', labelizer=dataset_badge_labelizer), 'organization': ModelTermsFacet(field='organization', model=Organization), 'owner': ModelTermsFacet(field='owner', model=User), 'license': ModelTermsFacet(field='license', model=License), 'geozone': ModelTermsFacet(field='geozones.id', model=GeoZone, labelizer=zone_labelizer), 'granularity': TermsFacet(field='granularity', labelizer=granularity_labelizer), 'format': TermsFacet(field='resources.format'), 'reuses': RangeFacet(field='metrics.reuses', ranges=[('none', (None, 1)), ('few', (1, 5)), ('quite', (5, 10)), ('many', (10, None))], labels={ 'none': _('Never reused'), 'few': _('Little reused'), 'quite': _('Quite reused'), 'many': _('Heavily reused'), }), 'temporal_coverage': TemporalCoverageFacet(field='temporal_coverage'), 'featured': BoolFacet(field='featured'), } boosters = [ BoolBooster('featured', 1.5), BoolBooster('from_certified', 1.2), ValueFactor('spatial_weight', missing=1), ValueFactor('temporal_weight', missing=1), GaussDecay('metrics.reuses', max_reuses, decay=0.1), GaussDecay('metrics.followers', max_followers, max_followers, decay=0.1), ] @classmethod def is_indexable(cls, dataset): return (dataset.deleted is None and len(dataset.resources) > 0 and not dataset.private) @classmethod def get_suggest_weight(cls, temporal_weight, spatial_weight, featured): '''Compute the suggest part of the indexation payload''' featured_weight = 1 if not featured else FEATURED_WEIGHT return temporal_weight * spatial_weight * featured_weight @classmethod def serialize(cls, dataset): organization = None owner = None image_url = None spatial_weight = DEFAULT_SPATIAL_WEIGHT temporal_weight = DEFAULT_TEMPORAL_WEIGHT if dataset.organization: organization = Organization.objects( id=dataset.organization.id).first() image_url = organization.logo(40, external=True) elif dataset.owner: owner = User.objects(id=dataset.owner.id).first() image_url = owner.avatar(40, external=True) certified = organization and organization.certified document = { 'title': dataset.title, 'description': dataset.description, 'license': getattr(dataset.license, 'id', None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': str(organization.id) if organization else None, 'owner': str(owner.id) if owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [dataset.id], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'acronym': dataset.acronym, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'featured': dataset.featured, 'from_certified': certified, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): start = dataset.temporal_coverage.start.toordinal() end = dataset.temporal_coverage.end.toordinal() temporal_weight = min((end - start) / 365, MAX_TEMPORAL_WEIGHT) document.update({ 'temporal_coverage': { 'start': start, 'end': end }, 'temporal_weight': temporal_weight, }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zone_ids = [z.id for z in dataset.spatial.zones] zones = GeoZone.objects(id__in=zone_ids).exclude('geom') parents = set() geozones = [] coverage_level = ADMIN_LEVEL_MAX for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) coverage_level = min(coverage_level, admin_levels[zone.level]) geozones.extend([{'id': p} for p in parents]) spatial_weight = ADMIN_LEVEL_MAX / coverage_level document.update({ 'geozones': geozones, 'granularity': dataset.spatial.granularity, 'spatial_weight': spatial_weight, }) document['dataset_suggest']['weight'] = cls.get_suggest_weight( temporal_weight, spatial_weight, dataset.featured) if dataset.acronym: document['dataset_suggest']['input'].append(dataset.acronym) return document
class DatasetSearch(ModelSearchAdapter): model = Dataset fuzzy = True mapping = { 'properties': { 'title': { 'type': 'string', 'analyzer': i18n_analyzer, 'fields': { 'raw': { 'type': 'string', 'index': 'not_analyzed' } } }, 'description': { 'type': 'string', 'analyzer': i18n_analyzer }, 'license': { 'type': 'string', 'index': 'not_analyzed' }, 'frequency': { 'type': 'string' }, 'organization': { 'type': 'string' }, 'owner': { 'type': 'string' }, 'tags': { 'type': 'string', 'index_name': 'tag', 'index': 'not_analyzed', 'fields': { 'i18n': { 'type': 'string', 'analyzer': i18n_analyzer } } }, 'badges': { 'type': 'string', 'index_name': 'badges', 'index': 'not_analyzed' }, 'tag_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'resources': { 'type': 'object', 'index_name': 'resource', 'properties': { 'title': { 'type': 'string' }, 'description': { 'type': 'string' }, 'license': { 'type': 'string' }, } }, 'format_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': False, }, 'dataset_suggest': { 'type': 'completion', 'index_analyzer': 'simple', 'search_analyzer': 'simple', 'payloads': True, }, 'created': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'last_modified': { 'type': 'date', 'format': 'date_hour_minute_second' }, 'metrics': metrics_mapping(Dataset), 'featured': { 'type': 'boolean' }, # Store dates as ordinals to handle pre-1900 dates. 'temporal_coverage': { 'type': 'object', 'properties': { 'start': { 'type': 'long' }, 'end': { 'type': 'long' }, } }, 'geozones': { 'type': 'object', 'index_name': 'geozones', 'properties': { 'id': { 'type': 'string', 'index': 'not_analyzed' }, 'name': { 'type': 'string', 'index': 'not_analyzed' }, 'keys': { 'type': 'string', 'index': 'not_analyzed' }, } }, 'granularity': { 'type': 'string', 'index': 'not_analyzed' }, # 'geom': { # 'type': 'geo_shape', # 'precision': '100m', # }, 'extras': { 'type': 'object', 'index_name': 'extra', }, } } fields = ( 'geozones.keys^9', 'geozones.name^9', 'title^6', 'tags.i18n^3', 'description', ) sorts = { 'title': Sort('title.raw'), 'created': Sort('created'), 'last_modified': Sort('last_modified'), 'reuses': Sort('metrics.reuses'), 'followers': Sort('metrics.followers'), 'views': Sort('metrics.views'), } facets = { 'tag': TermFacet('tags'), 'badge': TermFacet('badges', labelizer=dataset_badge_labelizer), 'organization': ModelTermFacet('organization', Organization), 'owner': ModelTermFacet('owner', User), 'license': ModelTermFacet('license', License), 'geozone': ModelTermFacet('geozones.id', GeoZone, zone_labelizer), 'granularity': TermFacet('granularity', granularity_labelizer), 'format': TermFacet('resources.format'), 'reuses': RangeFacet('metrics.reuses'), 'temporal_coverage': TemporalCoverageFacet('temporal_coverage'), 'featured': BoolFacet('featured'), 'extra': ExtrasFacet('extras'), } boosters = [ BoolBooster('featured', 1.1), GaussDecay('metrics.reuses', max_reuses, decay=0.1), GaussDecay('metrics.followers', max_followers, max_followers, decay=0.1), ] @classmethod def is_indexable(cls, dataset): return (dataset.deleted is None and len(dataset.resources) > 0 and not dataset.private) @classmethod def serialize(cls, dataset): org_id = (str(dataset.organization.id) if dataset.organization is not None else None) if dataset.organization: image_url = dataset.organization.logo(40) elif dataset.owner: image_url = dataset.owner.avatar(40) else: image_url = None document = { 'title': dataset.title, 'description': dataset.description, 'license': (dataset.license.id if dataset.license is not None else None), 'tags': dataset.tags, 'badges': [badge.kind for badge in dataset.badges], 'tag_suggest': dataset.tags, 'resources': [{ 'title': r.title, 'description': r.description, 'format': r.format, } for r in dataset.resources], 'format_suggest': [r.format.lower() for r in dataset.resources if r.format], 'frequency': dataset.frequency, 'organization': org_id, 'owner': str(dataset.owner.id) if dataset.owner else None, 'dataset_suggest': { 'input': cls.completer_tokenize(dataset.title) + [dataset.id], 'output': dataset.title, 'payload': { 'id': str(dataset.id), 'slug': dataset.slug, 'image_url': image_url, }, }, 'created': dataset.created_at.strftime('%Y-%m-%dT%H:%M:%S'), 'last_modified': dataset.last_modified.strftime('%Y-%m-%dT%H:%M:%S'), 'metrics': dataset.metrics, 'extras': dataset.extras, 'featured': dataset.featured, } if (dataset.temporal_coverage is not None and dataset.temporal_coverage.start and dataset.temporal_coverage.end): document.update({ 'temporal_coverage': { 'start': dataset.temporal_coverage.start.toordinal(), 'end': dataset.temporal_coverage.end.toordinal(), } }) if dataset.spatial is not None: # Index precise zone labels and parents zone identifiers # to allow fast filtering. zones = GeoZone.objects( id__in=[z.id for z in dataset.spatial.zones]) parents = set() geozones = [] for zone in zones: geozones.append({ 'id': zone.id, 'name': zone.name, 'keys': zone.keys_values }) parents |= set(zone.parents) geozones.extend([{'id': p} for p in parents]) document.update({ 'geozones': geozones, # 'geom': dataset.spatial.geom, 'granularity': dataset.spatial.granularity, }) return document