def test_object_constructor(): expected = { 'type': 'object', 'properties': { 'inner_int': { 'type': 'integer' } } } class Inner(InnerDoc): inner_int = field.Integer() obj_from_doc = field.Object(doc_class=Inner) assert obj_from_doc.to_dict() == expected obj_from_props = field.Object(properties={'inner_int': field.Integer()}) assert obj_from_props.to_dict() == expected with pytest.raises(ValidationException): field.Object(doc_class=Inner, properties={'inner_int': field.Integer()}) with pytest.raises(ValidationException): field.Object(doc_class=Inner, dynamic=False)
def test_object_constructor(): expected = { "type": "object", "properties": { "inner_int": { "type": "integer" } } } class Inner(InnerDoc): inner_int = field.Integer() obj_from_doc = field.Object(doc_class=Inner) assert obj_from_doc.to_dict() == expected obj_from_props = field.Object(properties={"inner_int": field.Integer()}) assert obj_from_props.to_dict() == expected with pytest.raises(ValidationException): field.Object(doc_class=Inner, properties={"inner_int": field.Integer()}) with pytest.raises(ValidationException): field.Object(doc_class=Inner, dynamic=False)
class Mapping(Content.Mapping): data = field.Object() class Meta: # Necessary to allow for our data field to store appropriately in Elasticsearch. # A potential alternative could be storing as a string., we should assess the value. dynamic = False
class ProfileDocument(SumoDocument): username = field.Keyword(normalizer="lowercase") name = field.Text(fields={"keyword": field.Keyword()}) email = field.Keyword() # store avatar url so we don't need to hit the db when searching users # but set enabled=False to ensure ES does no parsing of it avatar = field.Object(enabled=False) timezone = field.Keyword() country = field.Keyword() locale = field.Keyword() involved_from = field.Date() product_ids = field.Keyword(multi=True) group_ids = field.Keyword(multi=True) class Index: name = config.USER_INDEX_NAME using = config.DEFAULT_ES7_CONNECTION def prepare_username(self, instance): return instance.user.username def prepare_email(self, instance): if instance.public_email: return instance.user.email def prepare_avatar(self, instance): if avatar := instance.fxa_avatar: return InnerDoc(url=avatar)
class Manga(Document): title = field.Text() title = field.Text(analyzer=titles, multi=True, fields={ 'space': field.Text(analyzer=titles_space, multi=True), 'keyword': field.Keyword(multi=True), }) tags = field.Object(Tag) upload_at = field.Date() scan_at = field.Date() url = field.Keyword() cover_url = field.Keyword() images_urls = field.Keyword(multi=True) images_len = field.Integer() class Index: name = 'nhentai__mangas' settings = {'number_of_shards': 2, 'number_of_replicas': 1} @classmethod def url_is_scaned(cls, url): logger.info(f"buscando manga {url}") if cls.search().filter("term", url=url).count() > 0: return True return False
class Ssn_trace(InnerDoc): is_valid = field.Boolean() is_deceased = field.Boolean() ssn = field.Keyword() human_message = field.Text() issued = field.Object(Ssn_issued)
class PositionDoc(field.InnerObjectWrapper): title = field.String() organization = field.Object( doc_class=OrganizationDoc, properties={ 'name': field.String(), } )
class Activity(InnerDoc): action = field.Text(analyzer=titles, fields={ 'space': field.Text(analyzer=titles_space), 'keyword': field.Keyword(), }) date = field.Date() user = field.Object(User)
class Mapping(Content.Mapping): # NOTE: parent is set to integer so DJES doesn't recurse parent = field.Integer() data = field.Object() class Meta: # Necessary to allow for our data field to store appropriately in Elasticsearch. # A potential alternative could be storing as a string., we should assess the value. dynamic = False
class Population(Document): name = field.Text(fields={ 'raw': field.Keyword(), }) description = field.Text() dweller = field.Object(Dweller_inner) samples = field.Object(Dweller_inner, multi=True) class Index: name = "population" def add_sample(self, sample_class=None, index=None): result = {} if sample_class is None: sample_class = Sample result['klass'] = export(sample_class) if index is not None: result['index'] = index self.samples.append(result)
class ProjectDoc(SerializedDoc): identifier = field.String() name = field.String() alternate_name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) status = field.String(fields={'raw': field.String(index='not_analyzed')}) start_year = field.Integer() countries = field.Nested( doc_class=CountryDoc, # project_location aggregation/facet uses the raw multifield properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) infrastructure_type = field.Object( properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})} ) # Providing a doc_class for initiatives produced errors, so keep it simple! initiatives = field.Nested(properties={'name': field.String()}) funding = field.Object( multi=True, properties={ 'sources': field.Object( multi=True, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}), } ) } ) regions = field.Nested( doc_class=RegionDoc, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) def get_display_name(self): return self.name
class Dataset(Document): resources = field.Object(Data_set_resource, multi=True) tags = field.Text(analyzer=titles, multi=True, fields={ 'space': field.Text(analyzer=titles_space, multi=True), 'keyword': field.Keyword(multi=True), }) metadata = field.Object(Metadata) activity = field.Object(Activity, multi=True) url = field.Keyword() status = field.Keyword() created_at = field.Date() class Index: name = 'chibi_gob__open_data__dataset' settings = {'number_of_shards': 2, 'number_of_replicas': 1} @classmethod def url_is_scaned(cls, url): logger.info(f"buscando dataset {url}") if cls.search().filter("term", url=url).count() > 0: return True return False @classmethod def get_by_url(cls, url): logger.info(f"get dataset {url}") result = cls.search().filter("term", url=url)[:1].execute() if result: return result[0] return None def save(self, *args, **kw): super().save(*args, **kw)
class EventDoc(SerializedDoc): name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) event_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}) start_year = field.Integer() places = field.Nested( doc_class=PlaceDoc, properties={'location_display': field.String(fields={'raw': field.String(index='not_analyzed')})} ) def get_display_name(self): return self.name
class SerializedDoc(DocType): _meta = field.Object( properties={'model': field.String(fields={'raw': field.String(index='not_analyzed')})} ) def get_model_meta(self): return getattr(self, '_meta', None) def get_result_highlight(self): highlight = getattr(self.meta, 'highlight', None) if highlight: return getattr(highlight, '_d_', None) return None def get_display_name(self): return None
class InitiativeDoc(SerializedDoc): identifier = field.String() name = field.String() principal_agent = field.Nested(multi=False, properties={'name': field.String()}) member_countries = field.Nested(doc_class=CountryDoc) geographic_scope = field.Nested( doc_class=CountryDoc, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) initiative_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}) start_year = field.Integer() def get_display_name(self): return self.name
class ProfileDocument(SumoDocument): username = field.Keyword(normalizer="lowercase") name = field.Text(fields={"keyword": field.Keyword()}) email = field.Keyword() # store avatar url so we don't need to hit the db when searching users # but set enabled=False to ensure ES does no parsing of it avatar = field.Object(enabled=False) timezone = field.Keyword() country = field.Keyword() locale = field.Keyword() involved_from = field.Date() product_ids = field.Keyword(multi=True) group_ids = field.Keyword(multi=True) class Index: name = config.USER_INDEX_NAME using = config.DEFAULT_ES7_CONNECTION @classmethod def prepare(cls, instance): """Override super method to exclude docs from indexing.""" # Add a discard field in the document if the following conditions are met # User is not active if not instance.user.is_active: instance.es_discard_doc = "unindex_me" return super(ProfileDocument, cls).prepare(instance) def prepare_username(self, instance): return instance.user.username def prepare_email(self, instance): if instance.public_email: return instance.user.email def prepare_avatar(self, instance): if avatar := instance.fxa_avatar: return InnerDoc(url=avatar)
class PersonDoc(SerializedDoc): identifier = field.String() given_name = field.String() additional_name = field.String() family_name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) citizenships = field.Nested(doc_class=CountryDoc) position_set = field.Nested( doc_class=PositionDoc, properties={ 'title': field.String(), 'organization': field.Object(properties={'name': field.String()}) } ) events = field.Nested(properties={'name': field.String()}) def get_display_name(self): return " ".join((self.given_name, self.family_name))
class Resource(Document): title = field.Text(analyzer=titles, fields={ 'space': field.Text(analyzer=titles_space), 'keyword': field.Keyword(), }) description = field.Text(analyzer=titles, fields={ 'space': field.Text(analyzer=titles_space), 'keyword': field.Keyword(), }) kind = field.Keyword() url = field.Keyword() created_at = field.Date() tags = field.Text(analyzer=titles, multi=True, fields={ 'space': field.Text(analyzer=titles_space, multi=True), 'keyword': field.Keyword(multi=True), }) metadata = field.Object(Metadata) class Index: name = 'chibi_gob__open_data__dataset__resource' settings = {'number_of_shards': 2, 'number_of_replicas': 1} @classmethod def url_is_scaned(cls, url): logger.info(f"buscando dataset {url}") if cls.search().filter("term", url=url).count() > 0: return True return False def save(self, *args, **kw): super().save(*args, **kw)
class MyDoc(document.DocType): title = field.String(index='not_analyzed') name = field.String() created_at = field.Date() inner = field.Object(properties={'old_field': field.String()}, doc_class=MyInner)
def test_object_dynamic_values(): for dynamic in True, False, 'strict': f = field.Object(dynamic=dynamic) assert f.to_dict()['dynamic'] == dynamic
def test_object_disabled(): f = field.Object(enabled=False) assert f.to_dict() == {"type": "object", "enabled": False}
class Mapping: name = field.String(analyzer="autocomplete", fields={"raw": field.String(index="not_analyzed")}) slug = field.String(index="not_analyzed") section_logo = ElasticsearchImageField() query = field.Object(enabled=False)
class MyDoc(document.Document): title = field.Keyword() name = field.Text() created_at = field.Date() inner = field.Object(MyInner)
class PlaceDoc(field.InnerObjectWrapper): city = field.String(fields={'raw': field.String(index='not_analyzed')}) country = field.Object(doc_class=CountryDoc) label = field.String(fields={'raw': field.String(index='not_analyzed')}) location_display = field.String(fields={'raw': field.String(index='not_analyzed')})
class MyDoc(document.DocType): title = field.Keyword() name = field.Text() created_at = field.Date() inner = field.Object(properties={'old_field': field.Text()}, doc_class=MyInner)
class B(A): o = field.Object(dynamic="strict", properties={"b": field.Text()})
class A(document.Document): o = field.Object(dynamic=False, properties={"a": field.Text()})
class WikiDocumentType(document.Document): excerpt_fields = ["summary", "content"] exclude_slugs = [ "Talk:", "User:"******"User_talk:", "Template_talk:", "Project_talk:", EXPERIMENT_TITLE_PREFIX, ] boost = field.Float(null_value=1.0) content = field.Text(analyzer="kuma_content", term_vector="with_positions_offsets") css_classnames = field.Keyword() html_attributes = field.Keyword() id = field.Long() kumascript_macros = field.Keyword() locale = field.Keyword() modified = field.Date() parent = field.Object( properties={ "id": field.Long(), "title": field.Text(analyzer="kuma_title"), "slug": field.Keyword(), "locale": field.Keyword(), }) slug = field.Keyword() summary = field.Text(analyzer="kuma_content", term_vector="with_positions_offsets") tags = field.Keyword() title = field.Text(analyzer="kuma_title") class Meta(object): mapping = Mapping("wiki_document") mapping.meta("_all", enabled=False) @classmethod def get_connection(cls, alias="default"): return connections.get_connection(alias) @classmethod def get_doc_type(cls): return cls._doc_type.name @classmethod def case_insensitive_keywords(cls, keywords): """Create a unique list of lowercased keywords.""" return sorted({keyword.lower() for keyword in keywords}) @classmethod def from_django(cls, obj): is_root_document = obj.slug.count("/") == 1 doc = { "id": obj.id, "boost": 4.0 if is_root_document else 1.0, "title": obj.title, "slug": obj.slug, "summary": obj.get_summary_text(), "locale": obj.locale, "modified": obj.modified, "content": strip_tags(obj.get_body_html() or ""), "tags": [o.name for o in obj.tags.all()], "kumascript_macros": cls.case_insensitive_keywords(obj.extract.macro_names()), "css_classnames": cls.case_insensitive_keywords(obj.extract.css_classnames()), "html_attributes": cls.case_insensitive_keywords(obj.extract.html_attributes()), } if obj.parent: doc["parent"] = { "id": obj.parent.id, "title": obj.parent.title, "locale": obj.parent.locale, "slug": obj.parent.slug, } else: doc["parent"] = {} return doc @classmethod def get_mapping(cls): return cls._doc_type.mapping.to_dict() @classmethod def get_analysis(cls): return { "filter": { "kuma_word_delimiter": { "type": "word_delimiter", "preserve_original": True, # hi-fi -> hifi, hi-fi "catenate_words": True, # hi-fi -> hifi "catenate_numbers": True, # 90-210 -> 90210 } }, "analyzer": { "default": { "tokenizer": "standard", "filter": ["standard", "elision"] }, # a custom analyzer that strips html and uses our own # word delimiter filter and the elision filter # (e.g. L'attribut -> attribut). The rest is the same as # the snowball analyzer "kuma_content": { "type": "custom", "tokenizer": "standard", "char_filter": ["html_strip"], "filter": [ "elision", "kuma_word_delimiter", "lowercase", "standard", "stop", "snowball", ], }, "kuma_title": { "type": "custom", "tokenizer": "standard", "filter": [ "elision", "kuma_word_delimiter", "lowercase", "standard", "snowball", ], }, }, } @classmethod def get_settings(cls): return { "mappings": cls.get_mapping(), "settings": { "analysis": cls.get_analysis(), "number_of_replicas": settings.ES_DEFAULT_NUM_REPLICAS, "number_of_shards": settings.ES_DEFAULT_NUM_SHARDS, }, } @classmethod def bulk_index(cls, documents, id_field="id", es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ "_index": index, "_type": type, "_id": d["id"], "_source": d } for d in documents] bulk(es, actions) @classmethod def bulk_delete(cls, ids, es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ "_op_type": "delete", "_index": index, "_type": type, "_id": _id } for _id in ids] bulk(es, actions) @classmethod def get_index(cls): from kuma.search.models import Index return Index.objects.get_current().prefixed_name @classmethod def search(cls, **kwargs): options = { "using": connections.get_connection(), "index": cls.get_index(), "doc_type": { cls._doc_type.name: cls.from_es }, } options.update(kwargs) sq = Search(**options) return sq @classmethod def get_model(cls): from kuma.wiki.models import Document return Document @classmethod def get_indexable(cls, percent=100): """ For this mapping type return a list of model IDs that should be indexed with the management command, in a full reindex. WARNING: When changing this code make sure to update the ``should_update`` method below, too! """ model = cls.get_model() excludes = Q() for exclude in cls.exclude_slugs: excludes |= Q(slug__startswith=exclude) qs = model.objects.filter(is_redirect=False).exclude(excludes) percent = percent / 100 if percent < 1: qs = qs[:int(qs.count() * percent)] return qs.values_list("id", flat=True) @classmethod def should_update(cls, obj): """ Given a Document instance should return boolean value whether the instance should be indexed or not. WARNING: This *must* mirror the logic of the ``get_indexable`` method above! """ return (not obj.is_redirect and not obj.deleted and not any([exclude in obj.slug for exclude in cls.exclude_slugs])) def get_excerpt(self): highlighted = getattr(self.meta, "highlight", None) if highlighted: for excerpt_field in self.excerpt_fields: if excerpt_field in highlighted: return "…".join(highlighted[excerpt_field]) return self.summary @classmethod def reindex_all(cls, chunk_size=500, index=None, percent=100): """Rebuild ElasticSearch indexes. :arg chunk_size: how many documents to bulk index as a single chunk. :arg index: the `Index` object to reindex into. Uses the current promoted index if none provided. :arg percent: 1 to 100--the percentage of the db to index. """ from kuma.search.models import Index from kuma.search.tasks import prepare_index, finalize_index from kuma.wiki.tasks import index_documents index = index or Index.objects.get_current() # Get the list of document IDs to index. indexable = WikiDocumentType.get_indexable(percent) total = len(indexable) total_chunks = int(ceil(total / chunk_size)) pre_task = prepare_index.si(index.pk) post_task = finalize_index.si(index.pk) if not total: # If there's no data we still create the index and finalize it. chain(pre_task, post_task).apply_async() else: index_tasks = [ index_documents.si(chunk, index.pk) for chunk in chunked(indexable, chunk_size) ] chord_flow(pre_task, index_tasks, post_task).apply_async() message = _( "Indexing %(total)d documents into %(total_chunks)d chunks of " "size %(size)d into index %(index)s." % { "total": total, "total_chunks": total_chunks, "size": chunk_size, "index": index.prefixed_name, }) return message
class WikiDocumentType(document.DocType): excerpt_fields = ['summary', 'content'] exclude_slugs = [ 'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:', EXPERIMENT_TITLE_PREFIX ] boost = field.Float(null_value=1.0) content = field.Text(analyzer='kuma_content', term_vector='with_positions_offsets') css_classnames = field.Keyword() html_attributes = field.Keyword() id = field.Long() kumascript_macros = field.Keyword() locale = field.Keyword() modified = field.Date() parent = field.Object( properties={ 'id': field.Long(), 'title': field.Text(analyzer='kuma_title'), 'slug': field.Keyword(), 'locale': field.Keyword(), }) slug = field.Keyword() summary = field.Text(analyzer='kuma_content', term_vector='with_positions_offsets') tags = field.Keyword() title = field.Text(analyzer='kuma_title') class Meta(object): mapping = Mapping('wiki_document') mapping.meta('_all', enabled=False) @classmethod def get_connection(cls, alias='default'): return connections.get_connection(alias) @classmethod def get_doc_type(cls): return cls._doc_type.name @classmethod def case_insensitive_keywords(cls, keywords): '''Create a unique list of lowercased keywords.''' return sorted(set([keyword.lower() for keyword in keywords])) @classmethod def from_django(cls, obj): is_root_document = (obj.slug.count('/') == 1) doc = { 'id': obj.id, 'boost': 4.0 if is_root_document else 1.0, 'title': obj.title, 'slug': obj.slug, 'summary': obj.get_summary_text(), 'locale': obj.locale, 'modified': obj.modified, 'content': strip_tags(obj.rendered_html or ''), 'tags': list(obj.tags.names()), 'kumascript_macros': cls.case_insensitive_keywords(obj.extract.macro_names()), 'css_classnames': cls.case_insensitive_keywords(obj.extract.css_classnames()), 'html_attributes': cls.case_insensitive_keywords(obj.extract.html_attributes()), } if obj.parent: doc['parent'] = { 'id': obj.parent.id, 'title': obj.parent.title, 'locale': obj.parent.locale, 'slug': obj.parent.slug, } else: doc['parent'] = {} return doc @classmethod def get_mapping(cls): return cls._doc_type.mapping.to_dict() @classmethod def get_analysis(cls): return { 'filter': { 'kuma_word_delimiter': { 'type': 'word_delimiter', 'preserve_original': True, # hi-fi -> hifi, hi-fi 'catenate_words': True, # hi-fi -> hifi 'catenate_numbers': True, # 90-210 -> 90210 } }, 'analyzer': { 'default': { 'tokenizer': 'standard', 'filter': ['standard', 'elision'] }, # a custom analyzer that strips html and uses our own # word delimiter filter and the elision filter # (e.g. L'attribut -> attribut). The rest is the same as # the snowball analyzer 'kuma_content': { 'type': 'custom', 'tokenizer': 'standard', 'char_filter': ['html_strip'], 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'stop', 'snowball', ], }, 'kuma_title': { 'type': 'custom', 'tokenizer': 'standard', 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'snowball', ], }, }, } @classmethod def get_settings(cls): return { 'mappings': cls.get_mapping(), 'settings': { 'analysis': cls.get_analysis(), 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, } } @classmethod def bulk_index(cls, documents, id_field='id', es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_index': index, '_type': type, '_id': d['id'], '_source': d } for d in documents] bulk(es, actions) @classmethod def bulk_delete(cls, ids, es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_op_type': 'delete', '_index': index, '_type': type, '_id': _id } for _id in ids] bulk(es, actions) @classmethod def get_index(cls): from kuma.search.models import Index return Index.objects.get_current().prefixed_name @classmethod def search(cls, **kwargs): options = { 'using': connections.get_connection(), 'index': cls.get_index(), 'doc_type': { cls._doc_type.name: cls.from_es }, } options.update(kwargs) sq = Search(**options) return sq @classmethod def get_model(cls): from kuma.wiki.models import Document return Document @classmethod def get_indexable(cls, percent=100): """ For this mapping type return a list of model IDs that should be indexed with the management command, in a full reindex. WARNING: When changing this code make sure to update the ``should_update`` method below, too! """ model = cls.get_model() excludes = [] for exclude in cls.exclude_slugs: excludes.append(Q(slug__icontains=exclude)) qs = (model.objects.filter(is_redirect=False, deleted=False).exclude( reduce(operator.or_, excludes))) percent = percent / 100 if percent < 1: qs = qs[:int(qs.count() * percent)] return qs.values_list('id', flat=True) @classmethod def should_update(cls, obj): """ Given a Document instance should return boolean value whether the instance should be indexed or not. WARNING: This *must* mirror the logic of the ``get_indexable`` method above! """ return (not obj.is_redirect and not obj.deleted and not any([exclude in obj.slug for exclude in cls.exclude_slugs])) def get_excerpt(self): highlighted = getattr(self.meta, 'highlight', None) if highlighted: for excerpt_field in self.excerpt_fields: if excerpt_field in highlighted: return u'…'.join(highlighted[excerpt_field]) return self.summary @classmethod def reindex_all(cls, chunk_size=500, index=None, percent=100): """Rebuild ElasticSearch indexes. :arg chunk_size: how many documents to bulk index as a single chunk. :arg index: the `Index` object to reindex into. Uses the current promoted index if none provided. :arg percent: 1 to 100--the percentage of the db to index. """ from kuma.search.models import Index from kuma.search.tasks import prepare_index, finalize_index from kuma.wiki.tasks import index_documents index = index or Index.objects.get_current() # Get the list of document IDs to index. indexable = WikiDocumentType.get_indexable(percent) total = len(indexable) total_chunks = int(ceil(total / chunk_size)) pre_task = prepare_index.si(index.pk) post_task = finalize_index.si(index.pk) if not total: # If there's no data we still create the index and finalize it. chain(pre_task, post_task).apply_async() else: index_tasks = [ index_documents.si(chunk, index.pk) for chunk in chunked(indexable, chunk_size) ] chord_flow(pre_task, index_tasks, post_task).apply_async() message = _( 'Indexing %(total)d documents into %(total_chunks)d chunks of ' 'size %(size)d into index %(index)s.' % { 'total': total, 'total_chunks': total_chunks, 'size': chunk_size, 'index': index.prefixed_name }) return message
class B(A): o = field.Object(dynamic='strict', properties={'b': field.Text()})