class Mapping: class Meta: doc_type = "super_manual_mapping" excludes = ("garbage", ) bar = field.String(fields={"raw": field.String(index="not_analyzed")}) status = field.String(index="not_analyzed")
class PositionDoc(field.InnerObjectWrapper): title = field.String() organization = field.Object( doc_class=OrganizationDoc, properties={ 'name': field.String(), } )
class Document(DocType): id = field.Integer() title = field.String(analyzer='snowball'), author = field.String(analyzer='snowball'), creation_date = field.Date(), pages = field.Integer(), content = field.String(analyzer='snowball'), lang = field.String(), size = field.Integer(), tags = field.String(index='not_analyzed') autocomplete = field.Text(analyzer = ngram_analyzer)
class SerializedDoc(DocType): _meta = field.Object( properties={'model': field.String(fields={'raw': field.String(index='not_analyzed')})} ) def get_model_meta(self): return getattr(self, '_meta', None) def get_result_highlight(self): highlight = getattr(self.meta, 'highlight', None) if highlight: return getattr(highlight, '_d_', None) return None def get_display_name(self): return None
def test_field_supports_multiple_analyzers(): f = field.String(index_analyzer='snowball', search_analyzer='keyword') assert { 'index_analyzer': 'snowball', 'search_analyzer': 'keyword', 'type': 'string' } == f.to_dict()
class User(document.DocType): username = field.String() class Meta: all = document.MetaField(enabled=False) _index = document.MetaField(enabled=True) dynamic = document.MetaField('strict') dynamic_templates = document.MetaField([42])
def test_multifield_supports_multiple_analyzers(): f = field.String( fields={ 'f1': field.String(search_analyzer='keyword', analyzer='snowball'), 'f2': field.String(analyzer='keyword') }) assert { 'fields': { 'f1': { 'analyzer': 'snowball', 'search_analyzer': 'keyword', 'type': 'string' }, 'f2': { 'analyzer': 'keyword', 'type': 'string' } }, 'type': 'string' } == f.to_dict()
class User(document.DocType): pwd_hash = field.String() def check_password(self, pwd): return md5(pwd).hexdigest() == self.pwd_hash @property def password(self): raise AttributeError('readonly') @password.setter def password(self, pwd): self.pwd_hash = md5(pwd).hexdigest()
class OrganizationDoc(SerializedDoc): name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) mission = field.String() countries = field.Nested(doc_class=CountryDoc) headquarters_location = field.String(fields={'raw': field.String(index='not_analyzed')}) scope_of_operations = field.String( multi=True, fields={'raw': field.String(index='not_analyzed')} ) start_year = field.Integer() def get_display_name(self): return self.name
class PersonDoc(SerializedDoc): identifier = field.String() given_name = field.String() additional_name = field.String() family_name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) citizenships = field.Nested(doc_class=CountryDoc) position_set = field.Nested( doc_class=PositionDoc, properties={ 'title': field.String(), 'organization': field.Object(properties={'name': field.String()}) } ) events = field.Nested(properties={'name': field.String()}) def get_display_name(self): return " ".join((self.given_name, self.family_name))
class EventDoc(SerializedDoc): name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) event_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}) start_year = field.Integer() places = field.Nested( doc_class=PlaceDoc, properties={'location_display': field.String(fields={'raw': field.String(index='not_analyzed')})} ) def get_display_name(self): return self.name
def test_multi_fields_are_accepted_and_parsed(): f = field.construct_field( 'string', fields={ 'raw': {'type': 'string', 'index': 'not_analyzed'}, 'eng': field.String(analyzer='english'), } ) assert isinstance(f, field.String) assert { 'type': 'string', 'fields': { 'raw': { 'type': 'string', 'index': 'not_analyzed'}, 'eng': { 'type': 'string', 'analyzer': 'english'}, } } == f.to_dict()
class EntryDoc(SerializedDoc): title = field.String() author = field.String() content = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) publication_date = field.Date() categories = field.Nested( doc_class=CategoryDoc, properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})} ) def get_display_name(self): return self.title
class InitiativeDoc(SerializedDoc): identifier = field.String() name = field.String() principal_agent = field.Nested(multi=False, properties={'name': field.String()}) member_countries = field.Nested(doc_class=CountryDoc) geographic_scope = field.Nested( doc_class=CountryDoc, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) initiative_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}) start_year = field.Integer() def get_display_name(self): return self.name
class MySubDoc(MyDoc): name = field.String(index='not_analyzed') class Meta: doc_type = 'my_custom_doc' index = 'default-index'
class CountryDoc(field.InnerObjectWrapper): name = field.String()
class MyDoc(document.DocType): title = field.String(index='not_analyzed') name = field.String() created_at = field.Date() inner = field.Object(properties={'old_field': field.String()}, doc_class=MyInner)
class Blog(document.DocType): tags = field.String(multi=True, index='not_analyzed')
class User(document.DocType): username = field.String() class Meta: all = document.MetaField(enabled=False) _index = document.MetaField(enabled=True)
class RegionDoc(field.InnerObjectWrapper): name = field.String(fields={'raw': field.String(index='not_analyzed')})
def builtin_type(self): return field.String(**self._params)
class Mapping: name = field.String(analyzer="autocomplete", fields={"raw": field.String(index="not_analyzed")}) slug = field.String(index="not_analyzed") section_logo = ElasticsearchImageField() query = field.Object(enabled=False)
class WikiDocumentType(document.DocType): excerpt_fields = ['summary', 'content'] exclude_slugs = [ 'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:' ] boost = field.Float(null_value=1.0) content = field.String(analyzer='kuma_content', term_vector='with_positions_offsets') css_classnames = field.String(analyzer='case_insensitive_keyword') html_attributes = field.String(analyzer='case_insensitive_keyword') id = field.Long() kumascript_macros = field.String(analyzer='case_insensitive_keyword') locale = field.String(index='not_analyzed') modified = field.Date() parent = field.Nested( properties={ 'id': field.Long(), 'title': field.String(analyzer='kuma_title'), 'slug': field.String(index='not_analyzed'), 'locale': field.String(index='not_analyzed'), }) slug = field.String(index='not_analyzed') summary = field.String(analyzer='kuma_content', term_vector='with_positions_offsets') tags = field.String(analyzer='case_sensitive') title = field.String(analyzer='kuma_title', boost=1.2) class Meta(object): mapping = Mapping('wiki_document') mapping.meta('_all', enalbed=False) @classmethod def get_connection(cls, alias='default'): return connections.get_connection(alias) @classmethod def get_doc_type(cls): return cls._doc_type.name @classmethod def from_django(cls, obj): doc = { 'id': obj.id, 'title': obj.title, 'slug': obj.slug, 'summary': obj.get_summary(strip_markup=True), 'locale': obj.locale, 'modified': obj.modified, 'content': strip_tags(obj.rendered_html), 'tags': list(obj.tags.values_list('name', flat=True)), 'kumascript_macros': obj.extract_kumascript_macro_names(), 'css_classnames': obj.extract_css_classnames(), 'html_attributes': obj.extract_html_attributes(), } # Check if the document has a document zone attached try: is_zone = bool(obj.zone) except ObjectDoesNotExist: is_zone = False if is_zone: # boost all documents that are a zone doc['boost'] = 8.0 elif obj.slug.count('/') == 1: # a little boost if no zone but still first level doc['boost'] = 4.0 else: doc['boost'] = 1.0 if obj.parent: doc['parent'] = { 'id': obj.parent.id, 'title': obj.parent.title, 'locale': obj.parent.locale, 'slug': obj.parent.slug, } else: doc['parent'] = {} return doc @classmethod def get_mapping(cls): return cls._doc_type.mapping.to_dict() @classmethod def get_analysis(cls): return { 'filter': { 'kuma_word_delimiter': { 'type': 'word_delimiter', 'preserve_original': True, # hi-fi -> hifi, hi-fi 'catenate_words': True, # hi-fi -> hifi 'catenate_numbers': True, # 90-210 -> 90210 } }, 'analyzer': { 'default': { 'tokenizer': 'standard', 'filter': ['standard', 'elision'] }, # a custom analyzer that strips html and uses our own # word delimiter filter and the elision filter # (e.g. L'attribut -> attribut). The rest is the same as # the snowball analyzer 'kuma_content': { 'type': 'custom', 'tokenizer': 'standard', 'char_filter': ['html_strip'], 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'stop', 'snowball', ], }, 'kuma_title': { 'type': 'custom', 'tokenizer': 'standard', 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'snowball', ], }, 'case_sensitive': { 'type': 'custom', 'tokenizer': 'keyword' }, 'case_insensitive_keyword': { 'type': 'custom', 'tokenizer': 'keyword', 'filter': 'lowercase' } }, } @classmethod def get_settings(cls): return { 'mappings': cls.get_mapping(), 'settings': { 'analysis': cls.get_analysis(), 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, } } @classmethod def bulk_index(cls, documents, id_field='id', es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_index': index, '_type': type, '_id': d['id'], '_source': d } for d in documents] bulk(es, actions) @classmethod def bulk_delete(cls, ids, es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_op_type': 'delete', '_index': index, '_type': type, '_id': _id } for _id in ids] bulk(es, actions) @classmethod def get_index(cls): from kuma.search.models import Index return Index.objects.get_current().prefixed_name @classmethod def search(cls, **kwargs): options = { 'using': connections.get_connection(), 'index': cls.get_index(), 'doc_type': { cls._doc_type.name: cls.from_es }, } options.update(kwargs) sq = Search(**options) return sq @classmethod def get_model(cls): from kuma.wiki.models import Document return Document @classmethod def get_indexable(cls, percent=100): """ For this mapping type return a list of model IDs that should be indexed with the management command, in a full reindex. WARNING: When changing this code make sure to update the ``should_update`` method below, too! """ model = cls.get_model() excludes = [] for exclude in cls.exclude_slugs: excludes.append(Q(slug__icontains=exclude)) qs = (model.objects.filter(is_template=False, is_redirect=False, deleted=False).exclude( reduce(operator.or_, excludes))) percent = percent / 100 if percent < 1: qs = qs[:int(qs.count() * percent)] return qs.values_list('id', flat=True) @classmethod def should_update(cls, obj): """ Given a Document instance should return boolean value whether the instance should be indexed or not. WARNING: This *must* mirror the logic of the ``get_indexable`` method above! """ return (not obj.is_template and not obj.is_redirect and not obj.deleted and not any([exclude in obj.slug for exclude in cls.exclude_slugs])) def get_excerpt(self): if getattr(self, 'highlight', False): for excerpt_field in self.excerpt_fields: if excerpt_field in self.highlight: return u'…'.join(self.highlight[excerpt_field]) return self.summary @classmethod def reindex_all(cls, chunk_size=500, index=None, percent=100): """Rebuild ElasticSearch indexes. :arg chunk_size: how many documents to bulk index as a single chunk. :arg index: the `Index` object to reindex into. Uses the current promoted index if none provided. :arg percent: 1 to 100--the percentage of the db to index. """ from kuma.search.models import Index from kuma.search.tasks import prepare_index, finalize_index from kuma.wiki.tasks import index_documents index = index or Index.objects.get_current() # Get the list of document IDs to index. indexable = WikiDocumentType.get_indexable(percent) total = len(indexable) total_chunks = int(ceil(total / chunk_size)) pre_task = prepare_index.si(index.pk) post_task = finalize_index.si(index.pk) if not total: # If there's no data we still create the index and finalize it. chain(pre_task, post_task).apply_async() else: index_tasks = [ index_documents.si(chunk, index.pk) for chunk in chunked(indexable, chunk_size) ] chord_flow(pre_task, index_tasks, post_task).apply_async() message = _( 'Indexing {total} documents into {n} chunks of size {size} into ' 'index {index}.'.format(total=total, n=total_chunks, size=chunk_size, index=index.prefixed_name)) return message
class ProjectDoc(SerializedDoc): identifier = field.String() name = field.String() alternate_name = field.String() description = field.String( analyzer=html_strip, fields={'raw': field.String(index='not_analyzed')} ) status = field.String(fields={'raw': field.String(index='not_analyzed')}) start_year = field.Integer() countries = field.Nested( doc_class=CountryDoc, # project_location aggregation/facet uses the raw multifield properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) infrastructure_type = field.Object( properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})} ) # Providing a doc_class for initiatives produced errors, so keep it simple! initiatives = field.Nested(properties={'name': field.String()}) funding = field.Object( multi=True, properties={ 'sources': field.Object( multi=True, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}), } ) } ) regions = field.Nested( doc_class=RegionDoc, properties={ 'name': field.String(fields={'raw': field.String(index='not_analyzed')}) } ) def get_display_name(self): return self.name
class SimpleCommit(document.DocType): files = field.String(multi=True) class Meta: index = 'test-git'
class PlaceDoc(field.InnerObjectWrapper): city = field.String(fields={'raw': field.String(index='not_analyzed')}) country = field.Object(doc_class=CountryDoc) label = field.String(fields={'raw': field.String(index='not_analyzed')}) location_display = field.String(fields={'raw': field.String(index='not_analyzed')})
class MyD(document.DocType): title = field.String() class Meta: mapping = Mapping('my_d') mapping.meta('_all', enabled=False)
def test_nested_provides_direct_access_to_its_fields(): f = field.Nested() f.field('name', 'string', index='not_analyzed') assert 'name' in f assert f['name'] == field.String(index='not_analyzed')
class DocWithNested(document.DocType): comments = field.Nested(properties={'title': field.String()})
class CategoryDoc(field.InnerObjectWrapper): name = field.String()