Example #1
0
def test_object_constructor():
    expected = {
        'type': 'object',
        'properties': {
            'inner_int': {
                'type': 'integer'
            }
        }
    }

    class Inner(InnerDoc):
        inner_int = field.Integer()

    obj_from_doc = field.Object(doc_class=Inner)
    assert obj_from_doc.to_dict() == expected

    obj_from_props = field.Object(properties={'inner_int': field.Integer()})
    assert obj_from_props.to_dict() == expected

    with pytest.raises(ValidationException):
        field.Object(doc_class=Inner,
                     properties={'inner_int': field.Integer()})

    with pytest.raises(ValidationException):
        field.Object(doc_class=Inner, dynamic=False)
Example #2
0
def test_object_constructor():
    expected = {
        "type": "object",
        "properties": {
            "inner_int": {
                "type": "integer"
            }
        }
    }

    class Inner(InnerDoc):
        inner_int = field.Integer()

    obj_from_doc = field.Object(doc_class=Inner)
    assert obj_from_doc.to_dict() == expected

    obj_from_props = field.Object(properties={"inner_int": field.Integer()})
    assert obj_from_props.to_dict() == expected

    with pytest.raises(ValidationException):
        field.Object(doc_class=Inner,
                     properties={"inner_int": field.Integer()})

    with pytest.raises(ValidationException):
        field.Object(doc_class=Inner, dynamic=False)
Example #3
0
    class Mapping(Content.Mapping):
        data = field.Object()

        class Meta:
            # Necessary to allow for our data field to store appropriately in Elasticsearch.
            # A potential alternative could be storing as a string., we should assess the value.
            dynamic = False
Example #4
0
class ProfileDocument(SumoDocument):
    username = field.Keyword(normalizer="lowercase")
    name = field.Text(fields={"keyword": field.Keyword()})
    email = field.Keyword()
    # store avatar url so we don't need to hit the db when searching users
    # but set enabled=False to ensure ES does no parsing of it
    avatar = field.Object(enabled=False)

    timezone = field.Keyword()
    country = field.Keyword()
    locale = field.Keyword()

    involved_from = field.Date()

    product_ids = field.Keyword(multi=True)
    group_ids = field.Keyword(multi=True)

    class Index:
        name = config.USER_INDEX_NAME
        using = config.DEFAULT_ES7_CONNECTION

    def prepare_username(self, instance):
        return instance.user.username

    def prepare_email(self, instance):
        if instance.public_email:
            return instance.user.email

    def prepare_avatar(self, instance):
        if avatar := instance.fxa_avatar:
            return InnerDoc(url=avatar)
class Manga(Document):
    title = field.Text()
    title = field.Text(analyzer=titles,
                       multi=True,
                       fields={
                           'space': field.Text(analyzer=titles_space,
                                               multi=True),
                           'keyword': field.Keyword(multi=True),
                       })
    tags = field.Object(Tag)
    upload_at = field.Date()
    scan_at = field.Date()

    url = field.Keyword()
    cover_url = field.Keyword()
    images_urls = field.Keyword(multi=True)
    images_len = field.Integer()

    class Index:
        name = 'nhentai__mangas'
        settings = {'number_of_shards': 2, 'number_of_replicas': 1}

    @classmethod
    def url_is_scaned(cls, url):
        logger.info(f"buscando manga {url}")
        if cls.search().filter("term", url=url).count() > 0:
            return True
        return False
Example #6
0
class Ssn_trace(InnerDoc):
    is_valid = field.Boolean()
    is_deceased = field.Boolean()

    ssn = field.Keyword()
    human_message = field.Text()
    issued = field.Object(Ssn_issued)
Example #7
0
class PositionDoc(field.InnerObjectWrapper):
    title = field.String()
    organization = field.Object(
        doc_class=OrganizationDoc,
        properties={
            'name': field.String(),
        }
    )
class Activity(InnerDoc):
    action = field.Text(analyzer=titles,
                        fields={
                            'space': field.Text(analyzer=titles_space),
                            'keyword': field.Keyword(),
                        })
    date = field.Date()
    user = field.Object(User)
Example #9
0
    class Mapping(Content.Mapping):
        # NOTE: parent is set to integer so DJES doesn't recurse
        parent = field.Integer()
        data = field.Object()

        class Meta:
            # Necessary to allow for our data field to store appropriately in Elasticsearch.
            # A potential alternative could be storing as a string., we should assess the value.
            dynamic = False
Example #10
0
class Population(Document):
    name = field.Text(fields={
        'raw': field.Keyword(),
    })
    description = field.Text()
    dweller = field.Object(Dweller_inner)
    samples = field.Object(Dweller_inner, multi=True)

    class Index:
        name = "population"

    def add_sample(self, sample_class=None, index=None):
        result = {}
        if sample_class is None:
            sample_class = Sample
        result['klass'] = export(sample_class)
        if index is not None:
            result['index'] = index
        self.samples.append(result)
Example #11
0
class ProjectDoc(SerializedDoc):
    identifier = field.String()
    name = field.String()
    alternate_name = field.String()
    description = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    status = field.String(fields={'raw': field.String(index='not_analyzed')})
    start_year = field.Integer()
    countries = field.Nested(
        doc_class=CountryDoc,  # project_location aggregation/facet uses the raw multifield
        properties={
            'name': field.String(fields={'raw': field.String(index='not_analyzed')})
        }
    )
    infrastructure_type = field.Object(
        properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})}
    )
    # Providing a doc_class for initiatives produced errors, so keep it simple!
    initiatives = field.Nested(properties={'name': field.String()})
    funding = field.Object(
        multi=True,
        properties={
            'sources': field.Object(
                multi=True,
                properties={
                    'name': field.String(fields={'raw': field.String(index='not_analyzed')}),
                }
            )
        }
    )
    regions = field.Nested(
        doc_class=RegionDoc,
        properties={
            'name': field.String(fields={'raw': field.String(index='not_analyzed')})
        }
    )

    def get_display_name(self):
        return self.name
class Dataset(Document):
    resources = field.Object(Data_set_resource, multi=True)
    tags = field.Text(analyzer=titles,
                      multi=True,
                      fields={
                          'space': field.Text(analyzer=titles_space,
                                              multi=True),
                          'keyword': field.Keyword(multi=True),
                      })

    metadata = field.Object(Metadata)
    activity = field.Object(Activity, multi=True)
    url = field.Keyword()
    status = field.Keyword()
    created_at = field.Date()

    class Index:
        name = 'chibi_gob__open_data__dataset'
        settings = {'number_of_shards': 2, 'number_of_replicas': 1}

    @classmethod
    def url_is_scaned(cls, url):
        logger.info(f"buscando dataset {url}")
        if cls.search().filter("term", url=url).count() > 0:
            return True
        return False

    @classmethod
    def get_by_url(cls, url):
        logger.info(f"get dataset {url}")
        result = cls.search().filter("term", url=url)[:1].execute()
        if result:
            return result[0]
        return None

    def save(self, *args, **kw):
        super().save(*args, **kw)
Example #13
0
class EventDoc(SerializedDoc):
    name = field.String()
    description = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    event_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})})
    start_year = field.Integer()
    places = field.Nested(
        doc_class=PlaceDoc,
        properties={'location_display': field.String(fields={'raw': field.String(index='not_analyzed')})}
    )

    def get_display_name(self):
        return self.name
Example #14
0
class SerializedDoc(DocType):
    _meta = field.Object(
        properties={'model': field.String(fields={'raw': field.String(index='not_analyzed')})}
    )

    def get_model_meta(self):
        return getattr(self, '_meta', None)

    def get_result_highlight(self):
        highlight = getattr(self.meta, 'highlight', None)
        if highlight:
            return getattr(highlight, '_d_', None)
        return None

    def get_display_name(self):
        return None
Example #15
0
class InitiativeDoc(SerializedDoc):
    identifier = field.String()
    name = field.String()
    principal_agent = field.Nested(multi=False, properties={'name': field.String()})
    member_countries = field.Nested(doc_class=CountryDoc)
    geographic_scope = field.Nested(
        doc_class=CountryDoc,
        properties={
            'name': field.String(fields={'raw': field.String(index='not_analyzed')})
        }
    )
    initiative_type = field.Object(properties={'name': field.String(fields={'raw': field.String(index='not_analyzed')})})
    start_year = field.Integer()

    def get_display_name(self):
        return self.name
Example #16
0
class ProfileDocument(SumoDocument):
    username = field.Keyword(normalizer="lowercase")
    name = field.Text(fields={"keyword": field.Keyword()})
    email = field.Keyword()
    # store avatar url so we don't need to hit the db when searching users
    # but set enabled=False to ensure ES does no parsing of it
    avatar = field.Object(enabled=False)

    timezone = field.Keyword()
    country = field.Keyword()
    locale = field.Keyword()

    involved_from = field.Date()

    product_ids = field.Keyword(multi=True)
    group_ids = field.Keyword(multi=True)

    class Index:
        name = config.USER_INDEX_NAME
        using = config.DEFAULT_ES7_CONNECTION

    @classmethod
    def prepare(cls, instance):
        """Override super method to exclude docs from indexing."""
        # Add a discard field in the document if the following conditions are met
        # User is not active
        if not instance.user.is_active:
            instance.es_discard_doc = "unindex_me"

        return super(ProfileDocument, cls).prepare(instance)

    def prepare_username(self, instance):
        return instance.user.username

    def prepare_email(self, instance):
        if instance.public_email:
            return instance.user.email

    def prepare_avatar(self, instance):
        if avatar := instance.fxa_avatar:
            return InnerDoc(url=avatar)
Example #17
0
class PersonDoc(SerializedDoc):
    identifier = field.String()
    given_name = field.String()
    additional_name = field.String()
    family_name = field.String()
    description = field.String(
        analyzer=html_strip,
        fields={'raw': field.String(index='not_analyzed')}
    )
    citizenships = field.Nested(doc_class=CountryDoc)
    position_set = field.Nested(
        doc_class=PositionDoc,
        properties={
            'title': field.String(),
            'organization': field.Object(properties={'name': field.String()})
        }
    )
    events = field.Nested(properties={'name': field.String()})

    def get_display_name(self):
        return " ".join((self.given_name, self.family_name))
class Resource(Document):
    title = field.Text(analyzer=titles,
                       fields={
                           'space': field.Text(analyzer=titles_space),
                           'keyword': field.Keyword(),
                       })
    description = field.Text(analyzer=titles,
                             fields={
                                 'space': field.Text(analyzer=titles_space),
                                 'keyword': field.Keyword(),
                             })
    kind = field.Keyword()
    url = field.Keyword()
    created_at = field.Date()

    tags = field.Text(analyzer=titles,
                      multi=True,
                      fields={
                          'space': field.Text(analyzer=titles_space,
                                              multi=True),
                          'keyword': field.Keyword(multi=True),
                      })

    metadata = field.Object(Metadata)

    class Index:
        name = 'chibi_gob__open_data__dataset__resource'
        settings = {'number_of_shards': 2, 'number_of_replicas': 1}

    @classmethod
    def url_is_scaned(cls, url):
        logger.info(f"buscando dataset {url}")
        if cls.search().filter("term", url=url).count() > 0:
            return True
        return False

    def save(self, *args, **kw):
        super().save(*args, **kw)
Example #19
0
class MyDoc(document.DocType):
    title = field.String(index='not_analyzed')
    name = field.String()
    created_at = field.Date()
    inner = field.Object(properties={'old_field': field.String()},
                         doc_class=MyInner)
Example #20
0
def test_object_dynamic_values():
    for dynamic in True, False, 'strict':
        f = field.Object(dynamic=dynamic)
        assert f.to_dict()['dynamic'] == dynamic
Example #21
0
def test_object_disabled():
    f = field.Object(enabled=False)
    assert f.to_dict() == {"type": "object", "enabled": False}
Example #22
0
 class Mapping:
     name = field.String(analyzer="autocomplete",
                         fields={"raw": field.String(index="not_analyzed")})
     slug = field.String(index="not_analyzed")
     section_logo = ElasticsearchImageField()
     query = field.Object(enabled=False)
class MyDoc(document.Document):
    title = field.Keyword()
    name = field.Text()
    created_at = field.Date()
    inner = field.Object(MyInner)
Example #24
0
class PlaceDoc(field.InnerObjectWrapper):
    city = field.String(fields={'raw': field.String(index='not_analyzed')})
    country = field.Object(doc_class=CountryDoc)
    label = field.String(fields={'raw': field.String(index='not_analyzed')})
    location_display = field.String(fields={'raw': field.String(index='not_analyzed')})
class MyDoc(document.DocType):
    title = field.Keyword()
    name = field.Text()
    created_at = field.Date()
    inner = field.Object(properties={'old_field': field.Text()},
                         doc_class=MyInner)
Example #26
0
 class B(A):
     o = field.Object(dynamic="strict", properties={"b": field.Text()})
Example #27
0
 class A(document.Document):
     o = field.Object(dynamic=False, properties={"a": field.Text()})
Example #28
0
class WikiDocumentType(document.Document):
    excerpt_fields = ["summary", "content"]
    exclude_slugs = [
        "Talk:",
        "User:"******"User_talk:",
        "Template_talk:",
        "Project_talk:",
        EXPERIMENT_TITLE_PREFIX,
    ]

    boost = field.Float(null_value=1.0)
    content = field.Text(analyzer="kuma_content",
                         term_vector="with_positions_offsets")
    css_classnames = field.Keyword()
    html_attributes = field.Keyword()
    id = field.Long()
    kumascript_macros = field.Keyword()
    locale = field.Keyword()
    modified = field.Date()
    parent = field.Object(
        properties={
            "id": field.Long(),
            "title": field.Text(analyzer="kuma_title"),
            "slug": field.Keyword(),
            "locale": field.Keyword(),
        })
    slug = field.Keyword()
    summary = field.Text(analyzer="kuma_content",
                         term_vector="with_positions_offsets")
    tags = field.Keyword()
    title = field.Text(analyzer="kuma_title")

    class Meta(object):
        mapping = Mapping("wiki_document")
        mapping.meta("_all", enabled=False)

    @classmethod
    def get_connection(cls, alias="default"):
        return connections.get_connection(alias)

    @classmethod
    def get_doc_type(cls):
        return cls._doc_type.name

    @classmethod
    def case_insensitive_keywords(cls, keywords):
        """Create a unique list of lowercased keywords."""
        return sorted({keyword.lower() for keyword in keywords})

    @classmethod
    def from_django(cls, obj):
        is_root_document = obj.slug.count("/") == 1
        doc = {
            "id":
            obj.id,
            "boost":
            4.0 if is_root_document else 1.0,
            "title":
            obj.title,
            "slug":
            obj.slug,
            "summary":
            obj.get_summary_text(),
            "locale":
            obj.locale,
            "modified":
            obj.modified,
            "content":
            strip_tags(obj.get_body_html() or ""),
            "tags": [o.name for o in obj.tags.all()],
            "kumascript_macros":
            cls.case_insensitive_keywords(obj.extract.macro_names()),
            "css_classnames":
            cls.case_insensitive_keywords(obj.extract.css_classnames()),
            "html_attributes":
            cls.case_insensitive_keywords(obj.extract.html_attributes()),
        }

        if obj.parent:
            doc["parent"] = {
                "id": obj.parent.id,
                "title": obj.parent.title,
                "locale": obj.parent.locale,
                "slug": obj.parent.slug,
            }
        else:
            doc["parent"] = {}

        return doc

    @classmethod
    def get_mapping(cls):
        return cls._doc_type.mapping.to_dict()

    @classmethod
    def get_analysis(cls):
        return {
            "filter": {
                "kuma_word_delimiter": {
                    "type": "word_delimiter",
                    "preserve_original": True,  # hi-fi -> hifi, hi-fi
                    "catenate_words": True,  # hi-fi -> hifi
                    "catenate_numbers": True,  # 90-210 -> 90210
                }
            },
            "analyzer": {
                "default": {
                    "tokenizer": "standard",
                    "filter": ["standard", "elision"]
                },
                # a custom analyzer that strips html and uses our own
                # word delimiter filter and the elision filter
                # (e.g. L'attribut -> attribut). The rest is the same as
                # the snowball analyzer
                "kuma_content": {
                    "type":
                    "custom",
                    "tokenizer":
                    "standard",
                    "char_filter": ["html_strip"],
                    "filter": [
                        "elision",
                        "kuma_word_delimiter",
                        "lowercase",
                        "standard",
                        "stop",
                        "snowball",
                    ],
                },
                "kuma_title": {
                    "type":
                    "custom",
                    "tokenizer":
                    "standard",
                    "filter": [
                        "elision",
                        "kuma_word_delimiter",
                        "lowercase",
                        "standard",
                        "snowball",
                    ],
                },
            },
        }

    @classmethod
    def get_settings(cls):
        return {
            "mappings": cls.get_mapping(),
            "settings": {
                "analysis": cls.get_analysis(),
                "number_of_replicas": settings.ES_DEFAULT_NUM_REPLICAS,
                "number_of_shards": settings.ES_DEFAULT_NUM_SHARDS,
            },
        }

    @classmethod
    def bulk_index(cls, documents, id_field="id", es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            "_index": index,
            "_type": type,
            "_id": d["id"],
            "_source": d
        } for d in documents]

        bulk(es, actions)

    @classmethod
    def bulk_delete(cls, ids, es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            "_op_type": "delete",
            "_index": index,
            "_type": type,
            "_id": _id
        } for _id in ids]

        bulk(es, actions)

    @classmethod
    def get_index(cls):
        from kuma.search.models import Index

        return Index.objects.get_current().prefixed_name

    @classmethod
    def search(cls, **kwargs):
        options = {
            "using": connections.get_connection(),
            "index": cls.get_index(),
            "doc_type": {
                cls._doc_type.name: cls.from_es
            },
        }
        options.update(kwargs)
        sq = Search(**options)

        return sq

    @classmethod
    def get_model(cls):
        from kuma.wiki.models import Document

        return Document

    @classmethod
    def get_indexable(cls, percent=100):
        """
        For this mapping type return a list of model IDs that should be
        indexed with the management command, in a full reindex.

        WARNING: When changing this code make sure to update the
                 ``should_update`` method below, too!

        """
        model = cls.get_model()

        excludes = Q()
        for exclude in cls.exclude_slugs:
            excludes |= Q(slug__startswith=exclude)

        qs = model.objects.filter(is_redirect=False).exclude(excludes)

        percent = percent / 100
        if percent < 1:
            qs = qs[:int(qs.count() * percent)]

        return qs.values_list("id", flat=True)

    @classmethod
    def should_update(cls, obj):
        """
        Given a Document instance should return boolean value
        whether the instance should be indexed or not.

        WARNING: This *must* mirror the logic of the ``get_indexable``
                 method above!
        """
        return (not obj.is_redirect and not obj.deleted and
                not any([exclude in obj.slug
                         for exclude in cls.exclude_slugs]))

    def get_excerpt(self):
        highlighted = getattr(self.meta, "highlight", None)
        if highlighted:
            for excerpt_field in self.excerpt_fields:
                if excerpt_field in highlighted:
                    return "…".join(highlighted[excerpt_field])
        return self.summary

    @classmethod
    def reindex_all(cls, chunk_size=500, index=None, percent=100):
        """Rebuild ElasticSearch indexes.

        :arg chunk_size: how many documents to bulk index as a single chunk.
        :arg index: the `Index` object to reindex into. Uses the current
            promoted index if none provided.
        :arg percent: 1 to 100--the percentage of the db to index.

        """
        from kuma.search.models import Index
        from kuma.search.tasks import prepare_index, finalize_index
        from kuma.wiki.tasks import index_documents

        index = index or Index.objects.get_current()

        # Get the list of document IDs to index.
        indexable = WikiDocumentType.get_indexable(percent)

        total = len(indexable)
        total_chunks = int(ceil(total / chunk_size))

        pre_task = prepare_index.si(index.pk)
        post_task = finalize_index.si(index.pk)

        if not total:
            # If there's no data we still create the index and finalize it.
            chain(pre_task, post_task).apply_async()
        else:
            index_tasks = [
                index_documents.si(chunk, index.pk)
                for chunk in chunked(indexable, chunk_size)
            ]
            chord_flow(pre_task, index_tasks, post_task).apply_async()

        message = _(
            "Indexing %(total)d documents into %(total_chunks)d chunks of "
            "size %(size)d into index %(index)s." % {
                "total": total,
                "total_chunks": total_chunks,
                "size": chunk_size,
                "index": index.prefixed_name,
            })
        return message
Example #29
0
class WikiDocumentType(document.DocType):
    excerpt_fields = ['summary', 'content']
    exclude_slugs = [
        'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:',
        EXPERIMENT_TITLE_PREFIX
    ]

    boost = field.Float(null_value=1.0)
    content = field.Text(analyzer='kuma_content',
                         term_vector='with_positions_offsets')
    css_classnames = field.Keyword()
    html_attributes = field.Keyword()
    id = field.Long()
    kumascript_macros = field.Keyword()
    locale = field.Keyword()
    modified = field.Date()
    parent = field.Object(
        properties={
            'id': field.Long(),
            'title': field.Text(analyzer='kuma_title'),
            'slug': field.Keyword(),
            'locale': field.Keyword(),
        })
    slug = field.Keyword()
    summary = field.Text(analyzer='kuma_content',
                         term_vector='with_positions_offsets')
    tags = field.Keyword()
    title = field.Text(analyzer='kuma_title')

    class Meta(object):
        mapping = Mapping('wiki_document')
        mapping.meta('_all', enabled=False)

    @classmethod
    def get_connection(cls, alias='default'):
        return connections.get_connection(alias)

    @classmethod
    def get_doc_type(cls):
        return cls._doc_type.name

    @classmethod
    def case_insensitive_keywords(cls, keywords):
        '''Create a unique list of lowercased keywords.'''
        return sorted(set([keyword.lower() for keyword in keywords]))

    @classmethod
    def from_django(cls, obj):
        is_root_document = (obj.slug.count('/') == 1)
        doc = {
            'id':
            obj.id,
            'boost':
            4.0 if is_root_document else 1.0,
            'title':
            obj.title,
            'slug':
            obj.slug,
            'summary':
            obj.get_summary_text(),
            'locale':
            obj.locale,
            'modified':
            obj.modified,
            'content':
            strip_tags(obj.rendered_html or ''),
            'tags':
            list(obj.tags.names()),
            'kumascript_macros':
            cls.case_insensitive_keywords(obj.extract.macro_names()),
            'css_classnames':
            cls.case_insensitive_keywords(obj.extract.css_classnames()),
            'html_attributes':
            cls.case_insensitive_keywords(obj.extract.html_attributes()),
        }

        if obj.parent:
            doc['parent'] = {
                'id': obj.parent.id,
                'title': obj.parent.title,
                'locale': obj.parent.locale,
                'slug': obj.parent.slug,
            }
        else:
            doc['parent'] = {}

        return doc

    @classmethod
    def get_mapping(cls):
        return cls._doc_type.mapping.to_dict()

    @classmethod
    def get_analysis(cls):
        return {
            'filter': {
                'kuma_word_delimiter': {
                    'type': 'word_delimiter',
                    'preserve_original': True,  # hi-fi -> hifi, hi-fi
                    'catenate_words': True,  # hi-fi -> hifi
                    'catenate_numbers': True,  # 90-210 -> 90210
                }
            },
            'analyzer': {
                'default': {
                    'tokenizer': 'standard',
                    'filter': ['standard', 'elision']
                },
                # a custom analyzer that strips html and uses our own
                # word delimiter filter and the elision filter
                # (e.g. L'attribut -> attribut). The rest is the same as
                # the snowball analyzer
                'kuma_content': {
                    'type':
                    'custom',
                    'tokenizer':
                    'standard',
                    'char_filter': ['html_strip'],
                    'filter': [
                        'elision',
                        'kuma_word_delimiter',
                        'lowercase',
                        'standard',
                        'stop',
                        'snowball',
                    ],
                },
                'kuma_title': {
                    'type':
                    'custom',
                    'tokenizer':
                    'standard',
                    'filter': [
                        'elision',
                        'kuma_word_delimiter',
                        'lowercase',
                        'standard',
                        'snowball',
                    ],
                },
            },
        }

    @classmethod
    def get_settings(cls):
        return {
            'mappings': cls.get_mapping(),
            'settings': {
                'analysis': cls.get_analysis(),
                'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
                'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
            }
        }

    @classmethod
    def bulk_index(cls, documents, id_field='id', es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            '_index': index,
            '_type': type,
            '_id': d['id'],
            '_source': d
        } for d in documents]

        bulk(es, actions)

    @classmethod
    def bulk_delete(cls, ids, es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            '_op_type': 'delete',
            '_index': index,
            '_type': type,
            '_id': _id
        } for _id in ids]

        bulk(es, actions)

    @classmethod
    def get_index(cls):
        from kuma.search.models import Index
        return Index.objects.get_current().prefixed_name

    @classmethod
    def search(cls, **kwargs):
        options = {
            'using': connections.get_connection(),
            'index': cls.get_index(),
            'doc_type': {
                cls._doc_type.name: cls.from_es
            },
        }
        options.update(kwargs)
        sq = Search(**options)

        return sq

    @classmethod
    def get_model(cls):
        from kuma.wiki.models import Document
        return Document

    @classmethod
    def get_indexable(cls, percent=100):
        """
        For this mapping type return a list of model IDs that should be
        indexed with the management command, in a full reindex.

        WARNING: When changing this code make sure to update the
                 ``should_update`` method below, too!

        """
        model = cls.get_model()

        excludes = []
        for exclude in cls.exclude_slugs:
            excludes.append(Q(slug__icontains=exclude))

        qs = (model.objects.filter(is_redirect=False, deleted=False).exclude(
            reduce(operator.or_, excludes)))

        percent = percent / 100
        if percent < 1:
            qs = qs[:int(qs.count() * percent)]

        return qs.values_list('id', flat=True)

    @classmethod
    def should_update(cls, obj):
        """
        Given a Document instance should return boolean value
        whether the instance should be indexed or not.

        WARNING: This *must* mirror the logic of the ``get_indexable``
                 method above!
        """
        return (not obj.is_redirect and not obj.deleted and
                not any([exclude in obj.slug
                         for exclude in cls.exclude_slugs]))

    def get_excerpt(self):
        highlighted = getattr(self.meta, 'highlight', None)
        if highlighted:
            for excerpt_field in self.excerpt_fields:
                if excerpt_field in highlighted:
                    return u'…'.join(highlighted[excerpt_field])
        return self.summary

    @classmethod
    def reindex_all(cls, chunk_size=500, index=None, percent=100):
        """Rebuild ElasticSearch indexes.

        :arg chunk_size: how many documents to bulk index as a single chunk.
        :arg index: the `Index` object to reindex into. Uses the current
            promoted index if none provided.
        :arg percent: 1 to 100--the percentage of the db to index.

        """
        from kuma.search.models import Index
        from kuma.search.tasks import prepare_index, finalize_index
        from kuma.wiki.tasks import index_documents

        index = index or Index.objects.get_current()

        # Get the list of document IDs to index.
        indexable = WikiDocumentType.get_indexable(percent)

        total = len(indexable)
        total_chunks = int(ceil(total / chunk_size))

        pre_task = prepare_index.si(index.pk)
        post_task = finalize_index.si(index.pk)

        if not total:
            # If there's no data we still create the index and finalize it.
            chain(pre_task, post_task).apply_async()
        else:
            index_tasks = [
                index_documents.si(chunk, index.pk)
                for chunk in chunked(indexable, chunk_size)
            ]
            chord_flow(pre_task, index_tasks, post_task).apply_async()

        message = _(
            'Indexing %(total)d documents into %(total_chunks)d chunks of '
            'size %(size)d into index %(index)s.' % {
                'total': total,
                'total_chunks': total_chunks,
                'size': chunk_size,
                'index': index.prefixed_name
            })
        return message
Example #30
0
 class B(A):
     o = field.Object(dynamic='strict', properties={'b': field.Text()})