def prepare_doc(self):
        _fields, _map = {}, {}
        for idx, _f in enumerate(self.schema['fields'], 1):
            alias_name = _f['name']
            field_name = 'col{}'.format(idx)
            _field = self._schema2doc_map[_f['type']]
            _map[field_name] = alias_name
            _fields[field_name] = _field

        if self.has_geo_data:
            _fields['shape'] = dsl_field.GeoShape()
            _fields['point'] = dsl_field.GeoPoint()
            _fields['label'] = dsl_field.Text()
            _fields['shape_type'] = dsl_field.Integer()

        _fields['resource'] = dsl_field.Nested(
            properties={
                'id':
                dsl_field.Integer(),
                'title':
                dsl_field.Text(analyzer=polish_analyzer,
                               fields={'raw': dsl_field.Keyword()})
            })

        _fields['updated_at'] = dsl_field.Date()
        _fields['row_no'] = dsl_field.Long()
        _fields['Index'] = type('Index', (type, ), {'name': self.idx_name})

        doc = type(self.idx_name, (Document, ), _fields)
        doc._doc_type.mapping._meta['_meta'] = {'headers': _map}
        return doc
Exemple #2
0
    def doc(self):
        if not self._doc_cache:
            _fields, _map = {}, {}
            for idx, _f in enumerate(self.schema['fields']):
                alias_name = _f['name']
                field_name = 'col{}'.format(idx + 1)
                _field = _schema2doc_map[_f['type']]
                _map[field_name] = alias_name
                _fields[field_name] = _field

            _fields['resource'] = dsl_field.Nested(
                properties={
                    'id': dsl_field.Integer(),
                    'title': dsl_field.Text(
                        analyzer=polish_analyzer,
                        fields={'raw': dsl_field.Keyword()})
                }
            )

            _fields['updated_at'] = dsl_field.Date()
            _fields['row_no'] = dsl_field.Long()

            doc = type(self.idx_name, (DocType,), _fields)
            doc._doc_type.index = self.idx_name
            doc._doc_type.mapping._meta['_meta'] = {'headers': _map}
            doc._doc_type.mapping._meta['_meta']
            self._doc_cache = doc
        return self._doc_cache
Exemple #3
0
    class Mapping:

        content = ContentField()
        contributor = ContributorField()
        pay = field.Long()

        class Meta:
            dynamic = False
            excludes = ('content', 'contributor')
 def _schema2doc_map(self):
     _map = {
         'integer':
         dsl_field.Long(),
         'number':
         dsl_field.ScaledFloat(scaling_factor=100),
         'string':
         dsl_field.Text(analyzer=polish_analyzer,
                        fields={
                            'raw': dsl_field.Text(),
                            'keyword': dsl_field.Keyword(),
                        }),
         'any':
         dsl_field.Text(analyzer=polish_analyzer,
                        fields={
                            'raw': dsl_field.Text(),
                            'keyword': dsl_field.Keyword(),
                        }),
         'boolean':
         dsl_field.Boolean(),
         'time':
         dsl_field.Text(
             fields={
                 'text': dsl_field.Text(),
                 'time': dsl_field.Date(
                     format=constance_config.TIME_FORMATS),
             }),
         'duration':
         dsl_field.DateRange(),
         'default':
         dsl_field.Text(),
         'date':
         dsl_field.Text(
             fields={
                 'text': dsl_field.Text(),
                 'date': dsl_field.Date(
                     format=constance_config.DATE_FORMATS),
             }),
         'datetime':
         dsl_field.Text(
             fields={
                 'text':
                 dsl_field.Text(),
                 'datetime':
                 dsl_field.Date(format=constance_config.DATE_FORMATS),
             })
     }
     for key, val in _map.items():
         _map[key] = CustomObject(properties={
             'val': val,
             'repr': dsl_field.Keyword(),
         })
     return _map
    def prepare_doc(self):
        _fields = {
            'shape':
            dsl_field.GeoShape(),
            'point':
            dsl_field.GeoPoint(),
            'shape_type':
            dsl_field.Integer(),
            'label':
            dsl_field.Text(),
            'resource':
            dsl_field.Nested(
                properties={
                    'id':
                    dsl_field.Integer(),
                    'title':
                    dsl_field.Text(analyzer=polish_analyzer,
                                   fields={'raw': dsl_field.Keyword()})
                }),
            'updated_at':
            dsl_field.Date(),
            'row_no':
            dsl_field.Long()
        }
        _map = {}

        for idx, _f in enumerate(self.schema, 1):
            if _f.type not in self._schema2doc_map:
                continue
            alias_name = _f.name
            field_name = f'col{idx}'
            _field = self._schema2doc_map[_f.type]
            _map[field_name] = alias_name
            _fields[field_name] = _field
            _fields['Index'] = type('Index', (type, ), {'name': self.idx_name})

        doc = type(self.idx_name, (Document, ), _fields)
        doc._doc_type.mapping._meta['_meta'] = {'headers': _map}
        return doc
Exemple #6
0
class MyDoc2(document.DocType):
    extra = field.Long()
Exemple #7
0
class WikiDocumentType(document.Document):
    excerpt_fields = ["summary", "content"]
    exclude_slugs = [
        "Talk:",
        "User:"******"User_talk:",
        "Template_talk:",
        "Project_talk:",
        EXPERIMENT_TITLE_PREFIX,
    ]

    boost = field.Float(null_value=1.0)
    content = field.Text(analyzer="kuma_content",
                         term_vector="with_positions_offsets")
    css_classnames = field.Keyword()
    html_attributes = field.Keyword()
    id = field.Long()
    kumascript_macros = field.Keyword()
    locale = field.Keyword()
    modified = field.Date()
    parent = field.Object(
        properties={
            "id": field.Long(),
            "title": field.Text(analyzer="kuma_title"),
            "slug": field.Keyword(),
            "locale": field.Keyword(),
        })
    slug = field.Keyword()
    summary = field.Text(analyzer="kuma_content",
                         term_vector="with_positions_offsets")
    tags = field.Keyword()
    title = field.Text(analyzer="kuma_title")

    class Meta(object):
        mapping = Mapping("wiki_document")
        mapping.meta("_all", enabled=False)

    @classmethod
    def get_connection(cls, alias="default"):
        return connections.get_connection(alias)

    @classmethod
    def get_doc_type(cls):
        return cls._doc_type.name

    @classmethod
    def case_insensitive_keywords(cls, keywords):
        """Create a unique list of lowercased keywords."""
        return sorted({keyword.lower() for keyword in keywords})

    @classmethod
    def from_django(cls, obj):
        is_root_document = obj.slug.count("/") == 1
        doc = {
            "id":
            obj.id,
            "boost":
            4.0 if is_root_document else 1.0,
            "title":
            obj.title,
            "slug":
            obj.slug,
            "summary":
            obj.get_summary_text(),
            "locale":
            obj.locale,
            "modified":
            obj.modified,
            "content":
            strip_tags(obj.get_body_html() or ""),
            "tags": [o.name for o in obj.tags.all()],
            "kumascript_macros":
            cls.case_insensitive_keywords(obj.extract.macro_names()),
            "css_classnames":
            cls.case_insensitive_keywords(obj.extract.css_classnames()),
            "html_attributes":
            cls.case_insensitive_keywords(obj.extract.html_attributes()),
        }

        if obj.parent:
            doc["parent"] = {
                "id": obj.parent.id,
                "title": obj.parent.title,
                "locale": obj.parent.locale,
                "slug": obj.parent.slug,
            }
        else:
            doc["parent"] = {}

        return doc

    @classmethod
    def get_mapping(cls):
        return cls._doc_type.mapping.to_dict()

    @classmethod
    def get_analysis(cls):
        return {
            "filter": {
                "kuma_word_delimiter": {
                    "type": "word_delimiter",
                    "preserve_original": True,  # hi-fi -> hifi, hi-fi
                    "catenate_words": True,  # hi-fi -> hifi
                    "catenate_numbers": True,  # 90-210 -> 90210
                }
            },
            "analyzer": {
                "default": {
                    "tokenizer": "standard",
                    "filter": ["standard", "elision"]
                },
                # a custom analyzer that strips html and uses our own
                # word delimiter filter and the elision filter
                # (e.g. L'attribut -> attribut). The rest is the same as
                # the snowball analyzer
                "kuma_content": {
                    "type":
                    "custom",
                    "tokenizer":
                    "standard",
                    "char_filter": ["html_strip"],
                    "filter": [
                        "elision",
                        "kuma_word_delimiter",
                        "lowercase",
                        "standard",
                        "stop",
                        "snowball",
                    ],
                },
                "kuma_title": {
                    "type":
                    "custom",
                    "tokenizer":
                    "standard",
                    "filter": [
                        "elision",
                        "kuma_word_delimiter",
                        "lowercase",
                        "standard",
                        "snowball",
                    ],
                },
            },
        }

    @classmethod
    def get_settings(cls):
        return {
            "mappings": cls.get_mapping(),
            "settings": {
                "analysis": cls.get_analysis(),
                "number_of_replicas": settings.ES_DEFAULT_NUM_REPLICAS,
                "number_of_shards": settings.ES_DEFAULT_NUM_SHARDS,
            },
        }

    @classmethod
    def bulk_index(cls, documents, id_field="id", es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            "_index": index,
            "_type": type,
            "_id": d["id"],
            "_source": d
        } for d in documents]

        bulk(es, actions)

    @classmethod
    def bulk_delete(cls, ids, es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            "_op_type": "delete",
            "_index": index,
            "_type": type,
            "_id": _id
        } for _id in ids]

        bulk(es, actions)

    @classmethod
    def get_index(cls):
        from kuma.search.models import Index

        return Index.objects.get_current().prefixed_name

    @classmethod
    def search(cls, **kwargs):
        options = {
            "using": connections.get_connection(),
            "index": cls.get_index(),
            "doc_type": {
                cls._doc_type.name: cls.from_es
            },
        }
        options.update(kwargs)
        sq = Search(**options)

        return sq

    @classmethod
    def get_model(cls):
        from kuma.wiki.models import Document

        return Document

    @classmethod
    def get_indexable(cls, percent=100):
        """
        For this mapping type return a list of model IDs that should be
        indexed with the management command, in a full reindex.

        WARNING: When changing this code make sure to update the
                 ``should_update`` method below, too!

        """
        model = cls.get_model()

        excludes = Q()
        for exclude in cls.exclude_slugs:
            excludes |= Q(slug__startswith=exclude)

        qs = model.objects.filter(is_redirect=False).exclude(excludes)

        percent = percent / 100
        if percent < 1:
            qs = qs[:int(qs.count() * percent)]

        return qs.values_list("id", flat=True)

    @classmethod
    def should_update(cls, obj):
        """
        Given a Document instance should return boolean value
        whether the instance should be indexed or not.

        WARNING: This *must* mirror the logic of the ``get_indexable``
                 method above!
        """
        return (not obj.is_redirect and not obj.deleted and
                not any([exclude in obj.slug
                         for exclude in cls.exclude_slugs]))

    def get_excerpt(self):
        highlighted = getattr(self.meta, "highlight", None)
        if highlighted:
            for excerpt_field in self.excerpt_fields:
                if excerpt_field in highlighted:
                    return "…".join(highlighted[excerpt_field])
        return self.summary

    @classmethod
    def reindex_all(cls, chunk_size=500, index=None, percent=100):
        """Rebuild ElasticSearch indexes.

        :arg chunk_size: how many documents to bulk index as a single chunk.
        :arg index: the `Index` object to reindex into. Uses the current
            promoted index if none provided.
        :arg percent: 1 to 100--the percentage of the db to index.

        """
        from kuma.search.models import Index
        from kuma.search.tasks import prepare_index, finalize_index
        from kuma.wiki.tasks import index_documents

        index = index or Index.objects.get_current()

        # Get the list of document IDs to index.
        indexable = WikiDocumentType.get_indexable(percent)

        total = len(indexable)
        total_chunks = int(ceil(total / chunk_size))

        pre_task = prepare_index.si(index.pk)
        post_task = finalize_index.si(index.pk)

        if not total:
            # If there's no data we still create the index and finalize it.
            chain(pre_task, post_task).apply_async()
        else:
            index_tasks = [
                index_documents.si(chunk, index.pk)
                for chunk in chunked(indexable, chunk_size)
            ]
            chord_flow(pre_task, index_tasks, post_task).apply_async()

        message = _(
            "Indexing %(total)d documents into %(total_chunks)d chunks of "
            "size %(size)d into index %(index)s." % {
                "total": total,
                "total_chunks": total_chunks,
                "size": chunk_size,
                "index": index.prefixed_name,
            })
        return message
Exemple #8
0
class WikiDocumentType(document.DocType):
    excerpt_fields = ['summary', 'content']
    exclude_slugs = [
        'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:'
    ]

    boost = field.Float(null_value=1.0)
    content = field.String(analyzer='kuma_content',
                           term_vector='with_positions_offsets')
    css_classnames = field.String(analyzer='case_insensitive_keyword')
    html_attributes = field.String(analyzer='case_insensitive_keyword')
    id = field.Long()
    kumascript_macros = field.String(analyzer='case_insensitive_keyword')
    locale = field.String(index='not_analyzed')
    modified = field.Date()
    parent = field.Nested(
        properties={
            'id': field.Long(),
            'title': field.String(analyzer='kuma_title'),
            'slug': field.String(index='not_analyzed'),
            'locale': field.String(index='not_analyzed'),
        })
    slug = field.String(index='not_analyzed')
    summary = field.String(analyzer='kuma_content',
                           term_vector='with_positions_offsets')
    tags = field.String(analyzer='case_sensitive')
    title = field.String(analyzer='kuma_title', boost=1.2)

    class Meta(object):
        mapping = Mapping('wiki_document')
        mapping.meta('_all', enalbed=False)

    @classmethod
    def get_connection(cls, alias='default'):
        return connections.get_connection(alias)

    @classmethod
    def get_doc_type(cls):
        return cls._doc_type.name

    @classmethod
    def from_django(cls, obj):
        doc = {
            'id': obj.id,
            'title': obj.title,
            'slug': obj.slug,
            'summary': obj.get_summary(strip_markup=True),
            'locale': obj.locale,
            'modified': obj.modified,
            'content': strip_tags(obj.rendered_html),
            'tags': list(obj.tags.values_list('name', flat=True)),
            'kumascript_macros': obj.extract_kumascript_macro_names(),
            'css_classnames': obj.extract_css_classnames(),
            'html_attributes': obj.extract_html_attributes(),
        }

        # Check if the document has a document zone attached
        try:
            is_zone = bool(obj.zone)
        except ObjectDoesNotExist:
            is_zone = False

        if is_zone:
            # boost all documents that are a zone
            doc['boost'] = 8.0
        elif obj.slug.count('/') == 1:
            # a little boost if no zone but still first level
            doc['boost'] = 4.0
        else:
            doc['boost'] = 1.0
        if obj.parent:
            doc['parent'] = {
                'id': obj.parent.id,
                'title': obj.parent.title,
                'locale': obj.parent.locale,
                'slug': obj.parent.slug,
            }
        else:
            doc['parent'] = {}

        return doc

    @classmethod
    def get_mapping(cls):
        return cls._doc_type.mapping.to_dict()

    @classmethod
    def get_analysis(cls):
        return {
            'filter': {
                'kuma_word_delimiter': {
                    'type': 'word_delimiter',
                    'preserve_original': True,  # hi-fi -> hifi, hi-fi
                    'catenate_words': True,  # hi-fi -> hifi
                    'catenate_numbers': True,  # 90-210 -> 90210
                }
            },
            'analyzer': {
                'default': {
                    'tokenizer': 'standard',
                    'filter': ['standard', 'elision']
                },
                # a custom analyzer that strips html and uses our own
                # word delimiter filter and the elision filter
                # (e.g. L'attribut -> attribut). The rest is the same as
                # the snowball analyzer
                'kuma_content': {
                    'type':
                    'custom',
                    'tokenizer':
                    'standard',
                    'char_filter': ['html_strip'],
                    'filter': [
                        'elision',
                        'kuma_word_delimiter',
                        'lowercase',
                        'standard',
                        'stop',
                        'snowball',
                    ],
                },
                'kuma_title': {
                    'type':
                    'custom',
                    'tokenizer':
                    'standard',
                    'filter': [
                        'elision',
                        'kuma_word_delimiter',
                        'lowercase',
                        'standard',
                        'snowball',
                    ],
                },
                'case_sensitive': {
                    'type': 'custom',
                    'tokenizer': 'keyword'
                },
                'case_insensitive_keyword': {
                    'type': 'custom',
                    'tokenizer': 'keyword',
                    'filter': 'lowercase'
                }
            },
        }

    @classmethod
    def get_settings(cls):
        return {
            'mappings': cls.get_mapping(),
            'settings': {
                'analysis': cls.get_analysis(),
                'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
                'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
            }
        }

    @classmethod
    def bulk_index(cls, documents, id_field='id', es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            '_index': index,
            '_type': type,
            '_id': d['id'],
            '_source': d
        } for d in documents]

        bulk(es, actions)

    @classmethod
    def bulk_delete(cls, ids, es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            '_op_type': 'delete',
            '_index': index,
            '_type': type,
            '_id': _id
        } for _id in ids]

        bulk(es, actions)

    @classmethod
    def get_index(cls):
        from kuma.search.models import Index
        return Index.objects.get_current().prefixed_name

    @classmethod
    def search(cls, **kwargs):
        options = {
            'using': connections.get_connection(),
            'index': cls.get_index(),
            'doc_type': {
                cls._doc_type.name: cls.from_es
            },
        }
        options.update(kwargs)
        sq = Search(**options)

        return sq

    @classmethod
    def get_model(cls):
        from kuma.wiki.models import Document
        return Document

    @classmethod
    def get_indexable(cls, percent=100):
        """
        For this mapping type return a list of model IDs that should be
        indexed with the management command, in a full reindex.

        WARNING: When changing this code make sure to update the
                 ``should_update`` method below, too!

        """
        model = cls.get_model()

        excludes = []
        for exclude in cls.exclude_slugs:
            excludes.append(Q(slug__icontains=exclude))

        qs = (model.objects.filter(is_template=False,
                                   is_redirect=False,
                                   deleted=False).exclude(
                                       reduce(operator.or_, excludes)))

        percent = percent / 100
        if percent < 1:
            qs = qs[:int(qs.count() * percent)]

        return qs.values_list('id', flat=True)

    @classmethod
    def should_update(cls, obj):
        """
        Given a Document instance should return boolean value
        whether the instance should be indexed or not.

        WARNING: This *must* mirror the logic of the ``get_indexable``
                 method above!
        """
        return (not obj.is_template and not obj.is_redirect and not obj.deleted
                and
                not any([exclude in obj.slug
                         for exclude in cls.exclude_slugs]))

    def get_excerpt(self):
        if getattr(self, 'highlight', False):
            for excerpt_field in self.excerpt_fields:
                if excerpt_field in self.highlight:
                    return u'…'.join(self.highlight[excerpt_field])
        return self.summary

    @classmethod
    def reindex_all(cls, chunk_size=500, index=None, percent=100):
        """Rebuild ElasticSearch indexes.

        :arg chunk_size: how many documents to bulk index as a single chunk.
        :arg index: the `Index` object to reindex into. Uses the current
            promoted index if none provided.
        :arg percent: 1 to 100--the percentage of the db to index.

        """
        from kuma.search.models import Index
        from kuma.search.tasks import prepare_index, finalize_index
        from kuma.wiki.tasks import index_documents

        index = index or Index.objects.get_current()

        # Get the list of document IDs to index.
        indexable = WikiDocumentType.get_indexable(percent)

        total = len(indexable)
        total_chunks = int(ceil(total / chunk_size))

        pre_task = prepare_index.si(index.pk)
        post_task = finalize_index.si(index.pk)

        if not total:
            # If there's no data we still create the index and finalize it.
            chain(pre_task, post_task).apply_async()
        else:
            index_tasks = [
                index_documents.si(chunk, index.pk)
                for chunk in chunked(indexable, chunk_size)
            ]
            chord_flow(pre_task, index_tasks, post_task).apply_async()

        message = _(
            'Indexing {total} documents into {n} chunks of size {size} into '
            'index {index}.'.format(total=total,
                                    n=total_chunks,
                                    size=chunk_size,
                                    index=index.prefixed_name))
        return message
Exemple #9
0
class WikiDocumentType(document.Document):
    excerpt_fields = ['summary', 'content']
    exclude_slugs = [
        'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:',
        EXPERIMENT_TITLE_PREFIX
    ]

    boost = field.Float(null_value=1.0)
    content = field.Text(analyzer='kuma_content',
                         term_vector='with_positions_offsets')
    css_classnames = field.Keyword()
    html_attributes = field.Keyword()
    id = field.Long()
    kumascript_macros = field.Keyword()
    locale = field.Keyword()
    modified = field.Date()
    parent = field.Object(
        properties={
            'id': field.Long(),
            'title': field.Text(analyzer='kuma_title'),
            'slug': field.Keyword(),
            'locale': field.Keyword(),
        })
    slug = field.Keyword()
    summary = field.Text(analyzer='kuma_content',
                         term_vector='with_positions_offsets')
    tags = field.Keyword()
    title = field.Text(analyzer='kuma_title')

    class Meta(object):
        mapping = Mapping('wiki_document')
        mapping.meta('_all', enabled=False)

    @classmethod
    def get_connection(cls, alias='default'):
        return connections.get_connection(alias)

    @classmethod
    def get_doc_type(cls):
        return cls._doc_type.name

    @classmethod
    def case_insensitive_keywords(cls, keywords):
        '''Create a unique list of lowercased keywords.'''
        return sorted({keyword.lower() for keyword in keywords})

    @classmethod
    def from_django(cls, obj):
        is_root_document = obj.slug.count('/') == 1
        doc = {
            'id':
            obj.id,
            'boost':
            4.0 if is_root_document else 1.0,
            'title':
            obj.title,
            'slug':
            obj.slug,
            'summary':
            obj.get_summary_text(),
            'locale':
            obj.locale,
            'modified':
            obj.modified,
            'content':
            strip_tags(obj.rendered_html or ''),
            'tags': [o.name for o in obj.tags.all()],
            'kumascript_macros':
            cls.case_insensitive_keywords(obj.extract.macro_names()),
            'css_classnames':
            cls.case_insensitive_keywords(obj.extract.css_classnames()),
            'html_attributes':
            cls.case_insensitive_keywords(obj.extract.html_attributes()),
        }

        if obj.parent:
            doc['parent'] = {
                'id': obj.parent.id,
                'title': obj.parent.title,
                'locale': obj.parent.locale,
                'slug': obj.parent.slug,
            }
        else:
            doc['parent'] = {}

        return doc

    @classmethod
    def get_mapping(cls):
        return cls._doc_type.mapping.to_dict()

    @classmethod
    def get_analysis(cls):
        return {
            'filter': {
                'kuma_word_delimiter': {
                    'type': 'word_delimiter',
                    'preserve_original': True,  # hi-fi -> hifi, hi-fi
                    'catenate_words': True,  # hi-fi -> hifi
                    'catenate_numbers': True,  # 90-210 -> 90210
                }
            },
            'analyzer': {
                'default': {
                    'tokenizer': 'standard',
                    'filter': ['standard', 'elision']
                },
                # a custom analyzer that strips html and uses our own
                # word delimiter filter and the elision filter
                # (e.g. L'attribut -> attribut). The rest is the same as
                # the snowball analyzer
                'kuma_content': {
                    'type':
                    'custom',
                    'tokenizer':
                    'standard',
                    'char_filter': ['html_strip'],
                    'filter': [
                        'elision',
                        'kuma_word_delimiter',
                        'lowercase',
                        'standard',
                        'stop',
                        'snowball',
                    ],
                },
                'kuma_title': {
                    'type':
                    'custom',
                    'tokenizer':
                    'standard',
                    'filter': [
                        'elision',
                        'kuma_word_delimiter',
                        'lowercase',
                        'standard',
                        'snowball',
                    ],
                },
            },
        }

    @classmethod
    def get_settings(cls):
        return {
            'mappings': cls.get_mapping(),
            'settings': {
                'analysis': cls.get_analysis(),
                'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
                'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
            }
        }

    @classmethod
    def bulk_index(cls, documents, id_field='id', es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            '_index': index,
            '_type': type,
            '_id': d['id'],
            '_source': d
        } for d in documents]

        bulk(es, actions)

    @classmethod
    def bulk_delete(cls, ids, es=None, index=None):
        """Index of a bunch of documents."""
        es = es or cls.get_connection()
        index = index or cls.get_index()
        type = cls.get_doc_type()

        actions = [{
            '_op_type': 'delete',
            '_index': index,
            '_type': type,
            '_id': _id
        } for _id in ids]

        bulk(es, actions)

    @classmethod
    def get_index(cls):
        from kuma.search.models import Index
        return Index.objects.get_current().prefixed_name

    @classmethod
    def search(cls, **kwargs):
        options = {
            'using': connections.get_connection(),
            'index': cls.get_index(),
            'doc_type': {
                cls._doc_type.name: cls.from_es
            },
        }
        options.update(kwargs)
        sq = Search(**options)

        return sq

    @classmethod
    def get_model(cls):
        from kuma.wiki.models import Document
        return Document

    @classmethod
    def get_indexable(cls, percent=100):
        """
        For this mapping type return a list of model IDs that should be
        indexed with the management command, in a full reindex.

        WARNING: When changing this code make sure to update the
                 ``should_update`` method below, too!

        """
        model = cls.get_model()

        excludes = Q()
        for exclude in cls.exclude_slugs:
            excludes |= Q(slug__startswith=exclude)

        qs = model.objects.filter(is_redirect=False).exclude(excludes)

        percent = percent / 100
        if percent < 1:
            qs = qs[:int(qs.count() * percent)]

        return qs.values_list('id', flat=True)

    @classmethod
    def should_update(cls, obj):
        """
        Given a Document instance should return boolean value
        whether the instance should be indexed or not.

        WARNING: This *must* mirror the logic of the ``get_indexable``
                 method above!
        """
        return (not obj.is_redirect and not obj.deleted and
                not any([exclude in obj.slug
                         for exclude in cls.exclude_slugs]))

    def get_excerpt(self):
        highlighted = getattr(self.meta, 'highlight', None)
        if highlighted:
            for excerpt_field in self.excerpt_fields:
                if excerpt_field in highlighted:
                    return '…'.join(highlighted[excerpt_field])
        return self.summary

    @classmethod
    def reindex_all(cls, chunk_size=500, index=None, percent=100):
        """Rebuild ElasticSearch indexes.

        :arg chunk_size: how many documents to bulk index as a single chunk.
        :arg index: the `Index` object to reindex into. Uses the current
            promoted index if none provided.
        :arg percent: 1 to 100--the percentage of the db to index.

        """
        from kuma.search.models import Index
        from kuma.search.tasks import prepare_index, finalize_index
        from kuma.wiki.tasks import index_documents

        index = index or Index.objects.get_current()

        # Get the list of document IDs to index.
        indexable = WikiDocumentType.get_indexable(percent)

        total = len(indexable)
        total_chunks = int(ceil(total / chunk_size))

        pre_task = prepare_index.si(index.pk)
        post_task = finalize_index.si(index.pk)

        if not total:
            # If there's no data we still create the index and finalize it.
            chain(pre_task, post_task).apply_async()
        else:
            index_tasks = [
                index_documents.si(chunk, index.pk)
                for chunk in chunked(indexable, chunk_size)
            ]
            chord_flow(pre_task, index_tasks, post_task).apply_async()

        message = _(
            'Indexing %(total)d documents into %(total_chunks)d chunks of '
            'size %(size)d into index %(index)s.' % {
                'total': total,
                'total_chunks': total_chunks,
                'size': chunk_size,
                'index': index.prefixed_name
            })
        return message
class ShpData(IndexedData):
    _type = 'geo'

    _schema2doc_map = {
        'C':
        dsl_field.Text(
            analyzer=polish_analyzer,
            fields={
                'raw': dsl_field.Text(),
                'keyword': dsl_field.Keyword(),
            },
        ),
        'D':
        dsl_field.Date(),
        'N':
        dsl_field.ScaledFloat(scaling_factor=100),
        'L':
        dsl_field.Boolean(),
        '@':
        dsl_field.Date(),
        'I':
        dsl_field.Long(),
        '+':
        dsl_field.Long(),
        'F':
        dsl_field.Float(),
        'O':
        dsl_field.Double(),
    }

    _schema_to_api_field = {
        'C': api_fields.String,
        'D': api_fields.DateTime,
        'N': api_fields.Number,
        'L': api_fields.Boolean,
        '@': api_fields.DateTime,
        'I': api_fields.Number,
        '+': api_fields.Number,
        'F': api_fields.Number,
        'O': api_fields.Number,
    }

    _schema_long_names = {
        'C': 'string',
        'D': 'datetime',
        'N': 'number',
        'L': 'boolean',
        '@': 'datetime',
        'I': 'integer',
        '+': 'integer',
        'F': 'number',
        'O': 'number',
    }

    _source = None
    _schema = None
    _transformer = None

    def __init__(self, resource, from_table_index=False):
        super().__init__(resource)
        self.from_table_index = from_table_index

    @property
    def has_geo_data(self):
        return True

    @property
    def is_chartable(self):
        fields = self.schema
        return len(fields) > 1 and any(
            (field.type in ('N', 'I', '+', 'F', 'O') for field in fields))

    @property
    def source(self):
        if not self._source:
            with ArchiveReader(self.resource.main_file.path) as extracted:
                shp_path = next(
                    iter(f for f in extracted if f.endswith('.shp')))
                self._source = shapefile.Reader(shp_path)
                self._transformer = ShapeTransformer(extracted)
        return self._source

    def get_schema(self, **kwargs):
        use_aliases = kwargs.get('use_aliases', False)
        headers = self.reversed_headers_map
        return {
            'fields': [{
                'name': headers[item.name] if use_aliases else item.name,
                'type': self._schema_long_names[item.type],
                'format': 'default'
            } for item in self.schema]
        }

    @property
    def schema(self):
        if not self._schema:
            self._schema = [
                DBSchemaField(*_f) for _f in self.source.fields[1:]
            ]
        return self._schema

    def prepare_doc(self):
        _fields = {
            'shape':
            dsl_field.GeoShape(),
            'point':
            dsl_field.GeoPoint(),
            'shape_type':
            dsl_field.Integer(),
            'label':
            dsl_field.Text(),
            'resource':
            dsl_field.Nested(
                properties={
                    'id':
                    dsl_field.Integer(),
                    'title':
                    dsl_field.Text(analyzer=polish_analyzer,
                                   fields={'raw': dsl_field.Keyword()})
                }),
            'updated_at':
            dsl_field.Date(),
            'row_no':
            dsl_field.Long()
        }
        _map = {}

        for idx, _f in enumerate(self.schema, 1):
            if _f.type not in self._schema2doc_map:
                continue
            alias_name = _f.name
            field_name = f'col{idx}'
            _field = self._schema2doc_map[_f.type]
            _map[field_name] = alias_name
            _fields[field_name] = _field
            _fields['Index'] = type('Index', (type, ), {'name': self.idx_name})

        doc = type(self.idx_name, (Document, ), _fields)
        doc._doc_type.mapping._meta['_meta'] = {'headers': _map}
        return doc

    def get_api_fields(self):
        record_fields = {}
        for f in self.schema:
            field_name = self.reversed_headers_map[f.name]
            field_cls = self._schema_to_api_field[f.type]
            record_fields[field_name] = field_cls(is_tabular_data_field=True)
        return record_fields

    @staticmethod
    def _get_row_id(row):
        return str(
            uuid.uuid5(uuid.NAMESPACE_DNS,
                       '+|+'.join(str(i)[:10000] for i in row)))

    def _docs_iter(self, doc):
        for row_no, sr in enumerate(self.source.shapeRecords(), 1):
            geojson = self._transformer.transform(sr.shape)
            v = {
                'shape': geojson,
                'updated_at': datetime.now(),
                'row_no': row_no,
                'resource': {
                    'id': self.resource.id,
                    'title': self.resource.title
                },
            }
            for i, val in enumerate(sr.record, 1):
                v[f'col{i}'] = val if val != b'' else None

            v['shape_type'] = sr.shape.shapeType
            v['point'] = median_point(geojson)
            tds = self.resource.tabular_data_schema
            if tds is not None and 'geo' in tds and 'label' in tds['geo']:
                v['label'] = sr.record[tds['geo']['label'].get('col_name')]
            d = doc(**v)
            d.meta.id = self._get_row_id(sr.record)
            yield d
Exemple #11
0
 def __init__(self, *args, **kwargs):
     super(ContributionField, self).__init__(*args, **kwargs)
     self.properties['id'] = field.Long()
     self.properties['contributor'] = ContributorField()
Exemple #12
0
 def __init__(self, *args, **kwargs):
     super(ContributorField, self).__init__(*args, **kwargs)
     self.properties['id'] = field.Long()
     self.properties['username'] = field.String(index='not_analyzed')
     self.properties['payroll_name'] = field.String(index='not_analyzed')
     self.properties['is_freelance'] = field.Boolean()