def prepare_doc(self): _fields, _map = {}, {} for idx, _f in enumerate(self.schema['fields'], 1): alias_name = _f['name'] field_name = 'col{}'.format(idx) _field = self._schema2doc_map[_f['type']] _map[field_name] = alias_name _fields[field_name] = _field if self.has_geo_data: _fields['shape'] = dsl_field.GeoShape() _fields['point'] = dsl_field.GeoPoint() _fields['label'] = dsl_field.Text() _fields['shape_type'] = dsl_field.Integer() _fields['resource'] = dsl_field.Nested( properties={ 'id': dsl_field.Integer(), 'title': dsl_field.Text(analyzer=polish_analyzer, fields={'raw': dsl_field.Keyword()}) }) _fields['updated_at'] = dsl_field.Date() _fields['row_no'] = dsl_field.Long() _fields['Index'] = type('Index', (type, ), {'name': self.idx_name}) doc = type(self.idx_name, (Document, ), _fields) doc._doc_type.mapping._meta['_meta'] = {'headers': _map} return doc
def doc(self): if not self._doc_cache: _fields, _map = {}, {} for idx, _f in enumerate(self.schema['fields']): alias_name = _f['name'] field_name = 'col{}'.format(idx + 1) _field = _schema2doc_map[_f['type']] _map[field_name] = alias_name _fields[field_name] = _field _fields['resource'] = dsl_field.Nested( properties={ 'id': dsl_field.Integer(), 'title': dsl_field.Text( analyzer=polish_analyzer, fields={'raw': dsl_field.Keyword()}) } ) _fields['updated_at'] = dsl_field.Date() _fields['row_no'] = dsl_field.Long() doc = type(self.idx_name, (DocType,), _fields) doc._doc_type.index = self.idx_name doc._doc_type.mapping._meta['_meta'] = {'headers': _map} doc._doc_type.mapping._meta['_meta'] self._doc_cache = doc return self._doc_cache
class Mapping: content = ContentField() contributor = ContributorField() pay = field.Long() class Meta: dynamic = False excludes = ('content', 'contributor')
def _schema2doc_map(self): _map = { 'integer': dsl_field.Long(), 'number': dsl_field.ScaledFloat(scaling_factor=100), 'string': dsl_field.Text(analyzer=polish_analyzer, fields={ 'raw': dsl_field.Text(), 'keyword': dsl_field.Keyword(), }), 'any': dsl_field.Text(analyzer=polish_analyzer, fields={ 'raw': dsl_field.Text(), 'keyword': dsl_field.Keyword(), }), 'boolean': dsl_field.Boolean(), 'time': dsl_field.Text( fields={ 'text': dsl_field.Text(), 'time': dsl_field.Date( format=constance_config.TIME_FORMATS), }), 'duration': dsl_field.DateRange(), 'default': dsl_field.Text(), 'date': dsl_field.Text( fields={ 'text': dsl_field.Text(), 'date': dsl_field.Date( format=constance_config.DATE_FORMATS), }), 'datetime': dsl_field.Text( fields={ 'text': dsl_field.Text(), 'datetime': dsl_field.Date(format=constance_config.DATE_FORMATS), }) } for key, val in _map.items(): _map[key] = CustomObject(properties={ 'val': val, 'repr': dsl_field.Keyword(), }) return _map
def prepare_doc(self): _fields = { 'shape': dsl_field.GeoShape(), 'point': dsl_field.GeoPoint(), 'shape_type': dsl_field.Integer(), 'label': dsl_field.Text(), 'resource': dsl_field.Nested( properties={ 'id': dsl_field.Integer(), 'title': dsl_field.Text(analyzer=polish_analyzer, fields={'raw': dsl_field.Keyword()}) }), 'updated_at': dsl_field.Date(), 'row_no': dsl_field.Long() } _map = {} for idx, _f in enumerate(self.schema, 1): if _f.type not in self._schema2doc_map: continue alias_name = _f.name field_name = f'col{idx}' _field = self._schema2doc_map[_f.type] _map[field_name] = alias_name _fields[field_name] = _field _fields['Index'] = type('Index', (type, ), {'name': self.idx_name}) doc = type(self.idx_name, (Document, ), _fields) doc._doc_type.mapping._meta['_meta'] = {'headers': _map} return doc
class MyDoc2(document.DocType): extra = field.Long()
class WikiDocumentType(document.Document): excerpt_fields = ["summary", "content"] exclude_slugs = [ "Talk:", "User:"******"User_talk:", "Template_talk:", "Project_talk:", EXPERIMENT_TITLE_PREFIX, ] boost = field.Float(null_value=1.0) content = field.Text(analyzer="kuma_content", term_vector="with_positions_offsets") css_classnames = field.Keyword() html_attributes = field.Keyword() id = field.Long() kumascript_macros = field.Keyword() locale = field.Keyword() modified = field.Date() parent = field.Object( properties={ "id": field.Long(), "title": field.Text(analyzer="kuma_title"), "slug": field.Keyword(), "locale": field.Keyword(), }) slug = field.Keyword() summary = field.Text(analyzer="kuma_content", term_vector="with_positions_offsets") tags = field.Keyword() title = field.Text(analyzer="kuma_title") class Meta(object): mapping = Mapping("wiki_document") mapping.meta("_all", enabled=False) @classmethod def get_connection(cls, alias="default"): return connections.get_connection(alias) @classmethod def get_doc_type(cls): return cls._doc_type.name @classmethod def case_insensitive_keywords(cls, keywords): """Create a unique list of lowercased keywords.""" return sorted({keyword.lower() for keyword in keywords}) @classmethod def from_django(cls, obj): is_root_document = obj.slug.count("/") == 1 doc = { "id": obj.id, "boost": 4.0 if is_root_document else 1.0, "title": obj.title, "slug": obj.slug, "summary": obj.get_summary_text(), "locale": obj.locale, "modified": obj.modified, "content": strip_tags(obj.get_body_html() or ""), "tags": [o.name for o in obj.tags.all()], "kumascript_macros": cls.case_insensitive_keywords(obj.extract.macro_names()), "css_classnames": cls.case_insensitive_keywords(obj.extract.css_classnames()), "html_attributes": cls.case_insensitive_keywords(obj.extract.html_attributes()), } if obj.parent: doc["parent"] = { "id": obj.parent.id, "title": obj.parent.title, "locale": obj.parent.locale, "slug": obj.parent.slug, } else: doc["parent"] = {} return doc @classmethod def get_mapping(cls): return cls._doc_type.mapping.to_dict() @classmethod def get_analysis(cls): return { "filter": { "kuma_word_delimiter": { "type": "word_delimiter", "preserve_original": True, # hi-fi -> hifi, hi-fi "catenate_words": True, # hi-fi -> hifi "catenate_numbers": True, # 90-210 -> 90210 } }, "analyzer": { "default": { "tokenizer": "standard", "filter": ["standard", "elision"] }, # a custom analyzer that strips html and uses our own # word delimiter filter and the elision filter # (e.g. L'attribut -> attribut). The rest is the same as # the snowball analyzer "kuma_content": { "type": "custom", "tokenizer": "standard", "char_filter": ["html_strip"], "filter": [ "elision", "kuma_word_delimiter", "lowercase", "standard", "stop", "snowball", ], }, "kuma_title": { "type": "custom", "tokenizer": "standard", "filter": [ "elision", "kuma_word_delimiter", "lowercase", "standard", "snowball", ], }, }, } @classmethod def get_settings(cls): return { "mappings": cls.get_mapping(), "settings": { "analysis": cls.get_analysis(), "number_of_replicas": settings.ES_DEFAULT_NUM_REPLICAS, "number_of_shards": settings.ES_DEFAULT_NUM_SHARDS, }, } @classmethod def bulk_index(cls, documents, id_field="id", es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ "_index": index, "_type": type, "_id": d["id"], "_source": d } for d in documents] bulk(es, actions) @classmethod def bulk_delete(cls, ids, es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ "_op_type": "delete", "_index": index, "_type": type, "_id": _id } for _id in ids] bulk(es, actions) @classmethod def get_index(cls): from kuma.search.models import Index return Index.objects.get_current().prefixed_name @classmethod def search(cls, **kwargs): options = { "using": connections.get_connection(), "index": cls.get_index(), "doc_type": { cls._doc_type.name: cls.from_es }, } options.update(kwargs) sq = Search(**options) return sq @classmethod def get_model(cls): from kuma.wiki.models import Document return Document @classmethod def get_indexable(cls, percent=100): """ For this mapping type return a list of model IDs that should be indexed with the management command, in a full reindex. WARNING: When changing this code make sure to update the ``should_update`` method below, too! """ model = cls.get_model() excludes = Q() for exclude in cls.exclude_slugs: excludes |= Q(slug__startswith=exclude) qs = model.objects.filter(is_redirect=False).exclude(excludes) percent = percent / 100 if percent < 1: qs = qs[:int(qs.count() * percent)] return qs.values_list("id", flat=True) @classmethod def should_update(cls, obj): """ Given a Document instance should return boolean value whether the instance should be indexed or not. WARNING: This *must* mirror the logic of the ``get_indexable`` method above! """ return (not obj.is_redirect and not obj.deleted and not any([exclude in obj.slug for exclude in cls.exclude_slugs])) def get_excerpt(self): highlighted = getattr(self.meta, "highlight", None) if highlighted: for excerpt_field in self.excerpt_fields: if excerpt_field in highlighted: return "…".join(highlighted[excerpt_field]) return self.summary @classmethod def reindex_all(cls, chunk_size=500, index=None, percent=100): """Rebuild ElasticSearch indexes. :arg chunk_size: how many documents to bulk index as a single chunk. :arg index: the `Index` object to reindex into. Uses the current promoted index if none provided. :arg percent: 1 to 100--the percentage of the db to index. """ from kuma.search.models import Index from kuma.search.tasks import prepare_index, finalize_index from kuma.wiki.tasks import index_documents index = index or Index.objects.get_current() # Get the list of document IDs to index. indexable = WikiDocumentType.get_indexable(percent) total = len(indexable) total_chunks = int(ceil(total / chunk_size)) pre_task = prepare_index.si(index.pk) post_task = finalize_index.si(index.pk) if not total: # If there's no data we still create the index and finalize it. chain(pre_task, post_task).apply_async() else: index_tasks = [ index_documents.si(chunk, index.pk) for chunk in chunked(indexable, chunk_size) ] chord_flow(pre_task, index_tasks, post_task).apply_async() message = _( "Indexing %(total)d documents into %(total_chunks)d chunks of " "size %(size)d into index %(index)s." % { "total": total, "total_chunks": total_chunks, "size": chunk_size, "index": index.prefixed_name, }) return message
class WikiDocumentType(document.DocType): excerpt_fields = ['summary', 'content'] exclude_slugs = [ 'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:' ] boost = field.Float(null_value=1.0) content = field.String(analyzer='kuma_content', term_vector='with_positions_offsets') css_classnames = field.String(analyzer='case_insensitive_keyword') html_attributes = field.String(analyzer='case_insensitive_keyword') id = field.Long() kumascript_macros = field.String(analyzer='case_insensitive_keyword') locale = field.String(index='not_analyzed') modified = field.Date() parent = field.Nested( properties={ 'id': field.Long(), 'title': field.String(analyzer='kuma_title'), 'slug': field.String(index='not_analyzed'), 'locale': field.String(index='not_analyzed'), }) slug = field.String(index='not_analyzed') summary = field.String(analyzer='kuma_content', term_vector='with_positions_offsets') tags = field.String(analyzer='case_sensitive') title = field.String(analyzer='kuma_title', boost=1.2) class Meta(object): mapping = Mapping('wiki_document') mapping.meta('_all', enalbed=False) @classmethod def get_connection(cls, alias='default'): return connections.get_connection(alias) @classmethod def get_doc_type(cls): return cls._doc_type.name @classmethod def from_django(cls, obj): doc = { 'id': obj.id, 'title': obj.title, 'slug': obj.slug, 'summary': obj.get_summary(strip_markup=True), 'locale': obj.locale, 'modified': obj.modified, 'content': strip_tags(obj.rendered_html), 'tags': list(obj.tags.values_list('name', flat=True)), 'kumascript_macros': obj.extract_kumascript_macro_names(), 'css_classnames': obj.extract_css_classnames(), 'html_attributes': obj.extract_html_attributes(), } # Check if the document has a document zone attached try: is_zone = bool(obj.zone) except ObjectDoesNotExist: is_zone = False if is_zone: # boost all documents that are a zone doc['boost'] = 8.0 elif obj.slug.count('/') == 1: # a little boost if no zone but still first level doc['boost'] = 4.0 else: doc['boost'] = 1.0 if obj.parent: doc['parent'] = { 'id': obj.parent.id, 'title': obj.parent.title, 'locale': obj.parent.locale, 'slug': obj.parent.slug, } else: doc['parent'] = {} return doc @classmethod def get_mapping(cls): return cls._doc_type.mapping.to_dict() @classmethod def get_analysis(cls): return { 'filter': { 'kuma_word_delimiter': { 'type': 'word_delimiter', 'preserve_original': True, # hi-fi -> hifi, hi-fi 'catenate_words': True, # hi-fi -> hifi 'catenate_numbers': True, # 90-210 -> 90210 } }, 'analyzer': { 'default': { 'tokenizer': 'standard', 'filter': ['standard', 'elision'] }, # a custom analyzer that strips html and uses our own # word delimiter filter and the elision filter # (e.g. L'attribut -> attribut). The rest is the same as # the snowball analyzer 'kuma_content': { 'type': 'custom', 'tokenizer': 'standard', 'char_filter': ['html_strip'], 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'stop', 'snowball', ], }, 'kuma_title': { 'type': 'custom', 'tokenizer': 'standard', 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'snowball', ], }, 'case_sensitive': { 'type': 'custom', 'tokenizer': 'keyword' }, 'case_insensitive_keyword': { 'type': 'custom', 'tokenizer': 'keyword', 'filter': 'lowercase' } }, } @classmethod def get_settings(cls): return { 'mappings': cls.get_mapping(), 'settings': { 'analysis': cls.get_analysis(), 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, } } @classmethod def bulk_index(cls, documents, id_field='id', es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_index': index, '_type': type, '_id': d['id'], '_source': d } for d in documents] bulk(es, actions) @classmethod def bulk_delete(cls, ids, es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_op_type': 'delete', '_index': index, '_type': type, '_id': _id } for _id in ids] bulk(es, actions) @classmethod def get_index(cls): from kuma.search.models import Index return Index.objects.get_current().prefixed_name @classmethod def search(cls, **kwargs): options = { 'using': connections.get_connection(), 'index': cls.get_index(), 'doc_type': { cls._doc_type.name: cls.from_es }, } options.update(kwargs) sq = Search(**options) return sq @classmethod def get_model(cls): from kuma.wiki.models import Document return Document @classmethod def get_indexable(cls, percent=100): """ For this mapping type return a list of model IDs that should be indexed with the management command, in a full reindex. WARNING: When changing this code make sure to update the ``should_update`` method below, too! """ model = cls.get_model() excludes = [] for exclude in cls.exclude_slugs: excludes.append(Q(slug__icontains=exclude)) qs = (model.objects.filter(is_template=False, is_redirect=False, deleted=False).exclude( reduce(operator.or_, excludes))) percent = percent / 100 if percent < 1: qs = qs[:int(qs.count() * percent)] return qs.values_list('id', flat=True) @classmethod def should_update(cls, obj): """ Given a Document instance should return boolean value whether the instance should be indexed or not. WARNING: This *must* mirror the logic of the ``get_indexable`` method above! """ return (not obj.is_template and not obj.is_redirect and not obj.deleted and not any([exclude in obj.slug for exclude in cls.exclude_slugs])) def get_excerpt(self): if getattr(self, 'highlight', False): for excerpt_field in self.excerpt_fields: if excerpt_field in self.highlight: return u'…'.join(self.highlight[excerpt_field]) return self.summary @classmethod def reindex_all(cls, chunk_size=500, index=None, percent=100): """Rebuild ElasticSearch indexes. :arg chunk_size: how many documents to bulk index as a single chunk. :arg index: the `Index` object to reindex into. Uses the current promoted index if none provided. :arg percent: 1 to 100--the percentage of the db to index. """ from kuma.search.models import Index from kuma.search.tasks import prepare_index, finalize_index from kuma.wiki.tasks import index_documents index = index or Index.objects.get_current() # Get the list of document IDs to index. indexable = WikiDocumentType.get_indexable(percent) total = len(indexable) total_chunks = int(ceil(total / chunk_size)) pre_task = prepare_index.si(index.pk) post_task = finalize_index.si(index.pk) if not total: # If there's no data we still create the index and finalize it. chain(pre_task, post_task).apply_async() else: index_tasks = [ index_documents.si(chunk, index.pk) for chunk in chunked(indexable, chunk_size) ] chord_flow(pre_task, index_tasks, post_task).apply_async() message = _( 'Indexing {total} documents into {n} chunks of size {size} into ' 'index {index}.'.format(total=total, n=total_chunks, size=chunk_size, index=index.prefixed_name)) return message
class WikiDocumentType(document.Document): excerpt_fields = ['summary', 'content'] exclude_slugs = [ 'Talk:', 'User:'******'User_talk:', 'Template_talk:', 'Project_talk:', EXPERIMENT_TITLE_PREFIX ] boost = field.Float(null_value=1.0) content = field.Text(analyzer='kuma_content', term_vector='with_positions_offsets') css_classnames = field.Keyword() html_attributes = field.Keyword() id = field.Long() kumascript_macros = field.Keyword() locale = field.Keyword() modified = field.Date() parent = field.Object( properties={ 'id': field.Long(), 'title': field.Text(analyzer='kuma_title'), 'slug': field.Keyword(), 'locale': field.Keyword(), }) slug = field.Keyword() summary = field.Text(analyzer='kuma_content', term_vector='with_positions_offsets') tags = field.Keyword() title = field.Text(analyzer='kuma_title') class Meta(object): mapping = Mapping('wiki_document') mapping.meta('_all', enabled=False) @classmethod def get_connection(cls, alias='default'): return connections.get_connection(alias) @classmethod def get_doc_type(cls): return cls._doc_type.name @classmethod def case_insensitive_keywords(cls, keywords): '''Create a unique list of lowercased keywords.''' return sorted({keyword.lower() for keyword in keywords}) @classmethod def from_django(cls, obj): is_root_document = obj.slug.count('/') == 1 doc = { 'id': obj.id, 'boost': 4.0 if is_root_document else 1.0, 'title': obj.title, 'slug': obj.slug, 'summary': obj.get_summary_text(), 'locale': obj.locale, 'modified': obj.modified, 'content': strip_tags(obj.rendered_html or ''), 'tags': [o.name for o in obj.tags.all()], 'kumascript_macros': cls.case_insensitive_keywords(obj.extract.macro_names()), 'css_classnames': cls.case_insensitive_keywords(obj.extract.css_classnames()), 'html_attributes': cls.case_insensitive_keywords(obj.extract.html_attributes()), } if obj.parent: doc['parent'] = { 'id': obj.parent.id, 'title': obj.parent.title, 'locale': obj.parent.locale, 'slug': obj.parent.slug, } else: doc['parent'] = {} return doc @classmethod def get_mapping(cls): return cls._doc_type.mapping.to_dict() @classmethod def get_analysis(cls): return { 'filter': { 'kuma_word_delimiter': { 'type': 'word_delimiter', 'preserve_original': True, # hi-fi -> hifi, hi-fi 'catenate_words': True, # hi-fi -> hifi 'catenate_numbers': True, # 90-210 -> 90210 } }, 'analyzer': { 'default': { 'tokenizer': 'standard', 'filter': ['standard', 'elision'] }, # a custom analyzer that strips html and uses our own # word delimiter filter and the elision filter # (e.g. L'attribut -> attribut). The rest is the same as # the snowball analyzer 'kuma_content': { 'type': 'custom', 'tokenizer': 'standard', 'char_filter': ['html_strip'], 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'stop', 'snowball', ], }, 'kuma_title': { 'type': 'custom', 'tokenizer': 'standard', 'filter': [ 'elision', 'kuma_word_delimiter', 'lowercase', 'standard', 'snowball', ], }, }, } @classmethod def get_settings(cls): return { 'mappings': cls.get_mapping(), 'settings': { 'analysis': cls.get_analysis(), 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, } } @classmethod def bulk_index(cls, documents, id_field='id', es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_index': index, '_type': type, '_id': d['id'], '_source': d } for d in documents] bulk(es, actions) @classmethod def bulk_delete(cls, ids, es=None, index=None): """Index of a bunch of documents.""" es = es or cls.get_connection() index = index or cls.get_index() type = cls.get_doc_type() actions = [{ '_op_type': 'delete', '_index': index, '_type': type, '_id': _id } for _id in ids] bulk(es, actions) @classmethod def get_index(cls): from kuma.search.models import Index return Index.objects.get_current().prefixed_name @classmethod def search(cls, **kwargs): options = { 'using': connections.get_connection(), 'index': cls.get_index(), 'doc_type': { cls._doc_type.name: cls.from_es }, } options.update(kwargs) sq = Search(**options) return sq @classmethod def get_model(cls): from kuma.wiki.models import Document return Document @classmethod def get_indexable(cls, percent=100): """ For this mapping type return a list of model IDs that should be indexed with the management command, in a full reindex. WARNING: When changing this code make sure to update the ``should_update`` method below, too! """ model = cls.get_model() excludes = Q() for exclude in cls.exclude_slugs: excludes |= Q(slug__startswith=exclude) qs = model.objects.filter(is_redirect=False).exclude(excludes) percent = percent / 100 if percent < 1: qs = qs[:int(qs.count() * percent)] return qs.values_list('id', flat=True) @classmethod def should_update(cls, obj): """ Given a Document instance should return boolean value whether the instance should be indexed or not. WARNING: This *must* mirror the logic of the ``get_indexable`` method above! """ return (not obj.is_redirect and not obj.deleted and not any([exclude in obj.slug for exclude in cls.exclude_slugs])) def get_excerpt(self): highlighted = getattr(self.meta, 'highlight', None) if highlighted: for excerpt_field in self.excerpt_fields: if excerpt_field in highlighted: return '…'.join(highlighted[excerpt_field]) return self.summary @classmethod def reindex_all(cls, chunk_size=500, index=None, percent=100): """Rebuild ElasticSearch indexes. :arg chunk_size: how many documents to bulk index as a single chunk. :arg index: the `Index` object to reindex into. Uses the current promoted index if none provided. :arg percent: 1 to 100--the percentage of the db to index. """ from kuma.search.models import Index from kuma.search.tasks import prepare_index, finalize_index from kuma.wiki.tasks import index_documents index = index or Index.objects.get_current() # Get the list of document IDs to index. indexable = WikiDocumentType.get_indexable(percent) total = len(indexable) total_chunks = int(ceil(total / chunk_size)) pre_task = prepare_index.si(index.pk) post_task = finalize_index.si(index.pk) if not total: # If there's no data we still create the index and finalize it. chain(pre_task, post_task).apply_async() else: index_tasks = [ index_documents.si(chunk, index.pk) for chunk in chunked(indexable, chunk_size) ] chord_flow(pre_task, index_tasks, post_task).apply_async() message = _( 'Indexing %(total)d documents into %(total_chunks)d chunks of ' 'size %(size)d into index %(index)s.' % { 'total': total, 'total_chunks': total_chunks, 'size': chunk_size, 'index': index.prefixed_name }) return message
class ShpData(IndexedData): _type = 'geo' _schema2doc_map = { 'C': dsl_field.Text( analyzer=polish_analyzer, fields={ 'raw': dsl_field.Text(), 'keyword': dsl_field.Keyword(), }, ), 'D': dsl_field.Date(), 'N': dsl_field.ScaledFloat(scaling_factor=100), 'L': dsl_field.Boolean(), '@': dsl_field.Date(), 'I': dsl_field.Long(), '+': dsl_field.Long(), 'F': dsl_field.Float(), 'O': dsl_field.Double(), } _schema_to_api_field = { 'C': api_fields.String, 'D': api_fields.DateTime, 'N': api_fields.Number, 'L': api_fields.Boolean, '@': api_fields.DateTime, 'I': api_fields.Number, '+': api_fields.Number, 'F': api_fields.Number, 'O': api_fields.Number, } _schema_long_names = { 'C': 'string', 'D': 'datetime', 'N': 'number', 'L': 'boolean', '@': 'datetime', 'I': 'integer', '+': 'integer', 'F': 'number', 'O': 'number', } _source = None _schema = None _transformer = None def __init__(self, resource, from_table_index=False): super().__init__(resource) self.from_table_index = from_table_index @property def has_geo_data(self): return True @property def is_chartable(self): fields = self.schema return len(fields) > 1 and any( (field.type in ('N', 'I', '+', 'F', 'O') for field in fields)) @property def source(self): if not self._source: with ArchiveReader(self.resource.main_file.path) as extracted: shp_path = next( iter(f for f in extracted if f.endswith('.shp'))) self._source = shapefile.Reader(shp_path) self._transformer = ShapeTransformer(extracted) return self._source def get_schema(self, **kwargs): use_aliases = kwargs.get('use_aliases', False) headers = self.reversed_headers_map return { 'fields': [{ 'name': headers[item.name] if use_aliases else item.name, 'type': self._schema_long_names[item.type], 'format': 'default' } for item in self.schema] } @property def schema(self): if not self._schema: self._schema = [ DBSchemaField(*_f) for _f in self.source.fields[1:] ] return self._schema def prepare_doc(self): _fields = { 'shape': dsl_field.GeoShape(), 'point': dsl_field.GeoPoint(), 'shape_type': dsl_field.Integer(), 'label': dsl_field.Text(), 'resource': dsl_field.Nested( properties={ 'id': dsl_field.Integer(), 'title': dsl_field.Text(analyzer=polish_analyzer, fields={'raw': dsl_field.Keyword()}) }), 'updated_at': dsl_field.Date(), 'row_no': dsl_field.Long() } _map = {} for idx, _f in enumerate(self.schema, 1): if _f.type not in self._schema2doc_map: continue alias_name = _f.name field_name = f'col{idx}' _field = self._schema2doc_map[_f.type] _map[field_name] = alias_name _fields[field_name] = _field _fields['Index'] = type('Index', (type, ), {'name': self.idx_name}) doc = type(self.idx_name, (Document, ), _fields) doc._doc_type.mapping._meta['_meta'] = {'headers': _map} return doc def get_api_fields(self): record_fields = {} for f in self.schema: field_name = self.reversed_headers_map[f.name] field_cls = self._schema_to_api_field[f.type] record_fields[field_name] = field_cls(is_tabular_data_field=True) return record_fields @staticmethod def _get_row_id(row): return str( uuid.uuid5(uuid.NAMESPACE_DNS, '+|+'.join(str(i)[:10000] for i in row))) def _docs_iter(self, doc): for row_no, sr in enumerate(self.source.shapeRecords(), 1): geojson = self._transformer.transform(sr.shape) v = { 'shape': geojson, 'updated_at': datetime.now(), 'row_no': row_no, 'resource': { 'id': self.resource.id, 'title': self.resource.title }, } for i, val in enumerate(sr.record, 1): v[f'col{i}'] = val if val != b'' else None v['shape_type'] = sr.shape.shapeType v['point'] = median_point(geojson) tds = self.resource.tabular_data_schema if tds is not None and 'geo' in tds and 'label' in tds['geo']: v['label'] = sr.record[tds['geo']['label'].get('col_name')] d = doc(**v) d.meta.id = self._get_row_id(sr.record) yield d
def __init__(self, *args, **kwargs): super(ContributionField, self).__init__(*args, **kwargs) self.properties['id'] = field.Long() self.properties['contributor'] = ContributorField()
def __init__(self, *args, **kwargs): super(ContributorField, self).__init__(*args, **kwargs) self.properties['id'] = field.Long() self.properties['username'] = field.String(index='not_analyzed') self.properties['payroll_name'] = field.String(index='not_analyzed') self.properties['is_freelance'] = field.Boolean()