def configure_collections(): mapping = { "dynamic_templates": [ { "fields": { "match": "schemata.*", "mapping": {"type": "long"} } } ], "properties": { "label": { "type": "text", "analyzer": "icu_latin", "fields": {"kw": KEYWORD} }, "collection_id": KEYWORD, "foreign_id": KEYWORD, "languages": KEYWORD, "countries": KEYWORD, "category": KEYWORD, "summary": RAW_TEXT, "publisher": KEYWORD, "publisher_url": KEYWORD, "data_url": KEYWORD, "info_url": KEYWORD, "kind": KEYWORD, "text": LATIN_TEXT, "casefile": {"type": "boolean"}, "secret": {"type": "boolean"}, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, "count": {"type": "long"}, "schemata": {"type": "object"}, "creator": { "type": "object", "properties": { "id": KEYWORD, "type": KEYWORD, "name": { "type": "text", "fields": {"kw": KEYWORD} } } }, "team": { "type": "object", "properties": { "id": KEYWORD, "type": KEYWORD, "name": KEYWORD } }, } } configure_index(collections_index(), mapping, index_settings())
def configure_records(): mapping = { "properties": { "collection_id": KEYWORD, "document_id": KEYWORD, "index": {"type": "long"}, "text": LATIN_TEXT } } settings = index_settings(shards=10, refresh_interval='15s') configure_index(records_write_index(), mapping, settings)
def configure_xref(): mapping = { "date_detection": False, "dynamic": False, "properties": { "score": { "type": "float" }, "entity_id": KEYWORD, "collection_id": KEYWORD, "match_id": KEYWORD, "match_collection_id": KEYWORD, registry.country.group: KEYWORD, "schema": KEYWORD, "text": { "type": "text", "analyzer": "latin_index" }, "created_at": { "type": "date" }, }, } settings = index_settings(shards=SHARDS_HEAVY) return configure_index(xref_index(), mapping, settings)
def configure_xref(): mapping = { 'date_detection': False, 'dynamic': False, 'properties': { 'score': { 'type': 'float' }, 'entity_id': KEYWORD, 'collection_id': KEYWORD, 'match_id': KEYWORD, 'match_collection_id': KEYWORD, registry.country.group: KEYWORD, 'schema': KEYWORD, 'text': { 'type': 'text', 'analyzer': 'latin_index' }, 'created_at': { 'type': 'date' }, } } settings = index_settings(shards=SHARDS_HEAVY) return configure_index(xref_index(), mapping, settings)
def configure_collections(): mapping = { "date_detection": False, "dynamic": False, "dynamic_templates": [ { "fields": { "match": "schemata.*", "mapping": {"type": "long"} } } ], "_source": {"excludes": ["text"]}, "properties": { "label": { "type": "text", "copy_to": "text", "analyzer": "latin_index", "fields": {"kw": KEYWORD} }, "collection_id": KEYWORD, "foreign_id": KEYWORD_COPY, "languages": KEYWORD_COPY, "countries": KEYWORD_COPY, "category": KEYWORD_COPY, "frequency": KEYWORD_COPY, "summary": { "type": "text", "copy_to": "text", "index": False }, "publisher": KEYWORD_COPY, "publisher_url": KEYWORD_COPY, "data_url": KEYWORD_COPY, "info_url": KEYWORD_COPY, "kind": KEYWORD, "creator_id": KEYWORD, "team_id": KEYWORD, "text": { "type": "text", "analyzer": "latin_index", "term_vector": "with_positions_offsets", "store": True }, "casefile": {"type": "boolean"}, "restricted": {"type": "boolean"}, "secret": {"type": "boolean"}, "xref": {"type": "boolean"}, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, "count": {"type": "long"}, "schemata": { "dynamic": True, "type": "object" } } } index = collections_index() settings = index_settings(shards=1) return configure_index(index, mapping, settings)
def configure_collections(): mapping = { "date_detection": False, "dynamic": False, "dynamic_templates": [ { "fields": { "match": "schemata.*", "mapping": {"type": "long"} } } ], "_source": { "excludes": ["text"] }, "properties": { "label": { "type": "text", "copy_to": "text", "analyzer": "icu_latin", "fields": {"kw": KEYWORD} }, "collection_id": KEYWORD, "foreign_id": KEYWORD_COPY, "languages": KEYWORD_COPY, "countries": KEYWORD_COPY, "category": KEYWORD_COPY, "summary": { "type": "text", "copy_to": "text", "index": False }, "publisher": KEYWORD_COPY, "publisher_url": KEYWORD_COPY, "data_url": KEYWORD_COPY, "info_url": KEYWORD_COPY, "kind": KEYWORD, "creator_id": KEYWORD, "team_id": KEYWORD, "text": { "type": "text", "analyzer": "icu_latin", "term_vector": "with_positions_offsets", "store": True }, "casefile": {"type": "boolean"}, "secret": {"type": "boolean"}, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, "count": {"type": "long"}, "schemata": { "dynamic": True, "type": "object" } } } index = collections_index() settings = index_settings(shards=1) return configure_index(index, mapping, settings)
def configure_schema(schema, version): # Generate relevant type mappings for entity properties so that # we can do correct searches on each. schema_mapping = {} numeric_mapping = {registry.date.group: NUMERIC} for prop in schema.properties.values(): config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD)) config["copy_to"] = ["text"] schema_mapping[prop.name] = config if prop.type in NUMERIC_TYPES: numeric_mapping[prop.name] = NUMERIC mapping = { "date_detection": False, "dynamic": False, "_source": {"excludes": ["text", "fingerprints"]}, "properties": { "caption": KEYWORD, "schema": KEYWORD, "schemata": KEYWORD, registry.entity.group: KEYWORD, registry.language.group: KEYWORD, registry.country.group: KEYWORD, registry.checksum.group: KEYWORD, registry.ip.group: KEYWORD, registry.url.group: KEYWORD, registry.iban.group: KEYWORD, registry.email.group: KEYWORD, registry.phone.group: KEYWORD, registry.mimetype.group: KEYWORD, registry.identifier.group: KEYWORD, registry.date.group: PARTIAL_DATE, registry.address.group: KEYWORD, registry.name.group: KEYWORD, "fingerprints": { "type": "keyword", "normalizer": "latin_index", "copy_to": "text", "fields": {"text": LATIN_TEXT}, }, "text": { "type": "text", "analyzer": "latin_index", "search_analyzer": "latin_query", "search_quote_analyzer": "latin_index", "term_vector": "with_positions_offsets", }, "properties": {"type": "object", "properties": schema_mapping}, "numeric": {"type": "object", "properties": numeric_mapping}, "role_id": KEYWORD, "collection_id": KEYWORD, "origin": KEYWORD, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, }, } index = schema_index(model.get(schema), version) settings = index_settings(shards=get_shard_weight(schema)) return configure_index(index, mapping, settings)
def configure_notifications(): mapping = { "date_detection": False, "dynamic": False, "properties": { "event": KEYWORD, "actor_id": KEYWORD, "channels": KEYWORD, "created_at": {"type": "date"}, "params": {"dynamic": True, "type": "object"}, }, } index = notifications_index() settings = index_settings(shards=3) return configure_index(index, mapping, settings)
def configure_schema(schema, version): # Generate relevant type mappings for entity properties so that # we can do correct searches on each. schema_mapping = {} for prop in schema.properties.values(): config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD)) config['copy_to'] = ['text'] schema_mapping[prop.name] = config mapping = { "date_detection": False, "dynamic": False, "_source": { "excludes": ["text", "fingerprints"] }, "properties": { "name": { "type": "text", "analyzer": "icu_latin", "fields": {"kw": KEYWORD}, "boost": 3.0, "copy_to": "text" }, "schema": KEYWORD, "schemata": KEYWORD, "foreign_id": KEYWORD, "document_id": KEYWORD, "collection_id": KEYWORD, "uploader_id": KEYWORD, "entities": KEYWORD, "languages": KEYWORD, "countries": KEYWORD, "checksums": KEYWORD, "keywords": KEYWORD, "ips": KEYWORD, "urls": KEYWORD, "ibans": KEYWORD, "emails": KEYWORD, "phones": KEYWORD, "mimetypes": KEYWORD, "identifiers": KEYWORD, "dates": PARTIAL_DATE, "addresses": { "type": "keyword", "fields": {"text": LATIN_TEXT} }, "names": { "type": "keyword", "fields": {"text": LATIN_TEXT}, "copy_to": "text" }, "fingerprints": { "type": "keyword", "normalizer": "icu_latin", "copy_to": "text", "fields": {"text": LATIN_TEXT} }, "text": { "type": "text", "analyzer": "icu_latin", "term_vector": "with_positions_offsets", "store": True }, "properties": { "type": "object", "properties": schema_mapping }, "updated_at": {"type": "date"}, } } index = schema_index(model.get(schema), version) settings = index_settings(shards=get_shard_weight(schema)) return configure_index(index, mapping, settings)
def configure_schema(schema): # Generate relevant type mappings for entity properties so that # we can do correct searches on each. schema_mapping = {} if settings.ENTITIES_INDEX_SPLIT: for name, prop in schema.properties.items(): config = TYPE_MAPPINGS.get(prop.type, KEYWORD) schema_mapping[name] = config mapping = { "date_detection": False, "properties": { "title": RAW_TEXT, "name": { "type": "text", "analyzer": "icu_latin", "fields": {"kw": KEYWORD} }, "schema": KEYWORD, "schemata": KEYWORD, "bulk": {"type": "boolean"}, "status": KEYWORD, "error_message": RAW_TEXT, "content_hash": KEYWORD, "foreign_id": KEYWORD, "file_name": KEYWORD, "collection_id": KEYWORD, "uploader_id": KEYWORD, "children": KEYWORD, "source_url": KEYWORD, "extension": KEYWORD, "mime_type": KEYWORD, "encoding": KEYWORD, "entities": KEYWORD, "languages": KEYWORD, "countries": KEYWORD, "keywords": KEYWORD, "fingerprints": KEYWORD, "names": { "type": "keyword", "fields": {"text": RAW_TEXT} }, "emails": KEYWORD, "phones": KEYWORD, "identifiers": KEYWORD, "addresses": { "type": "keyword", "fields": {"text": RAW_TEXT} }, "columns": KEYWORD, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, "date": PARTIAL_DATE, "authored_at": PARTIAL_DATE, "modified_at": PARTIAL_DATE, "published_at": PARTIAL_DATE, "retrieved_at": PARTIAL_DATE, "dates": PARTIAL_DATE, "author": KEYWORD, "generator": KEYWORD, "summary": RAW_TEXT, "text": LATIN_TEXT, "properties": { "type": "object", "properties": schema_mapping }, "parent": { "type": "object", "properties": { "id": KEYWORD, "type": KEYWORD, "title": KEYWORD } }, "ancestors": KEYWORD, } } index = entities_write_index(schema) configure_index(index, mapping, index_settings())
def configure_schema(schema, version): # Generate relevant type mappings for entity properties so that # we can do correct searches on each. schema_mapping = {} for prop in schema.properties.values(): config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD)) config['copy_to'] = ['text'] schema_mapping[prop.name] = config mapping = { "date_detection": False, "dynamic": False, "_source": { "excludes": ["text", "fingerprints"] }, "properties": { "name": { "type": "text", "analyzer": "icu_latin", "fields": {"kw": KEYWORD}, "boost": 3.0, "copy_to": "text" }, "schema": KEYWORD, "schemata": KEYWORD, "bulk": {"type": "boolean"}, "status": KEYWORD, "error_message": { "type": "text", "copy_to": "text", "index": False }, "foreign_id": KEYWORD, "document_id": KEYWORD, "collection_id": KEYWORD, "uploader_id": KEYWORD, "fingerprints": { "type": "keyword", "normalizer": "icu_latin", "copy_to": "text", "fields": {"text": LATIN_TEXT} }, "entities": KEYWORD, "languages": KEYWORD, "countries": KEYWORD, "checksums": KEYWORD, "keywords": KEYWORD, "ips": KEYWORD, "urls": KEYWORD, "ibans": KEYWORD, "emails": KEYWORD, "phones": KEYWORD, "mimetypes": KEYWORD, "identifiers": KEYWORD, "addresses": { "type": "keyword", "fields": {"text": LATIN_TEXT} }, "dates": PARTIAL_DATE, "names": { "type": "keyword", "fields": {"text": LATIN_TEXT}, "copy_to": "text" }, "created_at": {"type": "date"}, "updated_at": {"type": "date"}, "text": { "type": "text", "analyzer": "icu_latin", "term_vector": "with_positions_offsets", "store": True }, "properties": { "type": "object", "properties": schema_mapping } } } index = schema_index(model.get(schema), version) return configure_index( index, mapping, index_settings(shards=get_shard_weight(schema)) )