Exemple #1
0
def configure_collections():
    mapping = {
        "dynamic_templates": [
            {
                "fields": {
                    "match": "schemata.*",
                    "mapping": {"type": "long"}
                }
            }
        ],
        "properties": {
            "label": {
                "type": "text",
                "analyzer": "icu_latin",
                "fields": {"kw": KEYWORD}
            },
            "collection_id": KEYWORD,
            "foreign_id": KEYWORD,
            "languages": KEYWORD,
            "countries": KEYWORD,
            "category": KEYWORD,
            "summary": RAW_TEXT,
            "publisher": KEYWORD,
            "publisher_url": KEYWORD,
            "data_url": KEYWORD,
            "info_url": KEYWORD,
            "kind": KEYWORD,
            "text": LATIN_TEXT,
            "casefile": {"type": "boolean"},
            "secret": {"type": "boolean"},
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
            "count": {"type": "long"},
            "schemata": {"type": "object"},
            "creator": {
                "type": "object",
                "properties": {
                    "id": KEYWORD,
                    "type": KEYWORD,
                    "name": {
                        "type": "text",
                        "fields": {"kw": KEYWORD}
                    }
                }
            },
            "team": {
                "type": "object",
                "properties": {
                    "id": KEYWORD,
                    "type": KEYWORD,
                    "name": KEYWORD
                }
            },
        }
    }
    configure_index(collections_index(), mapping, index_settings())
Exemple #2
0
def configure_records():
    mapping = {
        "properties": {
            "collection_id": KEYWORD,
            "document_id": KEYWORD,
            "index": {"type": "long"},
            "text": LATIN_TEXT
        }
    }
    settings = index_settings(shards=10, refresh_interval='15s')
    configure_index(records_write_index(), mapping, settings)
Exemple #3
0
def configure_xref():
    mapping = {
        "date_detection": False,
        "dynamic": False,
        "properties": {
            "score": {
                "type": "float"
            },
            "entity_id": KEYWORD,
            "collection_id": KEYWORD,
            "match_id": KEYWORD,
            "match_collection_id": KEYWORD,
            registry.country.group: KEYWORD,
            "schema": KEYWORD,
            "text": {
                "type": "text",
                "analyzer": "latin_index"
            },
            "created_at": {
                "type": "date"
            },
        },
    }
    settings = index_settings(shards=SHARDS_HEAVY)
    return configure_index(xref_index(), mapping, settings)
Exemple #4
0
def configure_xref():
    mapping = {
        'date_detection': False,
        'dynamic': False,
        'properties': {
            'score': {
                'type': 'float'
            },
            'entity_id': KEYWORD,
            'collection_id': KEYWORD,
            'match_id': KEYWORD,
            'match_collection_id': KEYWORD,
            registry.country.group: KEYWORD,
            'schema': KEYWORD,
            'text': {
                'type': 'text',
                'analyzer': 'latin_index'
            },
            'created_at': {
                'type': 'date'
            },
        }
    }
    settings = index_settings(shards=SHARDS_HEAVY)
    return configure_index(xref_index(), mapping, settings)
Exemple #5
0
def configure_collections():
    mapping = {
        "date_detection": False,
        "dynamic": False,
        "dynamic_templates": [
            {
                "fields": {
                    "match": "schemata.*",
                    "mapping": {"type": "long"}
                }
            }
        ],
        "_source": {"excludes": ["text"]},
        "properties": {
            "label": {
                "type": "text",
                "copy_to": "text",
                "analyzer": "latin_index",
                "fields": {"kw": KEYWORD}
            },
            "collection_id": KEYWORD,
            "foreign_id": KEYWORD_COPY,
            "languages": KEYWORD_COPY,
            "countries": KEYWORD_COPY,
            "category": KEYWORD_COPY,
            "frequency": KEYWORD_COPY,
            "summary": {
                "type": "text",
                "copy_to": "text",
                "index": False
            },
            "publisher": KEYWORD_COPY,
            "publisher_url": KEYWORD_COPY,
            "data_url": KEYWORD_COPY,
            "info_url": KEYWORD_COPY,
            "kind": KEYWORD,
            "creator_id": KEYWORD,
            "team_id": KEYWORD,
            "text": {
                "type": "text",
                "analyzer": "latin_index",
                "term_vector": "with_positions_offsets",
                "store": True
            },
            "casefile": {"type": "boolean"},
            "restricted": {"type": "boolean"},
            "secret": {"type": "boolean"},
            "xref": {"type": "boolean"},
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
            "count": {"type": "long"},
            "schemata": {
                "dynamic": True,
                "type": "object"
            }
        }
    }
    index = collections_index()
    settings = index_settings(shards=1)
    return configure_index(index, mapping, settings)
Exemple #6
0
def configure_collections():
    mapping = {
        "date_detection": False,
        "dynamic": False,
        "dynamic_templates": [
            {
                "fields": {
                    "match": "schemata.*",
                    "mapping": {"type": "long"}
                }
            }
        ],
        "_source": {
            "excludes": ["text"]
        },
        "properties": {
            "label": {
                "type": "text",
                "copy_to": "text",
                "analyzer": "icu_latin",
                "fields": {"kw": KEYWORD}
            },
            "collection_id": KEYWORD,
            "foreign_id": KEYWORD_COPY,
            "languages": KEYWORD_COPY,
            "countries": KEYWORD_COPY,
            "category": KEYWORD_COPY,
            "summary": {
                "type": "text",
                "copy_to": "text",
                "index": False
            },
            "publisher": KEYWORD_COPY,
            "publisher_url": KEYWORD_COPY,
            "data_url": KEYWORD_COPY,
            "info_url": KEYWORD_COPY,
            "kind": KEYWORD,
            "creator_id": KEYWORD,
            "team_id": KEYWORD,
            "text": {
                "type": "text",
                "analyzer": "icu_latin",
                "term_vector": "with_positions_offsets",
                "store": True
            },
            "casefile": {"type": "boolean"},
            "secret": {"type": "boolean"},
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
            "count": {"type": "long"},
            "schemata": {
                "dynamic": True,
                "type": "object"
            }
        }
    }
    index = collections_index()
    settings = index_settings(shards=1)
    return configure_index(index, mapping, settings)
Exemple #7
0
def configure_schema(schema, version):
    # Generate relevant type mappings for entity properties so that
    # we can do correct searches on each.
    schema_mapping = {}
    numeric_mapping = {registry.date.group: NUMERIC}
    for prop in schema.properties.values():
        config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD))
        config["copy_to"] = ["text"]
        schema_mapping[prop.name] = config
        if prop.type in NUMERIC_TYPES:
            numeric_mapping[prop.name] = NUMERIC

    mapping = {
        "date_detection": False,
        "dynamic": False,
        "_source": {"excludes": ["text", "fingerprints"]},
        "properties": {
            "caption": KEYWORD,
            "schema": KEYWORD,
            "schemata": KEYWORD,
            registry.entity.group: KEYWORD,
            registry.language.group: KEYWORD,
            registry.country.group: KEYWORD,
            registry.checksum.group: KEYWORD,
            registry.ip.group: KEYWORD,
            registry.url.group: KEYWORD,
            registry.iban.group: KEYWORD,
            registry.email.group: KEYWORD,
            registry.phone.group: KEYWORD,
            registry.mimetype.group: KEYWORD,
            registry.identifier.group: KEYWORD,
            registry.date.group: PARTIAL_DATE,
            registry.address.group: KEYWORD,
            registry.name.group: KEYWORD,
            "fingerprints": {
                "type": "keyword",
                "normalizer": "latin_index",
                "copy_to": "text",
                "fields": {"text": LATIN_TEXT},
            },
            "text": {
                "type": "text",
                "analyzer": "latin_index",
                "search_analyzer": "latin_query",
                "search_quote_analyzer": "latin_index",
                "term_vector": "with_positions_offsets",
            },
            "properties": {"type": "object", "properties": schema_mapping},
            "numeric": {"type": "object", "properties": numeric_mapping},
            "role_id": KEYWORD,
            "collection_id": KEYWORD,
            "origin": KEYWORD,
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
        },
    }
    index = schema_index(model.get(schema), version)
    settings = index_settings(shards=get_shard_weight(schema))
    return configure_index(index, mapping, settings)
Exemple #8
0
def configure_notifications():
    mapping = {
        "date_detection": False,
        "dynamic": False,
        "properties": {
            "event": KEYWORD,
            "actor_id": KEYWORD,
            "channels": KEYWORD,
            "created_at": {"type": "date"},
            "params": {"dynamic": True, "type": "object"},
        },
    }
    index = notifications_index()
    settings = index_settings(shards=3)
    return configure_index(index, mapping, settings)
Exemple #9
0
def configure_schema(schema, version):
    # Generate relevant type mappings for entity properties so that
    # we can do correct searches on each.
    schema_mapping = {}
    for prop in schema.properties.values():
        config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD))
        config['copy_to'] = ['text']
        schema_mapping[prop.name] = config

    mapping = {
        "date_detection": False,
        "dynamic": False,
        "_source": {
            "excludes": ["text", "fingerprints"]
        },
        "properties": {
            "name": {
                "type": "text",
                "analyzer": "icu_latin",
                "fields": {"kw": KEYWORD},
                "boost": 3.0,
                "copy_to": "text"
            },
            "schema": KEYWORD,
            "schemata": KEYWORD,
            "foreign_id": KEYWORD,
            "document_id": KEYWORD,
            "collection_id": KEYWORD,
            "uploader_id": KEYWORD,
            "entities": KEYWORD,
            "languages": KEYWORD,
            "countries": KEYWORD,
            "checksums": KEYWORD,
            "keywords": KEYWORD,
            "ips": KEYWORD,
            "urls": KEYWORD,
            "ibans": KEYWORD,
            "emails": KEYWORD,
            "phones": KEYWORD,
            "mimetypes": KEYWORD,
            "identifiers": KEYWORD,
            "dates": PARTIAL_DATE,
            "addresses": {
                "type": "keyword",
                "fields": {"text": LATIN_TEXT}
            },
            "names": {
                "type": "keyword",
                "fields": {"text": LATIN_TEXT},
                "copy_to": "text"
            },
            "fingerprints": {
                "type": "keyword",
                "normalizer": "icu_latin",
                "copy_to": "text",
                "fields": {"text": LATIN_TEXT}
            },
            "text": {
                "type": "text",
                "analyzer": "icu_latin",
                "term_vector": "with_positions_offsets",
                "store": True
            },
            "properties": {
                "type": "object",
                "properties": schema_mapping
            },
            "updated_at": {"type": "date"},
        }
    }
    index = schema_index(model.get(schema), version)
    settings = index_settings(shards=get_shard_weight(schema))
    return configure_index(index, mapping, settings)
Exemple #10
0
def configure_schema(schema):
    # Generate relevant type mappings for entity properties so that
    # we can do correct searches on each.
    schema_mapping = {}
    if settings.ENTITIES_INDEX_SPLIT:
        for name, prop in schema.properties.items():
            config = TYPE_MAPPINGS.get(prop.type, KEYWORD)
            schema_mapping[name] = config

    mapping = {
        "date_detection": False,
        "properties": {
            "title": RAW_TEXT,
            "name": {
                "type": "text",
                "analyzer": "icu_latin",
                "fields": {"kw": KEYWORD}
            },
            "schema": KEYWORD,
            "schemata": KEYWORD,
            "bulk": {"type": "boolean"},
            "status": KEYWORD,
            "error_message": RAW_TEXT,
            "content_hash": KEYWORD,
            "foreign_id": KEYWORD,
            "file_name": KEYWORD,
            "collection_id": KEYWORD,
            "uploader_id": KEYWORD,
            "children": KEYWORD,
            "source_url": KEYWORD,
            "extension": KEYWORD,
            "mime_type": KEYWORD,
            "encoding": KEYWORD,
            "entities": KEYWORD,
            "languages": KEYWORD,
            "countries": KEYWORD,
            "keywords": KEYWORD,
            "fingerprints": KEYWORD,
            "names": {
                "type": "keyword",
                "fields": {"text": RAW_TEXT}
            },
            "emails": KEYWORD,
            "phones": KEYWORD,
            "identifiers": KEYWORD,
            "addresses": {
                "type": "keyword",
                "fields": {"text": RAW_TEXT}
            },
            "columns": KEYWORD,
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
            "date": PARTIAL_DATE,
            "authored_at": PARTIAL_DATE,
            "modified_at": PARTIAL_DATE,
            "published_at": PARTIAL_DATE,
            "retrieved_at": PARTIAL_DATE,
            "dates": PARTIAL_DATE,
            "author": KEYWORD,
            "generator": KEYWORD,
            "summary": RAW_TEXT,
            "text": LATIN_TEXT,
            "properties": {
                "type": "object",
                "properties": schema_mapping
            },
            "parent": {
                "type": "object",
                "properties": {
                    "id": KEYWORD,
                    "type": KEYWORD,
                    "title": KEYWORD
                }
            },
            "ancestors": KEYWORD,
        }
    }
    index = entities_write_index(schema)
    configure_index(index, mapping, index_settings())
Exemple #11
0
def configure_schema(schema, version):
    # Generate relevant type mappings for entity properties so that
    # we can do correct searches on each.
    schema_mapping = {}
    for prop in schema.properties.values():
        config = dict(TYPE_MAPPINGS.get(prop.type, KEYWORD))
        config['copy_to'] = ['text']
        schema_mapping[prop.name] = config

    mapping = {
        "date_detection": False,
        "dynamic": False,
        "_source": {
            "excludes": ["text", "fingerprints"]
        },
        "properties": {
            "name": {
                "type": "text",
                "analyzer": "icu_latin",
                "fields": {"kw": KEYWORD},
                "boost": 3.0,
                "copy_to": "text"
            },
            "schema": KEYWORD,
            "schemata": KEYWORD,
            "bulk": {"type": "boolean"},
            "status": KEYWORD,
            "error_message": {
                "type": "text",
                "copy_to": "text",
                "index": False
            },
            "foreign_id": KEYWORD,
            "document_id": KEYWORD,
            "collection_id": KEYWORD,
            "uploader_id": KEYWORD,
            "fingerprints": {
                "type": "keyword",
                "normalizer": "icu_latin",
                "copy_to": "text",
                "fields": {"text": LATIN_TEXT}
            },
            "entities": KEYWORD,
            "languages": KEYWORD,
            "countries": KEYWORD,
            "checksums": KEYWORD,
            "keywords": KEYWORD,
            "ips": KEYWORD,
            "urls": KEYWORD,
            "ibans": KEYWORD,
            "emails": KEYWORD,
            "phones": KEYWORD,
            "mimetypes": KEYWORD,
            "identifiers": KEYWORD,
            "addresses": {
                "type": "keyword",
                "fields": {"text": LATIN_TEXT}
            },
            "dates": PARTIAL_DATE,
            "names": {
                "type": "keyword",
                "fields": {"text": LATIN_TEXT},
                "copy_to": "text"
            },
            "created_at": {"type": "date"},
            "updated_at": {"type": "date"},
            "text": {
                "type": "text",
                "analyzer": "icu_latin",
                "term_vector": "with_positions_offsets",
                "store": True
            },
            "properties": {
                "type": "object",
                "properties": schema_mapping
            }
        }
    }
    index = schema_index(model.get(schema), version)
    return configure_index(
        index, mapping, index_settings(shards=get_shard_weight(schema))
    )