Python entities_indexの例、aleph.index.core.entities_index Pythonの例

コード例 #1

0

ファイルを表示

ファイル: stats.py プロジェクト: GelLiNN/aleph

def get_instance_stats(authz):
    query = {
        'size': 0,
        'query': {
            'terms': {
                'roles': list(authz.roles)
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            }
        }
    }
    result = es.search(index=entities_index(),
                       doc_type=entity_type(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {'count': result.get('hits').get('total'), 'schemata': {}}
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['schemata'][key] = schema.get('doc_count')

    return data

コード例 #2

0

ファイルを表示

def generate_sitemap(collection_id):
    """Generate entries for a collection-based sitemap.xml file."""
    # cf. https://www.sitemaps.org/protocol.html
    query = {
        'query': {
            'bool': {
                'filter': [
                    {'term': {'collection_id': collection_id}},
                    {'term': {'schemata': Entity.THING}},
                    authz_query(Authz.from_role(None))
                ]
            }
        },
        '_source': {'includes': ['schemata', 'updated_at']}
    }
    scanner = scan(es, index=entities_index(), query=query)
    # strictly, the limit for sitemap.xml is 50,000
    for res in islice(scanner, 49500):
        source = res.get('_source', {})
        updated_at = source.get('updated_at', '').split('T', 1)[0]
        if Document.SCHEMA in source.get('schemata', []):
            url = document_url(res.get('_id'))
        else:
            url = entity_url(res.get('_id'))
        yield (url, updated_at)

コード例 #3

0

ファイルを表示

ファイル: statistics.py プロジェクト: kkrbalam/aleph

def get_instance_stats(authz):
    query = {
        'size': 0,
        'query': {
            'bool': {
                'filter': [
                    authz_query(authz),
                    {'term': {'schemata': Entity.THING}}
                ]
            }
        },
        'aggs': {
            'schema': {'terms': {'field': 'schema', 'size': 1000}}
        }
    }
    result = es.search(index=entities_index(),
                       body=query)
    aggregations = result.get('aggregations')
    data = {
        'count': result.get('hits').get('total'),
        'schemata': {}
    }
    for schema in aggregations.get('schema').get('buckets'):
        key = schema.get('key')
        data['schemata'][key] = schema.get('doc_count')

    return data

コード例 #4

0

ファイルを表示

ファイル: entities.py プロジェクト: arezola/aleph

def iter_entities(authz=None,
                  collection_id=None,
                  schemata=None,
                  includes=None,
                  excludes=None):
    """Scan all entities matching the given criteria."""
    filters = []
    if authz is not None:
        filters.append(authz_query(authz))
    if collection_id is not None:
        filters.append({'term': {'collection_id': collection_id}})
    if ensure_list(schemata):
        filters.append({'terms': {'schemata': ensure_list(schemata)}})
    source = {}
    if ensure_list(includes):
        source['includes'] = ensure_list(includes)
    if ensure_list(excludes):
        source['excludes'] = ensure_list(excludes)
    query = {
        'query': {
            'bool': {
                'filter': filters
            }
        },
        'sort': ['_doc'],
        '_source': source
    }
    for res in scan(es, index=entities_index(), query=query, scroll='1410m'):
        yield unpack_result(res)

コード例 #5

0

ファイルを表示

ファイル: __init__.py プロジェクト: GelLiNN/aleph

    def __init__(self, request, query, parser=None, schema=None):
        super(MatchQueryResult, self).__init__(request,
                                               query,
                                               parser=parser,
                                               schema=schema)
        ids = set()
        for match in self.results:
            ids.add(match.match_id)
            ids.add(match.entity_id)
        ids = {'ids': list(ids)}

        result = es.mget(index=entities_index(),
                         doc_type=entity_type(),
                         body=ids)
        for doc in result.get('docs', []):
            entity = unpack_result(doc)
            if entity is None:
                continue
            for match in self.results:
                if match.match_id == entity['id']:
                    match.match = entity
                if match.entity_id == entity['id']:
                    match.entity = entity

        # Do not return results if the entity has been removed in the mean
        # time. Not sure this is the ideal way of doing this, as it'll mess
        # with pagination counts etc.
        for match in list(self.results):
            if not hasattr(match, 'match') or not hasattr(match, 'entity'):
                self.results.remove(match)

コード例 #6

0

ファイルを表示

ファイル: entities_api.py プロジェクト: arezola/aleph

def delete(id):
    entity = get_db_entity(id, request.authz.WRITE)
    delete_entity(entity)
    db.session.commit()
    update_collection(entity.collection)
    refresh_index(entities_index())
    return ('', 204)

コード例 #7

0

ファイルを表示

def expand_group(node):
    if node.type.group is None or node.value is None:
        return
    value = str(node.value)
    query = {
        'query': {
            'term': {
                node.type.group: value
            }
        },
        '_source': {
            'includes': ['schema', 'properties']
        }
    }
    for res in scan(es, index=entities_index(), query=query):
        entity_id = res.get('_id')
        source = res.get('_source')
        properties = source.get('properties')
        schema = model.get(source.get('schema'))
        for prop in schema.properties.values():
            if prop.type != node.type:
                continue
            values = properties.get(prop.name)
            values = node.type.normalize_set(values)
            if value not in values:
                continue
            if prop.reverse:
                yield Link(node, prop.reverse, entity_id)
            else:
                yield Link(node, prop, entity_id, inverted=True)

コード例 #8

0

ファイルを表示

ファイル: entities.py プロジェクト: public-people/aleph

def get_entity(entity_id):
    """Fetch an entity from the index."""
    result = es.get(index=entities_index(),
                    doc_type='doc',
                    id=entity_id,
                    ignore=[404],
                    _source_exclude=['text'])
    return unpack_result(result)

コード例 #9

0

ファイルを表示

ファイル: xref.py プロジェクト: roukdanus/aleph

def xref_collection(collection_id, other_id=None):
    """Cross-reference all the entities and documents in a collection."""
    query = {'term': {'collection_id': collection_id}}
    query = {'query': query, '_source': {'excludes': ['text', 'properties.*']}}
    scanner = scan(es, index=entities_index(), query=query, scroll='30m')
    for idx, res in enumerate(scanner):
        res = unpack_result(res)
        _xref_item(res, collection_id=other_id)

コード例 #10

0

ファイルを表示

ファイル: collections.py プロジェクト: GelLiNN/aleph

def delete_collection(collection_id, wait=True):
    """Delete all documents from a particular collection."""
    query = {'term': {'collection_id': collection_id}}
    query_delete(records_index(), query, wait=wait)
    query_delete(entities_index(), query, wait=wait)
    es.delete(index=collections_index(),
              doc_type=collection_type(),
              id=collection_id,
              ignore=[404])

コード例 #11

0

ファイルを表示

ファイル: entities_api.py プロジェクト: arezola/aleph

def create():
    data = parse_request(EntityCreateSchema)
    collection = get_db_collection(data['collection_id'], request.authz.WRITE)
    entity = Entity.create(data, collection)
    db.session.commit()
    data = update_entity(entity)
    update_collection(collection)
    refresh_index(entities_index())
    return serialize_data(data, CombinedSchema)

コード例 #12

0

ファイルを表示

ファイル: documents.py プロジェクト: GelLiNN/aleph

def get_document(document_id):
    """Fetch a document from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=document_id,
                    ignore=[404])
    document = unpack_result(result)
    if document is not None:
        document.pop('text', None)
    return document

コード例 #13

0

ファイルを表示

ファイル: entities.py プロジェクト: GelLiNN/aleph

def get_entity(entity_id):
    """Fetch an entity from the index."""
    result = es.get(index=entities_index(),
                    doc_type=entity_type(),
                    id=entity_id,
                    ignore=[404])
    entity = unpack_result(result)
    if entity is not None:
        entity.pop('text', None)
    return entity

コード例 #14

0

ファイルを表示

ファイル: entities.py プロジェクト: SiloGit/aleph

def entity_tags(entity, authz):
    """Do a search on tags of an entity."""
    # NOTE: This must also work for documents.
    FIELDS = [
        'names',
        'emails',
        'phones',
        'addresses',
        'identifiers'
    ]
    pivots = []
    queries = []
    # Go through all the tags which apply to this entity, and find how
    # often they've been mentioned in other entities.
    for field in FIELDS:
        for value in entity.get(field, []):
            if value is None:
                continue
            queries.append({})
            queries.append({
                'size': 0,
                'query': {
                    'bool': {
                        'filter': [
                            authz_query(authz),
                            field_filter_query(field, value)
                        ],
                        'must_not': [
                            {'ids': {'values': [entity.get('id')]}},
                        ]
                    }
                }
            })
            pivots.append((field, value))

    if not len(queries):
        return []

    res = es.msearch(index=entities_index(), body=queries)
    results = []
    for (field, value), resp in zip(pivots, res.get('responses', [])):
        total = resp.get('hits', {}).get('total')
        if total > 0:
            qvalue = quote(value.encode('utf-8'))
            key = ('filter:%s' % field, qvalue)
            results.append({
                'id': query_string([key]),
                'value': value,
                'field': field,
                'count': total
            })

    results.sort(key=lambda p: p['count'], reverse=True)
    return results

コード例 #15

0

ファイルを表示

def ingest_upload(id):
    collection = get_db_collection(id, request.authz.WRITE)
    meta, foreign_id = _load_metadata(collection)
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, path)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.schema = Document.SCHEMA_FOLDER
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, None)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    if collection.casefile:
        for document in documents:
            params = {'document': document, 'collection': collection}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.apply_async([parent_id], priority=1)

    refresh_index(index=entities_index())
    return jsonify({
        'status':
        'ok',
        'documents': [CombinedSchema().dump(d).data for d in documents]
    })

コード例 #16

0

ファイルを表示

ファイル: triples.py プロジェクト: Ro9ueAdmin/aleph

def query_collection_contents(collection_id):
    q = {
        'query': {
            'term': {
                'collection_id': collection_id
            }
        },
        '_source': {
            'exclude': ['text']
        }
    }
    res = scan(es, index=entities_index(), query=q)
    return res

コード例 #17

0

ファイルを表示

ファイル: collections.py プロジェクト: mustafaascha/aleph

def get_collection_stats(collection_id):
    """Compute some statistics on the content of a collection."""
    key = cache.key('cstats', collection_id)
    data = cache.get_complex(key)
    if data is not None:
        return data

    log.info("Generating collection stats: %s", collection_id)
    query = {
        'size': 0,
        'query': {
            'bool': {
                'filter': [{
                    'term': {
                        'collection_id': collection_id
                    }
                }]
            }
        },
        'aggs': {
            'schemata': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'countries': {
                'terms': {
                    'field': 'countries',
                    'size': 500
                }
            },
            'languages': {
                'terms': {
                    'field': 'languages',
                    'size': 10
                }
            },
        }
    }
    result = search_safe(index=entities_index(), body=query)
    aggregations = result.get('aggregations', {})
    data = {'count': result['hits']['total']}

    for facet in ['schemata', 'countries', 'languages']:
        data[facet] = {}
        for bucket in aggregations[facet]['buckets']:
            data[facet][bucket['key']] = bucket['doc_count']
    expire = randint(3600 * 3, 3600 * 12)
    cache.set_complex(key, data, expire=expire)
    return data

コード例 #18

0

ファイルを表示

ファイル: collections.py プロジェクト: gavinrozzi/aleph

def update_collection_roles(collection):
    """Update the role visibility of objects which are part of collections."""
    roles = ', '.join([str(r) for r in collection.roles])
    body = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        'script': {
            'inline': 'ctx._source.roles = [%s]' % roles
        }
    }
    query_update(entities_index(), body)

コード例 #19

0

ファイルを表示

ファイル: entities.py プロジェクト: Ro9ueAdmin/aleph

def entity_references(entity, authz):
    """Given a particular entity, find all the references to it from other
    entities, grouped by the property where they are used."""
    schema = model[entity.get('schema')]

    # Generate all the possible mention locations.
    properties = []
    queries = []
    for prop in model.properties:
        if not prop.is_entity:
            continue
        if not schema.is_a(prop.range):
            continue

        field = 'properties.%s' % prop.name
        queries.append({})
        queries.append({
            'size': 0,
            'query': {
                'bool': {
                    'filter': [
                        authz_query(authz),
                        {
                            'term': {
                                'schemata': prop.schema.name
                            }
                        },
                        {
                            'term': {
                                field: entity.get('id')
                            }
                        },
                    ]
                }
            }
        })
        properties.append(prop)

    # Run a count search (with schema facet?)
    res = es.msearch(index=entities_index(), body=queries)
    results = []
    for prop, resp in zip(properties, res.get('responses', [])):
        total = resp.get('hits', {}).get('total')
        if total > 0:
            results.append({
                'count': total,
                'property': prop,
                'schema': prop.schema.name
            })
    return results

コード例 #20

0

ファイルを表示

ファイル: alerts.py プロジェクト: renesugar/aleph

def check_alert(alert):
    authz = Authz(role=alert.role)
    query = alert_query(alert, authz)
    found = 0
    for result in scan(es, query=query, index=entities_index()):
        entity = unpack_result(result)
        found += 1
        params = {'alert': alert, 'role': authz.role, 'entity': entity}
        publish(Events.MATCH_ALERT,
                actor_id=entity.get('uploader_id'),
                params=params)

    alert.update()
    log.info('Found %d new results for: %s', found, alert.label)
    db.session.commit()

コード例 #21

0

ファイルを表示

def xref_collection(collection, other=None):
    """Cross-reference all the entities and documents in a collection."""
    log.info("Cross-reference collection: %r", collection)
    other_id = other.id if other is not None else None
    query = {
        'query': {
            'term': {
                'collection_id': collection.id
            }
        },
        '_source': FIELDS_XREF
    }
    scanner = scan(es, index=entities_index(), query=query, scroll='15m')

    for i, res in enumerate(scanner):
        xref_item.apply_async([unpack_result(res), other_id], priority=4)

コード例 #22

0

ファイルを表示

ファイル: entities.py プロジェクト: GelLiNN/aleph

def _index_updates(collection, entities):
    """Look up existing index documents and generate an updated form.

    This is necessary to make the index accumulative, i.e. if an entity or link
    gets indexed twice with different field values, it'll add up the different
    field values into a single record. This is to avoid overwriting the
    document and losing field values. An alternative solution would be to
    implement this in Groovy on the ES.
    """
    common = {
        'collection_id': collection.id,
        'bulk': True,
        'roles': collection.roles,
        'updated_at': datetime.utcnow()
    }
    if not len(entities):
        return

    result = es.mget(index=entities_index(),
                     doc_type=entity_type(),
                     body={'ids': entities.keys()},
                     _source=['schema', 'properties', 'created_at'])
    for doc in result.get('docs', []):
        if not doc.get('found', False):
            continue
        entity_id = doc['_id']
        entity = entities.get(entity_id)
        existing = doc.get('_source')
        combined = model.merge(existing, entity)
        combined['created_at'] = existing.get('created_at')
        entities[entity_id] = combined

    for doc_id, entity in entities.items():
        entity.pop('id', None)
        entity.pop('data', None)
        entity.update(common)
        if 'created_at' not in entity:
            entity['created_at'] = entity.get('updated_at')
        schema = model.get(entity.get('schema'))
        entity = finalize_index(entity, schema)
        # pprint(entity)
        yield {
            '_id': doc_id,
            '_index': entity_index(),
            '_type': entity_type(),
            '_source': entity
        }

コード例 #23

0

ファイルを表示

ファイル: collections.py プロジェクト: gavinrozzi/aleph

def delete_entities(collection_id):
    """Delete entities from a collection."""
    query = {
        'bool': {
            'must_not': {
                'term': {
                    'schemata': 'Document'
                }
            },
            'must': {
                'term': {
                    'collection_id': collection_id
                }
            }
        }
    }
    query_delete(entities_index(), query)

コード例 #24

0

ファイルを表示

def xref_collection(collection_id, other_id=None):
    """Cross-reference all the entities and documents in a collection."""
    query = {'term': {'collection_id': collection_id}}
    query = {
        'query': query,
        '_source': FIELDS_XREF
    }
    scanner = scan(es,
                   index=entities_index(),
                   query=query,
                   scroll='15m')

    for i, res in enumerate(scanner):
        res = unpack_result(res)
        xref_item.apply_async(args=[res],
                              kwargs={'collection_id': other_id},
                              priority=4)

コード例 #25

0

ファイルを表示

ファイル: expand.py プロジェクト: renesugar/aleph

    def _resolve_index(self, cache):
        queries = OrderedDict()
        for (type_, id_) in cache.keys():
            if type_ in [Collection]:
                index = collections_index()
                queries[(type_, id_)] = {'_index': index, '_id': id_}
            elif type_ in [Document, Entity]:
                index = entities_index()
                queries[(type_, id_)] = {'_index': index, '_id': id_}

        if not len(queries):
            return

        results = es.mget(body={'docs': queries.values()},
                          _source_exclude=['text'])
        for key, doc in zip(queries.keys(), results['docs']):
            cache[key] = unpack_result(doc)

コード例 #26

0

ファイルを表示

def export_collection(collection):
    uri = collection_uri(collection.id)
    g = Graph()

    g.add((uri, RDF.type, DCMI.Collection))
    g.add((uri, RDFS.label, Literal(collection.label)))
    g.add((uri, DCMI.identifier, Literal(collection.foreign_id)))
    g.add((uri, ALEPH.category, ALEPH[collection.category]))

    for line in itergraph(g):
        yield line

    q = {'term': {'collection_id': collection.id}}
    q = {'query': q, '_source': {'exclude': ['text']}}
    for row in scan(es, index=entities_index(), query=q):
        g = export_entity(unpack_result(row), uri)
        for line in itergraph(g):
            yield line

コード例 #27

0

ファイルを表示

def delete_documents(collection_id):
    """Delete documents from a collection."""
    query = {
        'bool': {
            'must': [{
                'term': {
                    'schemata': 'Document'
                }
            }, {
                'term': {
                    'collection_id': collection_id
                }
            }]
        }
    }
    query_delete(entities_index(), query)
    records_query = {'term': {'collection_id': collection_id}}
    query_delete(records_index(), records_query)

コード例 #28

0

ファイルを表示

ファイル: xref.py プロジェクト: mustafaascha/aleph

def xref_item(proxy):
    """Cross-reference an entity or document, given as an indexed document."""
    query = match_query(proxy)
    if query == none_query():
        return

    query = {
        'query': query,
        'size': 100,
        '_source': {'includes': ['schema', 'properties', 'collection_id']}
    }
    result = search_safe(index=entities_index(), body=query)
    results = result.get('hits').get('hits')
    for result in results:
        result = unpack_result(result)
        if result is not None:
            other = model.get_proxy(result)
            score = compare(model, proxy, other)
            yield score, result.get('collection_id'), other

コード例 #29

0

ファイルを表示

ファイル: __init__.py プロジェクト: gavinrozzi/aleph

def entity_tags(entity, authz):
    """Do a search on tags of an entity."""
    # NOTE: This must also work for documents.
    FIELDS = [
        'names',
        'emails',
        'phones',
        'addresses',
        'identifiers'
    ]
    pivots = []
    queries = []
    # Go through all the tags which apply to this entity, and find how
    # often they've been mentioned in other entities.
    for field in FIELDS:
        for value in entity.get(field, []):
            if value is None or not len(value):
                continue
            queries.append({})
            queries.append({
                'size': 0,
                'query': {
                    'bool': {
                        'filter': [
                            authz_query(authz),
                            field_filter_query(field, value)
                        ],
                        'must_not': [
                            {'ids': {'values': [entity.get('id')]}},
                        ]
                    }
                }
            })
            pivots.append((field, value))

    if not len(queries):
        return

    res = es.msearch(index=entities_index(), body=queries)
    for (field, value), resp in zip(pivots, res.get('responses', [])):
        total = resp.get('hits', {}).get('total')
        if total is not None and total > 0:
            yield (field, value, total)

コード例 #30

0

ファイルを表示

ファイル: xref.py プロジェクト: gavinrozzi/aleph

def _xref_item(item, collection_id=None):
    """Cross-reference an entity or document, given as an indexed document."""
    name = item.get('name') or item.get('title')
    query = entity_query(item, collection_id=collection_id)
    if 'match_none' in query:
        return

    query = {
        'query': query,
        'size': 10,
        '_source': ['collection_id', 'name'],
    }
    result = search_safe(index=entities_index(), body=query)
    results = result.get('hits').get('hits')
    entity_id, document_id = None, None
    if Document.SCHEMA in item.get('schemata'):
        document_id = item.get('id')
    else:
        entity_id = item.get('id')

    dq = db.session.query(Match)
    dq = dq.filter(Match.entity_id == entity_id)
    dq = dq.filter(Match.document_id == document_id)
    if collection_id is not None:
        dq = dq.filter(Match.match_collection_id == collection_id)
    dq.delete()

    for result in results:
        source = result.get('_source', {})
        log.info("Xref [%.1f]: %s <=> %s", result.get('_score'), name,
                 source.get('name'))
        obj = Match()
        obj.entity_id = entity_id
        obj.document_id = document_id
        obj.collection_id = item.get('collection_id')
        obj.match_id = result.get('_id')
        obj.match_collection_id = source.get('collection_id')
        obj.score = result.get('_score')
        db.session.add(obj)
    db.session.commit()