Esempio n. 1
0
def get_collection(collection_id):
    """Fetch a collection from the index."""
    if collection_id is None:
        return
    key = cache.object_key(Collection, collection_id)
    data = cache.get_complex(key)
    if data is not None:
        return data

    collection = Collection.by_id(collection_id)
    if collection is None:
        return

    data = collection.to_dict()
    stats = get_collection_stats(collection.id)
    data['count'] = stats['count']
    data['schemata'] = stats['schemata']

    # if no countries or langs are given, take the most common from the data.
    countries = ensure_list(collection.countries)
    countries = countries or stats['countries'].keys()
    data['countries'] = registry.country.normalize_set(countries)

    languages = ensure_list(collection.languages)
    languages = languages or stats['languages'].keys()
    data['languages'] = registry.language.normalize_set(languages)
    cache.set_complex(key, data, expire=cache.EXPIRE)
    return data
Esempio n. 2
0
def entities(collection_id=None):
    require(request.authz.can_stream())
    log.debug("Stream entities [%r] begins... (coll: %s)",
              request.authz, collection_id)
    schemata = ensure_list(request.args.getlist('schema'))
    excludes = ['text', 'roles', 'fingerprints']
    includes = ensure_list(request.args.getlist('include'))
    includes = [f for f in includes if f not in excludes]
    if collection_id is not None:
        get_db_collection(collection_id, request.authz.READ)
        record_audit(Audit.ACT_COLLECTION, id=collection_id)
    entities = iter_entities(authz=request.authz,
                             collection_id=collection_id,
                             schemata=schemata,
                             excludes=excludes,
                             includes=includes)
    return stream_ijson(entities)
Esempio n. 3
0
File: authz.py Progetto: pudo/aleph
 def __init__(self, role_id, roles, is_admin=False):
     self.id = role_id
     self.logged_in = role_id is not None
     self.roles = set(ensure_list(roles))
     self.is_admin = is_admin
     self.in_maintenance = settings.MAINTENANCE
     self.session_write = not self.in_maintenance and self.logged_in
     self._collections = {}
Esempio n. 4
0
File: result.py Progetto: pudo/aleph
 def __init__(self, label, span, countries):
     super(LocationResult, self).__init__(label, span, countries)
     if self.key is not None:
         try:
             value = kv.lrange(place_key(self.key), 0, -1)
             self.countries = ensure_list(value)
         except KeyError:
             pass
Esempio n. 5
0
def iter_entities(authz=None, collection_id=None, schemata=None,
                  includes=None, excludes=None):
    """Scan all entities matching the given criteria."""
    filters = []
    if authz is not None:
        filters.append(authz_query(authz))
    if collection_id is not None:
        filters.append({'term': {'collection_id': collection_id}})
    if ensure_list(schemata):
        filters.append({'terms': {'schemata': ensure_list(schemata)}})
    query = {
        'query': {'bool': {'filter': filters}},
        '_source': _source_spec(includes, excludes)
    }
    index = entities_read_index(schema=schemata)
    for res in scan(es, index=index, query=query, scroll='1410m'):
        entity = unpack_result(res)
        if entity is not None:
            yield entity
Esempio n. 6
0
 def update(self, data, creator=None):
     self.updated_at = datetime.utcnow()
     self.label = data.get('label', self.label)
     self.summary = data.get('summary', self.summary)
     self.summary = data.get('summary', self.summary)
     self.publisher = data.get('publisher', self.publisher)
     self.publisher_url = data.get('publisher_url', self.publisher_url)
     self.info_url = data.get('info_url', self.info_url)
     self.data_url = data.get('data_url', self.data_url)
     self.category = data.get('category', self.category)
     self.casefile = as_bool(data.get('casefile'), default=self.casefile)
     self.countries = ensure_list(data.get('countries', self.countries))
     self.languages = ensure_list(data.get('languages', self.languages))
     if creator is None:
         creator = Role.by_id(data.get('creator_id'))
     if creator is not None:
         self.creator = creator
     db.session.add(self)
     db.session.flush()
     if self.creator is not None:
         Permission.grant(self, self.creator, True, True)
Esempio n. 7
0
def schema_scope(schema, expand=True):
    schemata = set()
    names = ensure_list(schema) or model.schemata.values()
    for schema in names:
        schema = model.get(schema)
        if schema is not None:
            schemata.add(schema)
            if expand:
                schemata.update(schema.descendants)
    for schema in schemata:
        if not schema.abstract:
            yield schema
Esempio n. 8
0
File: result.py Progetto: pudo/aleph
    def update(self):
        """Apply the outcome of the result to the document."""
        doc = self.document
        if self.status == self.STATUS_SUCCESS:
            doc.status = Document.STATUS_SUCCESS
            doc.error_message = None
        else:
            doc.status = Document.STATUS_FAIL
            doc.error_message = stringify(self.error_message)

        schema = model['Document']
        for flag, name in self.SCHEMATA:
            if flag in self.flags:
                schema = model[name]

        doc.schema = schema.name
        doc.foreign_id = safe_string(self.id)
        doc.content_hash = self.checksum or doc.content_hash
        doc.pdf_version = self.pdf_checksum
        doc.title = self.title or doc.meta.get('title')
        doc.file_name = self.file_name or doc.meta.get('file_name')
        doc.file_size = self.size or doc.meta.get('file_size')
        doc.summary = self.summary or doc.meta.get('summary')
        doc.author = self.author or doc.meta.get('author')
        doc.generator = self.generator or doc.meta.get('generator')
        doc.mime_type = self.mime_type or doc.meta.get('mime_type')
        doc.encoding = self.encoding or doc.meta.get('encoding')
        doc.date = self.date or doc.meta.get('date')
        doc.authored_at = self.created_at or doc.meta.get('authored_at')
        doc.modified_at = self.modified_at or doc.meta.get('modified_at')
        doc.published_at = self.published_at or doc.meta.get('published_at')
        doc.message_id = self.message_id or doc.meta.get('message_id')
        doc.in_reply_to = ensure_list(self.in_reply_to)
        doc.columns = list(self.columns.keys())
        doc.body_raw = self.body_html
        doc.body_text = self.body_text
        doc.headers = self.headers

        for kw in self.keywords:
            doc.add_keyword(safe_string(kw))
        for lang in self.languages:
            doc.add_language(safe_string(lang))

        db.session.flush()

        collector = DocumentTagCollector(doc, 'ingestors')
        for entity in self.entities:
            collector.emit(entity, DocumentTag.TYPE_PERSON)
        for email in self.emails:
            collector.emit(email, DocumentTag.TYPE_EMAIL)
        collector.save()
Esempio n. 9
0
def bulk(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    merge = get_flag('merge', default=False)

    # This will disable certain security measures in order to allow bulk
    # loading of document data.
    unsafe = get_flag('unsafe', default=False)
    unsafe = unsafe and request.authz.is_admin

    entities = ensure_list(request.get_json(force=True))
    bulk_write(collection, entities, merge=merge, unsafe=unsafe)
    refresh_collection(id)
    return ('', 204)
Esempio n. 10
0
File: core.py Progetto: pudo/aleph
def url_for(*a, **kw):
    """Overwrite Flask url_for to force external paths."""
    try:
        kw['_external'] = False
        query = kw.pop('_query', None)
        authorize = kw.pop('_authorize', False)
        relative = kw.pop('_relative', False)
        path = flask_url_for(*a, **kw)
        if authorize is True and hasattr(request, 'authz'):
            token = request.authz.to_token(scope=path)
            query = list(ensure_list(query))
            query.append(('api_key', token))
        return url_external(path, query, relative=relative)
    except RuntimeError:
        return None
Esempio n. 11
0
File: util.py Progetto: pudo/aleph
def field_filter_query(field, values):
    """Need to define work-around for full-text fields."""
    values = ensure_list(values)
    if not len(values):
        return {'match_all': {}}
    if field in ['_id', 'id']:
        return {'ids': {'values': values}}
    if field in ['names']:
        field = 'fingerprints'
    if len(values) == 1:
        # if field in ['addresses']:
        #     field = '%s.text' % field
        #     return {'match_phrase': {field: values[0]}}
        return {'term': {field: values[0]}}
    return {'terms': {field: values}}
Esempio n. 12
0
File: match.py Progetto: pudo/aleph
def match_query(proxy, collection_ids=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity:
    if proxy.id is not None:
        sq = {"ids": {"values": [proxy.id]}}
        query['bool']['must_not'].append(sq)

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['filter'].append({
            'terms': {'collection_id': collection_ids}
        })

    filters = []
    for (prop, value) in proxy.itervalues():
        specificity = prop.specificity(value)
        if specificity > 0:
            filters.append((prop, value, specificity))

    filters = sorted(filters, key=lambda p: p[2], reverse=True)
    required = []
    for (prop, value, specificity) in filters:
        if prop.type in REQUIRED and len(required) <= MAX_CLAUSES:
            required.extend(_make_queries(prop, value, specificity))

    scoring = []
    for (prop, value, specificity) in filters:
        clauses = len(required) + len(scoring)
        if prop.type not in REQUIRED and clauses <= MAX_CLAUSES:
            scoring.extend(_make_queries(prop, value, specificity))

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        'bool': {
            'should': [required],
            'minimum_should_match': 1
        }
    })
    query['bool']['should'].extend(scoring)
    return query
Esempio n. 13
0
def publish(event, actor_id=None, params=None, channels=None):
    """ Publish a notification to the given channels, while storing
    the parameters and initiating actor for the event. """
    assert isinstance(event, Event), event
    params = params or {}
    outparams = {}
    channels = ensure_list(channels)
    channels.append(channel(actor_id, clazz=Role))
    for name, clazz in event.params.items():
        obj = params.get(name)
        outparams[name] = get_entity_id(obj)
        channels.append(channel(obj, clazz=clazz))
    Notification.publish(event,
                         actor_id=actor_id,
                         params=outparams,
                         channels=channels)
    db.session.flush()
Esempio n. 14
0
def generate_document(document, shallow=False):
    """Generate bulk index actions for all records and the main document."""
    data = document.to_dict()
    data['text'] = ensure_list(data.get('text'))
    total_len = sum((len(t) for t in data['text']))
    if document.supports_records:
        q = db.session.query(DocumentRecord)
        q = q.filter(DocumentRecord.document_id == document.id)
        for idx, record in enumerate(q.yield_per(BULK_PAGE)):
            texts = list(record.texts)
            if total_len < INDEX_MAX_LEN:
                total_len += sum((len(t) for t in texts))
                data['text'].extend(texts)
            record = record.to_dict()
            record['collection_id'] = document.collection_id
            record['created_at'] = document.created_at
            record['updated_at'] = document.updated_at
            record['text'] = texts
            if not shallow:
                entity_id, index, body = index_operation(record)
                yield {
                    '_id': entity_id,
                    '_index': index,
                    '_source': body
                }
            if idx > 0 and idx % 1000 == 0:
                log.info("Indexed [%s]: %s records...", document.id, idx)

    # log.debug("Text length [%s]: %s", document.id, total_len)
    entity_id, index, body = index_operation(data)
    for other in entities_index_list(Document.SCHEMA):
        if other != index:
            yield {
                '_id': entity_id,
                '_index': other,
                '_op_type': 'delete'
            }

    yield {
        '_id': entity_id,
        '_index': index,
        '_source': body
    }
Esempio n. 15
0
File: triples.py Progetto: cxz/aleph
def export_entity_properties(g, uri, entity):
    properties = entity.get('properties', {})
    schema = model.get(entity.get('schema'))
    for name, prop in schema.properties.items():
        for value in ensure_list(properties.get(name)):
            if prop.type_name == 'date':
                obj = date_lit(value)
            if prop.type_name == 'country':
                obj = country_uri(value)
            if prop.type_name == 'email':
                obj = email_uri(value)
            if prop.type_name == 'phone':
                obj = phone_uri(value)
            if prop.type_name == 'url':
                obj = URIRef(value)
            if prop.type_name == 'entity':
                obj = entity_uri(value)
            else:
                obj = Literal(value)
            g.add((uri, FTM[name], obj))
Esempio n. 16
0
def iter_tokens(limit=1000000000):
    """Go through all the names in the index."""
    query = {'_source': {'include': 'names'}}
    index = entities_read_index(schema=Entity.LEGAL_ENTITY)
    seen = 0
    try:
        for res in scan(es, index=index, query=query, scroll='1440m'):
            names = ensure_list(res.get('_source', {}).get('names'))
            tokens = set()
            for name in names:
                tokens.update(name_tokens(name))
            yield from tokens

            seen += 1
            if seen % 1000 == 0:
                log.info("Entities: %s", seen)
            if limit is not None and seen > limit:
                return
    except Exception as ex:
        log.warning("Token iterator aborted: %s", ex)
Esempio n. 17
0
def parse_for_metadata(context, data, html):
    meta = context.params.get('meta', {})
    meta_date = context.params.get('meta_date', {})

    meta_paths = meta
    meta_paths.update(meta_date)

    for key, xpaths in meta_paths.items():
        for xpath in ensure_list(xpaths):
            element = html.find(xpath)
            if element is None:
                continue
            value = collapse_spaces(element.text_content())
            if key in meta_date:
                value = iso_date(value)
            if value is not None:
                data[key] = value
            break

    return data
Esempio n. 18
0
def convert_record(record, country=None):
    published_date = clean_date(record.pop('publishedDate', None))
    publisher = record.pop('publisher', {}).get('name')
    if record.get('tag'):
        for entity in convert_item(record, country):
            entity.add('publisher', publisher, quiet=True)
            entity.add('modifiedAt', published_date, quiet=True)
            yield entity

    compiled_release = record.get('compiledRelease', {})
    for entity in convert_item(compiled_release, country):
        entity.add('publisher', publisher, quiet=True)
        entity.add('modifiedAt', published_date, quiet=True)
        yield entity

    for release in ensure_list(record.get('releases', [])):
        for entity in convert_item(release, country):
            entity.add('publisher', publisher, quiet=True)
            entity.add('modifiedAt', published_date, quiet=True)
            yield entity
Esempio n. 19
0
def format_proxy(proxy, collection, job_id=None):
    """Apply final denormalisations to the index."""
    proxy.context = {}
    proxy = collection.ns.apply(proxy)
    data = proxy.to_full_dict()
    data['collection_id'] = collection.id
    data['job_id'] = job_id
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    properties = data.get('properties')
    text = properties.pop('indexText', [])
    text.extend(fps)
    text.append(collection.label)
    data['text'] = text

    data['updated_at'] = collection.updated_at
    for updated_at in properties.pop('indexUpdatedAt', []):
        data['updated_at'] = updated_at

    # integer casting
    numeric = {}
    for prop, values in properties.items():
        prop = proxy.schema.get(prop)
        if prop.type in NUMERIC_TYPES:
            numeric[prop.name] = _numeric_values(prop.type, values)
    # also cast group field for dates
    numeric['dates'] = _numeric_values(registry.date, data.get('dates'))
    data['numeric'] = numeric

    # pprint(data)
    entity_id = data.pop('id')
    return {
        '_id': entity_id,
        '_index': entities_write_index(data.get('schema')),
        '_source': data
    }
Esempio n. 20
0
def generate_document(document, shallow=False):
    """Generate bulk index actions for all records and the main document."""
    data = document.to_dict()
    data['text'] = ensure_list(data.get('text'))
    total_len = sum((len(t) for t in data['text']))
    if document.supports_records:
        q = db.session.query(DocumentRecord)
        q = q.filter(DocumentRecord.document_id == document.id)
        for idx, record in enumerate(q.yield_per(BULK_PAGE)):
            texts = list(record.texts)
            if total_len < INDEX_MAX_LEN:
                total_len += sum((len(t) for t in texts))
                data['text'].extend(texts)
            record = record.to_dict()
            record['collection_id'] = document.collection_id
            record['created_at'] = document.created_at
            record['updated_at'] = document.updated_at
            record['text'] = texts
            if not shallow:
                entity_id, index, body = index_operation(record)
                yield {
                    '_id': entity_id,
                    '_index': index,
                    '_type': 'doc',
                    '_source': body
                }
            if idx > 0 and idx % 1000 == 0:
                log.info("Indexed [%s]: %s records...", document.id, idx)

    # log.debug("Text length [%s]: %s", document.id, total_len)
    entity_id, index, body = index_operation(data)
    for other in entities_index_list(Document.SCHEMA):
        if other != index:
            yield {
                '_id': entity_id,
                '_index': other,
                '_type': 'doc',
                '_op_type': 'delete'
            }

    yield {'_id': entity_id, '_index': index, '_type': 'doc', '_source': body}
Esempio n. 21
0
    def update_many(self,
                    rows,
                    keys,
                    chunk_size=1000,
                    ensure=None,
                    types=None):
        """Update many rows in the table at a time.

        This is significantly faster than updating them one by one. Per default
        the rows are processed in chunks of 1000 per commit, unless you specify
        a different ``chunk_size``.

        See :py:meth:`update() <dataset.Table.update>` for details on
        the other parameters.
        """
        keys = ensure_list(keys)

        chunk = []
        columns = []
        for index, row in enumerate(rows):
            chunk.append(row)
            for col in row.keys():
                if col not in columns:
                    columns.append(col)

            # bindparam requires names to not conflict (cannot be "id" for id)
            for key in keys:
                row["_%s" % key] = row[key]

            # Update when chunk_size is fulfilled or this is the last row
            if len(chunk) == chunk_size or index == len(rows) - 1:
                cl = [self.table.c[k] == bindparam("_%s" % k) for k in keys]
                stmt = self.table.update(
                    whereclause=and_(*cl),
                    values={
                        col: bindparam(col, required=False)
                        for col in columns
                    },
                )
                self.db.executable.execute(stmt, chunk)
                chunk = []
Esempio n. 22
0
 def _serialize(self, obj):
     pk = obj.get('id')
     obj['links'] = {
         'self': url_for('collections_api.view', collection_id=pk),
         'xref': url_for('xref_api.index', collection_id=pk),
         'xref_export': url_for('xref_api.export', collection_id=pk,
                                _authorize=obj.get('secret')),
         'reconcile': url_for('reconcile_api.reconcile',
                              collection_id=pk,
                              _authorize=obj.get('secret')),
         'ui': collection_url(pk)
     }
     obj['writeable'] = request.authz.can(pk, request.authz.WRITE)
     creator_id = obj.pop('creator_id', None)
     obj['creator'] = self.resolve(Role, creator_id, RoleSerializer)
     obj['team'] = []
     for role_id in ensure_list(obj.pop('team_id', [])):
         if request.authz.can_read_role(role_id):
             role = self.resolve(Role, role_id, RoleSerializer)
             obj['team'].append(role)
     return obj
Esempio n. 23
0
    def match_company(self, proxy):
        MatchCriteria = self.client.get_type('ns0:MatchCriteria')
        # SelectionResult = self.client.get_type('ns0:SelectionResult')
        countries = list(proxy.countries)
        if len(countries) == 1:
            ct = MatchCriteria(Name=proxy.caption, Country=countries[0])
        else:
            ct = MatchCriteria(Name=proxy.caption)

        data = self.cache.get(ct)
        if data is None:
            try:
                res = self.service.Match(self.session, ct, ['None'])
                data = zeep.helpers.serialize_object(res)
                # pprint(data)
                data = json.loads(json.dumps(data))
                self.cache.store(ct, data)
            except Exception:
                log.exception("Orbis match call error.")
                self.close()
        return ensure_list(data)
Esempio n. 24
0
def export_entity(entity, collection_uri):
    g = Graph()
    uri = registry.entity.rdf(entity.get('id'))
    g.add((uri, DCTERMS.isPartOf, collection_uri))
    g.add((collection_uri, DCTERMS.hasPart, uri))

    if 'properties' not in entity:
        entity.update(Document.doc_data_to_schema(entity))
    schema = model.get(entity.get('schema'))
    for schema_ in schema.schemata:
        g.add((uri, RDF.type, schema_.uri))

    properties = entity.get('properties', {})
    for name, prop in schema.properties.items():
        for value in ensure_list(properties.get(name)):
            obj = prop.type.rdf(value)
            g.add((uri, prop.uri, obj))

    if entity.get('name'):
        g.add((uri, SKOS.prefLabel, Literal(entity.get('name'))))
    return g
Esempio n. 25
0
def _index_form(collection, matches):
    now = datetime.utcnow().isoformat()
    for (score, entity, match_collection_id, match) in matches:
        xref_id = hash_data((entity.id, collection.id, match.id))
        text = ensure_list(entity.get_type_values(registry.name))
        text.extend(match.get_type_values(registry.name))
        yield {
            "_id": xref_id,
            "_index": xref_index(),
            "_source": {
                "score": score,
                "entity_id": entity.id,
                "collection_id": collection.id,
                "match_id": match.id,
                "match_collection_id": match_collection_id,
                "countries": match.get_type_values(registry.country),
                "schema": match.schema.name,
                "text": text,
                "created_at": now,
            },
        }
Esempio n. 26
0
    def generate(self):
        for parent in ensure_list(self.data.get('extends')):
            parent = self.model.get(parent)
            parent.generate()

            for name, prop in parent.properties.items():
                if name not in self.properties:
                    self.properties[name] = prop

            self.extends.add(parent)
            for ancestor in parent.schemata:
                self.schemata.add(ancestor)
                self.names.add(ancestor.name)
                ancestor.descendants.add(self)

        for prop in self.properties.values():
            prop.generate()

        for featured in self.featured:
            if self.get(featured) is None:
                raise InvalidModel("Missing featured property: %s" % featured)
Esempio n. 27
0
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    for data in entities:
        entity = model.get_proxy(data, cleaned=False)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if safe:
            entity = remove_checksums(entity)
        entity.context = {"role_id": role_id, "mutable": mutable}
        for field, func in (("created_at", min), ("updated_at", max)):
            ts = func(ensure_list(data.get(field)), default=None)
            dt = registry.date.to_datetime(ts)
            if dt is not None:
                entity.context[field] = dt.isoformat()
        writer.put(entity, origin="bulk")
        yield entity.id
    writer.flush()
Esempio n. 28
0
    def generate(self):
        """While loading the schema, this function will validate and fully
        load the hierarchy, properties and flags of the definition."""
        for parent in ensure_list(self.data.get("extends")):
            parent = self.model.get(parent)
            parent.generate()

            for name, prop in parent.properties.items():
                if name not in self.properties:
                    self.properties[name] = prop

            self.extends.add(parent)
            for ancestor in parent.schemata:
                self.schemata.add(ancestor)
                self.names.add(ancestor.name)
                ancestor.descendants.add(self)

        for prop in list(self.properties.values()):
            prop.generate()

        for featured in self.featured:
            if self.get(featured) is None:
                raise InvalidModel("Missing featured property: %s" % featured)

        for caption in self.caption:
            if self.get(caption) is None:
                raise InvalidModel("Missing caption property: %s" % caption)

        for required in self.required:
            if self.get(required) is None:
                raise InvalidModel("Missing required property: %s" % required)

        if self.edge:
            if self.source_prop is None:
                msg = "Missing edge source: %s" % self.edge_source
                raise InvalidModel(msg)

            if self.target_prop is None:
                msg = "Missing edge target: %s" % self.edge_target
                raise InvalidModel(msg)
Esempio n. 29
0
def entities_by_ids(ids, schemata=None, cached=False,
                    includes=None, excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            if cached:
                _cache_entity(entity)
            yield entity
Esempio n. 30
0
def generate(collection_id):
    """
    ---
    post:
      summary: Generate cross-reference matches
      description: >
        Generate cross-reference matches for entities in a collection.
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/XrefGenerate'
      responses:
        '202':
          content:
            application/json:
              schema:
                properties:
                  status:
                    description: accepted
                    type: string
                type: object
          description: Accepted
      tags:
      - Xref
      - Collection
    """
    data = parse_request('XrefGenerate')
    collection = get_db_collection(collection_id, request.authz.WRITE)
    against = ensure_list(data.get("against_collection_ids"))
    payload = {'against_collection_ids': against}
    queue_task(collection, OP_XREF, payload=payload)
    return jsonify({'status': 'accepted'}, status=202)
Esempio n. 31
0
def match_query(proxy, collection_ids=None, query=None):
    """Given a document or entity in indexed form, build a query that
    will find similar entities based on a variety of criteria."""
    if query is None:
        query = bool_query()

    # Don't match the query entity:
    if proxy.id is not None:
        sq = {"ids": {"values": [proxy.id]}}
        query['bool']['must_not'].append(sq)

    collection_ids = ensure_list(collection_ids)
    if len(collection_ids):
        query['bool']['filter'].append({
            'terms': {'collection_id': collection_ids}
        })

    required = []
    scoring = []
    for (prop, value) in proxy.itervalues():
        queries = list(_make_queries(prop, value))
        if prop.type in REQUIRED:
            required.extend(queries)
        else:
            scoring.extend(queries)

    if not len(required):
        # e.g. a document from which no features have been extracted.
        return none_query()

    # make it mandatory to have at least one match
    query['bool']['must'].append({
        'bool': {
            'should': [required],
            'minimum_should_match': 1
        }
    })
    query['bool']['should'].extend(scoring)
    return query
Esempio n. 32
0
 def search(
     self,
     query: str,
     schema: Optional[str] = None,
     schemata: Optional[str] = None,
     filters: Optional[List] = None,
     publisher: bool = False,
     params: Optional[Mapping[str, Any]] = None,
 ) -> "EntityResultSet":
     """Conduct a search and return the search results."""
     filters_list: List = ensure_list(filters)
     if schema is not None:
         filters_list.append(("schema", schema))
     if schemata is not None:
         filters_list.append(("schemata", schemata))
     if schema is None and schemata is None:
         filters_list.append(("schemata", "Thing"))
     url = self._make_url("entities",
                          query=query,
                          filters=filters_list,
                          params=params)
     return EntityResultSet(self, url, publisher)
Esempio n. 33
0
def index_operation(data):
    """Apply final denormalisations to the index."""
    data['bulk'] = data.get('bulk', False)
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    texts = data.pop('text', [])
    texts.extend(data.get('properties', {}).pop('indexText', []))
    texts.extend(fps)
    data['text'] = texts

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')

    entity_id = str(data.pop('id'))
    data.pop('_index', None)
    index = entities_write_index(data.get('schema'))
    return entity_id, index, data
Esempio n. 34
0
def index_operation(data):
    """Apply final denormalisations to the index."""
    data['bulk'] = data.get('bulk', False)
    names = ensure_list(data.get('names'))
    fps = set([fingerprints.generate(name) for name in names])
    fps.update(names)
    data['fingerprints'] = [fp for fp in fps if fp is not None]

    # Slight hack: a magic property in followthemoney that gets taken out
    # of the properties and added straight to the index text.
    texts = data.pop('text', [])
    texts.extend(data.get('properties', {}).pop('indexText', []))
    texts.extend(fps)
    data['text'] = texts

    if not data.get('created_at'):
        data['created_at'] = data.get('updated_at')

    entity_id = str(data.pop('id'))
    data.pop('_index', None)
    index = entities_write_index(data.get('schema'))
    return entity_id, index, data
Esempio n. 35
0
def set_entity_properties(g, uri, entity):
    if entity.get('name'):
        g.add((uri, SKOS.prefLabel, Literal(entity.get('name'))))
    for name in entity.get('names', []):
        g.add((uri, RDFS.label, Literal(name)))

    for country in entity.get('countries', []):
        if len(country) != 2:
            continue
        g.add((uri, ALEPH.country, country_uri(country)))

    for phone in entity.get('phones', []):
        g.add((uri, ALEPH.phone, tel_uri(phone)))

    for email in entity.get('emails', []):
        g.add((uri, ALEPH.email, email_uri(email)))

    properties = entity.get('properties', {})
    for name, values in properties.items():
        pred = FTM[name]
        for value in ensure_list(values):
            obj = typed_object(name, value)
            g.add((uri, pred, obj))
Esempio n. 36
0
 def _serialize(self, obj):
     pk = obj.get("id")
     authz = request.authz if obj.get("secret") else None
     obj["links"] = {
         "self":
         url_for("collections_api.view", collection_id=pk),
         "xref_export":
         url_for("xref_api.export", collection_id=pk, _authz=authz),
         "reconcile":
         url_for("reconcile_api.reconcile", collection_id=pk),
         "ui":
         collection_url(pk),
     }
     obj["shallow"] = obj.get("shallow", True)
     obj["writeable"] = request.authz.can(pk, request.authz.WRITE)
     creator_id = obj.pop("creator_id", None)
     obj["creator"] = self.resolve(Role, creator_id, RoleSerializer)
     obj["team"] = []
     for role_id in ensure_list(obj.pop("team_id", [])):
         if request.authz.can_read_role(role_id):
             role = self.resolve(Role, role_id, RoleSerializer)
             obj["team"].append(role)
     return obj
Esempio n. 37
0
def _create_meta_object(context, data) -> dict:
    source_url = data.get("source_url", data.get("url"))
    foreign_id = data.get("foreign_id", data.get("request_id", source_url))

    meta = {
        "crawler": context.crawler.name,
        "foreign_id": foreign_id,
        "source_url": source_url,
        "title": data.get("title"),
        "author": data.get("author"),
        "publisher": data.get("publisher"),
        "file_name": data.get("file_name"),
        "retrieved_at": data.get("retrieved_at"),
        "modified_at": data.get("modified_at"),
        "published_at": data.get("published_at"),
        "headers": ensure_dict(data.get("headers")),
        "keywords": ensure_list(data.get("keywords")),
    }

    if data.get("aleph_folder_id"):
        meta["parent"] = {"id": data.get("aleph_folder_id")}

    return meta
Esempio n. 38
0
def entities_read_index(schema=None, descendants=True, exclude=None):
    """Combined index to run all queries against."""
    if not settings.ENTITIES_INDEX_SPLIT:
        indexes = set(settings.ENTITIES_INDEX_SET)
        indexes.add(settings.ENTITIES_INDEX)
        return ','.join(indexes)

    schemata = set()
    names = ensure_list(schema) or model.schemata.values()
    for schema in names:
        schema = model.get(schema)
        if schema is None:
            continue
        schemata.add(schema)
        if descendants:
            schemata.update(schema.descendants)
    exclude = model.get(exclude)
    indexes = list(settings.ENTITIES_INDEX_SET)
    for schema in schemata:
        if not schema.abstract and schema != exclude:
            indexes.append(schema_index(schema))
    # log.info("Read index: %r", indexes)
    return ','.join(indexes)
Esempio n. 39
0
 def _serialize(self, obj):
     pk = obj.get('id')
     obj['id'] = str(pk)
     schema = model.get(obj.get('schema'))
     if schema is None:
         return None
     properties = obj.get('properties', {})
     for prop in schema.properties.values():
         if prop.type != registry.entity:
             continue
         values = ensure_list(properties.get(prop.name))
         if values:
             properties[prop.name] = []
             for value in values:
                 entity = self.resolve(Entity, value,
                                       DiagramEntitySerializer)  # noqa
                 if entity is None:
                     entity = value
                 properties[prop.name].append(entity)
     obj.pop('_index', None)
     collection_id = obj.pop('collection_id', None)
     obj['collection_id'] = str(collection_id)
     return self._clean_response(obj)
Esempio n. 40
0
def entities_by_ids(
    ids, schemata=None, cached=False, includes=PROXY_INCLUDES, excludes=None
):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    cached = cached and excludes is None and includes == PROXY_INCLUDES
    entities = {}
    if cached:
        keys = [cache.object_key(Entity, i) for i in ids]
        for _, entity in cache.get_many_complex(keys):
            if entity is not None:
                entities[entity.get("id")] = entity

    missing = [i for i in ids if entities.get(id) is None]
    index = entities_read_index(schema=schemata)
    query = {
        "query": {"ids": {"values": missing}},
        "_source": _source_spec(includes, excludes),
        "size": MAX_PAGE,
    }
    result = es.search(index=index, body=query)
    for doc in result.get("hits", {}).get("hits", []):
        entity = unpack_result(doc)
        if entity is not None:
            entity_id = entity.get("id")
            entities[entity_id] = entity
            if cached:
                key = cache.object_key(Entity, entity_id)
                cache.set_complex(key, entity, expires=60 * 60 * 2)

    for i in ids:
        entity = entities.get(i)
        if entity is not None:
            yield entity
Esempio n. 41
0
def index():
    """Returns a list of diagrams for the role
    ---
    get:
      summary: List diagrams
      parameters:
      - description: The collection id.
        in: query
        name: 'filter:collection_id'
        required: true
        schema:
          minimum: 1
          type: integer
      responses:
        '200':
          content:
            application/json:
              schema:
                type: object
                allOf:
                - $ref: '#/components/schemas/QueryResponse'
                properties:
                  results:
                    type: array
                    items:
                      $ref: '#/components/schemas/Diagram'
          description: OK
      tags:
        - Diagram
    """
    parser = QueryParser(request.args, request.authz)
    q = Diagram.by_authz(request.authz)
    collection_ids = ensure_list(parser.filters.get('collection_id'))
    if len(collection_ids):
        q = q.filter(Diagram.collection_id.in_(collection_ids))
    result = DatabaseQueryResult(request, q)
    return DiagramSerializer.jsonify_result(result)
Esempio n. 42
0
 def ingest_component(self, entity, idx, comp):
     if comp.name == "VCALENDAR":
         entity.add("generator", comp.get("PRODID"))
     if comp.name == "VEVENT":
         event = self.manager.make_entity("Event")
         self.manager.apply_context(event, entity)
         uid = sanitize_text(comp.get("UID"))
         if uid is not None:
             event.make_id(uid)
         else:
             event.make_id(entity.id, idx)
         event.add("proof", entity)
         event.add("name", comp.get("SUMMARY"))
         event.add("description", comp.get("DESCRIPTION"))
         event.add("location", comp.get("LOCATION"))
         event.add("sourceUrl", comp.get("URL"))
         event.add("startDate", cal_date(comp.get("DTSTART")))
         event.add("endDate", cal_date(comp.get("DTEND")))
         event.add("date", cal_date(comp.get("CREATED")))
         event.add("modifiedAt", cal_date(comp.get("LAST-MODIFIED")))
         event.add("organizer", self.address_entity(comp.get("ORGANIZER")))
         for attendee in ensure_list(comp.get("ATTENDEE")):
             event.add("involved", self.address_entity(attendee))
         self.manager.emit_entity(event, fragment=idx)
Esempio n. 43
0
    def extract_text(self, data, languages=None):
        key = sha1(data).hexdigest()
        text = Cache.get_cache(key)
        if text is not None:
            log.info('%s chars cached', len(text))
            return text

        data = self.ensure_size(data)
        if data is None:
            return

        for attempt in range(1000):
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = Image(data=data, languages=languages)
                response = service.Recognize(image)
                log.info('%s chars recognized', len(response.text))
                if response.text is not None:
                    Cache.set_cache(key, response.text)
                return response.text
            except self.Error as e:
                log.warning("gRPC [%s]: %s", e.code(), e.details())
                backoff(failures=attempt)
Esempio n. 44
0
def entities_by_ids(ids, schemata=None, cached=False,
                    includes=None, excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            # Cache entities only briefly to avoid filling up the cache:
            if cached:
                key = cache.object_key(Entity, entity.get('id'))
                cache.set_complex(key, entity, expire=60 * 60)
            yield entity
Esempio n. 45
0
def entities_by_ids(ids, schemata=None, cached=False,
                    includes=None, excludes=None):
    """Iterate over unpacked entities based on a search for the given
    entity IDs."""
    ids = ensure_list(ids)
    if not len(ids):
        return
    index = entities_read_index(schema=schemata)
    query = {'ids': {'values': ids}}
    # query = {'bool': {'filter': query}}
    query = {
        'query': query,
        '_source': _source_spec(includes, excludes),
        'size': MAX_PAGE
    }
    result = es.search(index=index, body=query)
    for doc in result.get('hits', {}).get('hits', []):
        entity = unpack_result(doc)
        if entity is not None:
            # Cache entities only briefly to avoid filling up the cache:
            if cached:
                key = cache.object_key(Entity, entity.get('id'))
                cache.set_complex(key, entity, expire=60 * 60)
            yield entity
Esempio n. 46
0
def parse_for_metadata(context, data, html):
    meta = context.params.get('meta', {})
    meta_date = context.params.get('meta_date', {})

    meta_paths = meta
    meta_paths.update(meta_date)

    for key, xpaths in meta_paths.items():
        for xpath in ensure_list(xpaths):
            element = html.xpath(xpath)[0]
            if element is None:
                continue
            try:
                value = collapse_spaces(element.text_content())
            except AttributeError:
                # useful when element is an attribute
                value = collapse_spaces(str(element))
            if key in meta_date:
                value = iso_date(value)
            if value is not None:
                data[key] = value
            break

    return data
Esempio n. 47
0
File: authz.py Progetto: pudo/aleph
 def match(self, roles):
     """See if there's overlap in roles."""
     roles = ensure_list(roles)
     if not len(roles):
         return False
     return self.roles.intersection(roles) > 0
Esempio n. 48
0
File: result.py Progetto: pudo/aleph
 def __init__(self, label, span, countries):
     args = dict(countries=countries)
     label = self.type.clean(label, **args)
     super(TypedResult, self).__init__(label, span, countries)
     self.countries = ensure_list(self.type.country_hint(label))
Esempio n. 49
0
def _source_spec(includes, excludes):
    includes = ensure_list(includes)
    excludes = ensure_list(excludes)
    if not len(excludes):
        excludes = EXCLUDE_DEFAULT
    return {'includes': includes, 'excludes': excludes}