コード例 #1
0
ファイル: mapping.py プロジェクト: rmallof/aleph
def map_to_aggregator(collection, mapping, aggregator):
    table = get_entity(mapping.table_id)
    if table is None:
        table = aggregator.get(mapping.table_id)
    if table is None:
        raise RuntimeError("Table cannot be found: %s" % mapping.table_id)
    config = {"csv_url": _get_table_csv_link(table), "entities": mapping.query}
    mapper = model.make_mapping(config, key_prefix=collection.foreign_id)
    origin = mapping_origin(mapping.id)
    aggregator.delete(origin=origin)
    writer = aggregator.bulk()
    idx = 0
    for idx, record in enumerate(mapper.source.records, 1):
        if idx > 0 and idx % 1000 == 0:
            log.info("[%s] Mapped %s rows ...", mapping.id, idx)
        for entity in mapper.map(record).values():
            entity.context = mapping.get_proxy_context()
            if entity.schema.is_a("Thing"):
                entity.add("proof", mapping.table_id)
            entity = collection.ns.apply(entity)
            entity = remove_checksums(entity)
            writer.put(entity, fragment=idx, origin=origin)
            if mapping.entityset is not None:
                EntitySetItem.save(
                    mapping.entityset,
                    entity.id,
                    collection_id=collection.id,
                    added_by_id=mapping.role_id,
                )
    writer.flush()
    log.info("[%s] Mapping done (%s rows)", mapping.id, idx)
コード例 #2
0
def bulk_write(collection,
               entities,
               safe=False,
               role_id=None,
               mutable=True,
               index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        entity = model.get_proxy(data, cleaned=False)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if safe:
            entity = remove_checksums(entity)
        entity.context = {"role_id": role_id, "mutable": mutable}
        for field in ("created_at", "updated_at"):
            timestamp = data.get(field)
            if timestamp is not None:
                dt = registry.date.to_datetime(timestamp)
                if dt is not None:
                    entity.context[field] = dt.isoformat()
        writer.put(entity, origin="bulk")
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
コード例 #3
0
ファイル: processing.py プロジェクト: djoffrey/aleph
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    now = datetime.utcnow().isoformat()
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        if not is_mapping(data):
            raise InvalidData("Failed to read input data", errors=data)
        entity = model.get_proxy(data)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entity.context = {
            'role_id': role_id,
            'created_at': now,
            'updated_at': now,
        }
        writer.put(entity, origin='bulk')
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
コード例 #4
0
def map_to_aggregator(collection, mapping, aggregator):
    table = get_entity(mapping.table_id)
    if table is None:
        table = aggregator.get(mapping.table_id)
    if table is None:
        raise RuntimeError("Table cannot be found: %s" % mapping.table_id)
    config = {
        'csv_url': _get_table_csv_link(table),
        'entities': mapping.query
    }
    mapper = model.make_mapping(config, key_prefix=collection.foreign_id)
    origin = mapping_origin(mapping.id)
    aggregator.delete(origin=origin)
    writer = aggregator.bulk()
    for idx, record in enumerate(mapper.source.records, 1):
        if idx > 0 and idx % 1000 == 0:
            log.info("[%s] Mapped %s rows ...", mapping.id, idx)
        for entity in mapper.map(record).values():
            entity.context = mapping.get_proxy_context()
            entity.context['mutable'] = True
            if entity.schema.is_a('Thing'):
                entity.add('proof', mapping.table_id)
            entity = collection.ns.apply(entity)
            entity = remove_checksums(entity)
            writer.put(entity, fragment=idx, origin=origin)
    writer.flush()
    log.info("[%s] Mapping done (%s rows)", mapping.id, idx)
コード例 #5
0
ファイル: processing.py プロジェクト: x0rzkov/aleph
 def _generate():
     for data in entities:
         if not is_mapping(data):
             raise InvalidData("Failed to read input data", errors=data)
         entity = model.get_proxy(data)
         if entity.id is None:
             raise InvalidData("No ID for entity", errors=entity.to_dict())
         if not unsafe:
             entity = remove_checksums(entity)
         yield _process_entity(entity)
コード例 #6
0
 def test_remove_checksums(self):
     proxy = model.get_proxy(
         {
             "id": "banana",
             "schema": "Document",
             "properties": {"contentHash": ["banana"], "title": ["foo"]},
         }
     )
     proxy = remove_checksums(proxy)
     assert not proxy.has("contentHash")
     assert proxy.has("title")
コード例 #7
0
 def test_remove_checksums(self):
     proxy = model.get_proxy({
         'id': 'banana',
         'schema': 'Document',
         'properties': {
             'contentHash': ['banana'],
             'title': ['foo']
         }
     })
     proxy = remove_checksums(proxy)
     assert not proxy.has('contentHash')
     assert proxy.has('title')
コード例 #8
0
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    for data in entities:
        entity = model.get_proxy(data, cleaned=False)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if safe:
            entity = remove_checksums(entity)
        entity.context = {"role_id": role_id, "mutable": mutable}
        for field, func in (("created_at", min), ("updated_at", max)):
            ts = func(ensure_list(data.get(field)), default=None)
            dt = registry.date.to_datetime(ts)
            if dt is not None:
                entity.context[field] = dt.isoformat()
        writer.put(entity, origin="bulk")
        yield entity.id
    writer.flush()