Beispiel #1
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, records_index, records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    # Update collection stats
    index_collection(collection)
Beispiel #2
0
def bulk_write(collection, items):
    """Write a set of entities - given as raw dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data")

        entity = model.get_proxy(item)
        if entity.id is None:
            raise InvalidData("No ID for entity")

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities)
Beispiel #3
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    entities = {}
    total = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity_id = entity.get('id')
            if entity_id is None:
                continue
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            base = entities.get(entity_id, {})
            entities[entity_id] = merge_data(entity, base)
            total += 1

        if idx % 1000 == 0:
            log.info("[%s] Loaded %s records, %s entities...",
                     collection.foreign_id, idx, total)

        if len(entities) >= BULK_PAGE:
            index_bulk(collection, entities, chunk_size=BULK_PAGE)
            entities = {}

    if len(entities):
        index_bulk(collection, entities, chunk_size=BULK_PAGE)

    # Update collection stats
    index_collection(collection)
Beispiel #4
0
def bulk_write(collection, items, merge=True):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        entity = namespace.apply(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
Beispiel #5
0
def index_aggregate(stage,
                    collection,
                    sync=False,
                    entity_ids=None,
                    mapping_id=None):
    """Project the contents of the collections aggregator into the index."""
    entities = _fetch_entities(stage, collection, entity_ids=entity_ids)
    entities = (_process_entity(e, sync=sync) for e in entities)
    extra = {'job_id': stage.job.id, 'mapping_id': mapping_id}
    index_bulk(collection, entities, extra, sync=sync)
    refresh_collection(collection.id, sync=sync)
Beispiel #6
0
def index_aggregator(collection, aggregator, entity_ids=None, sync=False):
    def _generate():
        idx = 0
        entities = aggregator.iterate(entity_id=entity_ids)
        for idx, proxy in enumerate(entities):
            if idx > 0 and idx % 1000 == 0:
                log.debug("[%s] Index: %s...", collection, idx)
            yield proxy
        log.debug("[%s] Indexed %s entities", collection, idx)

    entities_index.index_bulk(collection, _generate(), sync=sync)
    aggregator.close()
Beispiel #7
0
def bulk_write(collection, entities, job_id=None, unsafe=False):
    """Write a set of entities - given as dicts - to the index."""
    def _generate():
        for data in entities:
            if not is_mapping(data):
                raise InvalidData("Failed to read input data", errors=data)
            entity = model.get_proxy(data)
            if not unsafe:
                entity = remove_checksums(entity)
            yield _process_entity(entity)

    index_bulk(collection, _generate(), job_id=job_id)
    refresh_collection(collection.id)
Beispiel #8
0
def bulk_write(collection, entities, job_id=None, unsafe=False):
    """Write a set of entities - given as dicts - to the index."""

    # This is called mainly by the /api/2/collections/X/_bulk API.
    def _generate():
        for data in entities:
            if not is_mapping(data):
                raise InvalidData("Failed to read input data", errors=data)
            entity = model.get_proxy(data)
            if entity.id is None:
                raise InvalidData("No ID for entity", errors=entity.to_dict())
            if not unsafe:
                entity = remove_checksums(entity)
            yield _process_entity(entity)

    index_bulk(collection, _generate(), {'job_id': job_id})
    refresh_collection(collection.id)
Beispiel #9
0
def index_entities(stage, collection, iterable, sync=False):
    entities = []
    for entity in iterable:
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())

        tag_entity(entity)
        entities.append(entity)
        if len(entities) >= BULK_PAGE:
            stage.report_finished(len(entities))
            index_bulk(collection, entities, job_id=stage.job.id, sync=sync)
            entities = []

    if len(entities):
        stage.report_finished(len(entities))
        index_bulk(collection, entities, job_id=stage.job.id, sync=sync)
    refresh_collection(collection)
Beispiel #10
0
def index_entities(collection, iterable, sync=False):
    queue = get_queue(collection, OP_INDEX)
    queue.progress.mark_pending(len(iterable))
    entities = []
    for entity in iterable:
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())

        tag_entity(entity)
        entities.append(entity)
        if len(entities) >= BULK_PAGE:
            queue.progress.mark_finished(len(entities))
            index_bulk(collection, entities, sync=sync)
            entities = []

    if len(entities):
        queue.progress.mark_finished(len(entities))
        index_bulk(collection, entities, sync=sync)
    refresh_collection(collection)
Beispiel #11
0
def load_rows(query, rows):
    """Load a single batch of QUEUE_PAGE rows from the given query."""
    entities = {}
    links = []
    for row in rows:
        entity_map = {}
        for entity in query.entities:
            data = entity.to_index(row)
            if data is not None:
                entity_map[entity.name] = data
                entities[data['id']] = data

        for link in query.links:
            for inverted in [False, True]:
                data = link.to_index(row, entity_map, inverted=inverted)
                if data is not None:
                    links.append(data)

    index_bulk(entities, links)
    log.info("[%s] Indexed %s rows as %s entities, %s links...",
             query.collection.foreign_id, len(rows), len(entities), len(links))
Beispiel #12
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id,
                     records_index,
                     records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    refresh_collection(collection)
Beispiel #13
0
def bulk_write(collection, items, merge=True, unsafe=False):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        if not unsafe:
            entity = namespace.apply(entity)
            entity = remove_checksums(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
Beispiel #14
0
def index_aggregate(stage, collection, entity_id=None, sync=False):
    """Project the contents of the collections aggregator into the index."""
    entities = _fetch_entities(stage, collection, entity_id=entity_id)
    entities = (_process_entity(e, sync=sync) for e in entities)
    index_bulk(collection, entities, job_id=stage.job.id)
    refresh_collection(collection.id)