Esempio n. 1
0
def reindex(step=5000):
    #elastic.indices.delete_index("datawire")
    q = Frame.all().order_by(Frame.submitted_at.desc())
    for offset in count(0, step):
        log.info("Re-indexing at %s", offset)
        if 0 == q.limit(step).offset(offset).count():
            return
        for frame_obj in q.limit(step).offset(offset):
            index(frame_obj)
Esempio n. 2
0
def matchall():
    """ Re-do all matching. """
    from datawire.store import load_frame
    from datawire.model import Frame
    from datawire.processing.matching import match
    for frame_ref in Frame.all():
        frame = load_frame(frame_ref.urn)
        if frame is None:
            continue
        match(frame)
Esempio n. 3
0
def generate_frame(service_key, event_key, data):
    service = Service.by_key(service_key)
    if service is None:
        raise NotFound('No such service: %s' % service_key)
    event = Event.by_key(service, event_key)
    if event is None:
        raise NotFound('No such event: %s' % event_key)

    frame = {
        'service': service_key,
        'event': event_key,
        'data': data.get('body')
    }
    headers = data.get('headers')
    frame.update({
        'source_url': parse_url(headers.get('X-Source-Location')),
        'details_url': parse_url(headers.get('X-Details-Location')),
        'hash': data_hash(frame),
        'action_at': parse_datetime(headers.get('X-Action-Time')),
        'submitted_at': datetime.utcnow()
    })

    if not frame['action_at']:
        frame['action_at'] = frame['submitted_at']
    else:
        frame['action_at'] = min(frame['action_at'], frame['submitted_at'])

    frame['urn'] = Frame.to_urn(frame)

    if Frame.by_hash(frame['hash']) is not None:
        raise BadRequest('Duplicate content, hash: %(hash)s' % frame)

    Frame.create(service, event, frame)
    store_frame(frame)
    db.session.commit()

    log.info("created: %(urn)s (%(hash)s)", frame)
    routing_key = 'matching.%s.%s' % (service_key, event_key)
    publish(matching_queue, routing_key, frame)
    routing_key = 'indexing.%s.%s' % (service_key, event_key)
    publish(indexing_queue, routing_key, frame)
    return frame['urn']
Esempio n. 4
0
def generate_frame(service_key, event_key, data):
    service = Service.by_key(service_key)
    if service is None:
        raise NotFound('No such service: %s' % service_key)
    event = Event.by_key(service, event_key)
    if event is None:
        raise NotFound('No such event: %s' % event_key)

    frame = {
        'service': service_key,
        'event': event_key,
        'data': data.get('body')
    }
    headers = data.get('headers')
    frame.update({
        'source_url': parse_url(headers.get('X-Source-Location')),
        'details_url': parse_url(headers.get('X-Details-Location')),
        'hash': data_hash(frame),
        'action_at': parse_datetime(headers.get('X-Action-Time')),
        'submitted_at': datetime.utcnow()
    })

    if not frame['action_at']:
        frame['action_at'] = frame['submitted_at']
    else:
        frame['action_at'] = min(frame['action_at'], frame['submitted_at'])

    frame['urn'] = Frame.to_urn(frame)

    if Frame.by_hash(frame['hash']) is not None:
        raise BadRequest('Duplicate content, hash: %(hash)s' % frame)

    Frame.create(service, event, frame)
    store_frame(frame)
    db.session.commit()

    log.info("created: %(urn)s (%(hash)s)", frame)
    routing_key = 'matching.%s.%s' % (service_key, event_key)
    publish(matching_queue, routing_key, frame)
    routing_key = 'indexing.%s.%s' % (service_key, event_key)
    publish(indexing_queue, routing_key, frame)
    return frame['urn']
Esempio n. 5
0
def backsearch(entity, step=5000):
    # TODO: Check if the string is already tracked, use existing results.
    found_count = 0
    pattern = entity.pattern
    q = Frame.all().order_by(Frame.submitted_at.desc())
    for offset in range(0, BACKSEARCH_LIMIT, step):
        log.info("Backsearch [%s] at %s (found: %s)", entity.text, offset, found_count)
        if 0 == q.limit(step).offset(offset).count():
            return
        for frame_obj in q.limit(step).offset(offset):
            frame = load_frame(frame_obj.urn)
            matches = match(frame, pattern, [entity.id])
            found_count += len(matches)
            if len(matches):
                db.session.commit()
            if found_count >= BACKSEARCH_FIND:
                return
Esempio n. 6
0
def backsearch(entity, step=5000):
    # TODO: Check if the string is already tracked, use existing results.
    found_count = 0
    pattern = entity.pattern
    q = Frame.all().order_by(Frame.submitted_at.desc())
    for offset in range(0, BACKSEARCH_LIMIT, step):
        log.info("Backsearch [%s] at %s (found: %s)", entity.text, offset,
                 found_count)
        if 0 == q.limit(step).offset(offset).count():
            return
        for frame_obj in q.limit(step).offset(offset):
            frame = load_frame(frame_obj.urn)
            matches = match(frame, pattern, [entity.id])
            found_count += len(matches)
            if len(matches):
                db.session.commit()
            if found_count >= BACKSEARCH_FIND:
                return
Esempio n. 7
0
def user_index(id):
    require.user_id(id)

    esq = {
        "query": {
            "filtered": {
                "query": {"match_all": {}}, "filter": {}
            }
        },
        "sort": [{"action_at": {"order": "desc"}}],
        "size": get_limit(),
        "from": get_offset(),
        "facets": {"entities": {
            "terms": {"field": "entities"}}
        }
    }

    filters = request.args.getlist('entity')
    if len(filters):
        esq['query']['filtered']['filter']['and'] = []
        for entity_id in filters:
            fq = {"term": {"entities": entity_id}}
            esq['query']['filtered']['filter']['and'].append(fq)
    else:
        esq['query']['filtered']['filter']['or'] = []
        for entity in Entity.all().filter(Entity.user_id == id):
            fq = {"term": {"entities": entity.id}}
            esq['query']['filtered']['filter']['or'].append(fq)

    res = elastic.search_raw(esq, elastic_index, 'frame')
    frame_urns = [r['_id'] for r in res['hits']['hits']]
    q = Frame.all().filter(Frame.urn.in_(frame_urns))
    frames = dict([(f.urn, f) for f in q])
    frames = [frames.get(urn) for urn in frame_urns]
    return query_pager(frames, 'frames.user_index',
                       count=res['hits']['total'],
                       paginate=False,
                       id=id)
Esempio n. 8
0
def index():
    q = Frame.all()
    q = q.order_by(Frame.action_at.desc())
    return query_pager(q, 'frames.index')
Esempio n. 9
0
def handle_indexing(body, message):
    routing_key = message.delivery_info.get('routing_key')
    log.info('%s - indexing: %s', routing_key, body['urn'])
    frame_obj = Frame.by_urn(body['urn'])
    index(frame_obj)