def reindex(step=5000): #elastic.indices.delete_index("datawire") q = Frame.all().order_by(Frame.submitted_at.desc()) for offset in count(0, step): log.info("Re-indexing at %s", offset) if 0 == q.limit(step).offset(offset).count(): return for frame_obj in q.limit(step).offset(offset): index(frame_obj)
def matchall(): """ Re-do all matching. """ from datawire.store import load_frame from datawire.model import Frame from datawire.processing.matching import match for frame_ref in Frame.all(): frame = load_frame(frame_ref.urn) if frame is None: continue match(frame)
def generate_frame(service_key, event_key, data): service = Service.by_key(service_key) if service is None: raise NotFound('No such service: %s' % service_key) event = Event.by_key(service, event_key) if event is None: raise NotFound('No such event: %s' % event_key) frame = { 'service': service_key, 'event': event_key, 'data': data.get('body') } headers = data.get('headers') frame.update({ 'source_url': parse_url(headers.get('X-Source-Location')), 'details_url': parse_url(headers.get('X-Details-Location')), 'hash': data_hash(frame), 'action_at': parse_datetime(headers.get('X-Action-Time')), 'submitted_at': datetime.utcnow() }) if not frame['action_at']: frame['action_at'] = frame['submitted_at'] else: frame['action_at'] = min(frame['action_at'], frame['submitted_at']) frame['urn'] = Frame.to_urn(frame) if Frame.by_hash(frame['hash']) is not None: raise BadRequest('Duplicate content, hash: %(hash)s' % frame) Frame.create(service, event, frame) store_frame(frame) db.session.commit() log.info("created: %(urn)s (%(hash)s)", frame) routing_key = 'matching.%s.%s' % (service_key, event_key) publish(matching_queue, routing_key, frame) routing_key = 'indexing.%s.%s' % (service_key, event_key) publish(indexing_queue, routing_key, frame) return frame['urn']
def backsearch(entity, step=5000): # TODO: Check if the string is already tracked, use existing results. found_count = 0 pattern = entity.pattern q = Frame.all().order_by(Frame.submitted_at.desc()) for offset in range(0, BACKSEARCH_LIMIT, step): log.info("Backsearch [%s] at %s (found: %s)", entity.text, offset, found_count) if 0 == q.limit(step).offset(offset).count(): return for frame_obj in q.limit(step).offset(offset): frame = load_frame(frame_obj.urn) matches = match(frame, pattern, [entity.id]) found_count += len(matches) if len(matches): db.session.commit() if found_count >= BACKSEARCH_FIND: return
def user_index(id): require.user_id(id) esq = { "query": { "filtered": { "query": {"match_all": {}}, "filter": {} } }, "sort": [{"action_at": {"order": "desc"}}], "size": get_limit(), "from": get_offset(), "facets": {"entities": { "terms": {"field": "entities"}} } } filters = request.args.getlist('entity') if len(filters): esq['query']['filtered']['filter']['and'] = [] for entity_id in filters: fq = {"term": {"entities": entity_id}} esq['query']['filtered']['filter']['and'].append(fq) else: esq['query']['filtered']['filter']['or'] = [] for entity in Entity.all().filter(Entity.user_id == id): fq = {"term": {"entities": entity.id}} esq['query']['filtered']['filter']['or'].append(fq) res = elastic.search_raw(esq, elastic_index, 'frame') frame_urns = [r['_id'] for r in res['hits']['hits']] q = Frame.all().filter(Frame.urn.in_(frame_urns)) frames = dict([(f.urn, f) for f in q]) frames = [frames.get(urn) for urn in frame_urns] return query_pager(frames, 'frames.user_index', count=res['hits']['total'], paginate=False, id=id)
def index(): q = Frame.all() q = q.order_by(Frame.action_at.desc()) return query_pager(q, 'frames.index')
def handle_indexing(body, message): routing_key = message.delivery_info.get('routing_key') log.info('%s - indexing: %s', routing_key, body['urn']) frame_obj = Frame.by_urn(body['urn']) index(frame_obj)