def wc(query): stop_words = set(stopwords.words('english')) # generating a corpus specific stopword list stopset_state_specific = {'review', 'na', 'declassifiedreleased', 'review', 'unclassified', 'confidential', 'secret', 'disposition', 'released', 'approved', 'document', 'classification', 'restrictions', 'state', 'department', 'date', 'eo', 'handling'} stop_set = stop_words.union(stopset_state_specific) q = { "query": { "match": { "file": query } } } r = es.search(body=q, index=es_index, fields=["file", "body"]) # switched to return 'body' instead of 'file': # 'body' is the portion of the 'file' that has been regex'd by the uploader # to include the most relevant information (e.g. excluding headers) data = r['hits']['hits'][0]['fields']['body'][0] # no_white = re.sub('\s', ' ', data) # updated to disallow numbers from the wordcloud no_white = re.sub(r'[^A-Za-z\s]', '', data) w_c = dict(Counter(word_tokenize(no_white))) frequency = [] for k, v in w_c.iteritems(): frequency.append(dict({"text": k, "size": v * 3})) frequency = filter(lambda x: x['size'] > 3 and x['text'].lower() not in stop_set, frequency) return json.dumps(frequency)
def geo_endpoint(): last_query = session.get('last_query', None) if last_query is None: return json.dumps([]) query = last_query['query'] url = '{}/_search'.format(es_path) loc_q = {"size": 30000, "filter": {"exists": {"field": "locations"}}} q = {"query": {"query_string": {"query": query}}} r = es.search(body=q, index=es_index, fields=["entities", "title", "file", "entities"], size=100000) data = r locations = [] # for hit in data['hits']['hits']: # print(hit['fields']['file'][0]) # print # for location in geodict_lib.find_locations_in_text(re.sub('\s', ' ', hit['_source']['file'])): # for token in location['found_tokens']: # locations.append({'lat':token['lat'],'lon':token['lon'],'name':token['matched_string']}) # geo = map(lambda x: x['found_tokens']) # return json.dumps(locations) # print('Number of Hits: ' + str(len(data['hits']['hits']))) for hit in data['hits']['hits']: entity_locations = [] entities = json.loads(hit['fields']['entities'][0]) try: for ent in entities: if ent['category'] == 'locations': entity_locations.append(ent) except: locs = [] try: doc_file = str(hit['fields']['file'][0].replace('\n', '<br>')) except: continue try: for location in entity_locations: locations.append({ 'lat': location['entity']['lat'], 'lon': location['entity']['lon'], 'name': location['entity']['placename'], 'title': hit['fields']['title'], 'file': doc_file }) except: continue # print('no locations') # geo = map(lambda x: x['found_tokens']) return json.dumps(locations)
def more_like_this(doc_id): """ Returns similar documents :param doc_id: :return: """ q = { "query": { "more_like_this": { "docs": [{ "_index": "dossiers", "_type": "attachment", "_id": doc_id }] } } } response = es.search(body=q, index=es_index, fields=['title'], size=10) results = {'results': []} try: for r in response['hits']['hits']: results['results'].append({ 'id': r['_id'], 'name': r['fields']['title'][0] }) except (KeyError, IndexError): pass return jsonify(results)
def dps_top_brands(kwargs): granularity = chg_granularity(kwargs['granularity']) if kwargs['start_date'] > kwargs['end_date']: return "wrong end_date" qry_field = "paired_brand.keyword" date_field = "pairing_created_on" cardinal_field = "paired_device_id" if kwargs['mno'] == 'all': qry_dsl = dps_query_without_mno(granularity, kwargs['trend_qty'], qry_field, date_field, cardinal_field, kwargs) else: match_para = "operator_name" qry_dsl = dps_query_with_mno(granularity, kwargs['trend_qty'], qry_field, date_field, match_para, cardinal_field, kwargs) qry = es.search(index=conf['dps_index'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry
def wc(query): stopset=set(stopwords.words('english')) url='http://localhost:9200/dossiers/_search' q = { "fields" : ["file"], "query" : { "term" : { "file" : query } } } #r=requests.post(url,data=json.dumps(q)) r=es.search(body=q,index=DEFAULT_INDEX) #r=requests.post(url,data=json.dumps(q)) data=r frequency=[] documents=[] for hit in data['hits']['hits']: text=hit['fields']['file'][0] nowhite=re.sub('\s', ' ', text) nowhite=re.sub(r'[^\w\s]', '',text) wt=word_tokenize(nowhite) documents.append(wt) docflat=[item for sublist in documents for item in sublist] wc=dict(Counter(docflat)) for k,v in wc.iteritems(): frequency.append(dict({"text":k,"size":v*3})) frequency=filter(lambda x:x['size']>6 and x['text'].lower() not in stopset,frequency) return json.dumps(frequency)
def url_fetch(query=""): if not query: last_query = session.get('last_query', None) if last_query is not None: query = session['last_query']['query'] stopset = set(stopwords.words('english')) q = {"fields": ["file"], "query": {"term": {"file": query}}} r = es.search(body=q, index=es_index) data = r['hits']['hits'] urls = [] pn = [] for doc in data: urls.append(re.findall(r'(https?://[^\s]+)', doc['fields']['file'][0])) try: for match in phonenumbers.PhoneNumberMatcher( doc['fields']['file'][0], region=None): pn.append({ 'number': phonenumbers.format_number( match.number, phonenumbers.PhoneNumberFormat.E164), 'location': geocoder.description_for_number(match.number, "en") }) except KeyError: pass urls = filter(lambda x: x != [], urls) # urls_flat=reduce(lambda x,y: x.extend(y),urls) urls_flat = [item for sublist in urls for item in sublist] return json.dumps({'urls': dict(Counter(urls_flat)), 'pn': pn})
def imeis_status(reg_status, qry_field, aggs_size): qry_dsl = { "aggs": { "aggs_1": { "terms": { "field": qry_field, "size": aggs_size, "order": { "_count": "desc" } } } }, "size": 0, "query": { "terms": { qry_field: reg_status } } } qry = es.search(index=conf['drs_reg_index'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry['aggregations']
def wc(query): stop_words=set(stopwords.words('english')) # generating a corpus specific stopword list stopset_state_specific = set(['review','na','declassifiedreleased','review','unclassified','confidential','secret','disposition','released','approved','document','classification','restrictions','state','department','date','eo','handling']) stopset = stop_words.union(stopset_state_specific) url='http://localhost:9200/dossiers/_search' q = { "fields" : ["file", "body"], #added body to query "query" : { "match" : { "file" : query } } } #r=requests.post(url,data=json.dumps(q)) r=es.search(body=q,index=DEFAULT_INDEX) # swithced to return 'body' instead of 'file' which is the portion of the 'file' that has been regex'd by the uploader # to include the most relevant information (e.g. excluding headers) data=r['hits']['hits'][0]['fields']['body'][0] nowhite=re.sub('\s', ' ', data) #updated to disallow numbers from the wordcloud nowhite=re.sub(r'[^A-Za-z\s]', '', data) wt=word_tokenize(nowhite) wc=dict(Counter(wt)) frequency=[] for k,v in wc.iteritems(): frequency.append(dict({"text":k,"size":v*3})) frequency=filter(lambda x:x['size']>3 and x['text'].lower() not in stopset,frequency) return json.dumps(frequency)
def search_endpoint(query=None, page=None, box_only=False): if not query and not page: last_query = session.get("last_query", None) if last_query: query, page = last_query["query"], last_query["page"] else: # better error return abort(404) if not page: page = 1 session["last_query"] = {"query": query, "page": page, "ids": []} session["history"] = amend_history(session.get("history", list()), session["last_query"]) # convert pages to records for ES start = int(page) if start > 1: start *= 10 q = { "fields": ["title", "highlight", "entities", "owner"], "from": start, "query": {"match": {"file": query}}, "highlight": {"fields": {"file": {}}, "pre_tags": ["<span class='highlight'>"], "post_tags": ["</span>"]}, } raw_response = es.search(body=q, index=DEFAULT_INDEX, df="file", size=10) hits = [] for resp in raw_response["hits"]["hits"]: # Store returned ids session["last_query"]["ids"].append(resp["_id"]) if is_owner(resp["fields"]["owner"][0]): # Flatten structure for individual hits hits.append( { "id": resp["_id"], "title": resp["fields"]["title"][0], "highlight": resp["highlight"]["file"][0], "permissions": True, } ) else: hits.append({"id": resp["_id"], "title": resp["fields"]["title"][0], "permissions": False}) results = { "hits": hits, "took": float(raw_response["took"]) / 1000, "total": "{:,}".format(raw_response["hits"]["total"]), "total_int": int(raw_response["hits"]["total"]), "query": query, "from": int(page), } if box_only: return render_template("search-results-box.html", results=results) return render_template("search-template.html", results=results, history=session["history"])
def more_like_this(doc_id): ''' Returns similar documents ''' q = { "fields": ["title"], "query": { "more_like_this" : { "docs" : [ { "_index" : "dossiers", "_type" : "attachment", "_id" : doc_id }] } }, "size": 10 } response = es.search(body=q, index=DEFAULT_INDEX) results = {'results': []} try: for r in response['hits']['hits']: results['results'].append({ 'id': r['_id'], 'name': r['fields']['title'][0] }) except KeyError, IndexError: pass
def search_results(query): page = int(request.args.get('page', 1)) per_page = app.config['PER_PAGE'] body = search_body(query) response = es.search( index=app.config['ES_INDEX'], # doc_type='', from_=(page-1)*per_page, size=per_page, body=body ) search_results = response['hits']['hits'] results_count = response['hits']['total'] pagination = Pagination( css_framework=app.config['CSS_FRAMEWORK'], page=page, total=results_count, per_page=per_page) return render_template('search_results.html', query=query, search_results=search_results, pagination=pagination, count=results_count)
def csv(dsl, start, size): search_results = es.search(index='suborders', doc_type='suborders', scroll='1m', body=dsl, _source=[ 'order_number', 'suborder_number', 'date_received', 'order_type', 'customer', 'metadata' ], size=size, from_=start) sid = search_results['_scroll_id'] scroll_size = search_results['hits']['total'] scroll_results = search_results['hits']['hits'] while scroll_size > 0: results = es.scroll(scroll='1m', body={ "scroll": "1m", "scroll_id": sid }) scroll_size = len(results['hits']['hits']) scroll_results += results['hits']['hits'] return scroll_results
def incident_type_case_status(kwargs, qry_field, aggs_size, cardinal_field): qry_dsl = { "aggs": { "aggs_1": { "terms": { "field": qry_field, "size": aggs_size, "order": { "_count": "desc" } }, "aggs": { "unique_devices": { "cardinality": { "field": cardinal_field, "precision_threshold": kwargs['precision_threshold'] } } } } }, "size": 0 } qry = es.search(index=conf['lsds_index'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry['aggregations']['aggs_1']['buckets']
def core_total_imeis(kwargs, field3, qry_index): qry_dsl = { "aggs": { "unique_imeis": { "cardinality": { "field": "imei_norm.keyword", "precision_threshold": kwargs['precision_threshold'] } } }, "size": 0, "query": { "bool": { "must": [{ "exists": { "field": field3 } }] } } } qry = es.search(index=qry_index, body=qry_dsl, request_timeout=conf['request_timeout']) return qry['aggregations']['unique_imeis']['value']
def more_like_this(doc_id): """ Returns similar documents :param doc_id: :return: """ q = { "query": { "more_like_this": { "docs": [ { "_index": "dossiers", "_type": "attachment", "_id": doc_id }] } } } response = es.search(body=q, index=es_index, fields=['title'], size=10) results = {'results': []} try: for r in response['hits']['hits']: results['results'].append({ 'id': r['_id'], 'name': r['fields']['title'][0] }) except (KeyError, IndexError): pass return jsonify(results)
def url_fetch(query=""): #query="http" if not query: query=session['last_query']['query'] stopset=set(stopwords.words('english')) url='http://localhost:9200/dossiers/_search' q = { "fields" : ["file"], "query" : { "term" : { "file" : query } } } #r=requests.post(url,data=json.dumps(q)) r=es.search(body=q,index=DEFAULT_INDEX) data=r['hits']['hits'] urls=[] pn=[] for doc in data: urls.append(re.findall(r'(https?://[^\s]+)', doc['fields']['file'][0])) try: for match in phonenumbers.PhoneNumberMatcher(doc['fields']['file'][0], region=None): pn.append({'number':phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164),'location':geocoder.description_for_number(match.number,"en")}) except KeyError: pass urls=filter(lambda x: x!=[],urls) #urls_flat=reduce(lambda x,y: x.extend(y),urls) urls_flat=[item for sublist in urls for item in sublist] return json.dumps({'urls':dict(Counter(urls_flat)), 'pn':pn})
def wc(query): stop_words = set(stopwords.words('english')) # generating a corpus specific stopword list stopset_state_specific = { 'review', 'na', 'declassifiedreleased', 'review', 'unclassified', 'confidential', 'secret', 'disposition', 'released', 'approved', 'document', 'classification', 'restrictions', 'state', 'department', 'date', 'eo', 'handling' } stop_set = stop_words.union(stopset_state_specific) q = {"query": {"match": {"file": query}}} r = es.search(body=q, index=es_index, fields=["file", "body"]) # switched to return 'body' instead of 'file': # 'body' is the portion of the 'file' that has been regex'd by the uploader # to include the most relevant information (e.g. excluding headers) data = r['hits']['hits'][0]['fields']['body'][0] # no_white = re.sub('\s', ' ', data) # updated to disallow numbers from the wordcloud no_white = re.sub(r'[^A-Za-z\s]', '', data) w_c = dict(Counter(word_tokenize(no_white))) frequency = [] for k, v in w_c.iteritems(): frequency.append(dict({"text": k, "size": v * 3})) frequency = filter( lambda x: x['size'] > 3 and x['text'].lower() not in stop_set, frequency) return json.dumps(frequency)
def es_search(): # max_results = request.args.get('max_results', 10) # max_results = int(max_results) # # query = request.args.get('query') # entities = term.Term.query.filter(term.Term.id.ilike('%{}%'.format(query))).all() # # if not entities: # abort(404) # # # return json.dumps([entity.to_dict() for entity in entities[:max_results]]) # return jsonify({'results': [entity.to_dict() for entity in entities[:max_results]]}) # remote_addr = request.environ.get('HTTP_X_REAL_IP', request.remote_addr) app.logger.info('{} - {}'.format(request.remote_addr, request.url)) query = request.args.get('q') results = es.search(index=app.config.get('INDEX_NAME'), q=query) hits = results['hits']['hits'] if not hits: abort(404) return jsonify({'results': hits})
def device_count(kwargs): qry_dsl = { "aggs": { "unique_devices": { "cardinality": { "field": "device_id", "precision_threshold": kwargs['precision_threshold'] } } }, "size": 0, "query": { "bool": { "must": [{ "match": { "registration_status": "Approved" } }] } } } qry = es.search(index=conf['drs_reg_index'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry['aggregations']
def query(index, query): if not es: return [] body = {"query": {"multi_match": {"query": query, "fields": ["*"]}}} search = es.search(index=index, body=body) results = [int(result["_id"]) for result in search["hits"]["hits"]] return results
def core_registration_status_details(kwargs): if kwargs['start_date'] > kwargs['end_date']: return "wrong end_date" qry_dsl = { "aggs": { "registration_status": { "terms": { "field": "status.keyword", "size": 9 }, "aggs": { "unique_brands": { "cardinality": { "field": "brand_name.keyword", "precision_threshold": kwargs['precision_threshold'] } }, "unique_models": { "cardinality": { "field": "model.keyword", "precision_threshold": kwargs['precision_threshold'] } } } } }, "size": 0, "query": { "bool": { "must": [{ "range": { "start_date": { "gte": kwargs['start_date'], "lte": kwargs['end_date'] } } }, { "exists": { "field": "status.keyword" } }], "must_not": [{ "exists": { "field": "end_date" } }] } } } qry = es.search(index=conf['core_indices']['core_reglist_index'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry
def viz_endpoint(query): url = "http://localhost:9200/dossiers/_search" q = {"_source": ["entity"], "fields": ["entities", "title"], "query": {"term": {"file": query}}, "size": 100} # r=requests.post(url,data=json.dumps(q)) r = es.search(body=q, index=DEFAULT_INDEX) data = r # graph = make_graph(data) graph = document_graph(data["hits"]["hits"]) return json.dumps(graph)
def query_index(index, query, page, per_page): if not es: return [], 0 search = es.search( index=index, body={'query': {'multi_match': {'query': query, 'fields': ['*']}}, 'from': 0, 'size': 99}) ids = [int(hit['_id']) for hit in search['hits']['hits']] return ids, search['hits']['total']['value']
def autocomplete_cities(): """Autocomplete for cities.""" query = request.args.get("query") redis_key = f"autocomplete_cities|{query}" # Try to find with Redis. try: result = redis_store.get(redis_key) redis_is_connected = True if result: return jsonify(suggestions=pickle.loads(result)) except RedisConnectionError: redis_is_connected = False # Try to find with Elasticsearch. try: cities = es.search( index="airtickets-city-index", from_=0, size=10, doc_type="CityName", body={ "query": { "bool": { "must": { "match_phrase_prefix": { "value": { "query": query } } } } }, "sort": { "population": { "order": "desc" } }, }, ) result = [city["_source"] for city in cities["hits"]["hits"]] except (ElasticConnectionError, NotFoundError, AttributeError): # Try to find with PostgreSQL. cities = (CityName.query.join( City.city).filter(CityName.name.like(query + "%")).distinct( City.population, CityName.city_id).order_by(City.population.desc(), CityName.city_id).limit(10).all()) result = [city.autocomplete_serialize() for city in cities] if redis_is_connected: redis_store.set(redis_key, pickle.dumps(result), 86400) return jsonify(suggestions=result)
def imei_distribution(kwargs): qry_dsl = { "aggs": { "block_cond": { "terms": { "field": "cond_name.keyword", "size": conf['num_core_blk_conds'], "order": { "_count": "desc" } } } }, "size": 0, "query": { "bool": { "must": [ { "match_all": {} }, { "range": { "block_date": { "gte": kwargs['start_date'], "lte": kwargs['end_date'] } } }, { "exists": { "field": "block_date" } }, { "exists": { "field": "start_date" } } ], "must_not": [ { "exists": { "field": "end_date" } } ] } } } qry = es.search(index=conf['core_indices']['core_classification_data'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry
def viz_all(): q = {"query": {"match_all": {}}} r = es.search(body=q, index=es_index, fields=["entities", "title"], size=100) data = r['hits']['hits'] graph = document_graph(data) return json.dumps(graph)
def request_doc(doc_id): q = { "query" : { "match" : { "_id" : doc_id } }, } response = es.search(body=q, index=DEFAULT_INDEX) return response
def search_query(): search = request.args['query'] field = request.args.get('fields', default="") try: p = field.split(',') dict_query = { "query": { "bool": { "must": [ { "query_string": { "query": search, "fields": p, } }, ], "should": [ { "multi_match": { "query": search, "type": "most_fields", "fields": p } }, ], } } } result = es.search(index="dictionary", doc_type="words", body=dict_query) res = result["hits"]["hits"] filterd_result = [] for x in res: fina = {} fin = x['_source'] fina['word'] = fin['word'] fina['meaning'] = fin['meaning'] fina['synonym'] = fin['synonym'] fina['antonym'] = fin['antonym'] fina['word_origin'] = fin['word_origin'] fina['example'] = fin['example'] filterd_result.append(fina) return jsonify({"msg": "success", "respose": filterd_result}) except Exception as e: print str(e) return jsonify({"msg": "error occured", "error": str(e)})
def lsds_01_total_reported_devices(kwargs): granularity = chg_granularity(kwargs['granularity']) if kwargs['start_date'] > kwargs['end_date']: return "wrong end_date" qry_dsl = { "aggs": { "time_range": { "date_histogram": { "field": "case_reported_date", "interval": granularity }, "aggs": { "unique_devices": { "cardinality": { "field": "case_id", "precision_threshold": conf['precision_threshold'] } } } } }, "size": 0, "query": { "bool": { "must": [{ "range": { "case_reported_date": { "gte": kwargs['start_date'], "lte": kwargs['end_date'] } } }], "filter": [{ "bool": { "should": [{ "exists": { "field": "reported_imeis.keyword" } }], "minimum_should_match": 1 } }] } } } qry = es.search(index=conf['lsds_index'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry
def bulk_search(queries): data = tablib.Dataset(headers=['filename', 'id', 'query']) for q in queries: r = es.search(q=q, fields=['title'], size=100, index="dossiers", doc_type="attachment") for res in r['hits']['hits']: title = res['fields']['title'][0] _id = res['_id'] data.append((title, _id, q)) return data
def viz_all(): q = { "fields" : ["entities","title"], "query" : { "match_all" : {} }, "size": 100 } r = es.search(body=q, index=DEFAULT_INDEX) graph = document_graph(r['hits']['hits']) return json.dumps(graph)
def search_results(query): res = es.search(index="microblog", doc_type="post", body={"query": {"match": { "body": query }}}) post_ids = [] for hit in res['hits']['hits']: app.logger.debug(hit) post_ids.append(hit['_source']['id']) app.logger.debug(post_ids) posts = g.user.followed_posts().filter(Post.id.in_(post_ids)) return render_template('search_results.html', query=query, results=posts)
def view_search(key): res = es.search(index=index, doc_type=_type, body={"query": {"match": {"name": key}}}) print res if res['hits']['hits']: for item in res['hits']['hits']: e = Employee(name=item['_source']['name'] , job=item['_source']['Job']) e.save() print "done" return jsonify({'name':item['_source']['name'], 'Designation':item['_source']['Job'], 'store':'mongo'}) else: return Response('Record Not Found')
def history_query(): """ AND query over all active history terms """ terms = active_history_terms(session["history"]) body = { "_source": ["entity"], "fields": ["entities", "title"], "query": {"constant_score": {"filter": {"terms": {"file": terms, "execution": "and"}}}}, } r = es.search(body=body, index=DEFAULT_INDEX, size=100) graph = make_response(json.dumps(document_graph(r["hits"]["hits"]))) return graph
def search(dsl, start, size): search_results = es.search(index='suborders', doc_type='suborders', body=dsl, _source=[ 'order_number', 'suborder_number', 'date_received', 'order_type', 'current_status', 'customer' ], size=size, from_=start) return search_results
def drs_registered_imeis_approved(kwargs): granularity = chg_granularity(kwargs['granularity']) if kwargs['start_date'] > kwargs['end_date']: return "wrong end_date" qry_dsl = { "aggs": { "time_range": { "date_histogram": { "field": "registration_date", "interval": granularity } } }, "size": 0, "query": { "bool": { "must": [{ "match_all": {} }, { "match": { "registration_status.keyword": "Approved" } }, { "range": { "registration_date": { "gte": kwargs['start_date'], "lte": kwargs['end_date'] } } }], "filter": [{ "bool": { "should": [{ "exists": { "field": "imeis.keyword" } }], "minimum_should_match": 1 } }] } } } qry = es.search(index=conf['drs_reg_index'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry
def print(dsl, start, size): search_results = es.search(index='suborders', doc_type='suborders', body=dsl, _source=[ 'suborder_number', 'order_type', 'order_number', 'date_submitted', 'customer', 'metadata', 'multiple_items', 'order_types' ], size=size, from_=start) return search_results
def drs_single_importer_status(kwargs): if kwargs['start_date'] > kwargs['end_date']: return "wrong end_date" qry_dsl = { "aggs": { "aggs_1": { "terms": { "field": "registration_status.keyword", "size": 15, "order": { "_count": "desc" } } } }, "size": 0, "query": { "bool": { "must": [{ "match": { "registered_user": kwargs['importer_name'] } }, { "range": { "registration_date": { "gte": kwargs['start_date'], "lte": kwargs['end_date'] } } }], "filter": { "bool": { "should": [{ "exists": { "field": "registration_status" } }], "minimum_should_match": 1 } } } } } qry = es.search(index=conf['drs_reg_index'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry
def match_all(): try: query = {"query": {"match_all": {}}} result = es.search(index="dict_word", doc_type="word", body=query) res = result["hits"]["hits"] filtered_result = [] for i in res: filtered_result.append(i["_source"]) return jsonify({"response": "success", "data": filtered_result}) except Exception as e: print str(e) return jsonify({"response": "failure", "error": str(e)})
def request_doc(doc_id): """ Searches elastic index for a document matching a particular ID. :param str doc_id: A specific document ID :return: results of elastic search matching doc_id """ q = { "query": { "match": { "_id": doc_id } }, } return es.search(body=q, index=es_index)
def viz_all(): q = { "query": { "match_all": {} } } r = es.search(body=q, index=es_index, fields=["entities", "title"], size=100) data = r['hits']['hits'] graph = document_graph(data) return json.dumps(graph)
def match_all(): try: dict_query = {"query": {"match_all": {}}} result = es.search(index="dictionary", doc_type="words", body=dict_query) res = result["hits"]["hits"] filtered_result = [] for x in res: fin = x['_source'] filtered_result.append(fin) return jsonify({"msg": "success", "response": filtered_result}) except Exception as e: print str(e) return jsonify({"msg": "error_occured", "error": str(e)})
def more_like_this(doc_id): """ Returns similar documents """ q = { "fields": ["title"], "query": {"more_like_this": {"docs": [{"_index": "dossiers", "_type": "attachment", "_id": doc_id}]}}, "size": 10, } response = es.search(body=q, index=DEFAULT_INDEX) results = {"results": []} try: for r in response["hits"]["hits"]: results["results"].append({"id": r["_id"], "name": r["fields"]["title"][0]}) except KeyError, IndexError: pass
def stolen_imeis_on_network(kwargs): qry_dsl = { "aggs": { "MNOs": { "terms": { "field": "mno_operator.keyword", "size": conf['num_of_mnos'], "order": { "_count": "desc" } } } }, "size": 0, "query": { "bool": { "filter": [{ "match_all": {} }, { "match": { "triplet_year": kwargs['trend_year'] } }, { "match": { "triplet_month": kwargs['trend_month'] } }, { "bool": { "should": [{ "match_phrase": { "blacklist_reasons.keyword": "{\"IMEI in local stolen list\"}" } }], "minimum_should_match": 1 } }] } } } qry = es.search(index=conf['core_indices']['join_core_mno-blacklist'], body=qry_dsl, request_timeout=conf['request_timeout']) return qry
def search_results(query): res = es.search(index="microblog", doc_type="post", body={"query": { "match": { "body": query } }}) post_ids = [] for hit in res['hits']['hits']: app.logger.debug(hit) post_ids.append(hit['_source']['id']) app.logger.debug(post_ids) posts = g.user.followed_posts().filter(Post.id.in_(post_ids)) return render_template('search_results.html', query=query, results=posts)
def es_search2(): if not app.config.get('IS_ES_INDEX'): return 'Sorry, you need enable Elasticsearch first.' app.logger.info('{} - {}'.format(request.remote_addr, request.url)) query = request.args.get('q') results = es.search(index=app.config.get('ES_INDEX_NAME'), doc_type=app.config.get('ES_TYPE_NAME'), q=query) hits = results['hits']['hits'] entries = [] for hit in hits: entries.append(Entry.get(Entry.id == hit['_id'])) return render_template('search.jinja2', entries=entries, search=query)
def get_order(): results = [] date = request.get_json() res = es.search(index="orders", body={"query": { "term": { "date": f"{date['date']}" } }}) for hit in res['hits']['hits']: result = {} src = hit['_source'] order_date = src['date'] order = Mongo.get_doc_by_id('orders', src['id_order']) for part in order: order_id_shop = part['id_shop'] order_id_stock = part['id_stock'] order_goods = part['goods'] stock_info = Postgres.return_stock_info(order_id_stock) order_stock_title = stock_info[0] order_stock_address = stock_info[1] shop_info = Mongo.get_doc_by_part('shops', order_id_shop) for info in shop_info: order_shop_title = info['title'] order_shop_address = info['address'] result.update({'Дата заказа': order_date}) result.update({'Название магазина': order_shop_title}) result.update({'Адрес магазина': order_shop_address}) result.update({'Название склада': order_stock_title}) result.update({'Aдрес склада': order_stock_address}) result.update({'Товары': {}}) for goods in order_goods: result['Товары'].update({goods: order_goods[goods]}) results.append(result) for i in results: print(i) return '.'
def wc(query): stopset = set(stopwords.words("english")) url = "http://localhost:9200/dossiers/_search" q = {"fields": ["file"], "query": {"term": {"file": query}}} # r=requests.post(url,data=json.dumps(q)) r = es.search(body=q, index=DEFAULT_INDEX) data = r["hits"]["hits"][0]["fields"]["file"][0] nowhite = re.sub("\s", " ", data) nowhite = re.sub(r"[^\w\s]", "", data) wt = word_tokenize(nowhite) wc = dict(Counter(wt)) frequency = [] for k, v in wc.iteritems(): frequency.append(dict({"text": k, "size": v * 3})) frequency = filter(lambda x: x["size"] > 3 and x["text"].lower() not in stopset, frequency) return json.dumps(frequency)
def viz_endpoint(query): # url = '{}/_search'.format(es_path) q = { "_source": ["entity"], "fields": ["entities", "title"], "query": { "match": { "file": query } }, "size": 150 } r = es.search(body=q, index=es_index) data = r['hits']['hits'] graph = document_graph(data) return json.dumps(graph)
def bulk_search(queries): """ :param list queries: List of elasticsearch queries :return tablib.Dataset: """ data = tablib.Dataset(headers=['filename', 'id', 'query']) for q in queries: r = es.search(q=q, fields=['title'], size=100, index=es_index, doc_type="attachment") for res in r['hits']['hits']: title = res['fields']['title'][0] _id = res['_id'] data.append((title, _id, q)) return data
def geo_endpoint(): query=session['last_query']['query'] url='http://localhost:9200/dossiers/_search' q = { "fields" : ["file"], "query" : { "term" : { "file" : query } } } #r=requests.post(url,data=json.dumps(q)) r=es.search(body=q,index=DEFAULT_INDEX) data=r locations=[] for hit in data['hits']['hits']: for location in geodict_lib.find_locations_in_text(re.sub('\s', ' ', str(hit['fields']['file']))): for token in location['found_tokens']: locations.append({'lat':token['lat'],'lon':token['lon'],'name':token['matched_string']}) #geo=map(lambda x: x['found_tokens']) return json.dumps(locations)
def history_query(): """ AND query over all active history terms """ terms = active_history_terms(session['history']) body = { "_source": ["entity"], "fields": ["entities", "title"], "query": { "constant_score": { "filter": { "terms": { "file": terms, "execution": "and" } } } } } r = es.search(body=body, index=es_index, size=100) data = r['hits']['hits'] graph = make_response(json.dumps(document_graph(data))) return graph
def serve_timeline(query=None, page=None, box_only=True, dates={}): if request.method == "POST": json_dict = request.get_json() #print json_dict #print type(json_dict) dates = json_dict['dates'] startdate = dates[0][0:10] enddate = dates[1][0:10] if startdate == enddate: startdate = "1973-01-01" enddate = "1974-01-01" #print startdate, enddate #print 'running a new query...' if not query and not page: last_query = session.get('last_query', None) if last_query: query, page = last_query['query'], last_query['page'] else: # better error return abort(404) if not page: page = 1 session['last_query'] = {'query': query, 'page': page, 'ids': []} # convert pages to records for ES start = int(page) if start > 1: start *= 10 q = { "fields": ["title", "highlight", "entities", "owner", "date"], "from": start, "query" : { "match" : { "file" : query } }, "filter":{ "range" : { "date" : { "gte": startdate, "lte": enddate, "format": "yyyy-MM-dd" } } }, "highlight": { "fields": { "file": { } }, "pre_tags" : ["<span class='highlight'>"], "post_tags" : ["</span>"] } } raw_response = es.search(body=q, index=DEFAULT_INDEX, df="file", size=10) #print q #print raw_response hits = [] for resp in raw_response['hits']['hits']: # Store returned ids session['last_query']['ids'].append(resp['_id']) if is_owner(resp['fields']['owner'][0]): # Flatten structure for individual hits hits.append({'id': resp['_id'], 'title': resp['fields']['title'][0], 'highlight': resp['highlight']['file'][0], 'permissions': True }) else: hits.append({'id': resp['_id'], 'title': resp['fields']['title'][0], 'permissions': False }) results = { 'hits': hits, 'took': float(raw_response['took'])/1000, 'total': "{:,}".format(raw_response['hits']['total']), 'total_int': int(raw_response['hits']['total']), 'query': query, 'from': int(page) } if box_only: return render_template('search-results-box.html', results=results) return render_template('search-template.html', results=results)
def timeline_new(query=None, page=None, box_only=False): if not query and not page: last_query = session.get('last_query', None) if last_query: query, page = last_query['query'], last_query['page'] else: # better error return abort(404) if not page: page = 1 session['last_query'] = {'query': query, 'page': page, 'ids': []} # convert pages to records for ES start = int(page) if start > 1: start *= 10 q_daterange = { "aggs" : { "max_date" : { "max" : { "field" : "date" } }, "min_date" : { "min" : { "field" : "date" } } } } response = es.search(body=q_daterange, index=DEFAULT_INDEX) print response['aggregations']['min_date'] print response['aggregations']['max_date'] print min_date_datetime = round_month_down(datetime.datetime.strptime(response['aggregations']['min_date']['value_as_string'], "%Y-%m-%dT%H:%M:%S.%fZ")) max_date_datetime = round_month_up(datetime.datetime.strptime(response['aggregations']['max_date']['value_as_string'], "%Y-%m-%dT%H:%M:%S.%fZ")) min_date = min_date_datetime.strftime(format="%Y-%m-%d") max_date = max_date_datetime.strftime(format="%Y-%m-%d") time_delta = week_delta(min_date_datetime,max_date_datetime) rng = pd.date_range(min_date, periods=time_delta, freq='w') rng = rng.tolist() rng = [date + datetime.timedelta(days=1) for date in rng] rng = [date.strftime("%Y-%m-%d") for date in rng] rngframe = pd.DataFrame(index=rng) timeline_minimum = min_date_datetime - datetime.timedelta(days=7) timeline_minimum = timeline_minimum.strftime(format="%Y-%m-%d") print min_date print max_date q = { "fields": ["title", "highlight", "entities", "owner", "date"], "from": start, "query" : { "match" : { "file" : query } }, "highlight": { "fields": { "file": { } }, "pre_tags" : ["<span class='highlight'>"], "post_tags" : ["</span>"] }, "aggs" : { "articles_over_time" : { "date_histogram" : { "field" : "date", "interval" : "week" }}, "max_date" : { "max" : { "field" : "date" } }, "min_date" : { "min" : { "field" : "date" } } } } response = es.search(body=q, index=DEFAULT_INDEX) print response['aggregations']['articles_over_time']['buckets'] df = pd.DataFrame(response['aggregations']['articles_over_time']['buckets']) df['Date'] = df.key_as_string.apply(lambda x: str(x[:10])) df.columns = ['Count','key','key_as_string','Date'] df = df.drop(['key','key_as_string'], axis=1) df = df.set_index('Date') output = rngframe.join(df, how="left") output = output.fillna(0) output = output.reset_index() output.columns = ['Date','Count'] date_count_json = output.to_json(orient='records') out = {'date_data': date_count_json, 'time_min': timeline_minimum} print json.dumps(out) return json.dumps(out)
def geo_endpoint(): query=session['last_query']['query'] #print 'Query: ' + query url='http://localhost:9200/dossiers/_search' loc_q = { "size" : 30000, "filter" : { "exists" : { "field" : "locations" } } } q = { "size" : 100000, "fields" : ["entities","title","file","entities"], "query" : { "query_string" : { "query" : query } } } #r=requests.post(url,data=json.dumps(q)) r=es.search(body=q,index=DEFAULT_INDEX) data=r locations=[] # for hit in data['hits']['hits']: # print hit['fields']['file'][0] # print # for location in geodict_lib.find_locations_in_text(re.sub('\s', ' ', hit['_source']['file'])): # for token in location['found_tokens']: # locations.append({'lat':token['lat'],'lon':token['lon'],'name':token['matched_string']}) #geo=map(lambda x: x['found_tokens']) # return json.dumps(locations) #print 'Number of Hits: ' + str(len(data['hits']['hits'])) for hit in data['hits']['hits']: entity_locations = [] entities = json.loads(hit['fields']['entities'][0]) try: for ent in entities: if ent['category'] == 'locations': entity_locations.append(ent) except: locs = [] try: doc_file = str(hit['fields']['file'][0].replace('\n','<br>')) except: continue try: for location in entity_locations: locations.append({'lat':location['entity']['lat'],'lon':location['entity']['lon'], 'name':location['entity']['placename'], 'title': hit['fields']['title'], 'file': doc_file}) except: continue # print 'no locations' #geo=map(lambda x: x['found_tokens']) return json.dumps(locations)
def serve_geo_new(query=None, page=None, box_only=True, bounds={}): if request.method == "POST": json_dict = request.get_json() print json_dict print type(json_dict) try: bounds = json_dict['bounds']['bounds'] southwest_lat = bounds['southwest_lat'] southwest_lon = bounds['southwest_lon'] northeast_lat = bounds['northeast_lat'] northeast_lon = bounds['northeast_lon'] except: southwest_lat = -84 southwest_lon = -170 northeast_lat = 85 northeast_lon =189 print json_dict print 'running a new query...' if not query and not page: last_query = session.get('last_query', None) if last_query: query, page = last_query['query'], last_query['page'] else: # better error return abort(404) if not page: page = 1 session['last_query'] = {'query': query, 'page': page, 'ids': []} # convert pages to records for ES start = int(page) if start > 1: start *= 10 q = { "fields": ["title", "highlight", "entities", "owner", "body"], "from": start, "query":{ "filtered":{ "query":{ "match":{ "file": query } }, "filter":{ "geo_bounding_box":{ "locs":{ "top_left":{ "lat": northeast_lat, # top_lat, "lon": southwest_lon, #top_lon }, "bottom_right":{ "lat": southwest_lat, #bottom_lat, "lon": northeast_lon, #bottom_lon } } } } } }, "highlight":{ "fields":{ "file":{ } }, "pre_tags":[ "<span class='highlight'>" ], "post_tags":[ "</span>" ] } } raw_response = es.search(body=q, index=DEFAULT_INDEX, df="file", size=10) hits = [] for resp in raw_response['hits']['hits']: # Store returned ids session['last_query']['ids'].append(resp['_id']) text = resp['fields']['body'][0] text = re.sub('\\n\\n', '\\n', text) text = re.sub('\\n', '<br>', text) if is_owner(resp['fields']['owner'][0]): # Flatten structure for individual hits hits.append({'id': resp['_id'], 'title': resp['fields']['title'][0], 'highlight': resp['highlight']['file'][0], 'permissions': True, 'body': text }) else: hits.append({'id': resp['_id'], 'title': resp['fields']['title'][0], 'permissions': False }) results = { 'hits': hits, 'took': float(raw_response['took'])/1000, 'total': "{:,}".format(raw_response['hits']['total']), 'total_int': int(raw_response['hits']['total']), 'query': query, 'from': int(page) } if box_only: return render_template('search-results-map.html', results=results) return render_template('search-template.html', results=results)
def serve_clusters(query=None,page=None, box_only=True, dates={},documents={}): if request.method == "POST": json_dict = request.get_json() if not query and not page: last_query = session.get('last_query', None) if last_query: query, page = last_query['query'], last_query['page'] else: # better error return abort(404) q={ "query": { "bool": { "must": [ { "match": { "file": query } }, { "terms": { "_id":json_dict['documents'] } } ] } }, "fields": ["title", "highlight", "entities", "owner", "date"], "highlight": { "fields": { "file": { "number_of_fragments": 1, "pre_tags" : ["<span class='highlight'>"], "post_tags" : ["</span>"] } } } } raw_response = es.search(body=q, index=DEFAULT_INDEX, df="file", size=10) hits = [] for resp in raw_response['hits']['hits']: # Store returned ids session['last_query']['ids'].append(resp['_id']) if is_owner(resp['fields']['owner'][0]): # Flatten structure for individual hits hits.append({'id': resp['_id'], 'title': resp['fields']['title'][0], 'highlight': resp['highlight']['file'][0], 'permissions': True }) else: hits.append({'id': resp['_id'], 'title': resp['fields']['title'][0], 'permissions': False }) results = { 'hits': hits, 'took': float(raw_response['took'])/1000, 'total': "{:,}".format(raw_response['hits']['total']), 'total_int': int(raw_response['hits']['total']), 'query': query, 'from': int(page) } if box_only: return render_template('search-results-box.html', results=results) return render_template('search-template.html', results=results)
def view_elastic_data(): res = es.search(index=index, doc_type=_type, body={"query": {"match_all": {}}}) return jsonify(res)