def save(request): logger = LogManager(__name__, 'SAVE SEARCH') ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_params = request.POST es_m.build(es_params) combined_query = es_m.get_combined_query() try: q = combined_query desc = request.POST['search_description'] s_content = json.dumps([request.POST[x] for x in request.POST.keys() if 'match_txt' in x]) search = Search(author=request.user,search_content=s_content,description=desc,dataset=Dataset.objects.get(pk=int(request.session['dataset'])),query=json.dumps(q)) search.save() logger.set_context('user_name', request.user.username) logger.set_context('search_id', search.id) logger.info('search_saved') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('es_params', es_params) logger.exception('search_saving_failed') return HttpResponse()
def _get_color(self, color_code_list): if not color_code_list: return 'none' if self._average_colors: red, green, blue = 0, 0, 0 for color_code in color_code_list: # none encountered in color_code, if in _derive_highlight_data while has >= instead of >, # runs into this problem here, need to check if color code contains "none" if color_code.startswith('none'): logger = LogManager(__name__, '_GET_COLOR') logger.set_context('color_code_list', color_code_list) logger.info('Highlighter color_code_list contained "none", returning "none"') return 'none' else: r, g, b = int(color_code[1:3], 16), int(color_code[3:5], 16), int(color_code[5:], 16) red += r green += g blue += b red = int(red/len(color_code_list)) green = int(green/len(color_code_list)) blue = int(blue/len(color_code_list)) return "#%02x%02x%02x"%(red, green, blue) else: return Counter(color_code_list).most_common(1)[0][0]
def update(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST if 'model_pk' in parameters: model = {"pk": parameters["model_pk"], "description": parameters["model_description"], "unique_id": parameters["model_uuid"]} request.session['model'] = model logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_model', model) logger.info('model_updated') if 'dataset[]' in parameters: # TODO: check if is a valid mapping_id before change session[dataset] new_datasets = parameters.getlist('dataset[]') new_datasets = [new_dataset for new_dataset in new_datasets if request.user.has_perm('permission_admin.can_access_dataset_' + str(new_dataset))] request.session['dataset'] = new_datasets logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_datasets', new_datasets) logger.info('datasets_updated') ds = Datasets().activate_datasets(request.session) #es_m = ds.build_manager(ES_Manager) return HttpResponseRedirect(URL_PREFIX + '/')
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) try: out = execute_search(es_m, es_params) except Exception as e: log_dict = { 'task': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed' } logging.getLogger(ERROR_LOGGER).error("Documents queried failed", extra=log_dict, exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = { 'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0 } logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out
def update(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST if 'model' in parameters: model = str(parameters['model']) request.session['model'] = model logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_model', model) logger.info('dataset_updated') if 'dataset' in parameters: # TODO: check if is a valid mapping_id before change session[dataset] new_dataset = parameters['dataset'] if request.user.has_perm('permission_admin.can_access_dataset_' + str(new_dataset)): request.session['dataset'] = new_dataset logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_dataset', new_dataset) logger.info('dataset_updated') ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) return HttpResponseRedirect(URL_PREFIX + '/')
def remove_facts_from_document(self, rm_facts_dict, bs=7500): '''remove a certain fact from all documents given a [str]key and [str]val''' logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS') try: # Clears readonly block just in case the index has been set to read only self.es_m.clear_readonly_block() query = self._fact_deletion_query(rm_facts_dict) self.es_m.load_combined_query(query) response = self.es_m.scroll(size=bs, field_scroll=self.field) scroll_id = response['_scroll_id'] total_docs = response['hits']['total'] docs_left = total_docs # DEBUG print('Starting.. Total docs - ', total_docs) # DEBUG batch = 0 while total_docs > 0: print('Docs left:', docs_left) # DEBUG data = '' for document in response['hits']['hits']: new_field = [] # The new facts field for fact in document['_source'][self.field]: # If the fact name is in rm_facts_dict keys if fact["fact"] in rm_facts_dict: # If the fact value is not in the delete key values if fact['str_val'] not in rm_facts_dict.getlist( fact["fact"]): new_field.append(fact) else: new_field.append(fact) # Update dataset data += json.dumps({ "update": { "_id": document['_id'], "_type": document['_type'], "_index": document['_index'] } }) + '\n' document = {'doc': {self.field: new_field}} data += json.dumps(document) + '\n' response = self.es_m.scroll(scroll_id=scroll_id, size=bs, field_scroll=self.field) total_docs = len(response['hits']['hits']) docs_left -= bs # DEBUG scroll_id = response['_scroll_id'] self.es_m.plain_post_bulk(self.es_m.es_url, data) print('DONE') # DEBUG logger.set_context('docs_left', total_docs) logger.set_context('batch', batch) logger.info('remove_facts_from_document') except: print(traceback.format_exc()) logger.set_context('es_params', self.es_params) logger.exception('remove_facts_from_document_failed')
def delete(request): logger = LogManager(__name__, 'DELETE SEARCH') search_id = request.GET['pk'] logger.set_context('user_name', request.user.username) logger.set_context('search_id', search_id) try: Search.objects.get(pk=search_id).delete() logger.info('search_deleted') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.exception('search_deletion_failed') return HttpResponse(search_id)
def delete(request): post_data = json.loads(request.POST['data']) logger = LogManager(__name__, 'DELETE SEARCH') search_ids = post_data['pks'] logger.set_context('user_name', request.user.username) logger.set_context('search_ids', search_ids) try: for search_id in search_ids: Search.objects.get(pk=search_id).delete() logger.info('search_deleted:' + search_id) except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.exception('search_deletion_failed') return HttpResponse()
def update_model(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST try: model = { "pk": parameters["model_pk"], "description": parameters["model_description"], "unique_id": parameters["model_uuid"] } request.session['model'] = model logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_model', model) logger.info('model_updated') return HttpResponse(json.dumps({'status': 'success'})) except: return HttpResponse(json.dumps({'status': 'error'}))
def save(request): logger = LogManager(__name__, 'SAVE SEARCH') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_params = request.POST es_m.build(es_params) combined_query = es_m.get_combined_query() try: q = combined_query desc = request.POST['search_description'] s_content = {} # make json for x in request.POST.keys(): if 'match_txt' in x: # get the ID of the field, eg match_txt_1 returns 1 match_txt_1533 returns 1533 field_id = x.rsplit("_", 1)[-1] match_field = request.POST['match_field_' + field_id] if match_field in s_content.keys(): s_content[match_field].append(request.POST[x]) else: s_content[match_field] = [request.POST[x]] search = Search(author=request.user, search_content=json.dumps(s_content), description=desc, query=json.dumps(q)) logger.info('Saving search for datasets: {}'.format( request.session['dataset'])) search.save() for dataset_id in request.session['dataset']: dataset = Dataset.objects.get(pk=int(dataset_id)) search.datasets.add(dataset) search.save() logger.set_context('user_name', request.user.username) logger.set_context('search_id', search.id) logger.info('search_saved') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('es_params', es_params) logger.exception('search_saving_failed') return HttpResponse()
def save(request): logger = LogManager(__name__, 'SAVE SEARCH') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_params = request.POST es_m.build(es_params) combined_query = es_m.get_combined_query() try: q = combined_query desc = request.POST['search_description'] s_content = {} # make json for x in request.POST.keys(): if 'match_txt' in x: # get the ID of the field, eg match_txt_1 returns 1 match_txt_1533 returns 1533 field_id = x.rsplit("_", 1)[-1] match_field = request.POST['match_field_' + field_id] if match_field in s_content.keys(): s_content[match_field].append(request.POST[x]) else: s_content[match_field] = [request.POST[x]] search = Search(author=request.user, search_content=json.dumps(s_content), description=desc, query=json.dumps(q)) logger.info('Saving search for datasets: {}'.format(request.session['dataset'])) search.save() for dataset_id in request.session['dataset']: dataset = Dataset.objects.get(pk=int(dataset_id)) search.datasets.add(dataset) search.save() logger.set_context('user_name', request.user.username) logger.set_context('search_id', search.id) logger.info('search_saved') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('es_params', es_params) logger.exception('search_saving_failed') return HttpResponse()
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) try: out = execute_search(es_m, es_params) except Exception as e: logging.getLogger(ERROR_LOGGER).error( json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out
def update_dataset(request): logger = LogManager(__name__, 'CHANGE_SETTINGS') parameters = request.POST try: # TODO: check if is a valid mapping_id before change session[dataset] new_datasets = parameters.getlist('dataset[]') new_datasets = [ new_dataset for new_dataset in new_datasets if request.user.has_perm('permission_admin.can_access_dataset_' + str(new_dataset)) ] request.session['dataset'] = new_datasets logger.clean_context() logger.set_context('user_name', request.user.username) logger.set_context('new_datasets', new_datasets) logger.info('datasets_updated') ds = Datasets().activate_datasets(request.session) return HttpResponse(json.dumps({'status': 'success'})) except: return HttpResponse(json.dumps({'status': 'error'}))
def facts_agg(es_params, request): logger = LogManager(__name__, 'FACTS AGGREGATION') distinct_values = [] query_results = [] lexicon = [] aggregation_data = es_params['aggregate_over'] aggregation_data = json.loads(aggregation_data) original_aggregation_field = aggregation_data['path'] aggregation_field = 'texta_link.facts' try: aggregation_size = 50 aggregations = {"strings": {es_params['sort_by']: {"field": aggregation_field, 'size': 0}}, "distinct_values": {"cardinality": {"field": aggregation_field}}} # Define selected mapping ds = Datasets().activate_dataset(request.session) dataset = ds.get_index() mapping = ds.get_mapping() date_range = ds.get_date_range() es_m = ES_Manager(dataset, mapping, date_range) for item in es_params: if 'saved_search' in item: s = Search.objects.get(pk=es_params[item]) name = s.description saved_query = json.loads(s.query) es_m.load_combined_query(saved_query) es_m.set_query_parameter('aggs', aggregations) response = es_m.search() # Filter response bucket_filter = '{0}.'.format(original_aggregation_field.lower()) final_bucket = [] for b in response['aggregations']['strings']['buckets']: if bucket_filter in b['key']: fact_name = b['key'].split('.')[-1] b['key'] = fact_name final_bucket.append(b) final_bucket = final_bucket[:aggregation_size] response['aggregations']['distinct_values']['value'] = len(final_bucket) response['aggregations']['strings']['buckets'] = final_bucket normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings') lexicon = list(set(lexicon+labels)) query_results.append({'name':name,'data':normalised_counts,'labels':labels}) distinct_values.append({'name':name,'data':response['aggregations']['distinct_values']['value']}) es_m.build(es_params) # FIXME # this is confusing for the user if not es_m.is_combined_query_empty(): es_m.set_query_parameter('aggs', aggregations) response = es_m.search() # Filter response bucket_filter = '{0}.'.format(original_aggregation_field.lower()) final_bucket = [] for b in response['aggregations']['strings']['buckets']: if bucket_filter in b['key']: fact_name = b['key'].split('.')[-1] b['key'] = fact_name final_bucket.append(b) final_bucket = final_bucket[:aggregation_size] response['aggregations']['distinct_values']['value'] = len(final_bucket) response['aggregations']['strings']['buckets'] = final_bucket normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings') lexicon = list(set(lexicon+labels)) query_results.append({'name':'Query','data':normalised_counts,'labels':labels}) distinct_values.append({'name':'Query','data':response['aggregations']['distinct_values']['value']}) data = [a+zero_list(len(query_results)) for a in map(list, zip(*[lexicon]))] data = [['Word']+[query_result['name'] for query_result in query_results]]+data for i,word in enumerate(lexicon): for j,query_result in enumerate(query_results): for k,label in enumerate(query_result['labels']): if word == label: data[i+1][j+1] = query_result['data'][k] logger.set_context('user_name', request.user.username) logger.info('facts_aggregation_queried') except Exception as e: print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.exception('facts_aggregation_query_failed') table_height = len(data)*15 table_height = table_height if table_height > 500 else 500 return {'data':[data[0]]+sorted(data[1:], key=lambda x: sum(x[1:]), reverse=True),'height':table_height,'type':'bar','distinct_values':json.dumps(distinct_values)}
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') try: start_time = time.time() out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) # DEFINING THE EXAMPLE SIZE es_m.set_query_parameter('from', es_params['examples_start']) es_m.set_query_parameter('size', es_params['num_examples']) # HIGHLIGHTING THE MATCHING FIELDS pre_tag = '<span class="[HL]" style="background-color:#FFD119">' post_tag = "</span>" highlight_config = {"fields": {}, "pre_tags": [pre_tag], "post_tags": [post_tag]} for field in es_params: if 'match_field' in field and es_params['match_operator_'+field.split('_')[-1]] != 'must_not': f = es_params[field] highlight_config['fields'][f] = {"number_of_fragments": 0} es_m.set_query_parameter('highlight', highlight_config) response = es_m.search() out['iTotalRecords'] = response['hits']['total'] out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit out['iTotalDisplayRecords'] = '10000' # get columns names from ES mapping out['column_names'] = es_m.get_column_names() for hit in response['hits']['hits']: hit_id = str(hit['_id']) row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content inner_hits = hit['inner_hits'] if 'inner_hits' in hit else {} name_to_inner_hits = defaultdict(list) for inner_hit_name, inner_hit in inner_hits.items(): hit_type, _, _ = inner_hit_name.rsplit('_', 2) for inner_hit_hit in inner_hit['hits']['hits']: source = inner_hit_hit['_source'] source['hit_type'] = hit_type name_to_inner_hits[source['doc_path']].append(source) # Fill the row content respecting the order of the columns cols_data = {} for col in out['column_names']: # If the content is nested, need to break the flat name in a path list filed_path = col.split('.') # Get content for this field path: # - Starts with the hit structure # - For every field in field_path, retrieve the specific content # - Repeat this until arrives at the last field # - If the field in the field_path is not in this hit structure, # make content empty (to allow dynamic mapping without breaking alignment) content = hit['_source'] for p in filed_path: if col == u'texta_facts' and p in content: new_content = [] facts = ['{ "'+x["fact"]+'": "'+x["str_val"]+'"}' for x in sorted(content[p], key=lambda k: k['fact'])] fact_counts = Counter(facts) facts = sorted(list(set(facts))) facts_dict = [json.loads(x) for x in facts] for i, d in enumerate(facts_dict): for k in d: # Make factnames bold for searcher if '<b>'+k+'</b>' not in new_content: new_content.append('<b>'+k+'</b>') new_content.append(' {}: {}'.format(d[k], fact_counts[facts[i]])) content = '\n'.join(new_content) else: content = content[p] if p in content else '' # To strip fields with whitespace in front try: old_content = content.strip() except: old_content = content # Substitute feature value with value highlighted by Elasticsearch if col in highlight_config['fields'] and 'highlight' in hit: content = hit['highlight'][col][0] if col in hit['highlight'] else '' # Prettify and standardize highlights highlight_data = [] if name_to_inner_hits[col]: color_map = ColorPicker.get_color_map(keys={hit['fact'] for hit in name_to_inner_hits[col]}) for inner_hit in name_to_inner_hits[col]: datum = { 'spans': json.loads(inner_hit['spans']), 'name': inner_hit['fact'], 'category': '[{0}]'.format(inner_hit['hit_type']), 'color': color_map[inner_hit['fact']] } if inner_hit['hit_type'] == 'fact_val': datum['value'] = inner_hit['str_val'] highlight_data.append(datum) content = Highlighter(average_colors=True, derive_spans=True, additional_style_string='font-weight: bold;').highlight( old_content, highlight_data, tagged_text=content) # else: # # WHEN USING OLD FORMAT DOCUMENTS, SOMETIMES BREAKS AT HIGHLIGHTER, CHECK IF ITS STRING INSTEAD OF FOR EXAMPLE LIST # highlight_data = [] # if (isinstance(content, str)) or (isinstance(content, bytes)): # content = Highlighter(average_colors=True, derive_spans=True, # additional_style_string='font-weight: bold;').highlight( # old_content, # highlight_data, # tagged_text=content) # Append the final content of this col to the row if(row[col] == ''): row[col] = content cols_data[col] = {'highlight_data': highlight_data, 'content': content, 'old_content': old_content} # Transliterate the highlighting between different cols translit_search_cols = ['text', 'translit', 'lemmas'] hl_cols = [x for x in cols_data if len(x.split('.')) > 1 and x.split('.')[-1] in translit_search_cols] # To get value before '.' as well row = highlight_transliterately(cols_data, row, hl_cols=hl_cols) # Checks if user wants to see full text or short version for col in row: if 'show_short_version' in es_params.keys(): row[col] = additional_option_cut_text(row[col], es_params['short_version_n_char']) out['aaData'].append(row.values()) out['lag'] = time.time()-start_time logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out except Exception as e: logging.getLogger(ERROR_LOGGER).error(json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} return out