def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) try: out = execute_search(es_m, es_params) except Exception as e: log_dict = { 'task': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed' } logging.getLogger(ERROR_LOGGER).error("Documents queried failed", extra=log_dict, exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = { 'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0 } logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out
def export_pages(request): es_params = request.session.get('export_args') if es_params is not None: if es_params['num_examples'] == '*': response = StreamingHttpResponse(get_all_rows(es_params, request), content_type='text/csv') else: response = StreamingHttpResponse(get_rows(es_params, request), content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s"' % (es_params['filename']) return response logger = LogManager(__name__, 'SEARCH CORPUS') logger.set_context('user_name', request.user.username) logger.error('export pages failed, parameters empty') return HttpResponse()
def export_pages(request): es_params = request.session.get('export_args') if es_params is not None: if es_params['num_examples'] == '*': response = StreamingHttpResponse(get_all_rows(es_params, request), content_type='text/csv') else: response = StreamingHttpResponse(get_rows(es_params, request), content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s"' % ( es_params['filename']) return response logger = LogManager(__name__, 'SEARCH CORPUS') logger.set_context('user_name', request.user.username) logger.error('export pages failed, parameters empty') return HttpResponse()
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') ds = Datasets().activate_datasets(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) try: out = execute_search(es_m, es_params) except Exception as e: logging.getLogger(ERROR_LOGGER).error( json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out
def search(es_params, request): logger = LogManager(__name__, 'SEARCH CORPUS') try: start_time = time.time() out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} ds = Datasets().activate_dataset(request.session) es_m = ds.build_manager(ES_Manager) es_m.build(es_params) # DEFINING THE EXAMPLE SIZE es_m.set_query_parameter('from', es_params['examples_start']) es_m.set_query_parameter('size', es_params['num_examples']) # HIGHLIGHTING THE MATCHING FIELDS pre_tag = '<span class="[HL]" style="background-color:#FFD119">' post_tag = "</span>" highlight_config = {"fields": {}, "pre_tags": [pre_tag], "post_tags": [post_tag]} for field in es_params: if 'match_field' in field and es_params['match_operator_'+field.split('_')[-1]] != 'must_not': f = es_params[field] highlight_config['fields'][f] = {"number_of_fragments": 0} es_m.set_query_parameter('highlight', highlight_config) response = es_m.search() out['iTotalRecords'] = response['hits']['total'] out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit out['iTotalDisplayRecords'] = '10000' # get columns names from ES mapping out['column_names'] = es_m.get_column_names() for hit in response['hits']['hits']: hit_id = str(hit['_id']) row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content inner_hits = hit['inner_hits'] if 'inner_hits' in hit else {} name_to_inner_hits = defaultdict(list) for inner_hit_name, inner_hit in inner_hits.items(): hit_type, _, _ = inner_hit_name.rsplit('_', 2) for inner_hit_hit in inner_hit['hits']['hits']: source = inner_hit_hit['_source'] source['hit_type'] = hit_type name_to_inner_hits[source['doc_path']].append(source) # Fill the row content respecting the order of the columns cols_data = {} for col in out['column_names']: # If the content is nested, need to break the flat name in a path list filed_path = col.split('.') # Get content for this field path: # - Starts with the hit structure # - For every field in field_path, retrieve the specific content # - Repeat this until arrives at the last field # - If the field in the field_path is not in this hit structure, # make content empty (to allow dynamic mapping without breaking alignment) content = hit['_source'] for p in filed_path: if col == u'texta_facts' and p in content: new_content = [] facts = ['{ "'+x["fact"]+'": "'+x["str_val"]+'"}' for x in sorted(content[p], key=lambda k: k['fact'])] fact_counts = Counter(facts) facts = sorted(list(set(facts))) facts_dict = [json.loads(x) for x in facts] for i, d in enumerate(facts_dict): for k in d: # Make factnames bold for searcher if '<b>'+k+'</b>' not in new_content: new_content.append('<b>'+k+'</b>') new_content.append(' {}: {}'.format(d[k], fact_counts[facts[i]])) content = '\n'.join(new_content) else: content = content[p] if p in content else '' # To strip fields with whitespace in front try: old_content = content.strip() except: old_content = content # Substitute feature value with value highlighted by Elasticsearch if col in highlight_config['fields'] and 'highlight' in hit: content = hit['highlight'][col][0] if col in hit['highlight'] else '' # Prettify and standardize highlights highlight_data = [] if name_to_inner_hits[col]: color_map = ColorPicker.get_color_map(keys={hit['fact'] for hit in name_to_inner_hits[col]}) for inner_hit in name_to_inner_hits[col]: datum = { 'spans': json.loads(inner_hit['spans']), 'name': inner_hit['fact'], 'category': '[{0}]'.format(inner_hit['hit_type']), 'color': color_map[inner_hit['fact']] } if inner_hit['hit_type'] == 'fact_val': datum['value'] = inner_hit['str_val'] highlight_data.append(datum) content = Highlighter(average_colors=True, derive_spans=True, additional_style_string='font-weight: bold;').highlight( old_content, highlight_data, tagged_text=content) # else: # # WHEN USING OLD FORMAT DOCUMENTS, SOMETIMES BREAKS AT HIGHLIGHTER, CHECK IF ITS STRING INSTEAD OF FOR EXAMPLE LIST # highlight_data = [] # if (isinstance(content, str)) or (isinstance(content, bytes)): # content = Highlighter(average_colors=True, derive_spans=True, # additional_style_string='font-weight: bold;').highlight( # old_content, # highlight_data, # tagged_text=content) # Append the final content of this col to the row if(row[col] == ''): row[col] = content cols_data[col] = {'highlight_data': highlight_data, 'content': content, 'old_content': old_content} # Transliterate the highlighting between different cols translit_search_cols = ['text', 'translit', 'lemmas'] hl_cols = [x for x in cols_data if len(x.split('.')) > 1 and x.split('.')[-1] in translit_search_cols] # To get value before '.' as well row = highlight_transliterately(cols_data, row, hl_cols=hl_cols) # Checks if user wants to see full text or short version for col in row: if 'show_short_version' in es_params.keys(): row[col] = additional_option_cut_text(row[col], es_params['short_version_n_char']) out['aaData'].append(row.values()) out['lag'] = time.time()-start_time logger.set_context('query', es_m.get_combined_query()) logger.set_context('user_name', request.user.username) logger.info('documents_queried') return out except Exception as e: logging.getLogger(ERROR_LOGGER).error(json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True) print('-- Exception[{0}] {1}'.format(__name__, e)) logger.set_context('user_name', request.user.username) logger.error('documents_queried_failed') out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0} return out