Example #1
0
def search(es_params, request):
    logger = LogManager(__name__, 'SEARCH CORPUS')

    ds = Datasets().activate_datasets(request.session)
    es_m = ds.build_manager(ES_Manager)
    es_m.build(es_params)
    try:
        out = execute_search(es_m, es_params)
    except Exception as e:
        log_dict = {
            'task': 'SEARCH DOCUMENTS',
            'event': 'documents_queried_failed'
        }
        logging.getLogger(ERROR_LOGGER).error("Documents queried failed",
                                              extra=log_dict,
                                              exc_info=True)
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.error('documents_queried_failed')

        out = {
            'column_names': [],
            'aaData': [],
            'iTotalRecords': 0,
            'iTotalDisplayRecords': 0,
            'lag': 0
        }

    logger.set_context('query', es_m.get_combined_query())
    logger.set_context('user_name', request.user.username)
    logger.info('documents_queried')
    return out
Example #2
0
def export_pages(request):

    es_params = request.session.get('export_args')
    if es_params is not None:
        if es_params['num_examples'] == '*':
            response = StreamingHttpResponse(get_all_rows(es_params, request), content_type='text/csv')
        else:
            response = StreamingHttpResponse(get_rows(es_params, request), content_type='text/csv')

        response['Content-Disposition'] = 'attachment; filename="%s"' % (es_params['filename'])

        return response

    logger = LogManager(__name__, 'SEARCH CORPUS')
    logger.set_context('user_name', request.user.username)
    logger.error('export pages failed, parameters empty')
    return HttpResponse()
Example #3
0
def export_pages(request):

    es_params = request.session.get('export_args')
    if es_params is not None:
        if es_params['num_examples'] == '*':
            response = StreamingHttpResponse(get_all_rows(es_params, request),
                                             content_type='text/csv')
        else:
            response = StreamingHttpResponse(get_rows(es_params, request),
                                             content_type='text/csv')

        response['Content-Disposition'] = 'attachment; filename="%s"' % (
            es_params['filename'])

        return response

    logger = LogManager(__name__, 'SEARCH CORPUS')
    logger.set_context('user_name', request.user.username)
    logger.error('export pages failed, parameters empty')
    return HttpResponse()
Example #4
0
File: views.py Project: ekt68/texta
def search(es_params, request):
    logger = LogManager(__name__, 'SEARCH CORPUS')

    ds = Datasets().activate_datasets(request.session)
    es_m = ds.build_manager(ES_Manager)
    es_m.build(es_params)
    try:
        out = execute_search(es_m, es_params)
    except Exception as e:
        logging.getLogger(ERROR_LOGGER).error(
            json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True)
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.error('documents_queried_failed')

        out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0}

    logger.set_context('query', es_m.get_combined_query())
    logger.set_context('user_name', request.user.username)
    logger.info('documents_queried')
    return out
Example #5
0
def search(es_params, request):
    logger = LogManager(__name__, 'SEARCH CORPUS')

    try:

        start_time = time.time()
        out = {'column_names': [],
               'aaData': [],
               'iTotalRecords': 0,
               'iTotalDisplayRecords': 0,
               'lag': 0}

        ds = Datasets().activate_dataset(request.session)
        es_m = ds.build_manager(ES_Manager)
        es_m.build(es_params)

        # DEFINING THE EXAMPLE SIZE
        es_m.set_query_parameter('from', es_params['examples_start'])
        es_m.set_query_parameter('size', es_params['num_examples'])

        # HIGHLIGHTING THE MATCHING FIELDS
        pre_tag = '<span class="[HL]" style="background-color:#FFD119">'
        post_tag = "</span>"
        highlight_config = {"fields": {}, "pre_tags": [pre_tag], "post_tags": [post_tag]}
        for field in es_params:
            if 'match_field' in field and es_params['match_operator_'+field.split('_')[-1]] != 'must_not':
                f = es_params[field]
                highlight_config['fields'][f] = {"number_of_fragments": 0}
        es_m.set_query_parameter('highlight', highlight_config)
        response = es_m.search()

        out['iTotalRecords'] = response['hits']['total']
        out['iTotalDisplayRecords'] = response['hits']['total'] # number of docs

        if int(out['iTotalDisplayRecords']) > 10000: # Allow less pages if over page limit
            out['iTotalDisplayRecords'] = '10000'

        # get columns names from ES mapping
        out['column_names'] = es_m.get_column_names()

        for hit in response['hits']['hits']:
            hit_id = str(hit['_id'])
            row = OrderedDict([(x, '') for x in out['column_names']]) # OrderedDict to remember column names with their content

            inner_hits = hit['inner_hits'] if 'inner_hits' in hit else {}
            name_to_inner_hits = defaultdict(list)
            for inner_hit_name, inner_hit in inner_hits.items():
                hit_type, _, _ = inner_hit_name.rsplit('_', 2)
                for inner_hit_hit in inner_hit['hits']['hits']:
                    source = inner_hit_hit['_source']
                    source['hit_type'] = hit_type
                    name_to_inner_hits[source['doc_path']].append(source)


            # Fill the row content respecting the order of the columns
            cols_data = {}
            for col in out['column_names']:

                # If the content is nested, need to break the flat name in a path list
                filed_path = col.split('.')

                # Get content for this field path:
                #   - Starts with the hit structure
                #   - For every field in field_path, retrieve the specific content
                #   - Repeat this until arrives at the last field
                #   - If the field in the field_path is not in this hit structure,
                #     make content empty (to allow dynamic mapping without breaking alignment)
                content = hit['_source']
                for p in filed_path:
                    if col == u'texta_facts' and p in content:
                        new_content = []
                        facts = ['{ "'+x["fact"]+'": "'+x["str_val"]+'"}' for x in sorted(content[p], key=lambda k: k['fact'])]
                        fact_counts = Counter(facts)

                        facts = sorted(list(set(facts)))
                        facts_dict = [json.loads(x) for x in facts]
                        for i, d in enumerate(facts_dict):
                            for k in d:
                                # Make factnames bold for searcher
                                if '<b>'+k+'</b>' not in new_content:
                                    new_content.append('<b>'+k+'</b>')
                                new_content.append('    {}: {}'.format(d[k], fact_counts[facts[i]]))
                        content = '\n'.join(new_content)
                    else:
                        content = content[p] if p in content else ''


                # To strip fields with whitespace in front
                try:
                    old_content = content.strip()
                except:
                    old_content = content

                # Substitute feature value with value highlighted by Elasticsearch
                if col in highlight_config['fields'] and 'highlight' in hit:
                    content = hit['highlight'][col][0] if col in hit['highlight'] else ''
                # Prettify and standardize highlights
                highlight_data = []
                if name_to_inner_hits[col]:
                    color_map = ColorPicker.get_color_map(keys={hit['fact'] for hit in name_to_inner_hits[col]})
                    for inner_hit in name_to_inner_hits[col]:
                        datum = {
                            'spans': json.loads(inner_hit['spans']),
                            'name': inner_hit['fact'],
                            'category': '[{0}]'.format(inner_hit['hit_type']),
                            'color': color_map[inner_hit['fact']]
                        }

                        if inner_hit['hit_type'] == 'fact_val':
                            datum['value'] = inner_hit['str_val']
                        highlight_data.append(datum)

                    content = Highlighter(average_colors=True, derive_spans=True,
                                              additional_style_string='font-weight: bold;').highlight(
                                                  old_content,
                                                  highlight_data,
                                                  tagged_text=content)
                # else:
                #     # WHEN USING OLD FORMAT DOCUMENTS, SOMETIMES BREAKS AT HIGHLIGHTER, CHECK IF ITS STRING INSTEAD OF FOR EXAMPLE LIST
                #     highlight_data = []
                #     if (isinstance(content, str)) or (isinstance(content, bytes)):
                #         content = Highlighter(average_colors=True, derive_spans=True,
                #                                 additional_style_string='font-weight: bold;').highlight(
                #                                     old_content,
                #                                     highlight_data,
                #                                     tagged_text=content)

                # Append the final content of this col to the row
                if(row[col] == ''):
                    row[col] = content

                cols_data[col] = {'highlight_data': highlight_data, 'content': content, 'old_content': old_content}



            # Transliterate the highlighting between different cols
            translit_search_cols = ['text', 'translit', 'lemmas']
            hl_cols = [x for x in cols_data if len(x.split('.')) > 1 and x.split('.')[-1] in translit_search_cols] # To get value before '.' as well
            row = highlight_transliterately(cols_data, row, hl_cols=hl_cols)

            # Checks if user wants to see full text or short version
            for col in row:
                if 'show_short_version' in es_params.keys():
                    row[col] = additional_option_cut_text(row[col], es_params['short_version_n_char'])

            out['aaData'].append(row.values())

            out['lag'] = time.time()-start_time
            logger.set_context('query', es_m.get_combined_query())
            logger.set_context('user_name', request.user.username)
            logger.info('documents_queried')

        return out

    except Exception as e:
        logging.getLogger(ERROR_LOGGER).error(json.dumps({'process': 'SEARCH DOCUMENTS', 'event': 'documents_queried_failed'}), exc_info=True)

        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.error('documents_queried_failed')

        out = {'column_names': [], 'aaData': [], 'iTotalRecords': 0, 'iTotalDisplayRecords': 0, 'lag': 0}
        return out