Beispiel #1
0
 def __init__(self, request):
     self.es_params = request.POST
     self.ds = Datasets().activate_dataset(request.session)
     self.index = self.ds.get_index()
     self.mapping = self.ds.get_mapping()
     self.es_m = ES_Manager(self.index, self.mapping)
     self.field = 'texta_facts'
Beispiel #2
0
 def __init__(self, es_index, es_mapping, field, query):
     # Dataset info
     self.es_index = es_index
     self.es_mapping = es_mapping
     self.field = field
     # Build ES manager
     self.es_m = ES_Manager(es_index, es_mapping)
     self.es_m.load_combined_query(query)
Beispiel #3
0
def open_close_dataset(request):
    dataset_id = request.POST['dataset_id']
    dataset = Dataset.objects.get(pk=dataset_id)

    if request.POST['open_close'] == 'open':
        ES_Manager.open_index(dataset.index)
    else:
        ES_Manager.close_index(dataset.index)

    return HttpResponse()
Beispiel #4
0
def open_close_dataset(request):
    dataset_id = request.POST['dataset_id']
    dataset = Dataset.objects.get(pk=dataset_id)

    if request.POST['open_close'] == 'open':
        ES_Manager.open_index(dataset.index)
    else:
        ES_Manager.close_index(dataset.index)

    return HttpResponse()
Beispiel #5
0
    def parse_request(self, request):

        self.lookup_types = request.POST['lookup_types'].split(',')
        self.key_constraints = request.POST['key_constraints'].split(',')
        self.content = request.POST['content'].split('\n')[-1].strip()
        print(self.content)
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        self.user = request.user
Beispiel #6
0
def delete_index(request):
    dataset_ids = request.POST.getlist('dataset_ids[]')
    for dataset_id in dataset_ids:
        index_to_delete = Dataset.objects.get(pk=dataset_id)
        content_type = ContentType.objects.get_for_model(Dataset)
        Permission.objects.get(
            codename='can_access_dataset_' + str(index_to_delete.id),
            content_type=content_type,
        ).delete()

        ES_Manager.delete_index(index_to_delete.index)
        index_to_delete.delete()

    return HttpResponseRedirect(URL_PREFIX + '/permission_admin/')
Beispiel #7
0
def delete_index(request):
    dataset_ids = request.POST.getlist('dataset_ids[]')
    for dataset_id in dataset_ids:
        index_to_delete = Dataset.objects.get(pk=dataset_id)
        content_type = ContentType.objects.get_for_model(Dataset)
        Permission.objects.get(
            codename='can_access_dataset_' + str(index_to_delete.id),
            content_type=content_type,
        ).delete()

        ES_Manager.delete_index(index_to_delete.index)
        index_to_delete.delete()

    return HttpResponseRedirect(URL_PREFIX + '/permission_admin/')
Beispiel #8
0
def index(request):
    template = loader.get_template('dataset_importer.html')
    jobs = DatasetImport.objects.all()

    archive_formats = collect_map_entries(extractor_map)
    single_document_formats = collect_map_entries(entity_reader_map)
    document_collection_formats = collect_map_entries(collection_reader_map)
    database_formats = collect_map_entries(database_reader_map)

    # preprocessors = collect_map_entries(preprocessor_map)
    # enabled_preprocessors = [preprocessor for preprocessor in preprocessors if preprocessor['is_enabled'] is True]

    datasets = Datasets().get_allowed_datasets(request.user)
    language_models = Task.objects.filter(
        task_type=TaskTypes.TRAIN_MODEL.value).filter(
            status__iexact=Task.STATUS_COMPLETED).order_by('-pk')

    analyzers = ES_Manager.get_analyzers()

    context = {
        # 'enabled_input_types': DATASET_IMPORTER_CONF['enabled_input_types'],
        'archive_formats': archive_formats,
        'single_document_formats': single_document_formats,
        'document_collection_formats': document_collection_formats,
        'database_formats': database_formats,
        'language_models': language_models,
        'allowed_datasets': datasets,
        'jobs': jobs,
        'analyzers': analyzers
        # 'enabled_preprocessors': enabled_preprocessors
    }

    return HttpResponse(template.render(context, request))
Beispiel #9
0
def index(request):
    template = loader.get_template('dataset_importer.html')
    jobs = DatasetImport.objects.all()

    archive_formats = collect_map_entries(extractor_map)
    single_document_formats = collect_map_entries(entity_reader_map)
    document_collection_formats = collect_map_entries(collection_reader_map)
    database_formats = collect_map_entries(database_reader_map)

    # preprocessors = collect_map_entries(preprocessor_map)
    # enabled_preprocessors = [preprocessor for preprocessor in preprocessors if preprocessor['is_enabled'] is True]

    datasets = Datasets().get_allowed_datasets(request.user)
    language_models =Task.objects.filter(task_type=TaskTypes.TRAIN_MODEL.value).filter(status__iexact=Task.STATUS_COMPLETED).order_by('-pk')

    analyzers = ES_Manager.get_analyzers()

    context = {
        # 'enabled_input_types': DATASET_IMPORTER_CONF['enabled_input_types'],
        'archive_formats': archive_formats,
        'single_document_formats': single_document_formats,
        'document_collection_formats': document_collection_formats,
        'database_formats': database_formats,
        'language_models': language_models,
        'allowed_datasets': datasets,
        'jobs': jobs,
        'analyzers': analyzers
        # 'enabled_preprocessors': enabled_preprocessors
    }

    return HttpResponse(template.render(context, request))
Beispiel #10
0
def get_example_texts(request, field, value):
    ds = Datasets().activate_dataset(request.session)
    dataset = ds.get_index()
    mapping = ds.get_mapping()

    query = json.dumps({
        "size": 10,
        "highlight": {
            "fields": {
                field: {}
            }
        },
        "query": {
            "match": {
                field: value
            }
        }
    })
    response = ES_Manager.plain_scroll(es_url, dataset, mapping, query)

    matched_sentences = []
    for hit in response['hits']['hits']:
        for match in hit['highlight'].values():
            matched_sentences.append(match[0])

    return matched_sentences
Beispiel #11
0
def index(request):
    indices = ES_Manager.get_indices()
    indices = sorted(indices, key=lambda x: x['index'])  # sort alphabetically
    datasets = get_datasets(indices=indices)

    users = User.objects.all()
    users = annotate_users_with_permissions(users, datasets)
    template = loader.get_template('permission_admin.html')
    allowed_datasets = Datasets().get_allowed_datasets(request.user)

    language_models = Task.objects.filter(
        task_type=TaskTypes.TRAIN_MODEL.value).filter(
            status__iexact=Task.STATUS_COMPLETED).order_by('-pk')

    return HttpResponse(
        template.render(
            {
                'users': users,
                'datasets': datasets,
                'indices': indices,
                'STATIC_URL': STATIC_URL,
                'URL_PREFIX': URL_PREFIX,
                'allowed_datasets': allowed_datasets,
                'language_models': language_models
            }, request))
Beispiel #12
0
def add_dataset(request):
    daterange = ""
    dataset = Dataset(author=request.user,
                      index=request.POST['index'],
                      mapping=request.POST['mapping'],
                      daterange=daterange,
                      access=(request.POST['access']))
    dataset.save()

    create_dataset_access_permission_and_propagate(dataset,
                                                   request.POST['access'])
    indices = ES_Manager.get_indices()
    ds_out = dataset.__dict__
    for index in indices:
        if index['index'] == ds_out['index']:
            ds_out['status'] = index['status']
            ds_out['docs_count'] = index['docs_count']
            ds_out['store_size'] = index['store_size']
            break
        elif '*' in ds_out['index']:
            ds_out['status'] = 'open'
            ds_out['docs_count'] = 'multiindex'
            ds_out['store_size'] = 'multiindex'
    ds_out['_state'] = ''
    ds_out['author'] = request.user.get_username()
    return JsonResponse(ds_out)
Beispiel #13
0
 def get_tags_by_id(self, doc_id):
     request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index, self.es_mapping, doc_id)
     response = ES_Manager.plain_get(request_url)
     if 'texta_tags' in response['_source']:
         tags = response['_source']['texta_tags']
     else:
         tags = ""
     return tags.split()
Beispiel #14
0
 def __init__(self, es_index, es_mapping, field, query):
     # Dataset info
     self.es_index = es_index
     self.es_mapping = es_mapping
     self.field = field
     # Build ES manager
     self.es_m = ES_Manager(es_index, es_mapping)
     self.es_m.load_combined_query(query)
Beispiel #15
0
def delete_index(request):
    index_to_delete = request.POST['index']
    index_name = Dataset.objects.get(pk=index_to_delete).index

    remove_dataset(index_to_delete)
    es_m = ES_Manager.delete_index(index_name)

    return HttpResponseRedirect(URL_PREFIX + '/permission_admin/')
Beispiel #16
0
def delete_index(request):
    index_to_delete = request.POST['index']
    index_name = Dataset.objects.get(pk=index_to_delete).index

    remove_dataset(index_to_delete)
    es_m = ES_Manager.delete_index(index_name)

    return HttpResponseRedirect(URL_PREFIX + '/permission_admin/')
Beispiel #17
0
 def get_tags_by_id(self, doc_id):
     request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index,
                                            self.es_mapping, doc_id)
     response = ES_Manager.plain_get(request_url)
     if 'texta_tags' in response['_source']:
         tags = response['_source']['texta_tags']
     else:
         tags = ""
     return tags.split()
Beispiel #18
0
 def check_if_analyzer_exists(self):
     ELASTICSEARCH_ANALYZERS = ES_Manager.get_analyzers()
     user_sent_analyzer = self.post_dict["analyzer"]
     available_analyzer_names = list(
         map(lambda x: x["analyzer"], ELASTICSEARCH_ANALYZERS))
     if user_sent_analyzer not in available_analyzer_names:
         raise ValueError(
             "Analyzer '{0}' not available. Available analyzers are: '{1}'".
             format(user_sent_analyzer, available_analyzer_names))
Beispiel #19
0
 def get_allowed_datasets(self, user):
     indices = ES_Manager.get_indices()
     datasets = self.sort_datasets(indices)
     #print(datasets)
     return [
         dataset for dataset in datasets
         if user.has_perm('permission_admin.can_access_dataset_' +
                          str(dataset['id']))
     ]
Beispiel #20
0
    def __init__(self, request):
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        # PREPARE AGGREGATION
        self.es_params = request.POST
        interval = self.es_params["interval_1"]

        self.daterange = self._get_daterange(self.es_params)

        self.ranges, self.date_labels = self._get_date_intervals(
            self.daterange, interval)
        self.agg_query = self.prepare_agg_query()
        # EXECUTE AGGREGATION
        agg_results = self.aggregate()

        # PARSE RESPONSES INTO JSON OBJECT
        self.agg_data = self.parse_responses(agg_results)
Beispiel #21
0
    def parse_request(self,request):

        self.lookup_types = request.POST['lookup_types'].split(',')
        self.key_constraints = request.POST['key_constraints'].split(',')
        self.content = request.POST['content'].split('\n')[-1].strip()
        print(self.content)
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        self.user = request.user
Beispiel #22
0
def index(request):
    indices = ES_Manager.get_indices()
    indices = sorted(indices, key=lambda x: x['index'])  # sort alphabetically
    datasets = get_datasets(indices=indices)

    users = User.objects.all()
    users = annotate_users_with_permissions(users, datasets)
    template = loader.get_template('permission_admin.html')
    allowed_datasets = Datasets().get_allowed_datasets(request.user)

    language_models =Task.objects.filter(task_type=TaskTypes.TRAIN_MODEL.value).filter(status__iexact=Task.STATUS_COMPLETED).order_by('-pk')

    return HttpResponse(template.render({'users':users,'datasets':datasets,'indices':indices,'STATIC_URL':STATIC_URL,'URL_PREFIX':URL_PREFIX, 'allowed_datasets': allowed_datasets, 'language_models': language_models},request))
Beispiel #23
0
def get_example_texts(request, field, value):
    ds = Datasets().activate_dataset(request.session)
    dataset = ds.get_index()
    mapping = ds.get_mapping()

    query = json.dumps({ "size":10, "highlight": {"fields": {field: {}}}, "query": {"match": {field: value}}})
    response = ES_Manager.plain_scroll(es_url, dataset, mapping, query)

    matched_sentences = []
    for hit in response['hits']['hits']:
        for match in hit['highlight'].values():
            matched_sentences.append(match[0])

    return matched_sentences
Beispiel #24
0
    def __init__(self,request):
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        # PREPARE AGGREGATION
        self.es_params = request.POST
        interval = self.es_params["interval_1"]


        self.daterange = self._get_daterange(self.es_params)
        
        self.ranges,self.date_labels = self._get_date_intervals(self.daterange,interval)
        self.agg_query = self.prepare_agg_query()
        # EXECUTE AGGREGATION
        agg_results = self.aggregate()

        # PARSE RESPONSES INTO JSON OBJECT
        self.agg_data = self.parse_responses(agg_results)
Beispiel #25
0
def more_like_this(request):
    if request.method == "POST":
        try:
            utf8_post_payload = json.loads(request.body.decode("utf-8"))
        except json.JSONDecodeError as e:
            return JsonResponse({"json": str(e)}, status=400)

        valid_request = ValidateFormSerializer(data=utf8_post_payload)

        if valid_request.is_valid():
            post_data = valid_request.validated_data
            fields = [field for field in post_data["fields"]]
            size = post_data.get("size", 10)
            returned_fields = post_data.get("returned_fields", None)
            if_agg_only = post_data.get("if_agg_only", False)

            like = []
            for document in post_data["like"]:
                dataset = Dataset.objects.get(pk=document["dataset_id"])
                doc = {"_id": document["document_id"], "_index": dataset.index, "_type": dataset.mapping}
                like.append(doc)

            hits = ES_Manager.more_like_this(
                elastic_url=es_url,
                fields=fields,
                like=like,
                size=size,
                dataset=dataset,
                return_fields=returned_fields,
                filters=post_data.get("filters", []),
                aggregations=post_data.get("aggregations", []),
                include=post_data.get("include", False),
                if_agg_only=if_agg_only,
            )

            return JsonResponse(hits, status=200) if "elasticsearch" not in hits else JsonResponse(hits, status=400)

        else:
            logging.getLogger(ERROR_LOGGER).error("Request: {}, Response: {}".format(request.POST, valid_request.errors))
            return JsonResponse(valid_request.errors, status=400)
Beispiel #26
0
def add_dataset(request):
    daterange = ""
    dataset = Dataset(author=request.user, index=request.POST['index'],
                      mapping=request.POST['mapping'], daterange=daterange, access=(request.POST['access']))
    dataset.save()

    create_dataset_access_permission_and_propagate(dataset, request.POST['access'])
    indices = ES_Manager.get_indices()
    ds_out = dataset.__dict__
    for index in indices:
        if index['index'] == ds_out['index']:
            ds_out['status'] = index['status']
            ds_out['docs_count'] = index['docs_count']
            ds_out['store_size'] = index['store_size']
            break
        elif '*' in ds_out['index']:
            ds_out['status'] = 'open'
            ds_out['docs_count'] = 'multiindex'
            ds_out['store_size'] = 'multiindex'
    ds_out['_state'] = ''
    ds_out['author'] = request.user.get_username()
    return JsonResponse(ds_out)
Beispiel #27
0
def more_like_this(request):
    if request.method == "POST":
        try:
            utf8_post_payload = json.loads(request.body.decode("utf-8"))
        except json.JSONDecodeError as e:
            return JsonResponse({"json": str(e)}, status=400)

        valid_request = ValidateFormSerializer(data=utf8_post_payload)

        if valid_request.is_valid():
            post_data = valid_request.validated_data
            fields = ["{}.keyword".format(field) for field in post_data["fields"]]
            size = post_data.get("size", 10)
            returned_fields = post_data.get("returned_fields", None)
            if_agg_only = post_data.get("if_agg_only", False)

            like = []
            for document in post_data["like"]:
                dataset = Dataset.objects.get(pk=document["dataset_id"])
                doc = {"_id": document["document_id"], "_index": dataset.index, "_type": dataset.mapping}
                like.append(doc)

            hits = ES_Manager.more_like_this(
                elastic_url=es_url,
                fields=fields,
                like=like,
                size=size,
                dataset=dataset,
                return_fields=returned_fields,
                filters=post_data.get("filters", []),
                aggregations=post_data.get("aggregations", []),
                if_agg_only=if_agg_only,
            )

            return JsonResponse(hits, status=200) if "elasticsearch" not in hits else JsonResponse(hits, status=400)

        else:
            logging.getLogger(ERROR_LOGGER).error("Request: {}, Response: {}".format(request.POST, valid_request.errors))
            return JsonResponse(valid_request.errors, status=400)
Beispiel #28
0
def get_datasets(indices=None):
    datasets = Dataset.objects.all()
    datasets_out = []
    for dataset in datasets:
        ds_out = dataset.__dict__

        if indices:
            for index in indices:
                if index['index'] == ds_out['index']:
                    ds_out['status'] = index['status']
                    ds_out['docs_count'] = ES_Manager.single_index_count(
                        index["index"])  # Passed value from indices is wrong.
                    ds_out['store_size'] = index['store_size']

                elif '*' in ds_out['index']:
                    ds_out['status'] = 'open'
                    ds_out['docs_count'] = 'multiindex'
                    ds_out['store_size'] = 'multiindex'

        datasets_out.append(ds_out)

    return datasets
Beispiel #29
0
def get_analyzer_names(request):
    ELASTICSEARCH_ANALYZERS = ES_Manager.get_analyzers()
    analyzer_names = list(map(lambda x: x["analyzer"], ELASTICSEARCH_ANALYZERS))
    return JsonResponse({"analyzers": analyzer_names})
Beispiel #30
0
def get_mappings(request):
    index = request.GET['index']
    return HttpResponse(json.dumps(ES_Manager.get_mappings(index)))
Beispiel #31
0
def find_mappings(request):
    try:
        slop = int(request.POST['slop'])
        max_len = int(request.POST['max_len'])
        min_len = int(request.POST['min_len'])
        min_freq = int(request.POST['min_freq'])
        match_field = request.POST['match_field']
        description = request.POST['description']

        batch_size = 50

        # Define selected mapping
        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()

        lexicon = []
        word_index = {}
        num_lexicons = 0
        for i, lexicon_id in enumerate(request.POST.getlist('lexicons[]')):
            num_lexicons += 1
            for word in Word.objects.filter(lexicon=lexicon_id):
                word = word.wrd
                lexicon.append(word)
                if word not in word_index:
                    word_index[word] = []
                word_index[word].append(i)
        lexicon = list(set(lexicon))
        if min_len > num_lexicons:
            min_len = num_lexicons
        mwe_counter = 0
        group_counter = 0
        phrases = []
        final = {}
        data = []
        new_run = Run(minimum_frequency=min_freq,
                      maximum_length=max_len,
                      minimum_length=min_len,
                      run_status='running',
                      run_started=datetime.now(),
                      run_completed=None,
                      user=request.user,
                      description=description)
        new_run.save()
        logging.getLogger(INFO_LOGGER).info(
            json.dumps({
                'process': 'MINE MWEs',
                'event': 'mwe_mining_started',
                'args': {
                    'user_name': request.user.username,
                    'run_id': new_run.id,
                    'slop': slop,
                    'min_len': min_len,
                    'max_len': max_len,
                    'min_freq': min_freq,
                    'match_field': match_field,
                    'desc': description
                }
            }))
        for i in range(min_len, max_len + 1):
            print('Permutation len:', i)
            for permutation in itertools.permutations(lexicon, i):
                word_indices = list(
                    flatten([word_index[word] for word in permutation]))
                if len(word_indices) == len(set(word_indices)):
                    permutation = ' '.join(permutation)
                    if slop > 0:
                        query = {
                            "query": {
                                "match_phrase": {
                                    match_field: {
                                        "query": permutation,
                                        "slop": slop
                                    }
                                }
                            }
                        }
                    else:
                        query = {
                            "query": {
                                "match_phrase": {
                                    match_field: {
                                        "query": permutation
                                    }
                                }
                            }
                        }
                    data.append(
                        json.dumps({
                            "index": dataset,
                            "mapping": mapping
                        }) + '\n' + json.dumps(query))
                    phrases.append(permutation)
                    if len(data) == batch_size:
                        for j, response in enumerate(
                                ES_Manager.plain_multisearch(
                                    es_url, dataset, mapping, data)):
                            try:
                                if response['hits']['total'] >= min_freq:
                                    sorted_phrase = ' '.join(
                                        sorted(phrases[j].split(' ')))
                                    sorted_conceptualised_phrase = conceptualise_phrase(
                                        sorted_phrase, request.user)
                                    if sorted_conceptualised_phrase not in final:
                                        final[sorted_conceptualised_phrase] = {
                                            'total_freq': 0,
                                            'mwes': [],
                                            'display_name': {
                                                'freq': 0,
                                                'label': False
                                            },
                                            'id': group_counter
                                        }
                                        group_counter += 1
                                    final[sorted_conceptualised_phrase][
                                        'total_freq'] += response['hits'][
                                            'total']
                                    final[sorted_conceptualised_phrase][
                                        'mwes'].append({
                                            'mwe':
                                            phrases[j],
                                            'freq':
                                            response['hits']['total'],
                                            'accepted':
                                            False,
                                            'id':
                                            mwe_counter
                                        })
                                    mwe_counter += 1
                                    final[sorted_conceptualised_phrase][
                                        'mwes'].sort(reverse=True,
                                                     key=lambda k: k['freq'])
                                    if response['hits']['total'] > final[
                                            sorted_conceptualised_phrase][
                                                'display_name']['freq']:
                                        final[sorted_conceptualised_phrase][
                                            'display_name']['freq'] = response[
                                                'hits']['total']
                                        final[sorted_conceptualised_phrase][
                                            'display_name']['label'] = phrases[
                                                j]
                            except KeyError as e:
                                raise e
                        data = []
                        phrases = []
            logging.getLogger(INFO_LOGGER).info(
                json.dumps({
                    'process': 'MINE MWEs',
                    'event': 'mwe_mining_progress',
                    'args': {
                        'user_name': request.user.username,
                        'run_id': new_run.id
                    },
                    'data': {
                        'permutations_processed': i + 1 - min_len,
                        'total_permutations': max_len - min_len + 1
                    }
                }))

        m_response = ES_Manager.plain_multisearch(es_url, dataset, mapping,
                                                  data)

        for j, response in enumerate(m_response):
            try:
                if response['hits']['total'] >= min_freq:
                    sorted_phrase = ' '.join(sorted(phrases[j].split(' ')))
                    sorted_conceptualised_phrase = conceptualise_phrase(
                        sorted_phrase, request.user)
                    if sorted_conceptualised_phrase not in final:
                        final[sorted_conceptualised_phrase] = {
                            'total_freq': 0,
                            'mwes': [],
                            'display_name': {
                                'freq': 0,
                                'label': False
                            },
                            'id': group_counter
                        }
                        group_counter += 1
                    final[sorted_conceptualised_phrase][
                        'total_freq'] += response['hits']['total']
                    final[sorted_conceptualised_phrase]['mwes'].append({
                        'mwe':
                        phrases[j],
                        'freq':
                        response['hits']['total'],
                        'accepted':
                        False,
                        'id':
                        mwe_counter
                    })
                    mwe_counter += 1
                    final[sorted_conceptualised_phrase]['mwes'].sort(
                        reverse=True, key=lambda k: k['freq'])
                    if response['hits']['total'] > final[
                            sorted_conceptualised_phrase]['display_name'][
                                'freq']:
                        final[sorted_conceptualised_phrase]['display_name'][
                            'freq'] = response['hits']['total']
                        final[sorted_conceptualised_phrase]['display_name'][
                            'label'] = phrases[j]
            except KeyError as e:
                raise e
        for key in final:
            final[key]['concept_name'] = {'freq': -1, 'label': ''}
        r = Run.objects.get(pk=new_run.pk)
        r.run_completed = datetime.now()
        r.run_status = 'completed'
        r.results = json.dumps(final)
        r.save()
        logging.getLogger(INFO_LOGGER).info(
            json.dumps({
                'process': 'MINE MWEs',
                'event': 'mwe_mining_completed',
                'args': {
                    'user_name': request.user.username,
                    'run_id': new_run.id
                }
            }))
    except Exception as e:
        print(e)
        logging.getLogger(ERROR_LOGGER).error(json.dumps({
            'process': 'MINE MWEs',
            'event': 'mwe_mining_failed',
            'args': {
                'user_name': request.user.username,
                'run_id': new_run.id
            }
        }),
                                              exc_info=True)
Beispiel #32
0
 def check_if_analyzer_exists(self):
     ELASTICSEARCH_ANALYZERS = ES_Manager.get_analyzers()
     user_sent_analyzer = self.post_dict["analyzer"]
     available_analyzer_names = list(map(lambda x: x["analyzer"], ELASTICSEARCH_ANALYZERS))
     if user_sent_analyzer not in available_analyzer_names:
         raise ValueError("Analyzer '{0}' not available. Available analyzers are: '{1}'".format(user_sent_analyzer, available_analyzer_names))
Beispiel #33
0
class AggManager:
    """ Manage Searcher aggregations and plotting preparations
    """
    def __init__(self,request):
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        # PREPARE AGGREGATION
        self.es_params = request.POST
        interval = self.es_params["interval_1"]


        self.daterange = self._get_daterange(self.es_params)
        
        self.ranges,self.date_labels = self._get_date_intervals(self.daterange,interval)
        self.agg_query = self.prepare_agg_query()
        # EXECUTE AGGREGATION
        agg_results = self.aggregate()

        # PARSE RESPONSES INTO JSON OBJECT
        self.agg_data = self.parse_responses(agg_results)



    @staticmethod
    def _get_daterange(es_params):
        daterange = {"min":es_params["agg_daterange_from_1"],"max":es_params["agg_daterange_to_1"]}
        return daterange


    @staticmethod
    def _get_date_intervals(daterange,interval):
        if daterange['min'] and daterange['max']:
            frmt = "%Y-%m-%d"
            start_datetime = datetime.strptime(daterange['min'],frmt)
            end_datetime = datetime.strptime(daterange['max'],frmt)
        
            if interval == 'year':
                rdelta = relativedelta(years=+1)
            elif interval == 'quarter':
                rdelta = relativedelta(months=+3)
            elif interval == 'month':
                rdelta = relativedelta(months=+1)
            elif interval == 'week':
                rdelta = relativedelta(weeks=+1)
            elif interval == 'day':
                rdelta = relativedelta(days=+1)

            next_calculated_datetime = start_datetime + rdelta
            dates = [start_datetime, next_calculated_datetime]
            labels = [start_datetime.strftime(frmt),next_calculated_datetime.strftime(frmt)]

            while next_calculated_datetime < end_datetime:
                next_calculated_datetime += rdelta
                dates.append(next_calculated_datetime)
                labels.append(next_calculated_datetime.strftime(frmt))

            dates.append(end_datetime)
            labels.append(end_datetime.strftime(frmt))

            dates_str = []
            for i,date in enumerate(dates[1:]):
                dates_str.append({'from':dates[i].strftime(frmt),'to':date.strftime(frmt)})

            return dates_str,labels

        else:

            return [],[]


    def prepare_agg_query(self):
        es_params = self.es_params

        agg_field_1 = es_params["agg_field_1"]
        agg_field_1 = json.loads(agg_field_1)
        sort_by_1 = es_params["sort_by_1"]
        agg_field_2 = es_params["agg_field_2"]
        agg_field_2 = json.loads(agg_field_2)
        sort_by_2 = es_params["sort_by_2"]

        try:
            agg_size_1 = int(es_params["agg_size_1"])        
            agg_size_2 = int(es_params["agg_size_2"])
        except KeyError:
            agg_size_1 = 10
            agg_size_2 = 10

        field_type_to_name = {'date': 'daterange', 'string':'string', 'text': 'string', 'keyword': 'string', 'facts': 'fact', 'fact_str_val': 'fact_str_val', 'fact_num_val': 'fact_num_val'}

        agg_name_1 = field_type_to_name[agg_field_1['type']]
        agg_name_2 = field_type_to_name[agg_field_2['type']]

        # If aggregating over text field, use .keyword instead
        if  agg_field_1['type'] == 'text' and sort_by_1 in ['terms', 'significant_terms']: # NEW PY REQUIREMENT
            agg_field_1['path'] = '{0}.keyword'.format(agg_field_1['path'])
        if  agg_field_2['type'] == 'text' and sort_by_2 in ['terms', 'significant_terms']: # NEW PY REQUIREMENT
            agg_field_2['path'] = '{0}.keyword'.format(agg_field_2['path'])

        # 1st LEVEL AGGREGATION
        agg = self.create_agg(agg_name_1,sort_by_1,agg_field_1["path"],agg_size_1)

        if agg_name_1 == 'fact' and es_params["agg_field_2_selected"] == 'false':
            agg[agg_name_1]["aggs"][agg_name_1]['aggs']['fact_str_val'] = \
                self.create_agg('fact_str_val', sort_by_1, agg_field_1['path'], agg_size_1)['fact_str_val']['aggs']['fact_str_val']

        # 2nd LEVEL AGGREGATION
        if es_params["agg_field_2_selected"] == 'true':
            agg_2 = self.create_agg(agg_name_2,sort_by_2,agg_field_2["path"],agg_size_2)
            if agg_name_1 == 'fact' and agg_name_2 == 'fact_str_val':
                agg[agg_name_1]['aggs'][agg_name_1]['aggs'] = agg_2[agg_name_2]['aggs']
                agg[agg_name_1]['aggs'][agg_name_1]['aggs']['documents'] = {"reverse_nested": {}}
            elif 'fact' in agg_name_1 and agg_name_2 == 'string':
                agg[agg_name_1]['aggs'][agg_name_1]['aggs']['documents']['aggs'] = agg_2
            else:
                if agg_name_2 == 'fact':
                    agg[agg_name_1]["aggregations"] = agg_2
                    agg[agg_name_1]["aggregations"][agg_name_2]['aggs'][agg_name_2]['aggs'] = self.create_agg('fact_str_val', sort_by_2, agg_field_2['path'], agg_size_2)['fact_str_val']['aggs']
                else:
                    agg[agg_name_1]["aggregations"] = agg_2
        
        return agg


    def create_agg(self, agg_name, sort_by, path, size):
        if agg_name == "daterange":
            return {agg_name: {"date_range": {"field": path, "format": date_format, "ranges": self.ranges}}}
        elif agg_name == 'fact':
            return {
                agg_name: {
                    "nested": {"path": "texta_facts"},
                    "aggs": {
                        agg_name: {
                            sort_by: {"field": "texta_facts.fact", "size": size},
                            "aggs": {"documents": {"reverse_nested": {}}}
                        }
                    }
                }
            }
        elif agg_name == 'fact_str_val':
            return {
                agg_name: {
                    "nested": {"path": "texta_facts"},
                    "aggs": {
                        agg_name: {
                            sort_by: {"field": "texta_facts.str_val", "size": size, 'order': {'documents.doc_count': 'desc'}},
                            "aggs": {"documents": {"reverse_nested": {}}}
                        }
                    }
                }
            }
        elif agg_name == 'fact_num_val':
            return {
                agg_name: {
                    "nested": {"path": "texta_facts"},
                    "aggs": {
                        agg_name: {
                            sort_by: {"field": "texta_facts.num_val", "size": size, 'order': {'documents.doc_count': 'desc'}},
                            "aggs": {"documents": {"reverse_nested": {}}}
                        }
                    }
                }
            }
        else:
            return {agg_name: {sort_by: {"field": path, "size": size}}}


    def aggregate(self):
        responses = []
        out = {}

        # EXECUTE SAVED SEARCHES
        for item in self.es_params:
            if 'saved_search' in item:
                s = Search.objects.get(pk=self.es_params[item])
                name = s.description
                saved_query = json.loads(s.query)
                self.es_m.load_combined_query(saved_query)
                self.es_m.set_query_parameter("aggs", self.agg_query)
                response = self.es_m.search()
                responses.append({"id":"search_"+str(s.pk),"label":name,"response":response})

        # EXECUTE THE LIVE QUERY
        if "ignore_active_search" not in self.es_params:
            self.es_m.build(self.es_params)
            self.es_m.set_query_parameter("aggs", self.agg_query)
            self.es_m.set_query_parameter("size", 0)
            response = self.es_m.search()
            #raise Exception(self.es_m.combined_query['main']['aggs'])
            responses.append({"id":"query","label":"Current Search","response":response})

        out["responses"] = responses

        # EXECUTE EMPTY TIMELINE QUERY IF RELATIVE FREQUENCY SELECTED     
        if json.loads(self.es_params["agg_field_1"])["type"] == "date" and self.es_params["freq_norm_1"] == "relative_frequency":
            empty_params = {}
            self.es_m.build(empty_params)
            self.es_m.set_query_parameter("aggs", self.agg_query)
            response = self.es_m.search()
            out["empty_timeline_response"] = response

        return out


    def parse_responses(self,agg_results):
        """ Parses ES responses into JSON structure and normalises daterange frequencies if necessary
        """

        total_freqs = {}
        agg_data = []

        if "empty_timeline_response" in agg_results:
            for bucket in agg_results["empty_timeline_response"]["aggregations"]["daterange"]["buckets"]:
                total_freqs[bucket["from_as_string"]] = bucket["doc_count"]

        for i,response in enumerate(agg_results["responses"]):
            aggs = response["response"]["aggregations"]
            output_type = None
            response_out = []

            for agg_name,agg_results in aggs.items():
                output_type = agg_name

                if agg_name == 'daterange':
                    response_out.extend(self._parse_daterange_buckets(agg_results['buckets'], total_freqs, self.es_params['freq_norm_1']))
                elif agg_name == 'string':
                    response_out.extend(self._parse_string_buckets(agg_results['buckets']))
                elif agg_name == 'fact':
                    response_out.extend(self._parse_fact_buckets(agg_results['fact']['buckets']))
                elif agg_name == 'fact_str_val' or agg_name == 'fact_num_val':
                    response_out.extend(self._parse_fact_buckets(agg_results[agg_name]['buckets']))

            agg_data.append({"data":response_out,"type":output_type,"label":response["label"]})

        return agg_data
        
    def _parse_daterange_buckets(self, buckets, total_freqs, freq_norm_1):
        results = []

        for bucket in buckets:
            new = {"children":[]}
            new["key"] = bucket["from_as_string"]
            # Normalises frequencies
            if freq_norm_1 == "relative_frequency":
                try:
                    new["val"] = str(round(float(bucket["doc_count"])/float(total_freqs[bucket["from_as_string"]]),5))
                except ZeroDivisionError:
                    new["val"] = 0
            else:
                new["val"] = bucket["doc_count"]

            if "string" in bucket:
                for bucket_2 in bucket["string"]["buckets"]:
                    child = {}
                    child["key"] = bucket_2["key"]
                    child["val"] = bucket_2["doc_count"]
                    new["children"].append(child)
            elif 'fact' in bucket:
                for inner_bucket in bucket['fact']['fact']['buckets']:
                    child = {'key': inner_bucket['key'], 'val': inner_bucket['doc_count']}
                    grandchildren = []
                    for super_inner_bucket in inner_bucket['fact_str_val']['buckets']:
                        grandchildren.append({'key': super_inner_bucket['key'], 'val': super_inner_bucket['documents']['doc_count']})

                    child['children'] = grandchildren
                    new['children'].append(child)
            elif 'fact_str_val' in bucket:
                for inner_bucket in bucket['fact_str_val']['fact_str_val']['buckets']:
                    new['children'].append({'key': inner_bucket['key'], 'val': inner_bucket['documents']['doc_count']})

            results.append(new)

        return results

    def _parse_string_buckets(self, buckets):
        results = []

        for bucket in buckets:
            new = {"children":[]}

            new["key"] = bucket["key"]
            new["val"] = bucket["doc_count"]

            if "string" in bucket:
                for bucket_2 in bucket["string"]["buckets"]:
                    child = {}
                    child["key"] = bucket_2["key"]
                    child["val"] = bucket_2["doc_count"]
                    new["children"].append(child)
            elif 'fact' in bucket:
                for inner_bucket in bucket['fact']['fact']['buckets']:
                    child = {'key': inner_bucket['key'], 'val': inner_bucket['doc_count']}
                    grandchildren = []
                    for super_inner_bucket in inner_bucket['fact_str_val']['buckets']:
                        grandchildren.append({'key': super_inner_bucket['key'], 'val': super_inner_bucket['documents']['doc_count']})

                    child['children'] = grandchildren
                    new['children'].append(child)
            elif 'fact_str_val' in bucket:
                for inner_bucket in bucket['fact_str_val']['fact_str_val']['buckets']:
                    new['children'].append({'key': inner_bucket['key'], 'val': inner_bucket['documents']['doc_count']})

            results.append(new)

        return results

    def _parse_fact_buckets(self, buckets):
        results = []

        for bucket in buckets:
            new = {"children": []}

            new["key"] = bucket["key"]
            new["val"] = bucket["documents"]["doc_count"]

            if 'fact_str_val' in bucket:
                for inner_bucket in bucket['fact_str_val']['buckets']:
                    child = {}
                    child['key'] = inner_bucket['key']
                    child['val'] = inner_bucket['documents']['doc_count']
                    new['children'].append(child)

            elif 'documents' in bucket and 'string' in bucket['documents']:
                for inner_bucket in bucket['documents']['string']['buckets']:
                    child = {}
                    child['key'] = inner_bucket['key']
                    child['val'] = inner_bucket['doc_count']
                    new['children'].append(child)

            results.append(new)

        return results


    def _parse_fact_val_results(self, buckets):
        pass

    def output_to_searcher(self):
        count_dict = defaultdict(defaultdict)
        children_dict = defaultdict(dict)
        i = 0

        data_out = []

        for agg in self.agg_data:
            if agg["type"] == "daterange":
                i+=1
                for row in agg["data"]:
                    count_dict[row["key"]][i] = row["val"]
                    if row["children"]:
                        children_dict[row["key"]][i] = {"data":row["children"],"label":agg["label"]}
            else:
                data_out.append(agg)

        combined_daterange_data = []
        labels = [a["label"] for a in self.agg_data]
        
        for row in sorted(count_dict.items(),key=lambda l:l[0]):
            new_row = dict(row[1])
            new_row["date"] = row[0]
            combined_daterange_data.append(new_row)

        daterange_data = {"type":"daterange",
                          "data":combined_daterange_data,
                          "ykeys":list(range(1,i+1)),
                          "labels":labels,
                          "children":dict(children_dict)}

        if daterange_data["data"]:
            data_out.append(daterange_data)

        return data_out
Beispiel #34
0
def facts_agg(es_params, request):
    logger = LogManager(__name__, 'FACTS AGGREGATION')

    distinct_values = []
    query_results = []
    lexicon = []
    aggregation_data = es_params['aggregate_over']
    aggregation_data = json.loads(aggregation_data)
    original_aggregation_field = aggregation_data['path']
    aggregation_field = 'texta_link.facts'

    try:
        aggregation_size = 50
        aggregations = {"strings": {es_params['sort_by']: {"field": aggregation_field, 'size': 0}},
                        "distinct_values": {"cardinality": {"field": aggregation_field}}}

        # Define selected mapping
        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()
        date_range = ds.get_date_range()
        es_m = ES_Manager(dataset, mapping, date_range)

        for item in es_params:
            if 'saved_search' in item:
                s = Search.objects.get(pk=es_params[item])
                name = s.description
                saved_query = json.loads(s.query)
                es_m.load_combined_query(saved_query)
                es_m.set_query_parameter('aggs', aggregations)
                response = es_m.search()

                # Filter response
                bucket_filter = '{0}.'.format(original_aggregation_field.lower())
                final_bucket = []
                for b in response['aggregations']['strings']['buckets']:
                    if bucket_filter in b['key']:
                        fact_name = b['key'].split('.')[-1]
                        b['key'] = fact_name
                        final_bucket.append(b)
                final_bucket = final_bucket[:aggregation_size]
                response['aggregations']['distinct_values']['value'] = len(final_bucket)
                response['aggregations']['strings']['buckets'] = final_bucket

                normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings')
                lexicon = list(set(lexicon+labels))
                query_results.append({'name':name,'data':normalised_counts,'labels':labels})
                distinct_values.append({'name':name,'data':response['aggregations']['distinct_values']['value']})


        es_m.build(es_params)
        # FIXME
        # this is confusing for the user
        if not es_m.is_combined_query_empty():
            es_m.set_query_parameter('aggs', aggregations)
            response = es_m.search()

            # Filter response
            bucket_filter = '{0}.'.format(original_aggregation_field.lower())
            final_bucket = []
            for b in response['aggregations']['strings']['buckets']:
                if bucket_filter in b['key']:
                    fact_name = b['key'].split('.')[-1]
                    b['key'] = fact_name
                    final_bucket.append(b)
            final_bucket = final_bucket[:aggregation_size]
            response['aggregations']['distinct_values']['value'] = len(final_bucket)
            response['aggregations']['strings']['buckets'] = final_bucket

            normalised_counts,labels = normalise_agg(response, es_m, es_params, 'strings')
            lexicon = list(set(lexicon+labels))
            query_results.append({'name':'Query','data':normalised_counts,'labels':labels})
            distinct_values.append({'name':'Query','data':response['aggregations']['distinct_values']['value']})

        data = [a+zero_list(len(query_results)) for a in map(list, zip(*[lexicon]))]
        data = [['Word']+[query_result['name'] for query_result in query_results]]+data

        for i,word in enumerate(lexicon):
            for j,query_result in enumerate(query_results):
                for k,label in enumerate(query_result['labels']):
                    if word == label:
                        data[i+1][j+1] = query_result['data'][k]

        logger.set_context('user_name', request.user.username)
        logger.info('facts_aggregation_queried')

    except Exception as e:
        print('-- Exception[{0}] {1}'.format(__name__, e))
        logger.set_context('user_name', request.user.username)
        logger.exception('facts_aggregation_query_failed')

    table_height = len(data)*15
    table_height = table_height if table_height > 500 else 500
    return {'data':[data[0]]+sorted(data[1:], key=lambda x: sum(x[1:]), reverse=True),'height':table_height,'type':'bar','distinct_values':json.dumps(distinct_values)}
Beispiel #35
0
class Autocomplete:
    def __init__(self):
        self.es_m = None
        self.lookup_type = None
        self.key_constraints = None
        self.content = None
        self.user = None
        self.limit = None

    def parse_request(self, request):

        self.lookup_types = request.POST['lookup_types'].split(',')
        self.key_constraints = request.POST['key_constraints'].split(',')
        self.content = request.POST['content'].split('\n')[-1].strip()
        print(self.content)
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        self.user = request.user

    def suggest(self, limit=10):
        self.limit = limit

        suggestions = {}

        for i, lookup_type in enumerate(self.lookup_types):
            if lookup_type == 'FACT_NAME':
                suggestions['FACT_NAME'] = self._get_facts('fact', lookup_type)
            elif lookup_type == 'FACT_VAL':
                suggestions['FACT_VAL'] = self._get_facts(
                    'str_val',
                    lookup_type,
                    key_constraint=self.key_constraints[i])
            elif lookup_type == 'CONCEPT':
                suggestions['CONCEPT'] = self._get_concepts()
            elif lookup_type == 'LEXICON':
                suggestions['LEXICON'] = self._get_lexicons()
        return suggestions

    def _get_facts(self, agg_subfield, lookup_type, key_constraint=None):
        agg_query = {
            agg_subfield: {
                "nested": {
                    "path": "texta_facts"
                },
                "aggs": {
                    agg_subfield: {
                        "terms": {
                            "field": "texta_facts.fact"
                        },
                        "aggs": {
                            "fact_values": {
                                "terms": {
                                    "field": "texta_facts.str_val",
                                    "size": self.limit,
                                    "include": "{0}.*".format(self.content)
                                }
                            }
                        }
                    }
                }
            }
        }

        self.es_m.build('')
        self.es_m.set_query_parameter("aggs", agg_query)

        if lookup_type == 'FACT_VAL' and key_constraint:
            facts = []
            for bucket in self.es_m.search(
            )["aggregations"][agg_subfield][agg_subfield]["buckets"]:
                if bucket["key"] == key_constraint:
                    facts += [
                        self._format_suggestion(sub_bucket["key"],
                                                sub_bucket["key"])
                        for sub_bucket in bucket["fact_values"]["buckets"]
                    ]

        elif lookup_type == 'FACT_VAL' and not key_constraint:
            facts = []
            for bucket in self.es_m.search(
            )["aggregations"][agg_subfield][agg_subfield]["buckets"]:
                facts += [
                    self._format_suggestion(sub_bucket["key"],
                                            sub_bucket["key"])
                    for sub_bucket in bucket["fact_values"]["buckets"]
                ]
        else:
            facts = [
                self._format_suggestion(a["key"], a["key"])
                for a in self.es_m.search()["aggregations"][agg_subfield]
                [agg_subfield]["buckets"]
            ]

        return facts

    def _get_concepts(self):
        concepts = []

        if len(self.content) > 0:
            terms = Term.objects.filter(term__startswith=self.content).filter(
                author=self.user)
            seen = {}
            for term in terms[:self.limit]:
                for term_concept in TermConcept.objects.filter(term=term.pk):
                    concept = term_concept.concept
                    concept_term = (concept.pk, term.term)

                    if concept_term not in seen:
                        seen[concept_term] = True

                        display_term = term.term.replace(
                            self.content,
                            '<font color="red">' + self.content + '</font>')
                        display_text = '<b>{0}</b>@C{1}-{2}'.format(
                            display_term, concept.pk,
                            concept.descriptive_term.term)

                        suggestion = self._format_suggestion(
                            concept.descriptive_term.term,
                            display_text,
                            resource_id=concept.pk)
                        concepts.append(suggestion)

        return concepts

    def _get_lexicons(self):
        suggested_lexicons = []

        if len(self.content) > 0:
            lexicons = Lexicon.objects.filter(
                name__startswith=self.content).filter(author=self.user)
            for lexicon in lexicons:
                display_term = lexicon.name.replace(
                    self.content,
                    '<font color="red">' + self.content + '</font>')
                display_text = '<b>{0}</b>@L{1}-{2}'.format(
                    display_term, lexicon.pk, lexicon.name)

                suggestion = self._format_suggestion(lexicon.name,
                                                     display_text,
                                                     resource_id=lexicon.pk)
                suggested_lexicons.append(suggestion)

        return suggested_lexicons

    @staticmethod
    def _format_suggestion(entry_text, display_text, resource_id=''):
        return {
            'entry_text': entry_text,
            'display_text': display_text,
            'resource_id': resource_id
        }
Beispiel #36
0
def get_next_page_data(query, es_from, last_page, query_data, request):
    start_from = None
    end = None
    rows = []
    page_length = query_data['page_length']

    if es_from == None:
        return {
            'rows': rows,
            'from': start_from,
            'page': last_page,
            'end': None,
            'total': None
        }

    dataset = query_data['dataset']
    mapping = query_data['mapping']
    polarity = query_data['polarity']

    inclusive_instructions = query_data['inclusive_instructions']
    exclusive_instructions = query_data['exclusive_instructions']

    query['from'] = es_from
    response = ES_Manager.plain_search(es_url, dataset, mapping, query)

    try:
        hit = response['hits']['hits'][0]
        feature_dict = {
            feature_name: hit['_source'][feature_name][0]
            for feature_name in hit['_source']
        }
        sorted_feature_names = sorted(feature_dict)
        feature_to_idx_map = {
            feature: (feature_idx + 1)
            for feature_idx, feature in enumerate(sorted_feature_names)
        }
    except:
        pass

    hit_idx = page_length - 1

    while len(
            rows) < page_length and 'hits' in response and 'hits' in response[
                'hits'] and response['hits'][
                    'hits'] and hit_idx + 1 == page_length:
        for hit_idx, hit in enumerate(response['hits']['hits']):
            if len(rows) >= page_length:
                break

            feature_dict = {}

            for field_name in hit['_source']:
                field_value = hit['_source'][field_name]
                if isinstance(field_value, dict):
                    for subfield_name, subfield_value in field_value.items():
                        combined_field_name = '{0}.{1}'.format(
                            field_name, subfield_name)
                        feature_dict[combined_field_name] = subfield_value
                else:
                    feature_dict[field_name] = field_value

            sorted_feature_names = sorted(feature_dict)

            feature_to_idx_map = defaultdict(list)
            for feature_idx, feature in enumerate(sorted_feature_names):
                feature_to_idx_map[feature.split('.')[0]].append(feature_idx +
                                                                 1)

            row = [hit['_id']]
            row.extend([
                feature_dict[feature_name]
                for feature_name in sorted_feature_names
            ])
            layer_dict = matcher.LayerDict(feature_dict)

            inclusive_matches = inclusive_instructions.match(layer_dict)

            if (polarity == 'positive') == bool(
                    inclusive_matches
            ):  # add row if polarity is positive and we have a match or negative and dont
                if len(rows) == 0:
                    start_from = query['from'] + hit_idx
                end = query['from'] + hit_idx

                if inclusive_matches:
                    row = highlight(row, feature_to_idx_map, inclusive_matches)

                rows.append(row)

        query['from'] = query['from'] + hit_idx + 1

        if len(rows) >= page_length:
            break
        response = ES_Manager.plain_search(es_url, dataset, mapping, query)
    if end:
        GrammarPageMapping(
            search_id=query_data['search_id'],
            inclusive_grammar=query_data['inclusive_grammar_id'],
            exclusive_grammar=query_data['exclusive_grammar_id'],
            page=query_data['requested_page'],
            polarity=query_data['polarity'],
            elastic_start=start_from,
            elastic_end=end + 1,
            author=request.user).save()

    return {
        'rows': rows,
        'from': start_from,
        'page': last_page + 1,
        'end': (end + (0 if last_page == 0 else 1)) if end else end,
        'total': response['hits']['total']
    }
Beispiel #37
0
def get_analyzer_names(request):
    ELASTICSEARCH_ANALYZERS = ES_Manager.get_analyzers()
    analyzer_names = list(map(lambda x: x["analyzer"],
                              ELASTICSEARCH_ANALYZERS))
    return JsonResponse({"analyzers": analyzer_names})
Beispiel #38
0
def get_mappings(request):
    index = request.GET['index']
    return HttpResponse(json.dumps(ES_Manager.get_mappings(index)))
Beispiel #39
0
class AggManager:
    """ Manage Searcher aggregations and plotting preparations
    """
    def __init__(self, request):
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        # PREPARE AGGREGATION
        self.es_params = request.POST
        interval = self.es_params["interval_1"]

        self.daterange = self._get_daterange(self.es_params)

        self.ranges, self.date_labels = self._get_date_intervals(
            self.daterange, interval)
        self.agg_query = self.prepare_agg_query()
        # EXECUTE AGGREGATION
        agg_results = self.aggregate()

        # PARSE RESPONSES INTO JSON OBJECT
        self.agg_data = self.parse_responses(agg_results)

    @staticmethod
    def _get_daterange(es_params):
        daterange = {
            "min": es_params["agg_daterange_from_1"],
            "max": es_params["agg_daterange_to_1"]
        }
        return daterange

    @staticmethod
    def _get_date_intervals(daterange, interval):
        if daterange['min'] and daterange['max']:
            frmt = "%Y-%m-%d"
            start_datetime = datetime.strptime(daterange['min'], frmt)
            end_datetime = datetime.strptime(daterange['max'], frmt)

            if interval == 'year':
                rdelta = relativedelta(years=+1)
            elif interval == 'quarter':
                rdelta = relativedelta(months=+3)
            elif interval == 'month':
                rdelta = relativedelta(months=+1)
            elif interval == 'week':
                rdelta = relativedelta(weeks=+1)
            elif interval == 'day':
                rdelta = relativedelta(days=+1)

            next_calculated_datetime = start_datetime + rdelta
            dates = [start_datetime, next_calculated_datetime]
            labels = [
                start_datetime.strftime(frmt),
                next_calculated_datetime.strftime(frmt)
            ]

            while next_calculated_datetime < end_datetime:
                next_calculated_datetime += rdelta
                dates.append(next_calculated_datetime)
                labels.append(next_calculated_datetime.strftime(frmt))

            dates.append(end_datetime)
            labels.append(end_datetime.strftime(frmt))

            dates_str = []
            for i, date in enumerate(dates[1:]):
                dates_str.append({
                    'from': dates[i].strftime(frmt),
                    'to': date.strftime(frmt)
                })

            return dates_str, labels

        else:

            return [], []

    def prepare_agg_query(self):
        es_params = self.es_params

        agg_field_1 = es_params["agg_field_1"]
        agg_field_1 = json.loads(agg_field_1)
        sort_by_1 = es_params["sort_by_1"]
        agg_field_2 = es_params["agg_field_2"]
        agg_field_2 = json.loads(agg_field_2)
        sort_by_2 = es_params["sort_by_2"]

        try:
            agg_size_1 = int(es_params["agg_size_1"])
            agg_size_2 = int(es_params["agg_size_2"])
        except KeyError:
            agg_size_1 = 10
            agg_size_2 = 10

        field_type_to_name = {
            'date': 'daterange',
            'string': 'string',
            'text': 'string',
            'keyword': 'string',
            'facts': 'fact',
            'fact_str_val': 'fact_str_val',
            'fact_num_val': 'fact_num_val'
        }

        agg_name_1 = field_type_to_name[agg_field_1['type']]
        agg_name_2 = field_type_to_name[agg_field_2['type']]

        # If aggregating over text field, use .keyword instead
        if agg_field_1['type'] == 'text' and sort_by_1 in [
                'terms', 'significant_terms'
        ]:  # NEW PY REQUIREMENT
            agg_field_1['path'] = '{0}.keyword'.format(agg_field_1['path'])
        if agg_field_2['type'] == 'text' and sort_by_2 in [
                'terms', 'significant_terms'
        ]:  # NEW PY REQUIREMENT
            agg_field_2['path'] = '{0}.keyword'.format(agg_field_2['path'])

        # 1st LEVEL AGGREGATION
        agg = self.create_agg(agg_name_1, sort_by_1, agg_field_1["path"],
                              agg_size_1)

        if agg_name_1 == 'fact' and es_params[
                "agg_field_2_selected"] == 'false':
            agg[agg_name_1]["aggs"][agg_name_1]['aggs']['fact_str_val'] = \
                self.create_agg('fact_str_val', sort_by_1, agg_field_1['path'], agg_size_1)['fact_str_val']['aggs']['fact_str_val']

        # 2nd LEVEL AGGREGATION
        if es_params["agg_field_2_selected"] == 'true':
            agg_2 = self.create_agg(agg_name_2, sort_by_2, agg_field_2["path"],
                                    agg_size_2)
            if agg_name_1 == 'fact' and agg_name_2 == 'fact_str_val':
                agg[agg_name_1]['aggs'][agg_name_1]['aggs'] = agg_2[
                    agg_name_2]['aggs']
                agg[agg_name_1]['aggs'][agg_name_1]['aggs']['documents'] = {
                    "reverse_nested": {}
                }
            elif 'fact' in agg_name_1 and agg_name_2 == 'string':
                agg[agg_name_1]['aggs'][agg_name_1]['aggs']['documents'][
                    'aggs'] = agg_2
            else:
                if agg_name_2 == 'fact':
                    agg[agg_name_1]["aggregations"] = agg_2
                    agg[agg_name_1]["aggregations"][agg_name_2]['aggs'][
                        agg_name_2]['aggs'] = self.create_agg(
                            'fact_str_val', sort_by_2, agg_field_2['path'],
                            agg_size_2)['fact_str_val']['aggs']
                else:
                    agg[agg_name_1]["aggregations"] = agg_2

        return agg

    def create_agg(self, agg_name, sort_by, path, size):
        if agg_name == "daterange":
            return {
                agg_name: {
                    "date_range": {
                        "field": path,
                        "format": date_format,
                        "ranges": self.ranges
                    }
                }
            }
        elif agg_name == 'fact':
            return {
                agg_name: {
                    "nested": {
                        "path": "texta_facts"
                    },
                    "aggs": {
                        agg_name: {
                            sort_by: {
                                "field": "texta_facts.fact",
                                "size": size
                            },
                            "aggs": {
                                "documents": {
                                    "reverse_nested": {}
                                }
                            }
                        }
                    }
                }
            }
        elif agg_name == 'fact_str_val':
            return {
                agg_name: {
                    "nested": {
                        "path": "texta_facts"
                    },
                    "aggs": {
                        agg_name: {
                            sort_by: {
                                "field": "texta_facts.str_val",
                                "size": size,
                                'order': {
                                    'documents.doc_count': 'desc'
                                }
                            },
                            "aggs": {
                                "documents": {
                                    "reverse_nested": {}
                                }
                            }
                        }
                    }
                }
            }
        elif agg_name == 'fact_num_val':
            return {
                agg_name: {
                    "nested": {
                        "path": "texta_facts"
                    },
                    "aggs": {
                        agg_name: {
                            sort_by: {
                                "field": "texta_facts.num_val",
                                "size": size,
                                'order': {
                                    'documents.doc_count': 'desc'
                                }
                            },
                            "aggs": {
                                "documents": {
                                    "reverse_nested": {}
                                }
                            }
                        }
                    }
                }
            }
        else:
            return {agg_name: {sort_by: {"field": path, "size": size}}}

    def aggregate(self):
        responses = []
        out = {}

        # EXECUTE SAVED SEARCHES
        for item in self.es_params:
            if 'saved_search' in item:
                s = Search.objects.get(pk=self.es_params[item])
                name = s.description
                saved_query = json.loads(s.query)
                self.es_m.load_combined_query(saved_query)
                self.es_m.set_query_parameter("aggs", self.agg_query)
                response = self.es_m.search()
                responses.append({
                    "id": "search_" + str(s.pk),
                    "label": name,
                    "response": response
                })

        # EXECUTE THE LIVE QUERY
        if "ignore_active_search" not in self.es_params:
            self.es_m.build(self.es_params)
            self.es_m.set_query_parameter("aggs", self.agg_query)
            self.es_m.set_query_parameter("size", 0)
            response = self.es_m.search()
            #raise Exception(self.es_m.combined_query['main']['aggs'])
            responses.append({
                "id": "query",
                "label": "Current Search",
                "response": response
            })

        out["responses"] = responses

        # EXECUTE EMPTY TIMELINE QUERY IF RELATIVE FREQUENCY SELECTED
        if json.loads(self.es_params["agg_field_1"]
                      )["type"] == "date" and self.es_params[
                          "freq_norm_1"] == "relative_frequency":
            empty_params = {}
            self.es_m.build(empty_params)
            self.es_m.set_query_parameter("aggs", self.agg_query)
            response = self.es_m.search()
            out["empty_timeline_response"] = response

        return out

    def parse_responses(self, agg_results):
        """ Parses ES responses into JSON structure and normalises daterange frequencies if necessary
        """

        total_freqs = {}
        agg_data = []

        if "empty_timeline_response" in agg_results:
            for bucket in agg_results["empty_timeline_response"][
                    "aggregations"]["daterange"]["buckets"]:
                total_freqs[bucket["from_as_string"]] = bucket["doc_count"]

        for i, response in enumerate(agg_results["responses"]):
            aggs = response["response"]["aggregations"]
            output_type = None
            response_out = []

            for agg_name, agg_results in aggs.items():
                output_type = agg_name

                if agg_name == 'daterange':
                    response_out.extend(
                        self._parse_daterange_buckets(
                            agg_results['buckets'], total_freqs,
                            self.es_params['freq_norm_1']))
                elif agg_name == 'string':
                    response_out.extend(
                        self._parse_string_buckets(agg_results['buckets']))
                elif agg_name == 'fact':
                    response_out.extend(
                        self._parse_fact_buckets(
                            agg_results['fact']['buckets']))
                elif agg_name == 'fact_str_val' or agg_name == 'fact_num_val':
                    response_out.extend(
                        self._parse_fact_buckets(
                            agg_results[agg_name]['buckets']))

            agg_data.append({
                "data": response_out,
                "type": output_type,
                "label": response["label"]
            })

        return agg_data

    def _parse_daterange_buckets(self, buckets, total_freqs, freq_norm_1):
        results = []

        for bucket in buckets:
            new = {"children": []}
            new["key"] = bucket["from_as_string"]
            # Normalises frequencies
            if freq_norm_1 == "relative_frequency":
                try:
                    new["val"] = str(
                        round(
                            float(bucket["doc_count"]) /
                            float(total_freqs[bucket["from_as_string"]]), 5))
                except ZeroDivisionError:
                    new["val"] = 0
            else:
                new["val"] = bucket["doc_count"]

            if "string" in bucket:
                for bucket_2 in bucket["string"]["buckets"]:
                    child = {}
                    child["key"] = bucket_2["key"]
                    child["val"] = bucket_2["doc_count"]
                    new["children"].append(child)
            elif 'fact' in bucket:
                for inner_bucket in bucket['fact']['fact']['buckets']:
                    child = {
                        'key': inner_bucket['key'],
                        'val': inner_bucket['doc_count']
                    }
                    grandchildren = []
                    for super_inner_bucket in inner_bucket['fact_str_val'][
                            'buckets']:
                        grandchildren.append({
                            'key':
                            super_inner_bucket['key'],
                            'val':
                            super_inner_bucket['documents']['doc_count']
                        })

                    child['children'] = grandchildren
                    new['children'].append(child)
            elif 'fact_str_val' in bucket:
                for inner_bucket in bucket['fact_str_val']['fact_str_val'][
                        'buckets']:
                    new['children'].append({
                        'key':
                        inner_bucket['key'],
                        'val':
                        inner_bucket['documents']['doc_count']
                    })

            results.append(new)

        return results

    def _parse_string_buckets(self, buckets):
        results = []

        for bucket in buckets:
            new = {"children": []}

            new["key"] = bucket["key"]
            new["val"] = bucket["doc_count"]

            if "string" in bucket:
                for bucket_2 in bucket["string"]["buckets"]:
                    child = {}
                    child["key"] = bucket_2["key"]
                    child["val"] = bucket_2["doc_count"]
                    new["children"].append(child)
            elif 'fact' in bucket:
                for inner_bucket in bucket['fact']['fact']['buckets']:
                    child = {
                        'key': inner_bucket['key'],
                        'val': inner_bucket['doc_count']
                    }
                    grandchildren = []
                    for super_inner_bucket in inner_bucket['fact_str_val'][
                            'buckets']:
                        grandchildren.append({
                            'key':
                            super_inner_bucket['key'],
                            'val':
                            super_inner_bucket['documents']['doc_count']
                        })

                    child['children'] = grandchildren
                    new['children'].append(child)
            elif 'fact_str_val' in bucket:
                for inner_bucket in bucket['fact_str_val']['fact_str_val'][
                        'buckets']:
                    new['children'].append({
                        'key':
                        inner_bucket['key'],
                        'val':
                        inner_bucket['documents']['doc_count']
                    })

            results.append(new)

        return results

    def _parse_fact_buckets(self, buckets):
        results = []

        for bucket in buckets:
            new = {"children": []}

            new["key"] = bucket["key"]
            new["val"] = bucket["documents"]["doc_count"]

            if 'fact_str_val' in bucket:
                for inner_bucket in bucket['fact_str_val']['buckets']:
                    child = {}
                    child['key'] = inner_bucket['key']
                    child['val'] = inner_bucket['documents']['doc_count']
                    new['children'].append(child)

            elif 'documents' in bucket and 'string' in bucket['documents']:
                for inner_bucket in bucket['documents']['string']['buckets']:
                    child = {}
                    child['key'] = inner_bucket['key']
                    child['val'] = inner_bucket['doc_count']
                    new['children'].append(child)

            results.append(new)

        return results

    def _parse_fact_val_results(self, buckets):
        pass

    def output_to_searcher(self):
        count_dict = defaultdict(defaultdict)
        children_dict = defaultdict(dict)
        i = 0

        data_out = []

        for agg in self.agg_data:
            if agg["type"] == "daterange":
                i += 1
                for row in agg["data"]:
                    count_dict[row["key"]][i] = row["val"]
                    if row["children"]:
                        children_dict[row["key"]][i] = {
                            "data": row["children"],
                            "label": agg["label"]
                        }
            else:
                data_out.append(agg)

        combined_daterange_data = []
        labels = [a["label"] for a in self.agg_data]

        for row in sorted(count_dict.items(), key=lambda l: l[0]):
            new_row = dict(row[1])
            new_row["date"] = row[0]
            combined_daterange_data.append(new_row)

        daterange_data = {
            "type": "daterange",
            "data": combined_daterange_data,
            "ykeys": list(range(1, i + 1)),
            "labels": labels,
            "children": dict(children_dict)
        }

        if daterange_data["data"]:
            data_out.append(daterange_data)

        return data_out
Beispiel #40
0
class FactManager:
    """ Manage Searcher facts, like deleting/storing, adding facts.
    """
    def __init__(self, request):
        self.es_params = request.POST
        self.ds = Datasets().activate_dataset(request.session)
        self.index = self.ds.get_index()
        self.mapping = self.ds.get_mapping()
        self.es_m = ES_Manager(self.index, self.mapping)
        self.field = 'texta_facts'

    def remove_facts_from_document(self, rm_facts_dict, bs=7500):
        '''remove a certain fact from all documents given a [str]key and [str]val'''
        logger = LogManager(__name__, 'FACT MANAGER REMOVE FACTS')

        try:
            # Clears readonly block just in case the index has been set to read only
            self.es_m.clear_readonly_block()

            query = self._fact_deletion_query(rm_facts_dict)
            self.es_m.load_combined_query(query)
            response = self.es_m.scroll(size=bs, field_scroll=self.field)
            scroll_id = response['_scroll_id']
            total_docs = response['hits']['total']
            docs_left = total_docs  # DEBUG
            print('Starting.. Total docs - ', total_docs)  # DEBUG
            batch = 0
            while total_docs > 0:
                print('Docs left:', docs_left)  # DEBUG
                data = ''
                for document in response['hits']['hits']:
                    new_field = []  # The new facts field
                    for fact in document['_source'][self.field]:
                        # If the fact name is in rm_facts_dict keys
                        if fact["fact"] in rm_facts_dict:
                            # If the fact value is not in the delete key values
                            if fact['str_val'] not in rm_facts_dict.getlist(
                                    fact["fact"]):
                                new_field.append(fact)
                        else:
                            new_field.append(fact)
                    # Update dataset
                    data += json.dumps({
                        "update": {
                            "_id": document['_id'],
                            "_type": document['_type'],
                            "_index": document['_index']
                        }
                    }) + '\n'
                    document = {'doc': {self.field: new_field}}
                    data += json.dumps(document) + '\n'
                response = self.es_m.scroll(scroll_id=scroll_id,
                                            size=bs,
                                            field_scroll=self.field)
                total_docs = len(response['hits']['hits'])
                docs_left -= bs  # DEBUG
                scroll_id = response['_scroll_id']
                self.es_m.plain_post_bulk(self.es_m.es_url, data)
            print('DONE')  # DEBUG

            logger.set_context('docs_left', total_docs)
            logger.set_context('batch', batch)
            logger.info('remove_facts_from_document')
        except:
            print(traceback.format_exc())
            logger.set_context('es_params', self.es_params)
            logger.exception('remove_facts_from_document_failed')

    def tag_documents_with_fact(self, es_params, tag_name, tag_value,
                                tag_field):
        '''Used to tag all documents in the current search with a certain fact'''

        self.es_m.build(es_params)
        self.es_m.load_combined_query(self.es_m.combined_query)

        response = self.es_m.scroll()

        data = ''
        for document in response['hits']['hits']:
            if 'mlp' in tag_field:
                split_field = tag_field.split('.')
                span = [
                    0,
                    len(document['_source'][split_field[0]][split_field[1]])
                ]
            else:
                span = [0, len(document['_source'][tag_field].strip())]
            document['_source'][self.field].append({
                "str_val": tag_value,
                "spans": str([span]),
                "fact": tag_name,
                "doc_path": tag_field
            })

            data += json.dumps({
                "update": {
                    "_id": document['_id'],
                    "_type": document['_type'],
                    "_index": document['_index']
                }
            }) + '\n'
            document = {'doc': {self.field: document['_source'][self.field]}}
            data += json.dumps(document) + '\n'
        self.es_m.plain_post_bulk(self.es_m.es_url, data)
        response = requests.post(
            '{0}/{1}/_update_by_query?refresh&conflicts=proceed'.format(
                self.es_m.es_url, self.index),
            headers=self.es_m.HEADERS)

    def count_cooccurrences(self, fact_pairs):
        """Finds the counts of cooccuring facts

        Arguments:
            fact_pairs {list of tuples of tuples} -- Example:[(('ORG', 'Riigikohus'),('PER', 'Jaan')), (('ORG', 'Riigikohus'),('PER', 'Peeter'))]

        Returns:
            [int list] -- Occurances of the given facts
        """
        queries = []
        for fact_pair in fact_pairs:
            fact_constraints = []

            for fact in fact_pair:
                constraint = {
                    "nested": {
                        "path": "texta_facts",
                        "query": {
                            "bool": {
                                "must": [{
                                    "term": {
                                        "texta_facts.fact": fact[0]
                                    }
                                }, {
                                    "term": {
                                        "texta_facts.str_val": fact[1]
                                    }
                                }]
                            }
                        }
                    }
                }
                fact_constraints.append(constraint)

            query = {"query": {"bool": {"must": fact_constraints}}, "size": 0}
            queries.append(json.dumps(query))

        header = json.dumps({"index": self.index})
        data = "\n".join(["{0}\n{1}".format(header, q)
                          for q in queries]) + "\n"

        responses = requests.post("{0}/{1}/_msearch".format(
            self.es_m.es_url, self.index),
                                  data=data,
                                  headers={"Content-Type": "application/json"})
        counts = [
            response["hits"]["total"]
            for response in responses.json()['responses']
        ]

        return counts

    def facts_via_aggregation(self, size=15):
        """Finds all facts from current search.
        Parameters:
            size - [int=15] -- Amount of fact values per fact name to search in query
        Returns:
            facts - [dict] -- Details for each fact, ex: {'PER - kostja': {'id': 0, 'name': 'PER', 'value': 'kostja', 'doc_count': 44}}
            fact_combinations - [list of tuples] -- All possible combinations of all facts: [(('FIRST_FACTNAME', 'FIRST_FACTVAL'), ('SECOND_FACTNAME', 'SECOND_FACTVAL'))]
            unique_fact_names - [list of string] -- All unique fact names
        """

        aggs = {
            "facts": {
                "nested": {
                    "path": "texta_facts"
                },
                "aggs": {
                    "fact_names": {
                        "terms": {
                            "field": "texta_facts.fact"
                        },
                        "aggs": {
                            "fact_values": {
                                "terms": {
                                    "field": "texta_facts.str_val",
                                    "size": size
                                }
                            }
                        }
                    }
                }
            }
        }
        self.es_m.build(self.es_params)
        self.es_m.set_query_parameter('aggs', aggs)

        response = self.es_m.search()

        response_aggs = response['aggregations']['facts']['fact_names'][
            'buckets']

        facts = {}
        fact_combinations = []
        fact_count = 0
        unique_fact_names = []
        for bucket in response_aggs:
            unique_fact_names.append(bucket['key'])
            for fact in bucket['fact_values']['buckets']:
                facts[bucket['key'] + " - " + fact['key']] = {
                    'id': fact_count,
                    'name': bucket['key'],
                    'value': fact['key'],
                    'doc_count': fact['doc_count']
                }
                fact_combinations.append((bucket['key'], fact['key']))
                fact_count += 1

        fact_combinations = [
            x for x in itertools.combinations(fact_combinations, 2)
        ]
        return (facts, fact_combinations, unique_fact_names)

    def fact_graph(self, search_size):
        facts, fact_combinations, unique_fact_names = self.facts_via_aggregation(
            size=search_size)
        # Get cooccurrences and remove values with 0
        fact_combinations = {
            k: v
            for k, v in dict(
                zip(fact_combinations,
                    self.count_cooccurrences(fact_combinations))).items()
            if v != 0
        }
        shapes = [
            "circle", "cross", "diamond", "square", "triangle-down",
            "triangle-up"
        ]
        types = dict(zip(unique_fact_names, itertools.cycle(shapes)))

        nodes = []
        for i, fact in enumerate(facts):
            nodes.append({
                "source": facts[fact]['id'],
                "size": facts[fact]['doc_count'],
                "score": facts[fact]['doc_count'],
                "name": facts[fact]['name'],
                "id": facts[fact]['value'],
                "type": types[facts[fact]['name']]
            })
            # Track max/min count
            count = facts[fact]['doc_count']
            if i == 0:
                max_node_size = count
                min_node_size = count
            max_node_size = max(max_node_size, count)
            min_node_size = min(min_node_size, count)

        links = []
        max_link_size = 0
        for fact in fact_combinations.keys():
            max_link_size = max(max_link_size, fact_combinations[fact])
            links.append({
                "source": facts[fact[0][0] + " - " + fact[0][1]]['id'],
                "target": facts[fact[1][0] + " - " + fact[1][1]]['id'],
                "count": fact_combinations[fact]
            })

        graph_data = json.dumps({"nodes": nodes, "links": links})
        return (graph_data, unique_fact_names, max_node_size, max_link_size,
                min_node_size)

    def _fact_deletion_query(self, rm_facts_dict):
        '''Creates the query for fact deletion based on dict of facts {name: val}'''
        fact_queries = []
        for key in rm_facts_dict:
            for val in rm_facts_dict.getlist(key):
                fact_queries.append({
                    "bool": {
                        "must": [{
                            "match": {
                                self.field + ".fact": key
                            }
                        }, {
                            "match": {
                                self.field + ".str_val": val
                            }
                        }]
                    }
                })

        query = {
            "main": {
                "query": {
                    "nested": {
                        "path": self.field,
                        "query": {
                            "bool": {
                                "should": fact_queries
                            }
                        }
                    }
                },
                "_source": [self.field]
            }
        }

        return query
Beispiel #41
0
class EsDataClassification(object):

    def __init__(self, es_index, es_mapping, field, query):
        # Dataset info
        self.es_index = es_index
        self.es_mapping = es_mapping
        self.field = field
        # Build ES manager
        self.es_m = ES_Manager(es_index, es_mapping)
        self.es_m.load_combined_query(query)

    def get_total_documents(self):
        return self.es_m.get_total_documents()

    def get_tags_by_id(self, doc_id):
        request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index, self.es_mapping, doc_id)
        response = ES_Manager.plain_get(request_url)
        if 'texta_tags' in response['_source']:
            tags = response['_source']['texta_tags']
        else:
            tags = ""
        return tags.split()

    def apply_classifiers(self, classifiers, classifier_tags):
        if not isinstance(classifiers, list):
            classifiers = [classifiers]

        if not isinstance(classifier_tags, list):
            classifier_tags = [classifier_tags]

        response = self.es_m.scroll()
        scroll_id = response['_scroll_id']
        total_hits = response['hits']['total']
        total_processed = 0
        positive_docs = []
        positive_docs_batch = []
        batch_size = 1000

        # Get all positive documents
        while total_hits > 0:

            # Check errors in the database request
            if (response['_shards']['total'] > 0 and response['_shards']['successful'] == 0) or response['timed_out']:
                msg = 'Elasticsearch failed to retrieve documents: ' \
                      '*** Shards: {0} *** Timeout: {1} *** Took: {2}'.format(response['_shards'],
                                                                              response['timed_out'], response['took'])
                raise EsIteratorError(msg)

            for hit in response['hits']['hits']:
                positive_docs_batch.append(((str(hit['_id'])), hit['_source']))

                if len(positive_docs_batch) >= batch_size:
                    positive_docs_per_classifier = self._apply_classifiers_to_documents(positive_docs_batch, classifiers, classifier_tags)
                    positive_docs_batch = []
                    total_processed += len(positive_docs_batch)

            # New scroll request
            response = self.es_m.scroll(scroll_id=scroll_id)
            total_hits = len(response['hits']['hits'])

        if positive_docs_batch:
            positive_docs_per_classifier = self._apply_classifiers_to_documents(positive_docs_batch, classifiers, classifier_tags)
            total_processed += len(positive_docs_batch)

        data = {}
        data['total_processed'] = total_processed
        data['total_positive'] = positive_docs_per_classifier[0] if len(classifiers) == 1 else positive_docs_per_classifier
        if len(classifiers) == 1:
            data['total_negative'] = total_processed - positive_docs_per_classifier[0]
        else:
            data['total_negative'] = [
                total_processed - positive_docs_count for positive_docs_count in positive_docs_per_classifier
            ]
        data['total_documents'] = self.get_total_documents()

        return data

    def _apply_classifiers_to_documents(self, documents, classifiers, classifier_tags):
        """
        :param documents: list of (doc_id, document) entries
        :return: None
        """
        field_path_components = self.field.split('.')
        fields_data = []

        for document in documents:
            # Traverse the nested fields to reach the sought input text/data for the classifier
            field_data = document[1]
            for field_path_component in field_path_components:
                field_data = field_data[field_path_component]
            fields_data.append(field_data)

        positive_docs = []
        classifiers_predictions = []

        for classifier in classifiers:
            predictions = classifier.predict(fields_data)
            classifiers_predictions.append(predictions)
            positive_docs.append(sum(predictions))

        bulk_update_content = []
        for document_idx, document in enumerate(documents):
            document_id, document = document
            if 'texta_tags' in document:
                tags = set([tag.strip() for tag in document['texta_tags'].split('\n')])
            else:
                tags = set()

            new_tags = False
            for classifier_idx, classifier_predictions in enumerate(classifiers_predictions):
                if classifier_predictions[document_idx] == 1:
                    tag_count_before = len(tags)
                    tags.add(classifier_tags[classifier_idx])
                    new_tags = len(tags) > tag_count_before

            if new_tags:
                bulk_update_content.append(json.dumps({
                    'update': {
                        '_id':    document_id,
                        '_index': self.es_index,
                        '_type':  self.es_mapping
                    }
                }))
                bulk_update_content.append(json.dumps({
                    'doc': {
                        'texta_tags': '\n'.join(sorted(tags))
                    }
                }))

        bulk_update_content.append('')
        bulk_update_content = '\n'.join(bulk_update_content)

        self.es_m.plain_post_bulk(self.es_m.es_url, bulk_update_content)

        return positive_docs
Beispiel #42
0
def get_next_page_data(query, es_from, last_page, query_data, request):
    start_from = None
    end = None
    rows = []
    page_length = query_data['page_length']

    if es_from == None:
        return {'rows':rows,'from':start_from,'page':last_page,'end':None, 'total':None}

    dataset = query_data['dataset']
    mapping = query_data['mapping']
    polarity = query_data['polarity']

    inclusive_instructions = query_data['inclusive_instructions']
    exclusive_instructions = query_data['exclusive_instructions']

    query['from'] = es_from
    response = ES_Manager.plain_search(es_url, dataset, mapping, query)

    try:
        hit = response['hits']['hits'][0]
        feature_dict = {feature_name:hit['_source'][feature_name][0] for feature_name in hit['_source']}
        sorted_feature_names = sorted(feature_dict)
        feature_to_idx_map = {feature: (feature_idx+1) for feature_idx, feature in enumerate(sorted_feature_names)}
    except:
        pass

    hit_idx = page_length-1

    while len(rows) < page_length and 'hits' in response and 'hits' in response['hits'] and response['hits']['hits'] and hit_idx+1 == page_length:
        for hit_idx, hit in enumerate(response['hits']['hits']):
            if len(rows) >= page_length:
                break

            feature_dict = {}

            for field_name in hit['_source']:
                field_value = hit['_source'][field_name]
                if isinstance(field_value, dict):
                    for subfield_name, subfield_value in field_value.items():
                        combined_field_name = '{0}.{1}'.format(field_name, subfield_name)
                        feature_dict[combined_field_name] = subfield_value
                else:
                    feature_dict[field_name] = field_value

            sorted_feature_names = sorted(feature_dict)

            feature_to_idx_map = defaultdict(list)
            for feature_idx, feature in enumerate(sorted_feature_names):
                feature_to_idx_map[feature.split('.')[0]].append(feature_idx+1)

            row = [hit['_id']]
            row.extend([feature_dict[feature_name] for feature_name in sorted_feature_names])
            layer_dict = matcher.LayerDict(feature_dict)

            inclusive_matches = inclusive_instructions.match(layer_dict)

            if (polarity == 'positive') == bool(inclusive_matches): # add row if polarity is positive and we have a match or negative and dont
                if len(rows) == 0:
                    start_from = query['from'] + hit_idx
                end = query['from'] + hit_idx

                if inclusive_matches:
                    row = highlight(row, feature_to_idx_map, inclusive_matches)

                rows.append(row)

        query['from'] = query['from'] + hit_idx + 1

        if len(rows) >= page_length:
            break
        response = ES_Manager.plain_search(es_url, dataset, mapping,query)
    if end:
        GrammarPageMapping(search_id=query_data['search_id'], inclusive_grammar=query_data['inclusive_grammar_id'],
                        exclusive_grammar=query_data['exclusive_grammar_id'], page=query_data['requested_page'], polarity=query_data['polarity'],
                        elastic_start=start_from, elastic_end=end+1, author=request.user).save()

    return {'rows':rows,'from':start_from,'page':last_page+1,'end':(end+(0 if last_page == 0 else 1)) if end else end, 'total':response['hits']['total']}
Beispiel #43
0
def find_mappings(request):
    try:
        slop     = int(request.POST['slop'])
        max_len  = int(request.POST['max_len'])
        min_len  = int(request.POST['min_len'])
        min_freq = int(request.POST['min_freq'])
        match_field = request.POST['match_field']
        description = request.POST['description']

        batch_size = 50

        # Define selected mapping
        ds = Datasets().activate_dataset(request.session)
        dataset = ds.get_index()
        mapping = ds.get_mapping()

        lexicon = []
        word_index = {}
        num_lexicons = 0
        for i,lexicon_id in enumerate(request.POST.getlist('lexicons[]')):
            num_lexicons +=1
            for word in Word.objects.filter(lexicon=lexicon_id):
                word = word.wrd
                lexicon.append(word)
                if word not in word_index:
                    word_index[word] = []
                word_index[word].append(i)
        lexicon = list(set(lexicon))
        if min_len > num_lexicons:
            min_len = num_lexicons
        mwe_counter = 0
        group_counter = 0
        phrases = []
        final   = {}
        data = []
        new_run = Run(minimum_frequency=min_freq,maximum_length=max_len,minimum_length=min_len,run_status='running',run_started=datetime.now(),run_completed=None,user=request.user,description=description)
        new_run.save()
        logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_started','args':{'user_name':request.user.username,'run_id':new_run.id,'slop':slop,'min_len':min_len,'max_len':max_len,'min_freq':min_freq,'match_field':match_field,'desc':description}}))
        for i in range(min_len,max_len+1):
            print('Permutation len:',i)
            for permutation in itertools.permutations(lexicon,i):
                word_indices = list(flatten([word_index[word] for word in permutation])) 
                if len(word_indices) == len(set(word_indices)):
                    permutation = ' '.join(permutation)
                    if slop > 0:
                        query = {"query": {"match_phrase": {match_field: {"query": permutation,"slop": slop}}}}
                    else:
                        query = {"query": {"match_phrase": {match_field: {"query": permutation}}}}
                    data.append(json.dumps({"index":dataset,"mapping":mapping})+'\n'+json.dumps(query))
                    phrases.append(permutation)
                    if len(data) == batch_size:
                        for j,response in enumerate(ES_Manager.plain_multisearch(es_url, dataset, mapping, data)):
                            try:
                                if response['hits']['total'] >= min_freq:
                                    sorted_phrase = ' '.join(sorted(phrases[j].split(' ')))
                                    sorted_conceptualised_phrase = conceptualise_phrase(sorted_phrase,request.user)
                                    if sorted_conceptualised_phrase not in final:
                                        final[sorted_conceptualised_phrase] = {'total_freq':0,'mwes':[],'display_name':{'freq':0,'label':False},'id':group_counter}
                                        group_counter+=1
                                    final[sorted_conceptualised_phrase]['total_freq']+=response['hits']['total']
                                    final[sorted_conceptualised_phrase]['mwes'].append({'mwe':phrases[j],'freq':response['hits']['total'],'accepted':False,'id':mwe_counter})
                                    mwe_counter+=1
                                    final[sorted_conceptualised_phrase]['mwes'].sort(reverse=True,key=lambda k: k['freq'])
                                    if response['hits']['total'] > final[sorted_conceptualised_phrase]['display_name']['freq']:
                                        final[sorted_conceptualised_phrase]['display_name']['freq'] = response['hits']['total']
                                        final[sorted_conceptualised_phrase]['display_name']['label'] = phrases[j]
                            except KeyError as e:
                                raise e
                        data = []
                        phrases = []
            logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_progress','args':{'user_name':request.user.username,'run_id':new_run.id},'data':{'permutations_processed':i+1-min_len,'total_permutations':max_len-min_len+1}}))
        
        m_response = ES_Manager.plain_multisearch(es_url, dataset, mapping, data)
        
        for j,response in enumerate(m_response):
            try:
                if response['hits']['total'] >= min_freq:
                    sorted_phrase = ' '.join(sorted(phrases[j].split(' ')))
                    sorted_conceptualised_phrase = conceptualise_phrase(sorted_phrase,request.user)
                    if sorted_conceptualised_phrase not in final:
                        final[sorted_conceptualised_phrase] = {'total_freq':0,'mwes':[],'display_name':{'freq':0,'label':False},'id':group_counter}
                        group_counter+=1
                    final[sorted_conceptualised_phrase]['total_freq']+=response['hits']['total']
                    final[sorted_conceptualised_phrase]['mwes'].append({'mwe':phrases[j],'freq':response['hits']['total'],'accepted':False,'id':mwe_counter})
                    mwe_counter+=1
                    final[sorted_conceptualised_phrase]['mwes'].sort(reverse=True,key=lambda k: k['freq'])
                    if response['hits']['total'] > final[sorted_conceptualised_phrase]['display_name']['freq']:
                        final[sorted_conceptualised_phrase]['display_name']['freq'] = response['hits']['total']
                        final[sorted_conceptualised_phrase]['display_name']['label'] = phrases[j]
            except KeyError as e:       
                raise e
        for key in final:
            final[key]['concept_name'] = {'freq':-1,'label':''}
        r = Run.objects.get(pk=new_run.pk)
        r.run_completed = datetime.now()
        r.run_status = 'completed'
        r.results =json.dumps(final)
        r.save()
        logging.getLogger(INFO_LOGGER).info(json.dumps({'process':'MINE MWEs','event':'mwe_mining_completed','args':{'user_name':request.user.username,'run_id':new_run.id}}))
    except Exception as e:
        print(e)
        logging.getLogger(ERROR_LOGGER).error(json.dumps({'process':'MINE MWEs','event':'mwe_mining_failed','args':{'user_name':request.user.username,'run_id':new_run.id}}),exc_info=True)
Beispiel #44
0
	def get_allowed_datasets(self, user):
		indices = ES_Manager.get_indices()
		datasets = self.sort_datasets(indices)
		#print(datasets)
		return [dataset for dataset in datasets if user.has_perm('permission_admin.can_access_dataset_' + str(dataset['id']))]
Beispiel #45
0
class EsDataClassification(object):
    def __init__(self, es_index, es_mapping, field, query):
        # Dataset info
        self.es_index = es_index
        self.es_mapping = es_mapping
        self.field = field
        # Build ES manager
        self.es_m = ES_Manager(es_index, es_mapping)
        self.es_m.load_combined_query(query)

    def get_total_documents(self):
        return self.es_m.get_total_documents()

    def get_tags_by_id(self, doc_id):
        request_url = '{0}/{1}/{2}/{3}'.format(self.es_m.es_url, self.es_index,
                                               self.es_mapping, doc_id)
        response = ES_Manager.plain_get(request_url)
        if 'texta_tags' in response['_source']:
            tags = response['_source']['texta_tags']
        else:
            tags = ""
        return tags.split()

    def apply_classifiers(self, classifiers, classifier_tags):
        if not isinstance(classifiers, list):
            classifiers = [classifiers]

        if not isinstance(classifier_tags, list):
            classifier_tags = [classifier_tags]

        response = self.es_m.scroll()
        scroll_id = response['_scroll_id']
        total_hits = response['hits']['total']
        total_processed = 0
        positive_docs = []
        positive_docs_batch = []
        batch_size = 1000

        # Get all positive documents
        while total_hits > 0:

            # Check errors in the database request
            if (response['_shards']['total'] > 0
                    and response['_shards']['successful']
                    == 0) or response['timed_out']:
                msg = 'Elasticsearch failed to retrieve documents: ' \
                      '*** Shards: {0} *** Timeout: {1} *** Took: {2}'.format(response['_shards'],
                                                                              response['timed_out'], response['took'])
                raise EsIteratorError(msg)

            for hit in response['hits']['hits']:
                positive_docs_batch.append(((str(hit['_id'])), hit['_source']))

                if len(positive_docs_batch) >= batch_size:
                    positive_docs_per_classifier = self._apply_classifiers_to_documents(
                        positive_docs_batch, classifiers, classifier_tags)
                    positive_docs_batch = []
                    total_processed += len(positive_docs_batch)

            # New scroll request
            response = self.es_m.scroll(scroll_id=scroll_id)
            total_hits = len(response['hits']['hits'])

        if positive_docs_batch:
            positive_docs_per_classifier = self._apply_classifiers_to_documents(
                positive_docs_batch, classifiers, classifier_tags)
            total_processed += len(positive_docs_batch)

        data = {}
        data['total_processed'] = total_processed
        data['total_positive'] = positive_docs_per_classifier[0] if len(
            classifiers) == 1 else positive_docs_per_classifier
        if len(classifiers) == 1:
            data[
                'total_negative'] = total_processed - positive_docs_per_classifier[
                    0]
        else:
            data['total_negative'] = [
                total_processed - positive_docs_count
                for positive_docs_count in positive_docs_per_classifier
            ]
        data['total_documents'] = self.get_total_documents()

        return data

    def _apply_classifiers_to_documents(self, documents, classifiers,
                                        classifier_tags):
        """
        :param documents: list of (doc_id, document) entries
        :return: None
        """
        field_path_components = self.field.split('.')
        fields_data = []

        for document in documents:
            # Traverse the nested fields to reach the sought input text/data for the classifier
            field_data = document[1]
            for field_path_component in field_path_components:
                field_data = field_data[field_path_component]
            fields_data.append(field_data)

        positive_docs = []
        classifiers_predictions = []

        for classifier in classifiers:
            predictions = classifier.predict(fields_data)
            classifiers_predictions.append(predictions)
            positive_docs.append(sum(predictions))

        bulk_update_content = []
        for document_idx, document in enumerate(documents):
            document_id, document = document
            if 'texta_tags' in document:
                tags = set([
                    tag.strip() for tag in document['texta_tags'].split('\n')
                ])
            else:
                tags = set()

            new_tags = False
            for classifier_idx, classifier_predictions in enumerate(
                    classifiers_predictions):
                if classifier_predictions[document_idx] == 1:
                    tag_count_before = len(tags)
                    tags.add(classifier_tags[classifier_idx])
                    new_tags = len(tags) > tag_count_before

            if new_tags:
                bulk_update_content.append(
                    json.dumps({
                        'update': {
                            '_id': document_id,
                            '_index': self.es_index,
                            '_type': self.es_mapping
                        }
                    }))
                bulk_update_content.append(
                    json.dumps(
                        {'doc': {
                            'texta_tags': '\n'.join(sorted(tags))
                        }}))

        bulk_update_content.append('')
        bulk_update_content = '\n'.join(bulk_update_content)

        self.es_m.plain_post_bulk(self.es_m.es_url, bulk_update_content)

        return positive_docs
Beispiel #46
0
class Autocomplete:

    def __init__(self):
        self.es_m = None
        self.lookup_type = None
        self.key_constraints = None
        self.content = None
        self.user = None
        self.limit = None

    def parse_request(self,request):

        self.lookup_types = request.POST['lookup_types'].split(',')
        self.key_constraints = request.POST['key_constraints'].split(',')
        self.content = request.POST['content'].split('\n')[-1].strip()
        print(self.content)
        ds = Datasets().activate_dataset(request.session)
        self.dataset = ds.get_index()
        self.mapping = ds.get_mapping()
        self.es_m = ES_Manager(self.dataset, self.mapping)

        self.user = request.user

    def suggest(self,limit=10):
        self.limit = limit

        suggestions = {}

        for i,lookup_type in enumerate(self.lookup_types):
            if lookup_type == 'FACT_NAME':
                suggestions['FACT_NAME'] = self._get_facts('fact', lookup_type)
            elif lookup_type == 'FACT_VAL':
                suggestions['FACT_VAL'] = self._get_facts('str_val', lookup_type, key_constraint=self.key_constraints[i])
            elif lookup_type == 'CONCEPT':
                suggestions['CONCEPT'] = self._get_concepts()
            elif lookup_type == 'LEXICON':
                suggestions['LEXICON'] = self._get_lexicons()
        return suggestions

    def _get_facts(self, agg_subfield, lookup_type, key_constraint=None):
        agg_query = {agg_subfield: {"nested": {"path": "texta_facts"}, "aggs": {agg_subfield: {"terms": {"field": "texta_facts.fact"}, "aggs": {"fact_values": {"terms": {"field": "texta_facts.str_val", "size": self.limit, "include": "{0}.*".format(self.content)}}}}}}}

        self.es_m.build('')
        self.es_m.set_query_parameter("aggs", agg_query)

        if lookup_type == 'FACT_VAL' and key_constraint:
            facts = []
            for bucket in self.es_m.search()["aggregations"][agg_subfield][agg_subfield]["buckets"]:
                if bucket["key"] == key_constraint:
                    facts += [self._format_suggestion(sub_bucket["key"], sub_bucket["key"]) for sub_bucket in bucket["fact_values"]["buckets"]]

        elif lookup_type == 'FACT_VAL' and not key_constraint:
            facts = []
            for bucket in self.es_m.search()["aggregations"][agg_subfield][agg_subfield]["buckets"]:
                facts += [self._format_suggestion(sub_bucket["key"], sub_bucket["key"]) for sub_bucket in bucket["fact_values"]["buckets"]]
        else:
            facts = [self._format_suggestion(a["key"],a["key"]) for a in self.es_m.search()["aggregations"][agg_subfield][agg_subfield]["buckets"]]

        return facts

    def _get_concepts(self):
        concepts = []

        if len(self.content) > 0:
            terms = Term.objects.filter(term__startswith=self.content).filter(author=self.user)
            seen = {}
            for term in terms[:self.limit]:
                for term_concept in TermConcept.objects.filter(term=term.pk):
                    concept = term_concept.concept
                    concept_term = (concept.pk,term.term)

                    if concept_term not in seen:
                        seen[concept_term] = True

                        display_term = term.term.replace(self.content,'<font color="red">'+self.content+'</font>')
                        display_text = '<b>{0}</b>@C{1}-{2}'.format(display_term,concept.pk,concept.descriptive_term.term)

                        suggestion = self._format_suggestion(concept.descriptive_term.term,display_text,resource_id=concept.pk)
                        concepts.append(suggestion)

        return concepts

    def _get_lexicons(self):
        suggested_lexicons = []

        if len(self.content) > 0:
            lexicons = Lexicon.objects.filter(name__startswith=self.content).filter(author=self.user)
            for lexicon in lexicons:
                display_term = lexicon.name.replace(self.content,'<font color="red">'+self.content+'</font>')
                display_text = '<b>{0}</b>@L{1}-{2}'.format(display_term,lexicon.pk,lexicon.name)

                suggestion = self._format_suggestion(lexicon.name,display_text,resource_id=lexicon.pk)
                suggested_lexicons.append(suggestion)

        return suggested_lexicons

    @staticmethod
    def _format_suggestion(entry_text,display_text,resource_id=''):
        return {'entry_text':entry_text,'display_text':display_text,'resource_id':resource_id}