Example #1
0
def api_document_tags_list(request, user, params):
    """ Get document tags (via auth_token)
    """
    dataset_id = params.get('dataset', None)
    document_ids = params.get('document_ids', None)

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json, status=400, content_type='application/json')

    es_m = ds.build_manager(ES_Manager)
    mass_helper = MassHelper(es_m)
    resp = mass_helper.get_document_by_ids(document_ids)

    data = []
    for doc in resp['hits']['hits']:
        for f in doc['_source'].get('texta_facts', []):
            if f['fact'] == 'TEXTA_TAG':
                doc_id = doc['_id']
                doc_path = f['doc_path']
                doc_tag = f['str_val']
                data.append({ 'document_id': doc_id, 'field': doc_path, 'tag': doc_tag})

    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #2
0
def api_search_list(request, user, params):
    """ Get list of available searches for API user (via auth_token)
    """

    # Read all params
    dataset_id = int(params['dataset'])

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
            error = {'error': 'invalid dataset parameter'}
            data_json = json.dumps(error)
            return HttpResponse(data_json, status=400, content_type='application/json')

    # Build response structure
    data = []
    dataset = Dataset(pk=dataset_id)
    search_list = list(Search.objects.filter(dataset=dataset))
    for search in search_list:
        row = {
            'dataset': dataset_id,
            'search': search.id,
            'description': search.description
        }
        data.append(row)

    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #3
0
def api_mass_train_tagger(request, user, params):
    """ Apply mass train tagger (via auth_token)
    """
    # Read all params
    dataset_id = params.get('dataset', None)
    selected_tags = set(params.get('tags', []))
    field = params.get("field", None)
    normalizer_opt = params.get("normalizer_opt", "0")
    classifier_opt = params.get("classifier_opt", "0")
    reductor_opt = params.get("reductor_opt", "0")
    extractor_opt = params.get("extractor_opt", "0")
    retrain_only = params.get("retrain_only", False)

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
            error = {'error': 'invalid dataset parameter'}
            data_json = json.dumps(error)
            return HttpResponse(data_json, status=400, content_type='application/json')

    es_m = ds.build_manager(ES_Manager)
    mass_helper = MassHelper(es_m)
    
    data = mass_helper.schedule_tasks(selected_tags, normalizer_opt, classifier_opt, reductor_opt, extractor_opt, field, dataset_id, user)
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #4
0
def api_tag_list(request, user, params):
    """ Get list of available tags for API user (via auth_token)
    """
    dataset_id = params['dataset']
    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
            error = {'error': 'invalid dataset parameter'}
            data_json = json.dumps(error)
            return HttpResponse(data_json, status=400, content_type='application/json')

    es_m = ds.build_manager(ES_Manager)
    mass_helper = MassHelper(es_m)
    tag_set = mass_helper.get_unique_tags()
    tag_frequency = mass_helper.get_tag_frequency(tag_set)
    tag_models = set([tagger.description for tagger in Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value)])

    data = []
    for tag in sorted(tag_frequency.keys()):
        count = tag_frequency[tag]
        has_model = tag in tag_models
        doc = {'description': tag,
               'count': count,
               'has_model': has_model}
        data.append(doc)
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #5
0
def api_mass_train_tagger(request, user, params):
    """ Apply mass train tagger (via auth_token)
    """
    # Read all params
    dataset_id = params.get('dataset', None)
    selected_tags = set(params.get('tags', []))
    field = params.get("field", None)
    normalizer_opt = params.get("normalizer_opt", "0")
    classifier_opt = params.get("classifier_opt", "0")
    reductor_opt = params.get("reductor_opt", "0")
    extractor_opt = params.get("extractor_opt", "0")
    retrain_only = params.get("retrain_only", False)

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    es_m = ds.build_manager(ES_Manager)
    mass_helper = MassHelper(es_m)

    data = mass_helper.schedule_tasks(selected_tags, normalizer_opt,
                                      classifier_opt, reductor_opt,
                                      extractor_opt, field, dataset_id, user)
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #6
0
def api_document_tags_list(request, user, params):
    """ Get document tags (via auth_token)
    """
    dataset_id = params.get('dataset', None)
    document_ids = params.get('document_ids', None)

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    es_m = ds.build_manager(ES_Manager)
    mass_helper = MassHelper(es_m)
    resp = mass_helper.get_document_by_ids(document_ids)

    data = []
    for doc in resp['hits']['hits']:
        for f in doc['_source'].get('texta_facts', []):
            if f['fact'] == 'TEXTA_TAG':
                doc_id = doc['_id']
                doc_path = f['doc_path']
                doc_tag = f['str_val']
                data.append({
                    'document_id': doc_id,
                    'field': doc_path,
                    'tag': doc_tag
                })

    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #7
0
def api_tag_list(request, user, params):
    """ Get list of available tags for API user (via auth_token)
    """
    dataset_id = params['dataset']
    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    es_m = ds.build_manager(ES_Manager)
    mass_helper = MassHelper(es_m)
    tag_set = mass_helper.get_unique_tags()
    tag_frequency = mass_helper.get_tag_frequency(tag_set)
    tag_models = set([
        tagger.description for tagger in Task.objects.filter(
            task_type=TaskTypes.TRAIN_TAGGER.value)
    ])

    data = []
    for tag in sorted(tag_frequency.keys()):
        count = tag_frequency[tag]
        has_model = tag in tag_models
        doc = {'description': tag, 'count': count, 'has_model': has_model}
        data.append(doc)
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #8
0
def api_search_list(request, user, params):
    """ Get list of available searches for API user (via auth_token)
    """

    # Read all params
    dataset_id = int(params['dataset'])

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    # Build response structure
    data = []
    dataset = Dataset(pk=dataset_id)
    search_list = list(Search.objects.filter(dataset=dataset))
    for search in search_list:
        row = {
            'dataset': dataset_id,
            'search': search.id,
            'description': search.description
        }
        data.append(row)

    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #9
0
def api_field_list(request, user, params):
    """ Get list of available fields for API user (via auth_token)
    """
    dataset_id = params['dataset']
    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
            error = {'error': 'invalid dataset parameter'}
            data_json = json.dumps(error)
            return HttpResponse(data_json, status=400, content_type='application/json')

    es_m = ds.build_manager(ES_Manager)
    fields = get_fields(es_m)
    data = sorted([x['path'] for x in fields])
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #10
0
def api_field_list(request, user, params):
    """ Get list of available fields for API user (via auth_token)
    """
    dataset_id = params['dataset']
    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    es_m = ds.build_manager(ES_Manager)
    fields = get_fields(es_m)
    data = sorted([x['path'] for x in fields])
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #11
0
def api_tag_feedback(request, user, params):
    """ Apply tag feedback (via auth_token)
        Currently working corrently with 1 tag per document. Needs further development.
    """
    decision_id = params.get('decision_id', None)

    if not decision_id:
        error = {'error': 'no decision ID supported'}
        data_json = json.dumps(error)
        return HttpResponse(data_json, status=400, content_type='application/json')
    
    doc_path = params.get('doc_path', None)

    if not doc_path:
        error = {'error': 'no doc_path supported. cannot index feedback'}
        data_json = json.dumps(error)
        return HttpResponse(data_json, status=400, content_type='application/json')

    prediction = params.get('prediction', None)

    if not prediction:
        error = {'error': 'no prediction supported'}
        data_json = json.dumps(error)
        return HttpResponse(data_json, status=400, content_type='application/json')

    feedback_obj = TagFeedback.update(user, decision_id, prediction)

    # retrieve dataset id from task params
    params = Task.objects.get(pk = feedback_obj.tagger.pk).parameters
    params_json = json.loads(params)
    dataset_id = params_json['dataset']
    tagger_name = params_json['description']

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json, status=400, content_type='application/json')


    document = json.loads(feedback_obj.document)
    in_dataset = int(feedback_obj.in_dataset)

    data = {'success': True}

    # check if document already indexed in ES
    if in_dataset == 0:
        es_m = ds.build_manager(ES_Manager)

        # add tag to the document
        if prediction > 0:
            # add facts here!!!!
            new_fact = {"fact": "TEXTA_TAG", "str_val": tagger_name, "doc_path": doc_path, "spans": "[[0,0]]"}
            document['texta_facts'] = [new_fact]
        
        es_m.add_document(document)

        feedback_obj.in_dataset = 1
        feedback_obj.save()
        data['feedback_indexed'] = True
    else:
        data['feedback_indexed'] = False

    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #12
0
def api_hybrid_tagger(request, user, params):
    """ Apply hybrid tagger (via auth_token)
    """
    DEFAULT_TAGS_THRESHOLD = 50
    DEFAULT_MAX_TAGGERS = 20

    dataset_id = params['dataset']
    search = params['search']
    field = params['field']
    max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS))
    min_count_threshold = int(params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD))

    if 'description' not in params:
        params['description'] = "via API call"
    # Paramater projection for preprocessor task
    task_type = TaskTypes.APPLY_PREPROCESSOR
    params["preprocessor_key"] = "text_tagger"
    params["text_tagger_feature_names"] = params['field']

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json, status=400, content_type='application/json')

    param_query = json.loads(Search.objects.get(pk=int(search)).query)
    es_m = ds.build_manager(ES_Manager)    
    es_m.load_combined_query(param_query)
    # Get similar documents in a neighborhood of size 1000
    response = es_m.more_like_this_search([field], search_size=1000)
    docs = response['hits']['hits']
    # Build Tag frequency
    tag_freq = {}
    for doc in docs:
        for f in doc['_source'].get('texta_facts', []):
            if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field:
                doc_tag = f['str_val']
                if doc_tag not in tag_freq:
                    tag_freq[doc_tag] = 0
                tag_freq[doc_tag] += 1

    # Top Tags to limit the number of taggers
    top_tags = [t[0] for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)]
    top_tags = set(top_tags[:max_taggers])
    # Perform tag selection
    data = {'task': {}, 'explain': []}
    candidate_tags = set()
    for tag in tag_freq:
        selected = 0
        count = tag_freq[tag]
        if count >= min_count_threshold and tag in top_tags:
            selected = 1
            candidate_tags.add(tag)
        data['explain'].append({'tag': tag, 
                                'selected': selected, 
                                'count': count })
    # Filter tags
    tagger_search = Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value).filter(status=Task.STATUS_COMPLETED)
    taggers = [tagger.id for tagger in tagger_search if tagger.description in candidate_tags]
    # Create Task if taggers is not zero
    if len(taggers) > 0:
        description = params['description']
        params['text_tagger_taggers'] = taggers
        # Create execution task
        task_id = create_task(task_type, description, params, user)
        # Add task to queue
        task = Task.get_by_id(task_id)
        task.update_status(Task.STATUS_QUEUED)
        # Return reference to task
        data['task'] = {
            'task_id': task_id,
            'task_type': task_type,
            'status': task.status,
            'user': task.user.username
        }
    else:
        # If here, no taggers were selected
        data['task'] = {"error": "no similar documents have tags count above threshold"}
    # Generate response
    data['min_count_threshold'] = min_count_threshold
    data['max_taggers'] = max_taggers
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #13
0
def api_tag_feedback(request, user, params):
    """ Apply tag feedback (via auth_token)
        Currently working corrently with 1 tag per document. Needs further development.
    """
    decision_id = params.get('decision_id', None)

    if not decision_id:
        error = {'error': 'no decision ID supported'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    doc_path = params.get('doc_path', None)

    if not doc_path:
        error = {'error': 'no doc_path supported. cannot index feedback'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    prediction = params.get('prediction', None)

    if not prediction:
        error = {'error': 'no prediction supported'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    feedback_obj = TagFeedback.update(user, decision_id, prediction)

    # retrieve dataset id from task params
    params = Task.objects.get(pk=feedback_obj.tagger.pk).parameters
    params_json = json.loads(params)
    dataset_id = params_json['dataset']
    tagger_name = params_json['description']

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    document = json.loads(feedback_obj.document)
    in_dataset = int(feedback_obj.in_dataset)

    data = {'success': True}

    # check if document already indexed in ES
    if in_dataset == 0:
        es_m = ds.build_manager(ES_Manager)

        # add tag to the document
        if prediction > 0:
            # add facts here!!!!
            new_fact = {
                "fact": "TEXTA_TAG",
                "str_val": tagger_name,
                "doc_path": doc_path,
                "spans": "[[0,0]]"
            }
            document['texta_facts'] = [new_fact]

        es_m.add_document(document)

        feedback_obj.in_dataset = 1
        feedback_obj.save()
        data['feedback_indexed'] = True
    else:
        data['feedback_indexed'] = False

    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')
Example #14
0
def api_hybrid_tagger(request, user, params):
    """ Apply hybrid tagger (via auth_token)
    """
    DEFAULT_TAGS_THRESHOLD = 50
    DEFAULT_MAX_TAGGERS = 20

    dataset_id = params['dataset']
    search = params['search']
    field = params['field']
    max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS))
    min_count_threshold = int(
        params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD))

    if 'description' not in params:
        params['description'] = "via API call"
    # Paramater projection for preprocessor task
    task_type = TaskTypes.APPLY_PREPROCESSOR
    params["preprocessor_key"] = "text_tagger"
    params["text_tagger_feature_names"] = params['field']

    ds = Datasets()
    ds.activate_datasets_by_id(dataset_id, use_default=False)
    # Check if dataset_id is valid
    if not ds.is_active():
        error = {'error': 'invalid dataset parameter'}
        data_json = json.dumps(error)
        return HttpResponse(data_json,
                            status=400,
                            content_type='application/json')

    param_query = json.loads(Search.objects.get(pk=int(search)).query)
    es_m = ds.build_manager(ES_Manager)
    es_m.load_combined_query(param_query)
    # Get similar documents in a neighborhood of size 1000
    response = es_m.more_like_this_search([field], search_size=1000)
    docs = response['hits']['hits']
    # Build Tag frequency
    tag_freq = {}
    for doc in docs:
        for f in doc['_source'].get('texta_facts', []):
            if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field:
                doc_tag = f['str_val']
                if doc_tag not in tag_freq:
                    tag_freq[doc_tag] = 0
                tag_freq[doc_tag] += 1

    # Top Tags to limit the number of taggers
    top_tags = [
        t[0]
        for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)
    ]
    top_tags = set(top_tags[:max_taggers])
    # Perform tag selection
    data = {'task': {}, 'explain': []}
    candidate_tags = set()
    for tag in tag_freq:
        selected = 0
        count = tag_freq[tag]
        if count >= min_count_threshold and tag in top_tags:
            selected = 1
            candidate_tags.add(tag)
        data['explain'].append({
            'tag': tag,
            'selected': selected,
            'count': count
        })
    # Filter tags
    tagger_search = Task.objects.filter(
        task_type=TaskTypes.TRAIN_TAGGER.value).filter(
            status=Task.STATUS_COMPLETED)
    taggers = [
        tagger.id for tagger in tagger_search
        if tagger.description in candidate_tags
    ]
    # Create Task if taggers is not zero
    if len(taggers) > 0:
        description = params['description']
        params['text_tagger_taggers'] = taggers
        # Create execution task
        task_id = create_task(task_type, description, params, user)
        # Add task to queue
        task = Task.get_by_id(task_id)
        task.update_status(Task.STATUS_QUEUED)
        # Return reference to task
        data['task'] = {
            'task_id': task_id,
            'task_type': task_type,
            'status': task.status,
            'user': task.user.username
        }
    else:
        # If here, no taggers were selected
        data['task'] = {
            "error": "no similar documents have tags count above threshold"
        }
    # Generate response
    data['min_count_threshold'] = min_count_threshold
    data['max_taggers'] = max_taggers
    data_json = json.dumps(data)
    return HttpResponse(data_json, status=200, content_type='application/json')