def api_document_tags_list(request, user, params): """ Get document tags (via auth_token) """ dataset_id = params.get('dataset', None) document_ids = params.get('document_ids', None) ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) mass_helper = MassHelper(es_m) resp = mass_helper.get_document_by_ids(document_ids) data = [] for doc in resp['hits']['hits']: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG': doc_id = doc['_id'] doc_path = f['doc_path'] doc_tag = f['str_val'] data.append({ 'document_id': doc_id, 'field': doc_path, 'tag': doc_tag}) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_search_list(request, user, params): """ Get list of available searches for API user (via auth_token) """ # Read all params dataset_id = int(params['dataset']) ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') # Build response structure data = [] dataset = Dataset(pk=dataset_id) search_list = list(Search.objects.filter(dataset=dataset)) for search in search_list: row = { 'dataset': dataset_id, 'search': search.id, 'description': search.description } data.append(row) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_mass_train_tagger(request, user, params): """ Apply mass train tagger (via auth_token) """ # Read all params dataset_id = params.get('dataset', None) selected_tags = set(params.get('tags', [])) field = params.get("field", None) normalizer_opt = params.get("normalizer_opt", "0") classifier_opt = params.get("classifier_opt", "0") reductor_opt = params.get("reductor_opt", "0") extractor_opt = params.get("extractor_opt", "0") retrain_only = params.get("retrain_only", False) ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) mass_helper = MassHelper(es_m) data = mass_helper.schedule_tasks(selected_tags, normalizer_opt, classifier_opt, reductor_opt, extractor_opt, field, dataset_id, user) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_tag_list(request, user, params): """ Get list of available tags for API user (via auth_token) """ dataset_id = params['dataset'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) mass_helper = MassHelper(es_m) tag_set = mass_helper.get_unique_tags() tag_frequency = mass_helper.get_tag_frequency(tag_set) tag_models = set([tagger.description for tagger in Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value)]) data = [] for tag in sorted(tag_frequency.keys()): count = tag_frequency[tag] has_model = tag in tag_models doc = {'description': tag, 'count': count, 'has_model': has_model} data.append(doc) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_document_tags_list(request, user, params): """ Get document tags (via auth_token) """ dataset_id = params.get('dataset', None) document_ids = params.get('document_ids', None) ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) mass_helper = MassHelper(es_m) resp = mass_helper.get_document_by_ids(document_ids) data = [] for doc in resp['hits']['hits']: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG': doc_id = doc['_id'] doc_path = f['doc_path'] doc_tag = f['str_val'] data.append({ 'document_id': doc_id, 'field': doc_path, 'tag': doc_tag }) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_tag_list(request, user, params): """ Get list of available tags for API user (via auth_token) """ dataset_id = params['dataset'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) mass_helper = MassHelper(es_m) tag_set = mass_helper.get_unique_tags() tag_frequency = mass_helper.get_tag_frequency(tag_set) tag_models = set([ tagger.description for tagger in Task.objects.filter( task_type=TaskTypes.TRAIN_TAGGER.value) ]) data = [] for tag in sorted(tag_frequency.keys()): count = tag_frequency[tag] has_model = tag in tag_models doc = {'description': tag, 'count': count, 'has_model': has_model} data.append(doc) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_field_list(request, user, params): """ Get list of available fields for API user (via auth_token) """ dataset_id = params['dataset'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') es_m = ds.build_manager(ES_Manager) fields = get_fields(es_m) data = sorted([x['path'] for x in fields]) data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_tag_feedback(request, user, params): """ Apply tag feedback (via auth_token) Currently working corrently with 1 tag per document. Needs further development. """ decision_id = params.get('decision_id', None) if not decision_id: error = {'error': 'no decision ID supported'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') doc_path = params.get('doc_path', None) if not doc_path: error = {'error': 'no doc_path supported. cannot index feedback'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') prediction = params.get('prediction', None) if not prediction: error = {'error': 'no prediction supported'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') feedback_obj = TagFeedback.update(user, decision_id, prediction) # retrieve dataset id from task params params = Task.objects.get(pk = feedback_obj.tagger.pk).parameters params_json = json.loads(params) dataset_id = params_json['dataset'] tagger_name = params_json['description'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') document = json.loads(feedback_obj.document) in_dataset = int(feedback_obj.in_dataset) data = {'success': True} # check if document already indexed in ES if in_dataset == 0: es_m = ds.build_manager(ES_Manager) # add tag to the document if prediction > 0: # add facts here!!!! new_fact = {"fact": "TEXTA_TAG", "str_val": tagger_name, "doc_path": doc_path, "spans": "[[0,0]]"} document['texta_facts'] = [new_fact] es_m.add_document(document) feedback_obj.in_dataset = 1 feedback_obj.save() data['feedback_indexed'] = True else: data['feedback_indexed'] = False data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_hybrid_tagger(request, user, params): """ Apply hybrid tagger (via auth_token) """ DEFAULT_TAGS_THRESHOLD = 50 DEFAULT_MAX_TAGGERS = 20 dataset_id = params['dataset'] search = params['search'] field = params['field'] max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS)) min_count_threshold = int(params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD)) if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') param_query = json.loads(Search.objects.get(pk=int(search)).query) es_m = ds.build_manager(ES_Manager) es_m.load_combined_query(param_query) # Get similar documents in a neighborhood of size 1000 response = es_m.more_like_this_search([field], search_size=1000) docs = response['hits']['hits'] # Build Tag frequency tag_freq = {} for doc in docs: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field: doc_tag = f['str_val'] if doc_tag not in tag_freq: tag_freq[doc_tag] = 0 tag_freq[doc_tag] += 1 # Top Tags to limit the number of taggers top_tags = [t[0] for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)] top_tags = set(top_tags[:max_taggers]) # Perform tag selection data = {'task': {}, 'explain': []} candidate_tags = set() for tag in tag_freq: selected = 0 count = tag_freq[tag] if count >= min_count_threshold and tag in top_tags: selected = 1 candidate_tags.add(tag) data['explain'].append({'tag': tag, 'selected': selected, 'count': count }) # Filter tags tagger_search = Task.objects.filter(task_type=TaskTypes.TRAIN_TAGGER.value).filter(status=Task.STATUS_COMPLETED) taggers = [tagger.id for tagger in tagger_search if tagger.description in candidate_tags] # Create Task if taggers is not zero if len(taggers) > 0: description = params['description'] params['text_tagger_taggers'] = taggers # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data['task'] = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } else: # If here, no taggers were selected data['task'] = {"error": "no similar documents have tags count above threshold"} # Generate response data['min_count_threshold'] = min_count_threshold data['max_taggers'] = max_taggers data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_tag_feedback(request, user, params): """ Apply tag feedback (via auth_token) Currently working corrently with 1 tag per document. Needs further development. """ decision_id = params.get('decision_id', None) if not decision_id: error = {'error': 'no decision ID supported'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') doc_path = params.get('doc_path', None) if not doc_path: error = {'error': 'no doc_path supported. cannot index feedback'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') prediction = params.get('prediction', None) if not prediction: error = {'error': 'no prediction supported'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') feedback_obj = TagFeedback.update(user, decision_id, prediction) # retrieve dataset id from task params params = Task.objects.get(pk=feedback_obj.tagger.pk).parameters params_json = json.loads(params) dataset_id = params_json['dataset'] tagger_name = params_json['description'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') document = json.loads(feedback_obj.document) in_dataset = int(feedback_obj.in_dataset) data = {'success': True} # check if document already indexed in ES if in_dataset == 0: es_m = ds.build_manager(ES_Manager) # add tag to the document if prediction > 0: # add facts here!!!! new_fact = { "fact": "TEXTA_TAG", "str_val": tagger_name, "doc_path": doc_path, "spans": "[[0,0]]" } document['texta_facts'] = [new_fact] es_m.add_document(document) feedback_obj.in_dataset = 1 feedback_obj.save() data['feedback_indexed'] = True else: data['feedback_indexed'] = False data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')
def api_hybrid_tagger(request, user, params): """ Apply hybrid tagger (via auth_token) """ DEFAULT_TAGS_THRESHOLD = 50 DEFAULT_MAX_TAGGERS = 20 dataset_id = params['dataset'] search = params['search'] field = params['field'] max_taggers = int(params.get('max_taggers', DEFAULT_MAX_TAGGERS)) min_count_threshold = int( params.get('min_count_threshold', DEFAULT_TAGS_THRESHOLD)) if 'description' not in params: params['description'] = "via API call" # Paramater projection for preprocessor task task_type = TaskTypes.APPLY_PREPROCESSOR params["preprocessor_key"] = "text_tagger" params["text_tagger_feature_names"] = params['field'] ds = Datasets() ds.activate_datasets_by_id(dataset_id, use_default=False) # Check if dataset_id is valid if not ds.is_active(): error = {'error': 'invalid dataset parameter'} data_json = json.dumps(error) return HttpResponse(data_json, status=400, content_type='application/json') param_query = json.loads(Search.objects.get(pk=int(search)).query) es_m = ds.build_manager(ES_Manager) es_m.load_combined_query(param_query) # Get similar documents in a neighborhood of size 1000 response = es_m.more_like_this_search([field], search_size=1000) docs = response['hits']['hits'] # Build Tag frequency tag_freq = {} for doc in docs: for f in doc['_source'].get('texta_facts', []): if f['fact'] == 'TEXTA_TAG' and f['doc_path'] == field: doc_tag = f['str_val'] if doc_tag not in tag_freq: tag_freq[doc_tag] = 0 tag_freq[doc_tag] += 1 # Top Tags to limit the number of taggers top_tags = [ t[0] for t in sorted(tag_freq.items(), key=lambda x: x[1], reverse=True) ] top_tags = set(top_tags[:max_taggers]) # Perform tag selection data = {'task': {}, 'explain': []} candidate_tags = set() for tag in tag_freq: selected = 0 count = tag_freq[tag] if count >= min_count_threshold and tag in top_tags: selected = 1 candidate_tags.add(tag) data['explain'].append({ 'tag': tag, 'selected': selected, 'count': count }) # Filter tags tagger_search = Task.objects.filter( task_type=TaskTypes.TRAIN_TAGGER.value).filter( status=Task.STATUS_COMPLETED) taggers = [ tagger.id for tagger in tagger_search if tagger.description in candidate_tags ] # Create Task if taggers is not zero if len(taggers) > 0: description = params['description'] params['text_tagger_taggers'] = taggers # Create execution task task_id = create_task(task_type, description, params, user) # Add task to queue task = Task.get_by_id(task_id) task.update_status(Task.STATUS_QUEUED) # Return reference to task data['task'] = { 'task_id': task_id, 'task_type': task_type, 'status': task.status, 'user': task.user.username } else: # If here, no taggers were selected data['task'] = { "error": "no similar documents have tags count above threshold" } # Generate response data['min_count_threshold'] = min_count_threshold data['max_taggers'] = max_taggers data_json = json.dumps(data) return HttpResponse(data_json, status=200, content_type='application/json')