def tag_doc(self, request, pk=None, project_pk=None): """Returns list of tags for input document.""" serializer = TaggerTagDocumentSerializer(data=request.data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) # retrieve tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model.path: raise NonExistantModelError() # declare input_document variable input_document = serializer.validated_data['doc'] # load field data tagger_field_data = json.loads(tagger_object.fields) # validate input document input_document = validate_input_document(input_document, tagger_field_data) if isinstance(input_document, Exception): return input_document # apply tagger tagger_response = apply_tagger( tagger_object.id, input_document, input_type='doc', lemmatize=serializer.validated_data['lemmatize'], feedback=serializer.validated_data['feedback_enabled'], ) # if feedback was enabled, add url tagger_response = add_finite_url_to_feedback(tagger_response, request) return Response(tagger_response, status=status.HTTP_200_OK)
def list_features(self, request, pk=None, project_pk=None): """Returns list of features for the extactor.""" extractor: CRFExtractor = self.get_object() # check if model exists if not extractor.model.path: raise NonExistantModelError() crf_model = extractor.load_extractor() feature_info = crf_model.get_features() return Response(feature_info, status=status.HTTP_200_OK)
def multitag_text(self, request, pk=None, project_pk=None): """ Applies list of tagger objects inside project to any text. This is different from Tagger Group as **all** taggers in project are used and they do not have to reside in the same Tagger Group. Returns list of tags. """ serializer = TaggerMultiTagSerializer(data=request.data) # validate serializer if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) # get project object project_object = Project.objects.get(pk=project_pk) # get available taggers from project taggers = Tagger.objects.filter(project=project_object).filter( task__status=Task.STATUS_COMPLETED) # filter again if serializer.validated_data['taggers']: taggers = taggers.filter( pk__in=serializer.validated_data['taggers']) # error if filtering resulted 0 taggers if not taggers: raise NonExistantModelError(detail='No tagging models available.') # retrieve params lemmatize = serializer.validated_data['lemmatize'] feedback = serializer.validated_data['feedback_enabled'] text = serializer.validated_data['text'] hide_false = serializer.validated_data['hide_false'] # error if redis not available if not get_redis_status()['alive']: raise RedisNotAvailable() # lemmatize text just once before giving it to taggers! if lemmatize: text = CeleryLemmatizer().lemmatize(text) # tag text using celery group primitive group_task = group( apply_tagger.s(tagger.pk, text, input_type='text', lemmatize=False, feedback=feedback) for tagger in taggers) group_results = [ a for a in group_task.apply( queue=CELERY_SHORT_TERM_TASK_QUEUE).get() if a ] # remove non-hits if hide_false is True: group_results = [a for a in group_results if a['result']] # if feedback was enabled, add urls group_results = [ add_finite_url_to_feedback(a, request) for a in group_results ] # sort & return tags sorted_tags = sorted(group_results, key=lambda k: k['probability'], reverse=True) return Response(sorted_tags, status=status.HTTP_200_OK)
def list_features(self, request, pk=None, project_pk=None): """Returns list of features for the tagger. By default, features are sorted by their relevance in descending order.""" if self.request.method == 'GET': serializer = TaggerListFeaturesSerializer( data=request.query_params) elif self.request.method == 'POST': serializer = TaggerListFeaturesSerializer(data=request.data) # retrieve tagger object tagger_object: Tagger = self.get_object() # check if tagger exists if not tagger_object.model.path: raise NonExistantModelError() # retrieve model tagger = TextTagger() tagger.load_django(tagger_object) try: # get feature names features = tagger.get_feature_names() except: return Response( { 'error': 'Error loading feature names. Are you using HashingVectorizer? It does not support feature names!' }, status=status.HTTP_400_BAD_REQUEST) feature_coefs = tagger.get_feature_coefs() supports = tagger.get_supports() selected_features = [ feature for i, feature in enumerate(features) if supports[i] ] selected_features = [{ 'feature': feature, 'coefficient': feature_coefs[i] } for i, feature in enumerate(selected_features) if feature_coefs[i] > 0] selected_features = sorted(selected_features, key=lambda k: k['coefficient'], reverse=True) serializer.is_valid(raise_exception=True) size = serializer.validated_data['size'] features_to_show = selected_features[:size] feature_info = { 'total_features': len(selected_features), 'showing_features': len(features_to_show), 'features': features_to_show } return Response(feature_info, status=status.HTTP_200_OK)
def tag_text(self, request, pk=None, project_pk=None): serializer = TaggerTagTextSerializer(data=request.data) # check if valid request serializer.is_valid(raise_exception=True) # retrieve tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model: raise NonExistantModelError() # apply tagger text = serializer.validated_data['text'] feedback = serializer.validated_data['feedback_enabled'] prediction = apply_tagger(tagger_object, text, feedback=feedback) prediction = add_finite_url_to_feedback(prediction, request) return Response(prediction, status=status.HTTP_200_OK)
def tag_random_doc(self, request, pk=None, project_pk=None): """Returns prediction for a random document in Elasticsearch.""" # get tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model.path: raise NonExistantModelError() if not tagger_object.model.path: return Response({'error': 'model does not exist (yet?)'}, status=status.HTTP_400_BAD_REQUEST) serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = tagger_object.get_available_or_all_indices(indices) # retrieve tagger fields tagger_fields = json.loads(tagger_object.fields) if not ElasticCore().check_if_indices_exist(indices): return Response( { 'error': f'One or more index from {list(indices)} do not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] # filter out correct fields from the document random_doc_filtered = { k: v for k, v in random_doc.items() if k in tagger_fields } # apply tagger tagger_response = apply_tagger(tagger_object.id, random_doc_filtered, input_type='doc') response = {"document": random_doc, "prediction": tagger_response} return Response(response, status=status.HTTP_200_OK)
def tag_text(self, request, pk=None, project_pk=None): """ API endpoint for tagging raw text with tagger group. """ logging.getLogger(INFO_LOGGER).info(f"[Tag Text] Starting tag_text...") data = request.data serializer = TaggerGroupTagTextSerializer(data=data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) hybrid_tagger_object = self.get_object() # check if any of the models ready if not hybrid_tagger_object.taggers.filter( task__status=Task.STATUS_COMPLETED): raise NonExistantModelError() # error if redis not available if not get_redis_status()['alive']: raise RedisNotAvailable() # declare tag candidates variables text = serializer.validated_data['text'] n_similar_docs = serializer.validated_data['n_similar_docs'] n_candidate_tags = serializer.validated_data['n_candidate_tags'] lemmatize = serializer.validated_data['lemmatize'] use_ner = serializer.validated_data['use_ner'] feedback = serializer.validated_data['feedback_enabled'] tagger_group_id = self.get_object().pk # update text and tags with MLP text, tags = get_mlp(tagger_group_id, text, lemmatize=lemmatize, use_ner=use_ner) # retrieve tag candidates tag_candidates = get_tag_candidates(tagger_group_id, text, ignore_tags=tags, n_similar_docs=n_similar_docs, max_candidates=n_candidate_tags) # get tags tags += apply_tagger_group(tagger_group_id, text, tag_candidates, request, input_type='text', feedback=feedback) return Response(tags, status=status.HTTP_200_OK)
def tag_text(self, request, pk=None, project_pk=None): """Returns list of tags for input text.""" serializer = TaggerTagTextSerializer(data=request.data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) # retrieve tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model.path: raise NonExistantModelError() # apply tagger tagger_response = apply_tagger( tagger_object.id, serializer.validated_data['text'], input_type='text', lemmatize=serializer.validated_data['lemmatize'], feedback=serializer.validated_data['feedback_enabled']) # if feedback was enabled, add url tagger_response = add_finite_url_to_feedback(tagger_response, request) return Response(tagger_response, status=status.HTTP_200_OK)
def tag_text(self, request, pk=None, project_pk=None): serializer = CRFExtractorTagTextSerializer(data=request.data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) # retrieve tagger object extractor: CRFExtractor = self.get_object() # check if tagger exists if not extractor.model.path: raise NonExistantModelError() # apply mlp text = serializer.validated_data["text"] with allow_join_result(): mlp = apply_mlp_on_list.apply_async(kwargs={"texts": [text], "analyzers": extractor.mlp_analyzers}, queue=CELERY_MLP_TASK_QUEUE).get() mlp_document = mlp[0] # apply extractor extractor_response = apply_crf_extractor( extractor.id, mlp_document ) return Response(extractor_response, status=status.HTTP_200_OK)
def tag_text(self, request, pk=None, project_pk=None): serializer = BertTagTextSerializer(data=request.data) # check if valid request serializer.is_valid(raise_exception=True) # retrieve tagger object tagger_object = self.get_object() # check if tagger exists if not tagger_object.model: raise NonExistantModelError() # apply tagger text = serializer.validated_data['text'] feedback = serializer.validated_data['feedback_enabled'] persistent = serializer.validated_data['persistent'] # decide whether to store the model in cache if not persistent: prediction = apply_tagger(tagger_object, text, feedback=feedback) else: prediction = apply_persistent_bert_tagger.s( text, tagger_object.pk, feedback=feedback).apply_async().get() prediction = add_finite_url_to_feedback(prediction, request) return Response(prediction, status=status.HTTP_200_OK)
def tag_random_doc(self, request, pk=None, project_pk=None): """ API endpoint for tagging a random document. """ logging.getLogger(INFO_LOGGER).info( f"[Tag Random doc] Starting tag_random_doc...") # get hybrid tagger object hybrid_tagger_object = self.get_object() # check if any of the models ready if not hybrid_tagger_object.taggers.filter( task__status=Task.STATUS_COMPLETED): raise NonExistantModelError() # retrieve tagger fields from the first object first_tagger = hybrid_tagger_object.taggers.first() tagger_fields = json.loads(first_tagger.fields) # error if redis not available if not get_redis_status()['alive']: raise RedisNotAvailable( 'Redis not available. Check if Redis is running.') serializer = TagRandomDocSerializer(data=request.data) serializer.is_valid(raise_exception=True) indices = [ index["name"] for index in serializer.validated_data["indices"] ] indices = first_tagger.get_available_or_all_indices(indices) if not ElasticCore().check_if_indices_exist(indices): return Response( { 'error': f'One or more index from {list(indices)} does not exist' }, status=status.HTTP_400_BAD_REQUEST) # retrieve random document random_doc = ElasticSearcher(indices=indices).random_documents( size=1)[0] # filter out correct fields from the document random_doc_filtered = { k: v for k, v in random_doc.items() if k in tagger_fields } tagger_group_id = self.get_object().pk # combine document field values into one string combined_texts = '\n'.join(random_doc_filtered.values()) combined_texts, tags = get_mlp(tagger_group_id, combined_texts, lemmatize=False) # retrieve tag candidates tag_candidates = get_tag_candidates(tagger_group_id, combined_texts, ignore_tags=tags) # get tags tags += apply_tagger_group(tagger_group_id, random_doc_filtered, tag_candidates, request, input_type='doc') # return document with tags response = {"document": random_doc, "tags": tags} return Response(response, status=status.HTTP_200_OK)
def tag_doc(self, request, pk=None, project_pk=None): """ API endpoint for tagging JSON documents with tagger group. """ logging.getLogger(INFO_LOGGER).info(f"[Tag Doc] Starting tag_doc...") data = request.data serializer = TaggerGroupTagDocumentSerializer(data=data) # check if valid request if not serializer.is_valid(): raise SerializerNotValid(detail=serializer.errors) hybrid_tagger_object = self.get_object() # check if any of the models ready if not hybrid_tagger_object.taggers.filter( task__status=Task.STATUS_COMPLETED): raise NonExistantModelError() # error if redis not available if not get_redis_status()['alive']: raise RedisNotAvailable( 'Redis not available. Check if Redis is running.') # retrieve field data from the first element # we can do that safely because all taggers inside # hybrid tagger instance are trained on same fields hybrid_tagger_field_data = json.loads( hybrid_tagger_object.taggers.first().fields) # declare input_document variable input_document = serializer.validated_data['doc'] # validate input document input_document = validate_input_document(input_document, hybrid_tagger_field_data) if isinstance(input_document, Exception): return input_document # combine document field values into one string combined_texts = '\n'.join(input_document.values()) # declare tag candidates variables n_similar_docs = serializer.validated_data['n_similar_docs'] n_candidate_tags = serializer.validated_data['n_candidate_tags'] lemmatize = serializer.validated_data['lemmatize'] use_ner = serializer.validated_data['use_ner'] feedback = serializer.validated_data['feedback_enabled'] tagger_group_id = self.get_object().pk # update text and tags with MLP combined_texts, tags = get_mlp(tagger_group_id, combined_texts, lemmatize=lemmatize, use_ner=use_ner) # retrieve tag candidates tag_candidates = get_tag_candidates(tagger_group_id, combined_texts, ignore_tags=tags, n_similar_docs=n_similar_docs, max_candidates=n_candidate_tags) # get tags tags += apply_tagger_group(tagger_group_id, input_document, tag_candidates, request, input_type='doc', lemmatize=lemmatize, feedback=feedback) return Response(tags, status=status.HTTP_200_OK)