Beispiel #1
0
 def tag_doc(self, request, pk=None, project_pk=None):
     """Returns list of tags for input document."""
     serializer = TaggerTagDocumentSerializer(data=request.data)
     # check if valid request
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # retrieve tagger object
     tagger_object = self.get_object()
     # check if tagger exists
     if not tagger_object.model.path:
         raise NonExistantModelError()
     # declare input_document variable
     input_document = serializer.validated_data['doc']
     # load field data
     tagger_field_data = json.loads(tagger_object.fields)
     # validate input document
     input_document = validate_input_document(input_document,
                                              tagger_field_data)
     if isinstance(input_document, Exception):
         return input_document
     # apply tagger
     tagger_response = apply_tagger(
         tagger_object.id,
         input_document,
         input_type='doc',
         lemmatize=serializer.validated_data['lemmatize'],
         feedback=serializer.validated_data['feedback_enabled'],
     )
     # if feedback was enabled, add url
     tagger_response = add_finite_url_to_feedback(tagger_response, request)
     return Response(tagger_response, status=status.HTTP_200_OK)
Beispiel #2
0
 def list_features(self, request, pk=None, project_pk=None):
     """Returns list of features for the extactor."""
     extractor: CRFExtractor = self.get_object()
     # check if model exists
     if not extractor.model.path:
         raise NonExistantModelError()
     crf_model = extractor.load_extractor()
     feature_info = crf_model.get_features()
     return Response(feature_info, status=status.HTTP_200_OK)
Beispiel #3
0
 def multitag_text(self, request, pk=None, project_pk=None):
     """
     Applies list of tagger objects inside project to any text.
     This is different from Tagger Group as **all** taggers in project are used and they do not have to reside in the same Tagger Group.
     Returns list of tags.
     """
     serializer = TaggerMultiTagSerializer(data=request.data)
     # validate serializer
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # get project object
     project_object = Project.objects.get(pk=project_pk)
     # get available taggers from project
     taggers = Tagger.objects.filter(project=project_object).filter(
         task__status=Task.STATUS_COMPLETED)
     # filter again
     if serializer.validated_data['taggers']:
         taggers = taggers.filter(
             pk__in=serializer.validated_data['taggers'])
     # error if filtering resulted 0 taggers
     if not taggers:
         raise NonExistantModelError(detail='No tagging models available.')
     # retrieve params
     lemmatize = serializer.validated_data['lemmatize']
     feedback = serializer.validated_data['feedback_enabled']
     text = serializer.validated_data['text']
     hide_false = serializer.validated_data['hide_false']
     # error if redis not available
     if not get_redis_status()['alive']:
         raise RedisNotAvailable()
     # lemmatize text just once before giving it to taggers!
     if lemmatize:
         text = CeleryLemmatizer().lemmatize(text)
     # tag text using celery group primitive
     group_task = group(
         apply_tagger.s(tagger.pk,
                        text,
                        input_type='text',
                        lemmatize=False,
                        feedback=feedback) for tagger in taggers)
     group_results = [
         a for a in group_task.apply(
             queue=CELERY_SHORT_TERM_TASK_QUEUE).get() if a
     ]
     # remove non-hits
     if hide_false is True:
         group_results = [a for a in group_results if a['result']]
     # if feedback was enabled, add urls
     group_results = [
         add_finite_url_to_feedback(a, request) for a in group_results
     ]
     # sort & return tags
     sorted_tags = sorted(group_results,
                          key=lambda k: k['probability'],
                          reverse=True)
     return Response(sorted_tags, status=status.HTTP_200_OK)
Beispiel #4
0
    def list_features(self, request, pk=None, project_pk=None):
        """Returns list of features for the tagger. By default, features are sorted by their relevance in descending order."""

        if self.request.method == 'GET':
            serializer = TaggerListFeaturesSerializer(
                data=request.query_params)

        elif self.request.method == 'POST':
            serializer = TaggerListFeaturesSerializer(data=request.data)

        # retrieve tagger object
        tagger_object: Tagger = self.get_object()
        # check if tagger exists
        if not tagger_object.model.path:
            raise NonExistantModelError()
        # retrieve model
        tagger = TextTagger()
        tagger.load_django(tagger_object)
        try:
            # get feature names
            features = tagger.get_feature_names()
        except:
            return Response(
                {
                    'error':
                    'Error loading feature names. Are you using HashingVectorizer? It does not support feature names!'
                },
                status=status.HTTP_400_BAD_REQUEST)

        feature_coefs = tagger.get_feature_coefs()
        supports = tagger.get_supports()
        selected_features = [
            feature for i, feature in enumerate(features) if supports[i]
        ]
        selected_features = [{
            'feature': feature,
            'coefficient': feature_coefs[i]
        } for i, feature in enumerate(selected_features)
                             if feature_coefs[i] > 0]
        selected_features = sorted(selected_features,
                                   key=lambda k: k['coefficient'],
                                   reverse=True)

        serializer.is_valid(raise_exception=True)
        size = serializer.validated_data['size']
        features_to_show = selected_features[:size]

        feature_info = {
            'total_features': len(selected_features),
            'showing_features': len(features_to_show),
            'features': features_to_show
        }
        return Response(feature_info, status=status.HTTP_200_OK)
Beispiel #5
0
 def tag_text(self, request, pk=None, project_pk=None):
     serializer = TaggerTagTextSerializer(data=request.data)
     # check if valid request
     serializer.is_valid(raise_exception=True)
     # retrieve tagger object
     tagger_object = self.get_object()
     # check if tagger exists
     if not tagger_object.model:
         raise NonExistantModelError()
     # apply tagger
     text = serializer.validated_data['text']
     feedback = serializer.validated_data['feedback_enabled']
     prediction = apply_tagger(tagger_object, text, feedback=feedback)
     prediction = add_finite_url_to_feedback(prediction, request)
     return Response(prediction, status=status.HTTP_200_OK)
Beispiel #6
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """Returns prediction for a random document in Elasticsearch."""
        # get tagger object
        tagger_object = self.get_object()
        # check if tagger exists

        if not tagger_object.model.path:
            raise NonExistantModelError()

        if not tagger_object.model.path:
            return Response({'error': 'model does not exist (yet?)'},
                            status=status.HTTP_400_BAD_REQUEST)

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = tagger_object.get_available_or_all_indices(indices)

        # retrieve tagger fields
        tagger_fields = json.loads(tagger_object.fields)
        if not ElasticCore().check_if_indices_exist(indices):
            return Response(
                {
                    'error':
                    f'One or more index from {list(indices)} do not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]

        # filter out correct fields from the document
        random_doc_filtered = {
            k: v
            for k, v in random_doc.items() if k in tagger_fields
        }

        # apply tagger
        tagger_response = apply_tagger(tagger_object.id,
                                       random_doc_filtered,
                                       input_type='doc')
        response = {"document": random_doc, "prediction": tagger_response}
        return Response(response, status=status.HTTP_200_OK)
Beispiel #7
0
    def tag_text(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging raw text with tagger group.
        """
        logging.getLogger(INFO_LOGGER).info(f"[Tag Text] Starting tag_text...")
        data = request.data
        serializer = TaggerGroupTagTextSerializer(data=data)
        # check if valid request
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)
        hybrid_tagger_object = self.get_object()
        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()
        # error if redis not available
        if not get_redis_status()['alive']:
            raise RedisNotAvailable()
        # declare tag candidates variables
        text = serializer.validated_data['text']
        n_similar_docs = serializer.validated_data['n_similar_docs']
        n_candidate_tags = serializer.validated_data['n_candidate_tags']
        lemmatize = serializer.validated_data['lemmatize']
        use_ner = serializer.validated_data['use_ner']
        feedback = serializer.validated_data['feedback_enabled']

        tagger_group_id = self.get_object().pk
        # update text and tags with MLP
        text, tags = get_mlp(tagger_group_id,
                             text,
                             lemmatize=lemmatize,
                             use_ner=use_ner)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            text,
                                            ignore_tags=tags,
                                            n_similar_docs=n_similar_docs,
                                            max_candidates=n_candidate_tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   text,
                                   tag_candidates,
                                   request,
                                   input_type='text',
                                   feedback=feedback)
        return Response(tags, status=status.HTTP_200_OK)
Beispiel #8
0
 def tag_text(self, request, pk=None, project_pk=None):
     """Returns list of tags for input text."""
     serializer = TaggerTagTextSerializer(data=request.data)
     # check if valid request
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # retrieve tagger object
     tagger_object = self.get_object()
     # check if tagger exists
     if not tagger_object.model.path:
         raise NonExistantModelError()
     # apply tagger
     tagger_response = apply_tagger(
         tagger_object.id,
         serializer.validated_data['text'],
         input_type='text',
         lemmatize=serializer.validated_data['lemmatize'],
         feedback=serializer.validated_data['feedback_enabled'])
     # if feedback was enabled, add url
     tagger_response = add_finite_url_to_feedback(tagger_response, request)
     return Response(tagger_response, status=status.HTTP_200_OK)
Beispiel #9
0
 def tag_text(self, request, pk=None, project_pk=None):
     serializer = CRFExtractorTagTextSerializer(data=request.data)
     # check if valid request
     if not serializer.is_valid():
         raise SerializerNotValid(detail=serializer.errors)
     # retrieve tagger object
     extractor: CRFExtractor = self.get_object()
     # check if tagger exists
     if not extractor.model.path:
         raise NonExistantModelError()
     # apply mlp
     text = serializer.validated_data["text"]
     with allow_join_result():
         mlp = apply_mlp_on_list.apply_async(kwargs={"texts": [text], "analyzers": extractor.mlp_analyzers}, queue=CELERY_MLP_TASK_QUEUE).get()
         mlp_document = mlp[0]
     # apply extractor
     extractor_response = apply_crf_extractor(
         extractor.id,
         mlp_document
     )
     return Response(extractor_response, status=status.HTTP_200_OK)
Beispiel #10
0
 def tag_text(self, request, pk=None, project_pk=None):
     serializer = BertTagTextSerializer(data=request.data)
     # check if valid request
     serializer.is_valid(raise_exception=True)
     # retrieve tagger object
     tagger_object = self.get_object()
     # check if tagger exists
     if not tagger_object.model:
         raise NonExistantModelError()
     # apply tagger
     text = serializer.validated_data['text']
     feedback = serializer.validated_data['feedback_enabled']
     persistent = serializer.validated_data['persistent']
     # decide whether to store the model in cache
     if not persistent:
         prediction = apply_tagger(tagger_object, text, feedback=feedback)
     else:
         prediction = apply_persistent_bert_tagger.s(
             text, tagger_object.pk, feedback=feedback).apply_async().get()
     prediction = add_finite_url_to_feedback(prediction, request)
     return Response(prediction, status=status.HTTP_200_OK)
Beispiel #11
0
    def tag_random_doc(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging a random document.
        """
        logging.getLogger(INFO_LOGGER).info(
            f"[Tag Random doc] Starting tag_random_doc...")
        # get hybrid tagger object
        hybrid_tagger_object = self.get_object()

        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()

        # retrieve tagger fields from the first object
        first_tagger = hybrid_tagger_object.taggers.first()
        tagger_fields = json.loads(first_tagger.fields)
        # error if redis not available

        if not get_redis_status()['alive']:
            raise RedisNotAvailable(
                'Redis not available. Check if Redis is running.')

        serializer = TagRandomDocSerializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        indices = [
            index["name"] for index in serializer.validated_data["indices"]
        ]
        indices = first_tagger.get_available_or_all_indices(indices)

        if not ElasticCore().check_if_indices_exist(indices):
            return Response(
                {
                    'error':
                    f'One or more index from {list(indices)} does not exist'
                },
                status=status.HTTP_400_BAD_REQUEST)

        # retrieve random document
        random_doc = ElasticSearcher(indices=indices).random_documents(
            size=1)[0]
        # filter out correct fields from the document
        random_doc_filtered = {
            k: v
            for k, v in random_doc.items() if k in tagger_fields
        }

        tagger_group_id = self.get_object().pk

        # combine document field values into one string
        combined_texts = '\n'.join(random_doc_filtered.values())
        combined_texts, tags = get_mlp(tagger_group_id,
                                       combined_texts,
                                       lemmatize=False)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            combined_texts,
                                            ignore_tags=tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   random_doc_filtered,
                                   tag_candidates,
                                   request,
                                   input_type='doc')
        # return document with tags
        response = {"document": random_doc, "tags": tags}
        return Response(response, status=status.HTTP_200_OK)
Beispiel #12
0
    def tag_doc(self, request, pk=None, project_pk=None):
        """
        API endpoint for tagging JSON documents with tagger group.
        """
        logging.getLogger(INFO_LOGGER).info(f"[Tag Doc] Starting tag_doc...")
        data = request.data
        serializer = TaggerGroupTagDocumentSerializer(data=data)
        # check if valid request
        if not serializer.is_valid():
            raise SerializerNotValid(detail=serializer.errors)
        hybrid_tagger_object = self.get_object()
        # check if any of the models ready
        if not hybrid_tagger_object.taggers.filter(
                task__status=Task.STATUS_COMPLETED):
            raise NonExistantModelError()
        # error if redis not available
        if not get_redis_status()['alive']:
            raise RedisNotAvailable(
                'Redis not available. Check if Redis is running.')
        # retrieve field data from the first element
        # we can do that safely because all taggers inside
        # hybrid tagger instance are trained on same fields
        hybrid_tagger_field_data = json.loads(
            hybrid_tagger_object.taggers.first().fields)
        # declare input_document variable
        input_document = serializer.validated_data['doc']
        # validate input document
        input_document = validate_input_document(input_document,
                                                 hybrid_tagger_field_data)
        if isinstance(input_document, Exception):
            return input_document
        # combine document field values into one string
        combined_texts = '\n'.join(input_document.values())

        # declare tag candidates variables
        n_similar_docs = serializer.validated_data['n_similar_docs']
        n_candidate_tags = serializer.validated_data['n_candidate_tags']
        lemmatize = serializer.validated_data['lemmatize']
        use_ner = serializer.validated_data['use_ner']
        feedback = serializer.validated_data['feedback_enabled']

        tagger_group_id = self.get_object().pk

        # update text and tags with MLP
        combined_texts, tags = get_mlp(tagger_group_id,
                                       combined_texts,
                                       lemmatize=lemmatize,
                                       use_ner=use_ner)
        # retrieve tag candidates
        tag_candidates = get_tag_candidates(tagger_group_id,
                                            combined_texts,
                                            ignore_tags=tags,
                                            n_similar_docs=n_similar_docs,
                                            max_candidates=n_candidate_tags)
        # get tags
        tags += apply_tagger_group(tagger_group_id,
                                   input_document,
                                   tag_candidates,
                                   request,
                                   input_type='doc',
                                   lemmatize=lemmatize,
                                   feedback=feedback)
        return Response(tags, status=status.HTTP_200_OK)