Beispiel #1
0
def main(*args, **kwargs):
    classifier_models = ClassifierModel.objects.all()
    classifiers_map = {c.id: pickle.loads(c.data) for c in classifier_models}
    # first run for ClassifiedDocuments
    chunksize = 20
    chunkcounter = 0
    while True:
        frm = chunkcounter * chunksize
        to = (chunkcounter + 1) * chunksize
        texts = ClassifiedDocument.objects.all().\
            values('id', 'text', 'classifier')[frm:to]
        if not texts:
            print("No more data")
            break
        print('RUNNING CHUNK', chunkcounter)

        for x in texts:
            continue
            clf = classifiers_map.get(x['classifier'])
            clfn = clf.classify_as_label_probs(clf.preprocess(x['text']))
            x['classification_probabilities'] = clfn
            x['confidence'] = classification_confidence(clfn)

        # make atomic transaction
        with transaction.atomic():
            for x in texts:
                continue
                probs = x['classification_probabilities']
                ClassifiedDocument.objects.filter(id=x['id']).update(
                    classification_label=probs[0][0],
                    confidence=x['confidence'],
                    classification_probabilities=probs)

        # now the excerpts
        for x in texts:
            excerpts = ClassifiedExcerpt.objects.filter(
                classified_document__id=x['id']).values(
                    'id', 'start_pos', 'end_pos')
            for y in excerpts:
                print("EXC ID", y['id'])
                clf = classifiers_map.get(x['classifier'])
                clfn = clf.classify_as_label_probs(
                    clf.preprocess(x['text'][y['start_pos']:y['end_pos']]))
                y['classification_probabilities'] = clfn
                y['confidence'] = classification_confidence(clfn)

            # update the excerpts
            with transaction.atomic():
                for y in excerpts:
                    probs = y['classification_probabilities']
                    ClassifiedExcerpt.objects.filter(id=y['id']).update(
                        classification_label=probs[0][0],
                        classification_probabilities=probs,
                        confidence=y['confidence'])
        chunkcounter += 1
Beispiel #2
0
def get_confidences(classifier, test_data):
    """
    Get the confidences values for all the datasets
    @classifier: classifier object
    @test_data: test data specific to the classifier

    It returns {correct_confidences: [float], incorrect_confidences:[float]}
    """
    # deep_data = get_processed_data(
    # '_playground/sample_data/processed_sectors_subsectors.csv'
    # # 'fixtures/processed_data_for_testing.csv'
    # )
    # confidences for correct and incorrect prediction
    correct_confidences = []
    incorrect_confidences = []

    for text, label in test_data:
        classification = classifier.classify_as_label_probs(text)
        confidence = classification_confidence(classification)
        classified_label = classification[0][0]  # get the max
        if classified_label == label:  # means correct confidence
            correct_confidences.append(confidence)
        else:
            incorrect_confidences.append(confidence)
    print("correct:", len(correct_confidences), "incorrect:",
          len(incorrect_confidences))
    return {
        'correct_confidences': correct_confidences,
        'incorrect_confidences': incorrect_confidences
    }
Beispiel #3
0
def update_classified_documents_with_classifier(classifier_model):
    """
    Update the existing classified documents and excerpts with the classifier
    """
    classifier = pickle.loads(classifier_model.data)
    print("Updating classified docs ...")
    for doc in ClassifiedDocument.objects.all():
        classified = classify_text(classifier, doc.text)
        doc.classifier = classifier_model
        doc.confidence = classification_confidence(classified)
        doc.classification_label = classified[0][0]
        doc.classification_probabilities = classified
        doc.save()
    print("Updated classified docs ...")
    print("Updating classified excerpts ...")
    for excerpt in ClassifiedExcerpt.objects.all():
        classified = classify_text(classifier, excerpt.text)
        excerpt.classification_label = classified[0][0]
        excerpt.confidence = classification_confidence(classified)
        excerpt.classification_probabilities = classified
        excerpt.save()
    print("Updated classified excerpts ...")
Beispiel #4
0
 def classification_confidence(self):
     return classification_confidence(self.classification_probabilities)
Beispiel #5
0
    def post(self, request, version):
        data = dict(request.data.items())
        validation_details = self._validate_classification_params(data)
        if not validation_details['status']:
            return Response(validation_details['error_data'],
                            status=status.HTTP_400_BAD_REQUEST)
        # check if deeper and doc_id present
        deeper = True if data.get('deeper') else False
        if deeper and data.get('doc_id'):
            # get already classified data
            try:
                classified_doc = ClassifiedDocument.objects.get(
                    id=data['doc_id'])
                return_data = ClassifiedDocumentSerializer(classified_doc).data
                return_data['excerpts_classification'] = \
                    ClassifiedExcerptSerializer(
                        classified_doc.excerpts, many=True
                    ).data
                return Response(return_data)
            except ClassifiedDocument.DoesNotExist:
                return Response({'error': 'Classified Document not found'},
                                status=status.HTTP_404_NOT_FOUND)
            except Exception as e:
                return Response({
                    'status': False,
                    'message': 'Invalid doc_id'
                },
                                status=status.HTTP_400_BAD_REQUEST)
        classifier = self.classifiers.get(version)

        if not classifier:
            return Response(
                {
                    'status': False,
                    'message': 'Classifier not found'
                }, status.HTTP_404_NOT_FOUND)
        text = data['text']

        # get language
        language = langdetect.detect(text)
        original = None
        try:
            if language != 'en':
                original = text
                logger.info("not english language")
                translation = self.translator.translate(text)
                translated = translation.text
                text = translated
                logger.info("Translated text: {}".format(translated))
        except Exception as e:
            logger.warn("Exception while translating text. {}".format(e))

        classified = classify_text(classifier['classifier'], text)

        if not data.get('deeper'):
            return Response({
                'classification':
                classified,
                'classification_confidence':
                classification_confidence(classified)
            })

        # Create classified Document
        grp_id = data.get('group_id')

        extra_info = {"language": language}
        if original:
            extra_info['original'] = original

        doc = ClassifiedDocument.objects.create(
            text=text,
            classifier=classifier['classifier_model'],
            confidence=classified[0][1],
            classification_label=classified[0][0],
            classification_probabilities=classified,
            group_id=grp_id,
            extra_info=extra_info)

        # now add the doc to a cluster, only if new doc is present
        if not data.get('doc_id'):
            # doc id is send for already present doc
            # we want to cluster new document
            assign_cluster_to_doc.delay(doc.id)

        classified_excerpts = classify_lead_excerpts(
            classifier['classifier'],
            text,
        )
        # create excerpts
        excerpts = []
        for x in classified_excerpts:
            excerpts.append(
                ClassifiedExcerpt.objects.create(
                    classified_document=doc,
                    start_pos=x['start_pos'],
                    end_pos=x['end_pos'],
                    classification_label=x['classification'][0][0],
                    confidence=x['classification'][0][1],
                    classification_probabilities=x['classification']))
        ret = ClassifiedDocumentSerializer(doc).data
        ret['excerpts_classification'] = ClassifiedExcerptSerializer(
            excerpts, many=True).data
        return Response(ret)
Beispiel #6
0
    def post(self, request, version):
        data = dict(request.data.items())
        validation_details = self._validate_classification_params(data)
        if not validation_details['status']:
            return Response(validation_details['error_data'],
                            status=status.HTTP_400_BAD_REQUEST)
        # check if deeper and doc_id present
        deeper = True if data.get('deeper') else False
        if deeper and data.get('doc_id'):
            # get already classified data
            try:
                classified_doc = ClassifiedDocument.objects.get(
                    id=data['doc_id'])
                return_data = ClassifiedDocumentSerializer(classified_doc).data
                return_data['excerpts_classification'] = \
                    ClassifiedExcerptSerializer(
                        classified_doc.excerpts, many=True
                    ).data
                return Response(return_data)
            except ClassifiedDocument.DoesNotExist:
                return Response({'error': 'Classified Document not found'},
                                status=status.HTTP_404_NOT_FOUND)
            except Exception as e:
                return Response({
                    'status': False,
                    'message': 'Invalid doc_id'
                },
                                status=status.HTTP_400_BAD_REQUEST)
        classifier = self.classifiers.get(version)

        if not classifier:
            return Response(
                {
                    'status': False,
                    'message': 'Classifier not found'
                }, status.HTTP_404_NOT_FOUND)
        text = data['text']
        classified = classify_text(classifier['classifier'], text)

        if not data.get('deeper'):
            return Response({
                'classification':
                classified,
                'classification_confidence':
                classification_confidence(classified)
            })

        # Create classified Document
        grp_id = data.get('group_id')

        # get language
        language = langdetect.detect(text)

        doc = ClassifiedDocument.objects.create(
            text=text,
            classifier=classifier['classifier_model'],
            confidence=classified[0][1],
            classification_label=classified[0][0],
            classification_probabilities=classified,
            group_id=grp_id,
            extra_info={"language": language})
        classified_excerpts = classify_lead_excerpts(
            classifier['classifier'],
            text,
        )
        # create excerpts
        excerpts = []
        for x in classified_excerpts:
            excerpts.append(
                ClassifiedExcerpt.objects.create(
                    classified_document=doc,
                    start_pos=x['start_pos'],
                    end_pos=x['end_pos'],
                    classification_label=x['classification'][0][0],
                    confidence=x['classification'][0][1],
                    classification_probabilities=x['classification']))
        ret = ClassifiedDocumentSerializer(doc).data
        ret['excerpts_classification'] = ClassifiedExcerptSerializer(
            excerpts, many=True).data
        return Response(ret)