Beispiel #1
0
def classify(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    class_sample = ClassifiedSample.objects.get(id=sample_id)
    if class_sample.label:
        return

    job = class_sample.job

    # If classifier is not trained, return - it will be reclassified if
    # the classifier finishes training
    if not job.is_classifier_trained():
        return

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(class_sample)

    if label is None:
        # Something went wrong
        log.warning(
            '[Classification] Got None label for sample %d. Retrying.' %
            class_sample.id)
        current.retry(
            countdown=min(60 * 2**(current.request.retries % 6), 60 * 60 * 1),
            max_retries=None,
        )
    ClassifiedSample.objects.filter(id=sample_id).update(label=label)

    send_event(
        'EventSampleClassified',
        job_id=job.id,
        class_id=class_sample.id,
        sample_id=class_sample.sample.id,
    )
Beispiel #2
0
    def setUp(self):
        self.u = User.objects.create_user(username='******', password='******')

        self.job = Job.objects.create_active(
            account=self.u.get_profile(),
            gold_samples=[{'url': '10clouds.com', 'label': LABEL_YES}])

        self.train_data = [
            Sample(job=self.job, source_type='',
                text='Mechanical squirrel screwdriver over car'),
            Sample(job=self.job, source_type='',
                text='Screwdriver fix mechanical bike bolts'),
            Sample(job=self.job, source_type='',
                text='Brown banana apple pinapple potato'),
            Sample(job=self.job, source_type='',
                text='apple pinapple potato'),
            Sample(job=self.job, source_type='',
                text='Hippo tree over lagoon'),
            Sample(job=self.job, source_type='',
                text='Green tan with true fox')
        ]
        self.labels = [LABEL_YES, LABEL_YES, LABEL_NO, LABEL_NO, LABEL_NO, LABEL_NO]
        self.classified = []
        for idx, sample in enumerate(self.train_data):
            self.classified.append(ClassifiedSample.objects.create(
                job=self.job,
                sample=sample,
                label=self.labels[idx]
            ))

        self.classifier247 = classifier_factory.create_classifier(
            job_id=self.job.id,
        )
Beispiel #3
0
def classify(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    class_sample = ClassifiedSample.objects.get(id=sample_id)
    if class_sample.label:
        return

    job = class_sample.job

    # If classifier is not trained, return - it will be reclassified if
    # the classifier finishes training
    if not job.is_classifier_trained():
        return

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(class_sample)

    if label is None:
        # Something went wrong
        log.warning(
            '[Classification] Got None label for sample %d. Retrying.' % class_sample.id
        )
        current.retry(
            countdown=min(60 * 2 ** (current.request.retries % 6), 60 * 60 * 1),
            max_retries=None,
        )
    ClassifiedSample.objects.filter(id=sample_id).update(label=label)

    send_event(
        'EventSampleClassified',
        job_id=job.id,
        class_id=class_sample.id,
        sample_id=class_sample.sample.id,
    )
Beispiel #4
0
    def testMetrics(self):
        # Mock classifier.analyze() method so we don't use up resources
        classifier = classifier_factory.create_classifier(self.job.id)
        new_analyze = lambda: {
            'modelDescription': {
                'confusionMatrix': {
                    LABEL_YES: {
                        LABEL_YES: 5.0,
                        LABEL_NO: 3.0,
                    },
                    LABEL_NO: {
                        LABEL_YES: 2.0,
                        LABEL_NO: 7.0,
                    }
                }
            }
        }

        classifier.analyze = new_analyze
        update_classifier_stats(classifier, self.job)

        # 1 is from SimpleClassifier train on create,
        # 1 is from update_classifier_stats above
        self.assertEqual(ClassifierPerformance.objects.count(), 2)
        cp = ClassifierPerformance.objects.filter(job=self.job).order_by('-id')
        cp = cp[0]

        metrics_to_check = (
            'TPR',
            'TNR',
            'AUC',
        )

        for metric in metrics_to_check:
            self.assertIn(metric, cp.value)
Beispiel #5
0
    def testMetrics(self):
        # Mock classifier.analyze() method so we don't use up resources
        classifier = classifier_factory.create_classifier(self.job.id)
        new_analyze = lambda: {
            'modelDescription': {
                'confusionMatrix': {
                    LABEL_YES: {
                        LABEL_YES: 5.0,
                        LABEL_NO: 3.0,
                    },
                    LABEL_NO: {
                        LABEL_YES: 2.0,
                        LABEL_NO: 7.0,
                    }
                }
            }
        }

        classifier.analyze = new_analyze
        update_classifier_stats(classifier, self.job)

        # 1 is from SimpleClassifier train on create,
        # 1 is from update_classifier_stats above
        self.assertEqual(ClassifierPerformance.objects.count(), 2)
        cp = ClassifierPerformance.objects.filter(job=self.job).order_by('-id')
        cp = cp[0]

        metrics_to_check = (
            'TPR',
            'TNR',
            'AUC',
        )

        for metric in metrics_to_check:
            self.assertIn(metric, cp.value)
Beispiel #6
0
def train(set_id):
    training_set = TrainingSet.objects.get(id=set_id)
    job = training_set.job

    classifier = classifier_factory.create_classifier(job.id)

    samples = (training_sample
               for training_sample in training_set.training_samples.all())

    classifier.train(samples, set_id=set_id)

    job = Job.objects.get(id=job.id)
    if job.is_classifier_trained():
        send_event(
            "EventClassifierTrained",
            job_id=job.id,
        )
Beispiel #7
0
 def testClassifierFactory(self):
     job = Job.objects.create_active(
         account=self.u.get_profile(),
         gold_samples=[{'url': '10clouds.com', 'label': LABEL_YES}],
     )
     factory = classifier_factory.create_classifier(job.id)
     self.assertEqual(factory.__class__, Classifier247)
     self.assertEqual(Classifier.objects.filter(
         job=job,
         type='SimpleClassifier',
     ).count(), 2)
     cs = ClassifiedSample.objects.create_by_owner(
         job=job,
         url='http://google.com',
     )
     cs = ClassifiedSample.objects.get(id=cs.id)
     self.assertTrue(factory.classify(cs))
Beispiel #8
0
def train(set_id):
    training_set = TrainingSet.objects.get(id=set_id)
    job = training_set.job

    classifier = classifier_factory.create_classifier(job.id)

    samples = (training_sample
        for training_sample in training_set.training_samples.all())

    classifier.train(samples, set_id=set_id)

    job = Job.objects.get(id=job.id)
    if job.is_classifier_trained():
        send_event(
            "EventClassifierTrained",
            job_id=job.id,
        )
Beispiel #9
0
def classify_btm(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    log.info(
        '[BTMClassification] Got sample %d for classification.' % sample_id
    )
    btm_sample = BeatTheMachineSample.objects.get(id=sample_id)
    if btm_sample.label:
        return

    job = btm_sample.job

    # If classifier is not trained, retry later
    if not job.is_classifier_trained():
        current.retry(countdown=min(60 * 2 ** current.request.retries,
            60 * 60 * 24))

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(btm_sample)
    if label is None:
        # Something went wrong
        log.warning(
            '[BTMClassification] Got None label for sample %d. Retrying.'
                % btm_sample.id
        )
        current.retry(countdown=min(60 * 2 ** current.request.retries,
            60 * 60 * 24))

    BeatTheMachineSample.objects.filter(id=sample_id).update(label=label)
    btm_sample.updateBTMStatus()

    send_event(
        'EventSampleBTM',
        job_id=job.id,
        btm_id=btm_sample.id,
        sample_id=btm_sample.sample.id,
    )
Beispiel #10
0
def classify_btm(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    log.info('[BTMClassification] Got sample %d for classification.' %
             sample_id)
    btm_sample = BeatTheMachineSample.objects.get(id=sample_id)
    if btm_sample.label:
        return

    job = btm_sample.job

    # If classifier is not trained, retry later
    if not job.is_classifier_trained():
        current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 *
                                    24))

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(btm_sample)
    if label is None:
        # Something went wrong
        log.warning(
            '[BTMClassification] Got None label for sample %d. Retrying.' %
            btm_sample.id)
        current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 *
                                    24))

    BeatTheMachineSample.objects.filter(id=sample_id).update(label=label)
    btm_sample.updateBTMStatus()

    send_event(
        'EventSampleBTM',
        job_id=job.id,
        btm_id=btm_sample.id,
        sample_id=btm_sample.sample.id,
    )
Beispiel #11
0
    def testGoogleP(self):
        results = {
            'insert': '',
            'analyze': {
                'modelDescription': {
                    'confusionMatrix': {
                        LABEL_YES: {
                            LABEL_YES: 1,
                            LABEL_NO: 0,
                        },
                        LABEL_NO: {
                            LABEL_YES: 0,
                            LABEL_NO: 1,
                        }
                    }
                }
            },
        }

        class MockGooglePrediction(object):
            def trainedmodels(self, *args, **kwargs):
                return self

            def insert(self, *args, **kwargs):
                self.method = 'insert'
                return self

            def analyze(self, *args, **kwargs):
                self.method = 'analyze'
                return self

            def execute(self, *args, **kwargs):
                result = results[self.method]
                if isinstance(result, Exception):
                    raise result
                return result

            def predict(self, *args, **kwargs):
                self.method = 'predict'
                return self

            def get(self, *args, **kwargs):
                self.method = 'get'
                return self

        def build(*args, **kwargs):
            return MockGooglePrediction()

        target = 'urlannotator.main.factories.settings.JOB_DEFAULT_CLASSIFIER'
        self.patch = mock.patch(target, new='GooglePredictionClassifier')
        self.patch.start()

        target = 'urlannotator.classification.classifiers.build'
        self.patch_api = mock.patch(target, new=build)
        self.patch_api.start()

        target = 'urlannotator.classification.classifiers.GSConnection'
        self.patch_bucket = mock.patch(target)
        self.patch_bucket.start()

        target = 'urlannotator.classification.classifiers.Key'
        self.patch_key = mock.patch(target)
        self.patch_key.start()

        u = User.objects.create_user(username='******', password='******')

        job = Job.objects.create_active(
            account=u.get_profile(),
            gold_samples=[{'url': '10clouds.com', 'label': LABEL_YES}])

        classifier = classifier_factory.create_classifier(job.id)
        classifier.analyze()

        results['analyze'] = Exception()
        classifier.analyze()

        results['get'] = {'trainingStatus': 'test'}
        self.assertEqual(classifier.get_train_status(), 'test')
        results['get'] = Exception()
        self.assertEqual(classifier.get_train_status(), CLASS_TRAIN_STATUS_RUNNING)
        results['get'] = {'trainingStatus': 'ERROR: test'}
        with self.assertRaises(ClassifierTrainingCriticalError):
            classifier.get_train_status()

        results['get'] = Exception()
        train_set = job.trainingset_set.all()[0]
        classifier.train(samples=train_set.training_samples.all())
        classifier.train(set_id=train_set.id, turn_off=True)
        job.set_classifier_trained()

        results['predict'] = {'outputLabel': LABEL_YES, 'outputMulti': [{'label': LABEL_YES, 'score': 1},{'label': LABEL_NO, 'score': 0}]}
        cs = ClassifiedSample.objects.create_by_owner(
            job=job,
            url='http://google.com',
        )
        # Refresh the Classified Sample
        cs = ClassifiedSample.objects.get(id=cs.id)
        self.assertEqual(classifier.classify(sample=cs), LABEL_YES)
        self.assertEqual(classifier.classify_with_info(sample=cs), results['predict'])

        # What if we remove the classfier's id?!?!
        classifier.model = None
        self.assertEqual(classifier.classify(sample=cs), None)
        self.assertEqual(classifier.classify_with_info(sample=cs), None)