def classify(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ class_sample = ClassifiedSample.objects.get(id=sample_id) if class_sample.label: return job = class_sample.job # If classifier is not trained, return - it will be reclassified if # the classifier finishes training if not job.is_classifier_trained(): return classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(class_sample) if label is None: # Something went wrong log.warning( '[Classification] Got None label for sample %d. Retrying.' % class_sample.id) current.retry( countdown=min(60 * 2**(current.request.retries % 6), 60 * 60 * 1), max_retries=None, ) ClassifiedSample.objects.filter(id=sample_id).update(label=label) send_event( 'EventSampleClassified', job_id=job.id, class_id=class_sample.id, sample_id=class_sample.sample.id, )
def setUp(self): self.u = User.objects.create_user(username='******', password='******') self.job = Job.objects.create_active( account=self.u.get_profile(), gold_samples=[{'url': '10clouds.com', 'label': LABEL_YES}]) self.train_data = [ Sample(job=self.job, source_type='', text='Mechanical squirrel screwdriver over car'), Sample(job=self.job, source_type='', text='Screwdriver fix mechanical bike bolts'), Sample(job=self.job, source_type='', text='Brown banana apple pinapple potato'), Sample(job=self.job, source_type='', text='apple pinapple potato'), Sample(job=self.job, source_type='', text='Hippo tree over lagoon'), Sample(job=self.job, source_type='', text='Green tan with true fox') ] self.labels = [LABEL_YES, LABEL_YES, LABEL_NO, LABEL_NO, LABEL_NO, LABEL_NO] self.classified = [] for idx, sample in enumerate(self.train_data): self.classified.append(ClassifiedSample.objects.create( job=self.job, sample=sample, label=self.labels[idx] )) self.classifier247 = classifier_factory.create_classifier( job_id=self.job.id, )
def classify(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ class_sample = ClassifiedSample.objects.get(id=sample_id) if class_sample.label: return job = class_sample.job # If classifier is not trained, return - it will be reclassified if # the classifier finishes training if not job.is_classifier_trained(): return classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(class_sample) if label is None: # Something went wrong log.warning( '[Classification] Got None label for sample %d. Retrying.' % class_sample.id ) current.retry( countdown=min(60 * 2 ** (current.request.retries % 6), 60 * 60 * 1), max_retries=None, ) ClassifiedSample.objects.filter(id=sample_id).update(label=label) send_event( 'EventSampleClassified', job_id=job.id, class_id=class_sample.id, sample_id=class_sample.sample.id, )
def testMetrics(self): # Mock classifier.analyze() method so we don't use up resources classifier = classifier_factory.create_classifier(self.job.id) new_analyze = lambda: { 'modelDescription': { 'confusionMatrix': { LABEL_YES: { LABEL_YES: 5.0, LABEL_NO: 3.0, }, LABEL_NO: { LABEL_YES: 2.0, LABEL_NO: 7.0, } } } } classifier.analyze = new_analyze update_classifier_stats(classifier, self.job) # 1 is from SimpleClassifier train on create, # 1 is from update_classifier_stats above self.assertEqual(ClassifierPerformance.objects.count(), 2) cp = ClassifierPerformance.objects.filter(job=self.job).order_by('-id') cp = cp[0] metrics_to_check = ( 'TPR', 'TNR', 'AUC', ) for metric in metrics_to_check: self.assertIn(metric, cp.value)
def testMetrics(self): # Mock classifier.analyze() method so we don't use up resources classifier = classifier_factory.create_classifier(self.job.id) new_analyze = lambda: { 'modelDescription': { 'confusionMatrix': { LABEL_YES: { LABEL_YES: 5.0, LABEL_NO: 3.0, }, LABEL_NO: { LABEL_YES: 2.0, LABEL_NO: 7.0, } } } } classifier.analyze = new_analyze update_classifier_stats(classifier, self.job) # 1 is from SimpleClassifier train on create, # 1 is from update_classifier_stats above self.assertEqual(ClassifierPerformance.objects.count(), 2) cp = ClassifierPerformance.objects.filter(job=self.job).order_by('-id') cp = cp[0] metrics_to_check = ( 'TPR', 'TNR', 'AUC', ) for metric in metrics_to_check: self.assertIn(metric, cp.value)
def train(set_id): training_set = TrainingSet.objects.get(id=set_id) job = training_set.job classifier = classifier_factory.create_classifier(job.id) samples = (training_sample for training_sample in training_set.training_samples.all()) classifier.train(samples, set_id=set_id) job = Job.objects.get(id=job.id) if job.is_classifier_trained(): send_event( "EventClassifierTrained", job_id=job.id, )
def testClassifierFactory(self): job = Job.objects.create_active( account=self.u.get_profile(), gold_samples=[{'url': '10clouds.com', 'label': LABEL_YES}], ) factory = classifier_factory.create_classifier(job.id) self.assertEqual(factory.__class__, Classifier247) self.assertEqual(Classifier.objects.filter( job=job, type='SimpleClassifier', ).count(), 2) cs = ClassifiedSample.objects.create_by_owner( job=job, url='http://google.com', ) cs = ClassifiedSample.objects.get(id=cs.id) self.assertTrue(factory.classify(cs))
def train(set_id): training_set = TrainingSet.objects.get(id=set_id) job = training_set.job classifier = classifier_factory.create_classifier(job.id) samples = (training_sample for training_sample in training_set.training_samples.all()) classifier.train(samples, set_id=set_id) job = Job.objects.get(id=job.id) if job.is_classifier_trained(): send_event( "EventClassifierTrained", job_id=job.id, )
def classify_btm(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ log.info( '[BTMClassification] Got sample %d for classification.' % sample_id ) btm_sample = BeatTheMachineSample.objects.get(id=sample_id) if btm_sample.label: return job = btm_sample.job # If classifier is not trained, retry later if not job.is_classifier_trained(): current.retry(countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24)) classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(btm_sample) if label is None: # Something went wrong log.warning( '[BTMClassification] Got None label for sample %d. Retrying.' % btm_sample.id ) current.retry(countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24)) BeatTheMachineSample.objects.filter(id=sample_id).update(label=label) btm_sample.updateBTMStatus() send_event( 'EventSampleBTM', job_id=job.id, btm_id=btm_sample.id, sample_id=btm_sample.sample.id, )
def classify_btm(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ log.info('[BTMClassification] Got sample %d for classification.' % sample_id) btm_sample = BeatTheMachineSample.objects.get(id=sample_id) if btm_sample.label: return job = btm_sample.job # If classifier is not trained, retry later if not job.is_classifier_trained(): current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 * 24)) classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(btm_sample) if label is None: # Something went wrong log.warning( '[BTMClassification] Got None label for sample %d. Retrying.' % btm_sample.id) current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 * 24)) BeatTheMachineSample.objects.filter(id=sample_id).update(label=label) btm_sample.updateBTMStatus() send_event( 'EventSampleBTM', job_id=job.id, btm_id=btm_sample.id, sample_id=btm_sample.sample.id, )
def testGoogleP(self): results = { 'insert': '', 'analyze': { 'modelDescription': { 'confusionMatrix': { LABEL_YES: { LABEL_YES: 1, LABEL_NO: 0, }, LABEL_NO: { LABEL_YES: 0, LABEL_NO: 1, } } } }, } class MockGooglePrediction(object): def trainedmodels(self, *args, **kwargs): return self def insert(self, *args, **kwargs): self.method = 'insert' return self def analyze(self, *args, **kwargs): self.method = 'analyze' return self def execute(self, *args, **kwargs): result = results[self.method] if isinstance(result, Exception): raise result return result def predict(self, *args, **kwargs): self.method = 'predict' return self def get(self, *args, **kwargs): self.method = 'get' return self def build(*args, **kwargs): return MockGooglePrediction() target = 'urlannotator.main.factories.settings.JOB_DEFAULT_CLASSIFIER' self.patch = mock.patch(target, new='GooglePredictionClassifier') self.patch.start() target = 'urlannotator.classification.classifiers.build' self.patch_api = mock.patch(target, new=build) self.patch_api.start() target = 'urlannotator.classification.classifiers.GSConnection' self.patch_bucket = mock.patch(target) self.patch_bucket.start() target = 'urlannotator.classification.classifiers.Key' self.patch_key = mock.patch(target) self.patch_key.start() u = User.objects.create_user(username='******', password='******') job = Job.objects.create_active( account=u.get_profile(), gold_samples=[{'url': '10clouds.com', 'label': LABEL_YES}]) classifier = classifier_factory.create_classifier(job.id) classifier.analyze() results['analyze'] = Exception() classifier.analyze() results['get'] = {'trainingStatus': 'test'} self.assertEqual(classifier.get_train_status(), 'test') results['get'] = Exception() self.assertEqual(classifier.get_train_status(), CLASS_TRAIN_STATUS_RUNNING) results['get'] = {'trainingStatus': 'ERROR: test'} with self.assertRaises(ClassifierTrainingCriticalError): classifier.get_train_status() results['get'] = Exception() train_set = job.trainingset_set.all()[0] classifier.train(samples=train_set.training_samples.all()) classifier.train(set_id=train_set.id, turn_off=True) job.set_classifier_trained() results['predict'] = {'outputLabel': LABEL_YES, 'outputMulti': [{'label': LABEL_YES, 'score': 1},{'label': LABEL_NO, 'score': 0}]} cs = ClassifiedSample.objects.create_by_owner( job=job, url='http://google.com', ) # Refresh the Classified Sample cs = ClassifiedSample.objects.get(id=cs.id) self.assertEqual(classifier.classify(sample=cs), LABEL_YES) self.assertEqual(classifier.classify_with_info(sample=cs), results['predict']) # What if we remove the classfier's id?!?! classifier.model = None self.assertEqual(classifier.classify(sample=cs), None) self.assertEqual(classifier.classify_with_info(sample=cs), None)