def run(self, *args, **kwargs): kw = { 'tagasaurisjobs__%s' % self.hit_name: None, 'tagasaurisjobs__%s__isnull' % self.key_name: False, } jobs = Job.objects.select_related('tagasaurisjobs').filter( status=JOB_STATUS_ACTIVE, tagasaurisjobs__isnull=False, **kw).iterator() client = make_tagapi_client() for job in jobs: old_hit = getattr(job.tagasaurisjobs, self.hit_name) new_hit = get_hit(client, getattr(job.tagasaurisjobs, self.key_name), job.get_hit_type()) if not new_hit or old_hit == new_hit: continue kw = {self.hit_name: new_hit} TagasaurisJobs.objects.filter(urlannotator_job=job).update(**kw) send_event( self.event_name, job_id=job.id, old_hit=old_hit, new_hit=new_hit, )
def web_content_extraction(sample_id, url=None, *args, **kwargs): """ Links/lynx required. Generates html output from those browsers. """ if url is None: url = Sample.objects.get(id=sample_id).url if not is_proper_url(url): return False sample = Sample.objects.get(id=sample_id) try: text = get_web_text(url) Sample.objects.filter(id=sample_id).update(text=text) send_event( "EventSampleContentDone", sample_id=sample_id, sample_url=sample.url, job_id=sample.job_id, ) except subprocess.CalledProcessError, e: # Something wrong has happened to links. Couldn't find documentation on # error codes - assume bad stuff has happened that retrying won't fix. send_event( 'EventSampleContentFail', sample_id=sample_id, sample_url=sample.url, job_id=sample.job_id, error_code=e.returncode ) return False
def _train(self, samples=[], set_id=0): writer, reader = self.update_self() # Trains the subclassifier writer.train( samples=[], turn_off=False, set_id=set_id, ) trained = False wait_time = 0 entry = ClassifierModel.objects.get(id=self.id) job_id = entry.job_id while not trained: time.sleep(min(wait_time, CLASS247_MAX_WAIT)) try: status = writer.get_train_status() except ClassifierTrainingError, e: status = CLASS_TRAIN_STATUS_ERROR # Swallow retry-safe training errors. send_event( "EventClassifierTrainError", job_id=job_id, message=e.message, ) trained = status == CLASS_TRAIN_STATUS_DONE wait_time += CLASS247_TRAIN_STEP
def create_by_worker(self, *args, **kwargs): kwargs['url'] = sanitize_url(kwargs['url']) kwargs['source_type'] = SAMPLE_TAGASAURIS_WORKER kwargs['source_val'] = kwargs['worker_id'] del kwargs['worker_id'] try: kwargs['sample'] = Sample.objects.get( job=kwargs['job'], url=kwargs['url'] ) except Sample.DoesNotExist: pass btm_sample = self.create(**kwargs) # If sample exists, step immediately to classification if 'sample' in kwargs: send_event('EventNewBTMSample', sample_id=kwargs['sample'].id, job_id=kwargs['job'].id) else: Sample.objects.create_by_btm( job_id=kwargs['job'].id, url=kwargs['url'], source_val=kwargs['source_val'], create_classified=False, ) return btm_sample
def testVotesProcess(self): def newVote(worker, label): return WorkerQualityVote( sample=self.sample, worker=worker, label=label, ) WorkerQualityVote.objects.bulk_create( newVote(self.workers[x], LABEL_YES) for x in xrange(3) ) send_event('EventProcessVotes') ts = TrainingSet.objects.newest_for_job(self.job) self.assertEqual(len(ts.training_samples.all()), 1) training_sample = ts.training_samples.all()[0] self.assertEqual(training_sample.label, LABEL_YES) WorkerQualityVote.objects.bulk_create( newVote(self.workers[x], LABEL_NO) for x in xrange(3, 7) ) send_event('EventProcessVotes') ts = TrainingSet.objects.newest_for_job(self.job) self.assertEqual(len(ts.training_samples.all()), 1) training_sample = ts.training_samples.all()[0] self.assertEqual(training_sample.label, LABEL_NO)
def test_suppression(self): event_name, file_name, file_content = \ 'TestEvent', "test_file_name", "success" send_event(event_name, fname=file_name, content=file_content) self.assertFalse(os.path.isfile(file_name))
def classify(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ class_sample = ClassifiedSample.objects.get(id=sample_id) if class_sample.label: return job = class_sample.job # If classifier is not trained, return - it will be reclassified if # the classifier finishes training if not job.is_classifier_trained(): return classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(class_sample) if label is None: # Something went wrong log.warning( '[Classification] Got None label for sample %d. Retrying.' % class_sample.id ) current.retry( countdown=min(60 * 2 ** (current.request.retries % 6), 60 * 60 * 1), max_retries=None, ) ClassifiedSample.objects.filter(id=sample_id).update(label=label) send_event( 'EventSampleClassified', job_id=job.id, class_id=class_sample.id, sample_id=class_sample.sample.id, )
def run(self, *args, **kwargs): kw = { 'tagasaurisjobs__%s' % self.hit_name: None, 'tagasaurisjobs__%s__isnull' % self.key_name: False, } jobs = Job.objects.select_related('tagasaurisjobs').filter( status=JOB_STATUS_ACTIVE, tagasaurisjobs__isnull=False, **kw).iterator() client = make_tagapi_client() for job in jobs: old_hit = getattr(job.tagasaurisjobs, self.hit_name) new_hit = get_hit(client, getattr(job.tagasaurisjobs, self.key_name), job.get_hit_type()) if not new_hit or old_hit == new_hit: continue kw = {self.hit_name: new_hit} TagasaurisJobs.objects.filter(urlannotator_job=job).update( **kw ) send_event( self.event_name, job_id=job.id, old_hit=old_hit, new_hit=new_hit, )
def train_lock(self, func, turn_off=True, *args, **kwargs): """ Locks the classifier during training. """ self.sync247.modified_lock() try: entry = ClassifierModel.objects.get(id=self.id) try: func(*args, **kwargs) except ClassifierTrainingError, e: # Retry-safe error has been propagated up to here whilst # it should've been handled in the `func`. Log it and abort. send_event( "EventClassifierTrainError", job_id=entry.job_id, message=e.message, ) return except ClassifierTrainingCriticalError, e: # Really bad things have happened during training. Log it and # abort. send_event( "EventClassifierCriticalTrainError", job_id=entry.job_id, message=e.message, ) return
def web_screenshot_extraction(sample_id, url=None, *args, **kwargs): """ Generates html output from those browsers. """ if url is None: url = Sample.objects.get(id=sample_id).url if not is_proper_url(url): return False sample = Sample.objects.get(id=sample_id) try: screenshot = get_web_screenshot(url) Sample.objects.filter(id=sample_id).update(screenshot=screenshot) send_event( "EventSampleScreenshotDone", sample_id=sample_id, sample_url=sample.url, job_id=sample.job_id, ) except BaseWebkitException, e: send_event( "EventSampleScreenshotFail", sample_id=sample_id, sample_url=sample.url, job_id=sample.job_id, error_code=e.status_code, ) return False
def classify(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ class_sample = ClassifiedSample.objects.get(id=sample_id) if class_sample.label: return job = class_sample.job # If classifier is not trained, return - it will be reclassified if # the classifier finishes training if not job.is_classifier_trained(): return classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(class_sample) if label is None: # Something went wrong log.warning( '[Classification] Got None label for sample %d. Retrying.' % class_sample.id) current.retry( countdown=min(60 * 2**(current.request.retries % 6), 60 * 60 * 1), max_retries=None, ) ClassifiedSample.objects.filter(id=sample_id).update(label=label) send_event( 'EventSampleClassified', job_id=job.id, class_id=class_sample.id, sample_id=class_sample.sample.id, )
def handle(self, event_name, event_kwargs=None, *args, **options): kwargs = {} if event_kwargs: kwargs = json.loads(event_kwargs) # We don't use args with send_event! send_event(event_name, **kwargs)
def odesk_complete(request): client = request.session['odesk_client'] token, secret = client.auth.get_access_token(request.GET['oauth_verifier']) client = odesk.Client( settings.ODESK_SERVER_KEY, settings.ODESK_SERVER_SECRET, oauth_access_token=token, oauth_access_token_secret=secret, auth='oauth', ) info = client.hr.get_user('me') cipher = info['profile_key'] if request.user.is_authenticated(): if request.user.get_profile().odesk_uid == '': request.user.get_profile().odesk_id = info['id'] request.user.get_profile().odesk_uid = cipher request.user.get_profile().odesk_token = token request.user.get_profile().odesk_secret = secret request.user.get_profile().save() send_event("EventNewOdeskAssoc", user_id=request.user.id) # Add Worker model on odesk account association if not Worker.objects.filter(external_id=cipher): w = Worker.objects.create_odesk(external_id=cipher) request.user.get_profile().worker_entry = w request.user.get_profile().save() request.session['success'] = 'You have successfully logged in.' return redirect('index') else: try: assoc = Account.objects.get(odesk_uid=cipher) u = authenticate(username=assoc.user.username, password='******') if not u: request.session['error'] = 'Such account already exists.' return redirect('login') login(request, u) return redirect('index') except Account.DoesNotExist: u = User.objects.create_user(email=info['email'], username='******'.join(['odesk', cipher]), password='******') profile = u.get_profile() profile.odesk_id = info['id'] profile.odesk_uid = cipher profile.odesk_token = token profile.odesk_secret = secret send_event("EventNewOdeskAssoc", user_id=u.id) profile.full_name = '%s %s' % (info['first_name'], info['last_name']) profile.save() u = authenticate(username=u.username, password='******') login(request, u) # Create Worker model on odesk account registration if not Worker.objects.filter(external_id=cipher): w = Worker.objects.create_odesk(external_id=cipher) u.get_profile().worker_entry = w u.get_profile().save() request.session['success'] = 'You have successfuly registered' return redirect('settings')
def handle_succeeded(self, **event_data): charge = self._get_charge(**event_data) if charge.charge_type == charge.Type.BASE_JOB: charge.job.initialize() elif charge.charge_type == charge.Type.BTM_JOB: charge.job.activate_btm() send_event("EventJobChargeSucceeded", job_id=charge.job_id, charge_id=charge.id)
def test_altering(self): event_name, file_name, file_content = \ 'TestEvent', "test_file_name", "success" send_event(event_name, fname=file_name, content=file_content) with open(file_name, 'r') as f: self.assertEqual(file_content[::-1], f.readline()) os.remove(file_name)
def test_proper_matching(self): event_name, file_name, file_content = \ 'TestEvent', "test_file_name", "success" send_event(event_name, fname=file_name, content=file_content) # due to eager celery task evaluation this should work with open(file_name, 'r') as f: self.assertEqual(file_content, f.readline()) os.remove(file_name)
def create_sample(extraction_result, sample_id, job_id, url, source_type, source_val='', domain='', label=None, silent=False, vote_sample=True, btm_sample=False, training=True, *args, **kwargs): """ If error while capturing web propagate it. Finally deletes TemporarySample. extraction_result should be [True, True] - otherwise chaining failed. """ extracted = all([x is True for x in extraction_result]) job = Job.objects.get(id=job_id) # Checking if all previous tasks succeeded. if extracted: # Proper sample entry Sample.objects.filter(id=sample_id).update( source_type=source_type, source_val=source_val, domain=domain, vote_sample=vote_sample, btm_sample=btm_sample, training=training, ) sample = Sample.objects.get(id=sample_id) if not silent: # Golden sample if label is not None: # GoldSample created sucesfully - pushing event. gold = GoldSample( sample=sample, label=label ) gold.save() send_event( "EventNewGoldSample", job_id=job.id, gold_id=gold.id, ) # Ordinary sample else: # Sample created sucesfully - pushing event. send_event( "EventNewBTMSample" if btm_sample else "EventNewSample", job_id=job.id, sample_id=sample_id, ) else: # Extraction failed, cleanup. Sample.objects.filter(id=sample_id).delete() if label is not None: Job.objects.filter(id=job.id, gold_left__gte=0)\ .update(gold_left=F('gold_left') - 1) return (extracted, sample_id)
def reclassify(self, force=False): """ Reclassifies current sample. If `force` is True, then the sample is reclassified even if previous classification was successful. Returns True on success. This call is asynchronous. """ if self.is_pending() or force: send_event("EventNewClassifySample", sample_id=self.id) return True return False
def create_classify_sample(result, source_type, create_classified=True, label='', source_val='', *args, **kwargs): """ Creates classified sample from existing sample, therefore we don't need web extraction. """ # We are given a tuple (extraction result, sample id) extraction_result = result[0] # If extraction failed - return if not extraction_result: return False sample_id = result[1] # Don't classify already classified samples if label: return sample_id if create_classified: try: sample = Sample.objects.get(id=sample_id) if not label: label = '' # Proper sample entry class_sample = ClassifiedSample.objects.create( job=sample.job, url=sample.url, sample=sample, label=label, source_type=source_type, source_val=source_val, ) worker = Sample.get_worker(source_type=source_type, source_val=source_val) if worker: # Update cache worker.get_urls_collected_count_for_job(sample.job, cache=False) # Sample created sucesfully - pushing event. send_event( "EventNewClassifySample", sample_id=class_sample.id, ) except DatabaseError, e: # Retry process on db error, such as 'Database is locked' create_classify_sample.retry(exc=e, countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24))
def new_vote(self, *args, **kwargs): vote = self._add_vote(**kwargs) if not vote: return None send_event( 'EventNewVoteAdded', worker_id=kwargs['worker'].id, sample_id=kwargs['sample'].id, ) return vote
def testLongTraining(self): job = Job.objects.create( account=self.u.get_profile(), status=JOB_STATUS_ACTIVE) ts = TrainingSet.objects.create(job=job) JobFactory().create_classifier(job) send_event('EventTrainingSetCompleted', set_id=ts.id, job_id=job.id) time.sleep(1) # Refresh our job object job = Job.objects.get(id=job.id) self.assertTrue(job.is_classifier_trained())
def reclassify(self, force=False): """ Reclassifies current sample. If `force` is True, then the sample is reclassified even if previous classification was successful. Returns True on success. This call is asynchronous. """ if self.is_pending() or force: send_event( 'EventNewClassifySample', sample_id=self.id, ) return True return False
def updateBTMStatus(self, save=True): """ Each execution adds another sample for voting. """ status = self.calculate_status() self.btm_status = status self.update_points(status) if save: self.save() if status == self.BTM_HUMAN: send_event('EventBTMSendToHuman', sample_id=self.id)
def testEventSamplesVoting(self): self.assertEqual(TagasaurisJobs.objects.count(), 1) self.assertEqual(TagasaurisJobs.objects.all()[0].voting_key, None) self.assertEqual(TagasaurisJobs.objects.all()[0].voting_hit, None) send_event('EventSamplesVoting') self.assertEqual(SampleMapping.objects.count(), 3) self.assertEqual(SampleMapping.objects.all()[0].crowscourcing_type, SampleMapping.TAGASAURIS) self.assertEqual(len(SampleMapping.objects.all()[0].external_id), 32) self.assertEqual(TagasaurisJobs.objects.count(), 1) self.assertEqual(len(TagasaurisJobs.objects.all()[0].voting_key), 32) VotingHITMonitor.delay() self.assertEqual(len(TagasaurisJobs.objects.all()[0].voting_hit), 32)
def train(set_id): training_set = TrainingSet.objects.get(id=set_id) job = training_set.job classifier = classifier_factory.create_classifier(job.id) samples = (training_sample for training_sample in training_set.training_samples.all()) classifier.train(samples, set_id=set_id) job = Job.objects.get(id=job.id) if job.is_classifier_trained(): send_event( "EventClassifierTrained", job_id=job.id, )
def update_btm_sample(sample_id, *args, **kwargs): """ Monitors sample creation and updates classify requests with this sample on match. """ sample = Sample.objects.get(id=sample_id) BeatTheMachineSample.objects.filter(job=sample.job, url=sample.url, sample=None).update(sample=sample) btms = BeatTheMachineSample.objects.filter(job=sample.job, url=sample.url, sample=sample, label='') for btm_sample in btms: send_event("EventNewClassifyBTMSample", sample_id=btm_sample.id, from_name='update_classified')
def handle_failed(self, **event_data): charge = self._get_charge(**event_data) if charge.charge_type == charge.Type.BASE_JOB: charge.job.stop() elif charge.charge_type == charge.Type.BTM_JOB: charge.job.stop_btm() data = { 'charge_id': charge.charge_id, 'job_id': charge.job_id, 'charge_type': charge.charge_type, } log.warning( 'Stripe callback: Charge {charge_id} ({charge_type}) for ' 'job {job_id} has failed! The job has been stopped.'.format(**data) ) send_event("EventJobChargeFailed", job_id=charge.job_id, charge_id=charge.id)
def update_classified_sample(sample_id, *args, **kwargs): """ Monitors sample creation and updates classify requests with this sample on match. """ sample = Sample.objects.get(id=sample_id) ClassifiedSample.objects.filter(job=sample.job, url=sample.url, sample=None).update(sample=sample) classified = ClassifiedSample.objects.filter(job=sample.job, url=sample.url, sample=sample, label='') for class_sample in classified: send_event("EventNewClassifySample", sample_id=class_sample.id, from_name='update_classified')
def update_btm_sample(sample_id, *args, **kwargs): """ Monitors sample creation and updates classify requests with this sample on match. """ sample = Sample.objects.get(id=sample_id) BeatTheMachineSample.objects.filter(job=sample.job, url=sample.url, sample=None).update(sample=sample) btms = BeatTheMachineSample.objects.filter( job=sample.job, url=sample.url, sample=sample, label='' ) for btm_sample in btms: send_event("EventNewClassifyBTMSample", sample_id=btm_sample.id, from_name='update_classified')
def create_by_owner(self, *args, **kwargs): self._sanitize(args, kwargs) kwargs["source_type"] = SAMPLE_SOURCE_OWNER kwargs["source_val"] = "" try: kwargs["sample"] = Sample.objects.get(job=kwargs["job"], url=kwargs["url"]) except Sample.DoesNotExist: pass classified_sample = self.create(**kwargs) # If sample exists, step immediately to classification if "sample" in kwargs: send_event("EventNewClassifySample", sample_id=classified_sample.id) else: Sample.objects.create_by_owner( job_id=kwargs["job"].id, url=kwargs["url"], create_classified=False, vote_sample=False ) return classified_sample
def update_classified_sample(sample_id, *args, **kwargs): """ Monitors sample creation and updates classify requests with this sample on match. """ sample = Sample.objects.get(id=sample_id) ClassifiedSample.objects.filter(job=sample.job, url=sample.url, sample=None).update(sample=sample) classified = ClassifiedSample.objects.filter( job=sample.job, url=sample.url, sample=sample, label='' ) for class_sample in classified: send_event("EventNewClassifySample", sample_id=class_sample.id, from_name='update_classified')
def watch_gold_status(job_id): job = Job.objects.get(id=job_id) if job.is_gold_samples_done(): log.info('watch_gold_status: Job %d has gold samples done.' % job_id) return if job.gold_left != 0: # If some golds are in progress, retry in 2 minutes. Indefinetly watch_gold_status.retry(countdown=2 * 60, max_retries=None) # Job samples are not done and job.gold_left == 0 job.set_gold_samples_done() training_set = TrainingSet.objects.newest_for_job(job) send_event( "EventTrainingSetCompleted", set_id=training_set.id, job_id=job.id )
def testBTMSampleIsNoVoting(self): self.assertEqual(Sample.objects.filter(btm_sample=False).count(), 3) self.assertEqual(Sample.objects.filter(btm_sample=True).count(), 0) send_event('EventSamplesVoting') # Only 3 gold samples! No BTM Samples! self.assertEqual(SampleMapping.objects.count(), 3) BeatTheMachineSample.objects.create_by_worker( job=self.job, url='google.com/1', label='', expected_output=LABEL_YES, worker_id=1234 ) BeatTheMachineSample.objects.create_by_worker( job=self.job, url='google.com/2', label='', expected_output=LABEL_YES, worker_id=12345 ) self.assertEqual(Sample.objects.filter(btm_sample=False).count(), 3) self.assertEqual(Sample.objects.filter(btm_sample=True).count(), 2) # BTM samples should be considered as BTM_HUMAN - sent to verification self.assertEqual(SampleMapping.objects.count(), 5) Sample.objects.filter(btm_sample=True).update(vote_sample=True) # Sample must have screenshot Sample.objects.all().update( screenshot="http://www.10clouds.com/media/v1334047194.07/10c/images/10c_logo.png" ) send_event('EventSamplesVoting') # 5 - incude added BTM Samples. self.assertEqual(SampleMapping.objects.count(), 5)
def testBTMVotesProcess(self): def newVote(worker, label): return WorkerQualityVote( sample=self.btm_sample.sample, worker=worker, label=label, btm_vote=True ) WorkerQualityVote.objects.bulk_create( newVote(self.workers[x], LABEL_YES) for x in xrange(3) ) ts = TrainingSet.objects.count() send_event('EventProcessVotes') # BTM's sample has `training` == True so we can train on this BTMSample self.assertEqual(TrainingSet.objects.count(), ts + 1) self.assertEqual(BeatTheMachineSample.objects.count(), 1) self.assertEqual(BeatTheMachineSample.objects.all()[0].btm_status, BeatTheMachineSample.BTM_HOLE)
def testBTMSampleIsNoVoting(self): self.assertEqual(Sample.objects.filter(btm_sample=False).count(), 3) self.assertEqual(Sample.objects.filter(btm_sample=True).count(), 0) send_event('EventSamplesVoting') # Only 3 gold samples! No BTM Samples! self.assertEqual(SampleMapping.objects.count(), 3) BeatTheMachineSample.objects.create_by_worker( job=self.job, url='google.com/1', label='', expected_output=LABEL_YES, worker_id=1234) BeatTheMachineSample.objects.create_by_worker( job=self.job, url='google.com/2', label='', expected_output=LABEL_YES, worker_id=12345) self.assertEqual(Sample.objects.filter(btm_sample=False).count(), 3) self.assertEqual(Sample.objects.filter(btm_sample=True).count(), 2) # BTM samples should be considered as BTM_HUMAN - sent to verification self.assertEqual(SampleMapping.objects.count(), 5) Sample.objects.filter(btm_sample=True).update(vote_sample=True) # Sample must have screenshot Sample.objects.all().update( screenshot= "http://www.10clouds.com/media/v1334047194.07/10c/images/10c_logo.png" ) send_event('EventSamplesVoting') # 5 - incude added BTM Samples. self.assertEqual(SampleMapping.objects.count(), 5)
def classify_btm(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ log.info( '[BTMClassification] Got sample %d for classification.' % sample_id ) btm_sample = BeatTheMachineSample.objects.get(id=sample_id) if btm_sample.label: return job = btm_sample.job # If classifier is not trained, retry later if not job.is_classifier_trained(): current.retry(countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24)) classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(btm_sample) if label is None: # Something went wrong log.warning( '[BTMClassification] Got None label for sample %d. Retrying.' % btm_sample.id ) current.retry(countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24)) BeatTheMachineSample.objects.filter(id=sample_id).update(label=label) btm_sample.updateBTMStatus() send_event( 'EventSampleBTM', job_id=job.id, btm_id=btm_sample.id, sample_id=btm_sample.sample.id, )
def create_by_owner(self, *args, **kwargs): self._sanitize(args, kwargs) kwargs['source_type'] = SAMPLE_SOURCE_OWNER kwargs['source_val'] = '' try: kwargs['sample'] = Sample.objects.get(job=kwargs['job'], url=kwargs['url']) except Sample.DoesNotExist: pass classified_sample = self.create(**kwargs) # If sample exists, step immediately to classification if 'sample' in kwargs: send_event('EventNewClassifySample', sample_id=classified_sample.id) else: Sample.objects.create_by_owner( job_id=kwargs['job'].id, url=kwargs['url'], create_classified=False, vote_sample=False, ) return classified_sample
def classify_btm(sample_id, from_name='', *args, **kwargs): """ Classifies given samples """ log.info('[BTMClassification] Got sample %d for classification.' % sample_id) btm_sample = BeatTheMachineSample.objects.get(id=sample_id) if btm_sample.label: return job = btm_sample.job # If classifier is not trained, retry later if not job.is_classifier_trained(): current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 * 24)) classifier = classifier_factory.create_classifier(job.id) label = classifier.classify(btm_sample) if label is None: # Something went wrong log.warning( '[BTMClassification] Got None label for sample %d. Retrying.' % btm_sample.id) current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 * 24)) BeatTheMachineSample.objects.filter(id=sample_id).update(label=label) btm_sample.updateBTMStatus() send_event( 'EventSampleBTM', job_id=job.id, btm_id=btm_sample.id, sample_id=btm_sample.sample.id, )
def run(*args, **kwargs): active_jobs = Job.objects.\ filter(sample__workerqualityvote__is_new=True).\ annotate(Count('sample__workerqualityvote__is_new')) for job in active_jobs: quality_algorithm = quality_factory.create_algorithm(job) decisions = quality_algorithm.extract_decisions() ts = TrainingSet.objects.create(job=job) can_train = False if decisions: log.info( 'ProcessVotesManager: Creating training set for job %d.' % job.id) dict_decisions = dict(decisions) samples = Sample.objects.filter(id__in=imap( lambda x: x[0], ifilter(lambda x: x[1] != LABEL_BROKEN, decisions)), training=True).defer('id') for sample in samples: TrainingSample.objects.create( set=ts, sample=sample, label=dict_decisions[sample.id], ) can_train = True for sample in Sample.objects.\ filter(job=job, goldsample__isnull=False).\ select_related('goldsample').iterator(): ts_sample, created = TrainingSample.objects.get_or_create( set=ts, sample=sample, ) can_train = True if not created: log.info( 'ProcessVotesManager: Overridden gold sample %d.' % sample.id) ts_sample.label = sample.goldsample.label ts_sample.save() decisions = quality_algorithm.extract_btm_decisions() if decisions: log.info('ProcessVotesManager: Processing btm decisions %d.' % job.id) for sample_id, label in decisions: if label == LABEL_BROKEN: log.info( 'ProcessVotesManager: Omitted broken label of btm sample %d.' % sample_id) continue btms = BeatTheMachineSample.objects.get( sample__id=sample_id) btms.recalculate_human(label) if btms.sample.training: can_train = True TrainingSample.objects.create( set=ts, sample=btms.sample, label=label, ) if can_train: send_event( 'EventTrainingSetCompleted', set_id=ts.id, job_id=job.id, ) else: ts.delete()
def copy_sample_to_job(sample_id, job_id, source_type, label='', source_val='', btm_sample=False, *args, **kwargs): try: old_sample = Sample.objects.get(id=sample_id) job = Job.objects.get(id=job_id) vote_sample = False if btm_sample else True training = False if btm_sample else True new_sample = Sample.objects.create( job=job, url=old_sample.url, text=old_sample.text, screenshot=old_sample.screenshot, source_type=source_type, source_val=source_val, btm_sample=btm_sample, vote_sample=vote_sample, training=training, ) send_event( "EventSampleScreenshotDone", sample_id=new_sample.id, sample_url=new_sample.url, job_id=new_sample.job_id, ) send_event( "EventSampleContentDone", sample_id=new_sample.id, sample_url=new_sample.url, job_id=new_sample.job_id, ) # Golden sample if label is not None: # GoldSample created sucesfully - pushing event. gold = GoldSample( sample=new_sample, label=label ) gold.save() send_event( "EventNewGoldSample", job_id=job.id, gold_id=gold.id, ) # Ordinary sample else: # Sample created sucesfully - pushing event. send_event( "EventNewBTMSample" if btm_sample else "EventNewSample", job_id=job.id, sample_id=new_sample.id, ) except IntegrityError: # Such sample has been created in the mean time, dont do anything return Sample.objects.get(job=job, url=old_sample.url).id except DatabaseError, e: # Retry process on db error, such as 'Database is locked' copy_sample_to_job.retry(exc=e, countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24))