def get_source_worker(self): """ Returns a worker who has sent this sample. """ return Sample.get_worker( source_type=self.source_type, source_val=self.source_val, )
def setUp(self): self.u = User.objects.create_user(username='******', password='******') self.job = Job.objects.create_active( account=self.u.get_profile(), gold_samples=[{'url': '10clouds.com', 'label': LABEL_YES}]) self.train_data = [ Sample(job=self.job, source_type='', text='Mechanical squirrel screwdriver over car'), Sample(job=self.job, source_type='', text='Screwdriver fix mechanical bike bolts'), Sample(job=self.job, source_type='', text='Brown banana apple pinapple potato'), Sample(job=self.job, source_type='', text='apple pinapple potato'), Sample(job=self.job, source_type='', text='Hippo tree over lagoon'), Sample(job=self.job, source_type='', text='Green tan with true fox') ] self.labels = [LABEL_YES, LABEL_YES, LABEL_NO, LABEL_NO, LABEL_NO, LABEL_NO] self.classified = [] for idx, sample in enumerate(self.train_data): self.classified.append(ClassifiedSample.objects.create( job=self.job, sample=sample, label=self.labels[idx] )) self.classifier247 = classifier_factory.create_classifier( job_id=self.job.id, )
def create_classify_sample(result, source_type, create_classified=True, label='', source_val='', *args, **kwargs): """ Creates classified sample from existing sample, therefore we don't need web extraction. """ # We are given a tuple (extraction result, sample id) extraction_result = result[0] # If extraction failed - return if not extraction_result: return False sample_id = result[1] # Don't classify already classified samples if label: return sample_id if create_classified: try: sample = Sample.objects.get(id=sample_id) if not label: label = '' # Proper sample entry class_sample = ClassifiedSample.objects.create( job=sample.job, url=sample.url, sample=sample, label=label, source_type=source_type, source_val=source_val, ) worker = Sample.get_worker(source_type=source_type, source_val=source_val) if worker: # Update cache worker.get_urls_collected_count_for_job(sample.job, cache=False) # Sample created sucesfully - pushing event. send_event( "EventNewClassifySample", sample_id=class_sample.id, ) except DatabaseError, e: # Retry process on db error, such as 'Database is locked' create_classify_sample.retry(exc=e, countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24))
def forwards(self, orm): "Write your forwards methods here." # Note: Remember to use orm['appname.ModelName'] rather than "from appname.models..." for sample in orm["main.Sample"].objects.filter(goldsample__isnull=True): worker = Sample.get_worker(source_type=sample.source_type, source_val=sample.source_val) if not worker: continue try: orm["crowdsourcing.WorkerQualityVote"].objects.new_vote(worker=worker, sample=sample, label=LABEL_YES) except: # Such vote already exists - skip. pass
def forwards(self, orm): "Write your forwards methods here." # Note: Remember to use orm['appname.ModelName'] rather than "from appname.models..." for sample in orm['main.Sample'].objects.filter(goldsample__isnull=True): worker = Sample.get_worker( source_type=sample.source_type, source_val=sample.source_val, ) if not worker: continue try: orm['crowdsourcing.WorkerQualityVote'].objects.new_vote( worker=worker, sample=sample, label=LABEL_YES, ) except: # Such vote already exists - skip. pass
def testVerifyFromTagasauris(self): job = Job.objects.create_active( account=self.user.get_profile(), gold_samples=json.dumps([{'url': 'google.com', 'label': LABEL_YES}]), same_domain_allowed=2, no_of_urls=10, ) worker_id = '1234' # Verifying first url (and adding) newest_url = 'google.com/1' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('added', resp_dict['result']) self.assertEqual(False, resp_dict['all']) self.assertEqual(Sample.objects.filter( job=job, url=Sample.sanitize_url(newest_url)).count(), 1) # This time verification should fail becaufe of too many urls from same # domain newest_url = 'google.com/2' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('domain duplicate', resp_dict['result']) self.assertEqual(False, resp_dict['all']) self.assertEqual(Sample.objects.filter( job=job, url=Sample.sanitize_url(newest_url)).count(), 0) # This time verification should fail becaufe of duplicated url (look at # golden sample) newest_url = 'google.com' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('duplicate', resp_dict['result']) self.assertEqual(False, resp_dict['all']) self.assertEqual(Sample.objects.filter( job=job, url=Sample.sanitize_url(newest_url)).count(), 1)
def testVerifyFromTagasaurisErrors(self): job = Job.objects.create_active( account=self.user.get_profile(), gold_samples=json.dumps([{'url': 'google.com', 'label': LABEL_YES}]), same_domain_allowed=2, no_of_urls=10, ) worker_id = '1234' # Error on not existing job. newest_url = 'google.com/1' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, 1234567), json.dumps(data), "text/json") self.assertNotEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('error' in resp_dict.keys()) self.assertEqual(Sample.objects.filter( url=Sample.sanitize_url(newest_url)).count(), 0) # Error on wrong post data (not json). newest_url = 'google.com/1' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), data) self.assertNotEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('error' in resp_dict.keys()) self.assertEqual(Sample.objects.filter( url=Sample.sanitize_url(newest_url)).count(), 0) # Error on wrong post data (parameters errors). newest_url = 'google.com/1' data = { 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertNotEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('error' in resp_dict.keys()) self.assertEqual(Sample.objects.filter( url=Sample.sanitize_url(newest_url)).count(), 0) newest_url = 'google.com/1' data = { 'url': newest_url, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertNotEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('error' in resp_dict.keys()) self.assertEqual(Sample.objects.filter( url=Sample.sanitize_url(newest_url)).count(), 0)
def testVerifyFromTagasaurisLimit(self): job = Job.objects.create_active( account=self.user.get_profile(), gold_samples=json.dumps([{'url': 'google.com', 'label': LABEL_YES}]), same_domain_allowed=20, no_of_urls=2, ) worker_id = '1234' # Verifying first url (and adding). We need one more. newest_url = 'google.com/1' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('added', resp_dict['result']) self.assertEqual(False, resp_dict['all']) self.assertEqual(Sample.objects.filter( job=job, url=Sample.sanitize_url(newest_url)).count(), 1) # Verifying second url. Gathering should be completed. newest_url = 'google.com/2' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('added', resp_dict['result']) self.assertEqual(True, resp_dict['all']) self.assertEqual(Sample.objects.filter( job=job, url=Sample.sanitize_url(newest_url)).count(), 1) self.assertEqual(job.get_urls_collected(), job.no_of_urls) # Verifying third url. Gathering should be completed but url won't be # added. newest_url = 'google.com/3' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post('%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('', resp_dict['result']) self.assertEqual(True, resp_dict['all']) self.assertEqual(Sample.objects.filter( job=job, url=Sample.sanitize_url(newest_url)).count(), 0) self.assertEqual(job.get_urls_collected(), job.no_of_urls)
def testVerifyFromTagasauris(self): job = Job.objects.create_active( account=self.user.get_profile(), gold_samples=json.dumps([{ 'url': 'google.com', 'label': LABEL_YES }]), same_domain_allowed=2, no_of_urls=10, ) worker_id = '1234' # Verifying first url (and adding) newest_url = 'google.com/1' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('added', resp_dict['result']) self.assertEqual(False, resp_dict['all']) self.assertEqual( Sample.objects.filter(job=job, url=Sample.sanitize_url(newest_url)).count(), 1) # This time verification should fail becaufe of too many urls from same # domain newest_url = 'google.com/2' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('domain duplicate', resp_dict['result']) self.assertEqual(False, resp_dict['all']) self.assertEqual( Sample.objects.filter(job=job, url=Sample.sanitize_url(newest_url)).count(), 0) # This time verification should fail becaufe of duplicated url (look at # golden sample) newest_url = 'google.com' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('duplicate', resp_dict['result']) self.assertEqual(False, resp_dict['all']) self.assertEqual( Sample.objects.filter(job=job, url=Sample.sanitize_url(newest_url)).count(), 1)
def testVerifyFromTagasaurisErrors(self): job = Job.objects.create_active( account=self.user.get_profile(), gold_samples=json.dumps([{ 'url': 'google.com', 'label': LABEL_YES }]), same_domain_allowed=2, no_of_urls=10, ) worker_id = '1234' # Error on not existing job. newest_url = 'google.com/1' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, 1234567), json.dumps(data), "text/json") self.assertNotEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('error' in resp_dict.keys()) self.assertEqual( Sample.objects.filter(url=Sample.sanitize_url(newest_url)).count(), 0) # Error on wrong post data (not json). newest_url = 'google.com/1' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), data) self.assertNotEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('error' in resp_dict.keys()) self.assertEqual( Sample.objects.filter(url=Sample.sanitize_url(newest_url)).count(), 0) # Error on wrong post data (parameters errors). newest_url = 'google.com/1' data = { 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertNotEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('error' in resp_dict.keys()) self.assertEqual( Sample.objects.filter(url=Sample.sanitize_url(newest_url)).count(), 0) newest_url = 'google.com/1' data = { 'url': newest_url, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertNotEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('error' in resp_dict.keys()) self.assertEqual( Sample.objects.filter(url=Sample.sanitize_url(newest_url)).count(), 0)
def testVerifyFromTagasaurisLimit(self): job = Job.objects.create_active( account=self.user.get_profile(), gold_samples=json.dumps([{ 'url': 'google.com', 'label': LABEL_YES }]), same_domain_allowed=20, no_of_urls=2, ) worker_id = '1234' # Verifying first url (and adding). We need one more. newest_url = 'google.com/1' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('added', resp_dict['result']) self.assertEqual(False, resp_dict['all']) self.assertEqual( Sample.objects.filter(job=job, url=Sample.sanitize_url(newest_url)).count(), 1) # Verifying second url. Gathering should be completed. newest_url = 'google.com/2' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('added', resp_dict['result']) self.assertEqual(True, resp_dict['all']) self.assertEqual( Sample.objects.filter(job=job, url=Sample.sanitize_url(newest_url)).count(), 1) self.assertEqual(job.get_urls_collected(), job.no_of_urls) # Verifying third url. Gathering should be completed but url won't be # added. newest_url = 'google.com/3' data = { 'url': newest_url, 'worker_id': worker_id, } resp = self.c.post( '%ssample/add/tagasauris/%s/?format=json' % (self.api_url, job.id), json.dumps(data), "text/json") self.assertEqual(resp.status_code, 200) resp_dict = json.loads(resp.content) self.assertTrue('result' in resp_dict.keys()) self.assertTrue('all' in resp_dict.keys()) self.assertEqual('', resp_dict['result']) self.assertEqual(True, resp_dict['all']) self.assertEqual( Sample.objects.filter(job=job, url=Sample.sanitize_url(newest_url)).count(), 0) self.assertEqual(job.get_urls_collected(), job.no_of_urls)
def get_source_worker(self): """ Returns a worker who has sent this sample. """ return Sample.get_worker(source_type=self.source_type, source_val=self.source_val)