Esempio n. 1
0
    def run(self, *args, **kwargs):
        kw = {
            'tagasaurisjobs__%s' % self.hit_name: None,
            'tagasaurisjobs__%s__isnull' % self.key_name: False,
        }
        jobs = Job.objects.select_related('tagasaurisjobs').filter(
            status=JOB_STATUS_ACTIVE, tagasaurisjobs__isnull=False,
            **kw).iterator()
        client = make_tagapi_client()

        for job in jobs:
            old_hit = getattr(job.tagasaurisjobs, self.hit_name)
            new_hit = get_hit(client,
                              getattr(job.tagasaurisjobs, self.key_name),
                              job.get_hit_type())

            if not new_hit or old_hit == new_hit:
                continue

            kw = {self.hit_name: new_hit}
            TagasaurisJobs.objects.filter(urlannotator_job=job).update(**kw)
            send_event(
                self.event_name,
                job_id=job.id,
                old_hit=old_hit,
                new_hit=new_hit,
            )
Esempio n. 2
0
def web_content_extraction(sample_id, url=None, *args, **kwargs):
    """ Links/lynx required. Generates html output from those browsers.
    """
    if url is None:
        url = Sample.objects.get(id=sample_id).url

    if not is_proper_url(url):
        return False

    sample = Sample.objects.get(id=sample_id)

    try:
        text = get_web_text(url)

        Sample.objects.filter(id=sample_id).update(text=text)
        send_event(
            "EventSampleContentDone",
            sample_id=sample_id,
            sample_url=sample.url,
            job_id=sample.job_id,
        )
    except subprocess.CalledProcessError, e:
        # Something wrong has happened to links. Couldn't find documentation on
        # error codes - assume bad stuff has happened that retrying won't fix.
        send_event(
            'EventSampleContentFail',
            sample_id=sample_id,
            sample_url=sample.url,
            job_id=sample.job_id,
            error_code=e.returncode
        )
        return False
Esempio n. 3
0
    def _train(self, samples=[], set_id=0):
        writer, reader = self.update_self()

        # Trains the subclassifier
        writer.train(
            samples=[],
            turn_off=False,
            set_id=set_id,
        )

        trained = False
        wait_time = 0
        entry = ClassifierModel.objects.get(id=self.id)
        job_id = entry.job_id

        while not trained:
            time.sleep(min(wait_time, CLASS247_MAX_WAIT))

            try:
                status = writer.get_train_status()
            except ClassifierTrainingError, e:
                status = CLASS_TRAIN_STATUS_ERROR
                # Swallow retry-safe training errors.
                send_event(
                    "EventClassifierTrainError",
                    job_id=job_id,
                    message=e.message,
                )
            trained = status == CLASS_TRAIN_STATUS_DONE
            wait_time += CLASS247_TRAIN_STEP
Esempio n. 4
0
    def create_by_worker(self, *args, **kwargs):
        kwargs['url'] = sanitize_url(kwargs['url'])
        kwargs['source_type'] = SAMPLE_TAGASAURIS_WORKER
        kwargs['source_val'] = kwargs['worker_id']
        del kwargs['worker_id']
        try:
            kwargs['sample'] = Sample.objects.get(
                job=kwargs['job'],
                url=kwargs['url']
            )
        except Sample.DoesNotExist:
            pass

        btm_sample = self.create(**kwargs)
        # If sample exists, step immediately to classification
        if 'sample' in kwargs:
            send_event('EventNewBTMSample',
                sample_id=kwargs['sample'].id,
                job_id=kwargs['job'].id)
        else:
            Sample.objects.create_by_btm(
                job_id=kwargs['job'].id,
                url=kwargs['url'],
                source_val=kwargs['source_val'],
                create_classified=False,
            )

        return btm_sample
Esempio n. 5
0
    def testVotesProcess(self):

        def newVote(worker, label):
            return WorkerQualityVote(
                sample=self.sample,
                worker=worker,
                label=label,
            )

        WorkerQualityVote.objects.bulk_create(
            newVote(self.workers[x], LABEL_YES) for x in xrange(3)
        )

        send_event('EventProcessVotes')

        ts = TrainingSet.objects.newest_for_job(self.job)
        self.assertEqual(len(ts.training_samples.all()), 1)

        training_sample = ts.training_samples.all()[0]
        self.assertEqual(training_sample.label, LABEL_YES)

        WorkerQualityVote.objects.bulk_create(
            newVote(self.workers[x], LABEL_NO) for x in xrange(3, 7)
        )

        send_event('EventProcessVotes')

        ts = TrainingSet.objects.newest_for_job(self.job)
        self.assertEqual(len(ts.training_samples.all()), 1)

        training_sample = ts.training_samples.all()[0]
        self.assertEqual(training_sample.label, LABEL_NO)
Esempio n. 6
0
    def test_suppression(self):
        event_name, file_name, file_content = \
            'TestEvent', "test_file_name", "success"

        send_event(event_name, fname=file_name, content=file_content)

        self.assertFalse(os.path.isfile(file_name))
Esempio n. 7
0
def classify(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    class_sample = ClassifiedSample.objects.get(id=sample_id)
    if class_sample.label:
        return

    job = class_sample.job

    # If classifier is not trained, return - it will be reclassified if
    # the classifier finishes training
    if not job.is_classifier_trained():
        return

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(class_sample)

    if label is None:
        # Something went wrong
        log.warning(
            '[Classification] Got None label for sample %d. Retrying.' % class_sample.id
        )
        current.retry(
            countdown=min(60 * 2 ** (current.request.retries % 6), 60 * 60 * 1),
            max_retries=None,
        )
    ClassifiedSample.objects.filter(id=sample_id).update(label=label)

    send_event(
        'EventSampleClassified',
        job_id=job.id,
        class_id=class_sample.id,
        sample_id=class_sample.sample.id,
    )
Esempio n. 8
0
    def run(self, *args, **kwargs):
        kw = {
            'tagasaurisjobs__%s' % self.hit_name: None,
            'tagasaurisjobs__%s__isnull' % self.key_name: False,
        }
        jobs = Job.objects.select_related('tagasaurisjobs').filter(
            status=JOB_STATUS_ACTIVE,
            tagasaurisjobs__isnull=False, **kw).iterator()
        client = make_tagapi_client()

        for job in jobs:
            old_hit = getattr(job.tagasaurisjobs, self.hit_name)
            new_hit = get_hit(client,
                getattr(job.tagasaurisjobs, self.key_name), job.get_hit_type())

            if not new_hit or old_hit == new_hit:
                continue

            kw = {self.hit_name: new_hit}
            TagasaurisJobs.objects.filter(urlannotator_job=job).update(
                **kw
            )
            send_event(
                self.event_name,
                job_id=job.id,
                old_hit=old_hit,
                new_hit=new_hit,
            )
Esempio n. 9
0
    def train_lock(self, func, turn_off=True, *args, **kwargs):
        """
        Locks the classifier during training.
        """
        self.sync247.modified_lock()
        try:

            entry = ClassifierModel.objects.get(id=self.id)
            try:
                func(*args, **kwargs)
            except ClassifierTrainingError, e:
                # Retry-safe error has been propagated up to here whilst
                # it should've been handled in the `func`. Log it and abort.
                send_event(
                    "EventClassifierTrainError",
                    job_id=entry.job_id,
                    message=e.message,
                )
                return
            except ClassifierTrainingCriticalError, e:
                # Really bad things have happened during training. Log it and
                # abort.
                send_event(
                    "EventClassifierCriticalTrainError",
                    job_id=entry.job_id,
                    message=e.message,
                )
                return
Esempio n. 10
0
def web_screenshot_extraction(sample_id, url=None, *args, **kwargs):
    """ Generates html output from those browsers.
    """
    if url is None:
        url = Sample.objects.get(id=sample_id).url

    if not is_proper_url(url):
        return False

    sample = Sample.objects.get(id=sample_id)
    try:
        screenshot = get_web_screenshot(url)
        Sample.objects.filter(id=sample_id).update(screenshot=screenshot)

        send_event(
            "EventSampleScreenshotDone",
            sample_id=sample_id,
            sample_url=sample.url,
            job_id=sample.job_id,
        )
    except BaseWebkitException, e:
        send_event(
            "EventSampleScreenshotFail",
            sample_id=sample_id,
            sample_url=sample.url,
            job_id=sample.job_id,
            error_code=e.status_code,
        )
        return False
Esempio n. 11
0
def classify(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    class_sample = ClassifiedSample.objects.get(id=sample_id)
    if class_sample.label:
        return

    job = class_sample.job

    # If classifier is not trained, return - it will be reclassified if
    # the classifier finishes training
    if not job.is_classifier_trained():
        return

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(class_sample)

    if label is None:
        # Something went wrong
        log.warning(
            '[Classification] Got None label for sample %d. Retrying.' %
            class_sample.id)
        current.retry(
            countdown=min(60 * 2**(current.request.retries % 6), 60 * 60 * 1),
            max_retries=None,
        )
    ClassifiedSample.objects.filter(id=sample_id).update(label=label)

    send_event(
        'EventSampleClassified',
        job_id=job.id,
        class_id=class_sample.id,
        sample_id=class_sample.sample.id,
    )
Esempio n. 12
0
    def handle(self, event_name, event_kwargs=None, *args, **options):
        kwargs = {}
        if event_kwargs:
            kwargs = json.loads(event_kwargs)

        # We don't use args with send_event!
        send_event(event_name, **kwargs)
Esempio n. 13
0
def odesk_complete(request):
    client = request.session['odesk_client']
    token, secret = client.auth.get_access_token(request.GET['oauth_verifier'])
    client = odesk.Client(
        settings.ODESK_SERVER_KEY,
        settings.ODESK_SERVER_SECRET,
        oauth_access_token=token,
        oauth_access_token_secret=secret,
        auth='oauth',
    )
    info = client.hr.get_user('me')
    cipher = info['profile_key']

    if request.user.is_authenticated():
        if request.user.get_profile().odesk_uid == '':
            request.user.get_profile().odesk_id = info['id']
            request.user.get_profile().odesk_uid = cipher
            request.user.get_profile().odesk_token = token
            request.user.get_profile().odesk_secret = secret
            request.user.get_profile().save()

            send_event("EventNewOdeskAssoc", user_id=request.user.id)
            # Add Worker model on odesk account association
            if not Worker.objects.filter(external_id=cipher):
                w = Worker.objects.create_odesk(external_id=cipher)
                request.user.get_profile().worker_entry = w
                request.user.get_profile().save()
            request.session['success'] = 'You have successfully logged in.'
        return redirect('index')
    else:
        try:
            assoc = Account.objects.get(odesk_uid=cipher)
            u = authenticate(username=assoc.user.username, password='******')
            if not u:
                request.session['error'] = 'Such account already exists.'
                return redirect('login')
            login(request, u)
            return redirect('index')
        except Account.DoesNotExist:
            u = User.objects.create_user(email=info['email'],
                username='******'.join(['odesk', cipher]), password='******')
            profile = u.get_profile()
            profile.odesk_id = info['id']
            profile.odesk_uid = cipher
            profile.odesk_token = token
            profile.odesk_secret = secret
            send_event("EventNewOdeskAssoc", user_id=u.id)
            profile.full_name = '%s %s' % (info['first_name'],
                info['last_name'])
            profile.save()
            u = authenticate(username=u.username, password='******')
            login(request, u)

            # Create Worker model on odesk account registration
            if not Worker.objects.filter(external_id=cipher):
                w = Worker.objects.create_odesk(external_id=cipher)
                u.get_profile().worker_entry = w
                u.get_profile().save()
            request.session['success'] = 'You have successfuly registered'
            return redirect('settings')
Esempio n. 14
0
 def handle_succeeded(self, **event_data):
     charge = self._get_charge(**event_data)
     if charge.charge_type == charge.Type.BASE_JOB:
         charge.job.initialize()
     elif charge.charge_type == charge.Type.BTM_JOB:
         charge.job.activate_btm()
     send_event("EventJobChargeSucceeded", job_id=charge.job_id,
         charge_id=charge.id)
Esempio n. 15
0
    def test_altering(self):
        event_name, file_name, file_content = \
            'TestEvent', "test_file_name", "success"

        send_event(event_name, fname=file_name, content=file_content)

        with open(file_name, 'r') as f:
            self.assertEqual(file_content[::-1], f.readline())
        os.remove(file_name)
Esempio n. 16
0
    def test_suppression(self):
        event_name, file_name, file_content = \
            'TestEvent', "test_file_name", "success"

        send_event(event_name,
            fname=file_name,
            content=file_content)

        self.assertFalse(os.path.isfile(file_name))
Esempio n. 17
0
    def test_proper_matching(self):
        event_name, file_name, file_content = \
            'TestEvent', "test_file_name", "success"

        send_event(event_name, fname=file_name, content=file_content)

        # due to eager celery task evaluation this should work
        with open(file_name, 'r') as f:
            self.assertEqual(file_content, f.readline())
        os.remove(file_name)
Esempio n. 18
0
def create_sample(extraction_result, sample_id, job_id, url,
        source_type, source_val='', domain='', label=None, silent=False,
        vote_sample=True, btm_sample=False, training=True, *args, **kwargs):
    """
    If error while capturing web propagate it. Finally deletes TemporarySample.
    extraction_result should be [True, True] - otherwise chaining failed.
    """

    extracted = all([x is True for x in extraction_result])

    job = Job.objects.get(id=job_id)
    # Checking if all previous tasks succeeded.
    if extracted:

        # Proper sample entry
        Sample.objects.filter(id=sample_id).update(
            source_type=source_type,
            source_val=source_val,
            domain=domain,
            vote_sample=vote_sample,
            btm_sample=btm_sample,
            training=training,
        )
        sample = Sample.objects.get(id=sample_id)
        if not silent:
            # Golden sample
            if label is not None:
                # GoldSample created sucesfully - pushing event.
                gold = GoldSample(
                    sample=sample,
                    label=label
                )
                gold.save()
                send_event(
                    "EventNewGoldSample",
                    job_id=job.id,
                    gold_id=gold.id,
                )

            # Ordinary sample
            else:
                # Sample created sucesfully - pushing event.
                send_event(
                    "EventNewBTMSample" if btm_sample else "EventNewSample",
                    job_id=job.id,
                    sample_id=sample_id,
                )
    else:
        # Extraction failed, cleanup.
        Sample.objects.filter(id=sample_id).delete()
        if label is not None:
            Job.objects.filter(id=job.id, gold_left__gte=0)\
                .update(gold_left=F('gold_left') - 1)

    return (extracted, sample_id)
Esempio n. 19
0
    def test_altering(self):
        event_name, file_name, file_content = \
            'TestEvent', "test_file_name", "success"

        send_event(event_name,
            fname=file_name,
            content=file_content)

        with open(file_name, 'r') as f:
            self.assertEqual(file_content[::-1], f.readline())
        os.remove(file_name)
Esempio n. 20
0
    def reclassify(self, force=False):
        """
            Reclassifies current sample. If `force` is True, then the sample is
            reclassified even if previous classification was successful.
            Returns True on success.

            This call is asynchronous.
        """
        if self.is_pending() or force:
            send_event("EventNewClassifySample", sample_id=self.id)
            return True
        return False
Esempio n. 21
0
    def test_proper_matching(self):
        event_name, file_name, file_content = \
            'TestEvent', "test_file_name", "success"

        send_event(event_name,
            fname=file_name,
            content=file_content)

        # due to eager celery task evaluation this should work
        with open(file_name, 'r') as f:
            self.assertEqual(file_content, f.readline())
        os.remove(file_name)
Esempio n. 22
0
def create_classify_sample(result, source_type, create_classified=True,
        label='', source_val='', *args, **kwargs):
    """
        Creates classified sample from existing sample, therefore we don't need
        web extraction.
    """

    # We are given a tuple (extraction result, sample id)
    extraction_result = result[0]

    # If extraction failed - return
    if not extraction_result:
        return False
    sample_id = result[1]

    # Don't classify already classified samples
    if label:
        return sample_id

    if create_classified:
        try:
            sample = Sample.objects.get(id=sample_id)

            if not label:
                label = ''

            # Proper sample entry
            class_sample = ClassifiedSample.objects.create(
                job=sample.job,
                url=sample.url,
                sample=sample,
                label=label,
                source_type=source_type,
                source_val=source_val,
            )

            worker = Sample.get_worker(source_type=source_type,
                    source_val=source_val)
            if worker:
                # Update cache
                worker.get_urls_collected_count_for_job(sample.job, cache=False)

            # Sample created sucesfully - pushing event.
            send_event(
                "EventNewClassifySample",
                sample_id=class_sample.id,
            )

        except DatabaseError, e:
            # Retry process on db error, such as 'Database is locked'
            create_classify_sample.retry(exc=e,
                countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24))
Esempio n. 23
0
    def new_vote(self, *args, **kwargs):
        vote = self._add_vote(**kwargs)

        if not vote:
            return None

        send_event(
            'EventNewVoteAdded',
            worker_id=kwargs['worker'].id,
            sample_id=kwargs['sample'].id,
        )

        return vote
Esempio n. 24
0
    def testLongTraining(self):
        job = Job.objects.create(
            account=self.u.get_profile(),
            status=JOB_STATUS_ACTIVE)
        ts = TrainingSet.objects.create(job=job)
        JobFactory().create_classifier(job)
        send_event('EventTrainingSetCompleted',
            set_id=ts.id, job_id=job.id)
        time.sleep(1)

        # Refresh our job object
        job = Job.objects.get(id=job.id)
        self.assertTrue(job.is_classifier_trained())
Esempio n. 25
0
    def reclassify(self, force=False):
        """
            Reclassifies current sample. If `force` is True, then the sample is
            reclassified even if previous classification was successful.
            Returns True on success.

            This call is asynchronous.
        """
        if self.is_pending() or force:
            send_event(
                'EventNewClassifySample',
                sample_id=self.id,
            )
            return True
        return False
Esempio n. 26
0
    def updateBTMStatus(self, save=True):
        """
            Each execution adds another sample for voting.
        """

        status = self.calculate_status()

        self.btm_status = status
        self.update_points(status)

        if save:
            self.save()
            if status == self.BTM_HUMAN:
                send_event('EventBTMSendToHuman',
                    sample_id=self.id)
Esempio n. 27
0
    def testEventSamplesVoting(self):
        self.assertEqual(TagasaurisJobs.objects.count(), 1)
        self.assertEqual(TagasaurisJobs.objects.all()[0].voting_key, None)
        self.assertEqual(TagasaurisJobs.objects.all()[0].voting_hit, None)

        send_event('EventSamplesVoting')

        self.assertEqual(SampleMapping.objects.count(), 3)

        self.assertEqual(SampleMapping.objects.all()[0].crowscourcing_type,
                         SampleMapping.TAGASAURIS)
        self.assertEqual(len(SampleMapping.objects.all()[0].external_id), 32)

        self.assertEqual(TagasaurisJobs.objects.count(), 1)
        self.assertEqual(len(TagasaurisJobs.objects.all()[0].voting_key), 32)
        VotingHITMonitor.delay()
        self.assertEqual(len(TagasaurisJobs.objects.all()[0].voting_hit), 32)
Esempio n. 28
0
def train(set_id):
    training_set = TrainingSet.objects.get(id=set_id)
    job = training_set.job

    classifier = classifier_factory.create_classifier(job.id)

    samples = (training_sample
               for training_sample in training_set.training_samples.all())

    classifier.train(samples, set_id=set_id)

    job = Job.objects.get(id=job.id)
    if job.is_classifier_trained():
        send_event(
            "EventClassifierTrained",
            job_id=job.id,
        )
Esempio n. 29
0
def update_btm_sample(sample_id, *args, **kwargs):
    """
        Monitors sample creation and updates classify requests with this sample
        on match.
    """
    sample = Sample.objects.get(id=sample_id)
    BeatTheMachineSample.objects.filter(job=sample.job,
                                        url=sample.url,
                                        sample=None).update(sample=sample)
    btms = BeatTheMachineSample.objects.filter(job=sample.job,
                                               url=sample.url,
                                               sample=sample,
                                               label='')
    for btm_sample in btms:
        send_event("EventNewClassifyBTMSample",
                   sample_id=btm_sample.id,
                   from_name='update_classified')
Esempio n. 30
0
def train(set_id):
    training_set = TrainingSet.objects.get(id=set_id)
    job = training_set.job

    classifier = classifier_factory.create_classifier(job.id)

    samples = (training_sample
        for training_sample in training_set.training_samples.all())

    classifier.train(samples, set_id=set_id)

    job = Job.objects.get(id=job.id)
    if job.is_classifier_trained():
        send_event(
            "EventClassifierTrained",
            job_id=job.id,
        )
Esempio n. 31
0
    def testEventSamplesVoting(self):
        self.assertEqual(TagasaurisJobs.objects.count(), 1)
        self.assertEqual(TagasaurisJobs.objects.all()[0].voting_key, None)
        self.assertEqual(TagasaurisJobs.objects.all()[0].voting_hit, None)

        send_event('EventSamplesVoting')

        self.assertEqual(SampleMapping.objects.count(), 3)

        self.assertEqual(SampleMapping.objects.all()[0].crowscourcing_type,
            SampleMapping.TAGASAURIS)
        self.assertEqual(len(SampleMapping.objects.all()[0].external_id), 32)

        self.assertEqual(TagasaurisJobs.objects.count(), 1)
        self.assertEqual(len(TagasaurisJobs.objects.all()[0].voting_key), 32)
        VotingHITMonitor.delay()
        self.assertEqual(len(TagasaurisJobs.objects.all()[0].voting_hit), 32)
Esempio n. 32
0
    def handle_failed(self, **event_data):
        charge = self._get_charge(**event_data)
        if charge.charge_type == charge.Type.BASE_JOB:
            charge.job.stop()
        elif charge.charge_type == charge.Type.BTM_JOB:
            charge.job.stop_btm()

        data = {
            'charge_id': charge.charge_id,
            'job_id': charge.job_id,
            'charge_type': charge.charge_type,
        }
        log.warning(
            'Stripe callback: Charge {charge_id} ({charge_type}) for '
            'job {job_id} has failed! The job has been stopped.'.format(**data)
        )
        send_event("EventJobChargeFailed", job_id=charge.job_id,
            charge_id=charge.id)
Esempio n. 33
0
def update_classified_sample(sample_id, *args, **kwargs):
    """
        Monitors sample creation and updates classify requests with this sample
        on match.
    """
    sample = Sample.objects.get(id=sample_id)

    ClassifiedSample.objects.filter(job=sample.job,
                                    url=sample.url,
                                    sample=None).update(sample=sample)
    classified = ClassifiedSample.objects.filter(job=sample.job,
                                                 url=sample.url,
                                                 sample=sample,
                                                 label='')
    for class_sample in classified:
        send_event("EventNewClassifySample",
                   sample_id=class_sample.id,
                   from_name='update_classified')
Esempio n. 34
0
def update_btm_sample(sample_id, *args, **kwargs):
    """
        Monitors sample creation and updates classify requests with this sample
        on match.
    """
    sample = Sample.objects.get(id=sample_id)
    BeatTheMachineSample.objects.filter(job=sample.job, url=sample.url,
        sample=None).update(sample=sample)
    btms = BeatTheMachineSample.objects.filter(
        job=sample.job,
        url=sample.url,
        sample=sample,
        label=''
    )
    for btm_sample in btms:
        send_event("EventNewClassifyBTMSample",
            sample_id=btm_sample.id,
            from_name='update_classified')
Esempio n. 35
0
    def create_by_owner(self, *args, **kwargs):
        self._sanitize(args, kwargs)
        kwargs["source_type"] = SAMPLE_SOURCE_OWNER
        kwargs["source_val"] = ""
        try:
            kwargs["sample"] = Sample.objects.get(job=kwargs["job"], url=kwargs["url"])
        except Sample.DoesNotExist:
            pass

        classified_sample = self.create(**kwargs)
        # If sample exists, step immediately to classification
        if "sample" in kwargs:
            send_event("EventNewClassifySample", sample_id=classified_sample.id)
        else:
            Sample.objects.create_by_owner(
                job_id=kwargs["job"].id, url=kwargs["url"], create_classified=False, vote_sample=False
            )

        return classified_sample
Esempio n. 36
0
def update_classified_sample(sample_id, *args, **kwargs):
    """
        Monitors sample creation and updates classify requests with this sample
        on match.
    """
    sample = Sample.objects.get(id=sample_id)

    ClassifiedSample.objects.filter(job=sample.job, url=sample.url,
        sample=None).update(sample=sample)
    classified = ClassifiedSample.objects.filter(
        job=sample.job,
        url=sample.url,
        sample=sample,
        label=''
    )
    for class_sample in classified:
        send_event("EventNewClassifySample",
            sample_id=class_sample.id,
            from_name='update_classified')
Esempio n. 37
0
def watch_gold_status(job_id):
    job = Job.objects.get(id=job_id)

    if job.is_gold_samples_done():
        log.info('watch_gold_status: Job %d has gold samples done.' % job_id)
        return

    if job.gold_left != 0:
        # If some golds are in progress, retry in 2 minutes. Indefinetly
        watch_gold_status.retry(countdown=2 * 60, max_retries=None)

    # Job samples are not done and job.gold_left == 0
    job.set_gold_samples_done()

    training_set = TrainingSet.objects.newest_for_job(job)
    send_event(
        "EventTrainingSetCompleted",
        set_id=training_set.id,
        job_id=job.id
    )
Esempio n. 38
0
    def testBTMSampleIsNoVoting(self):
        self.assertEqual(Sample.objects.filter(btm_sample=False).count(), 3)
        self.assertEqual(Sample.objects.filter(btm_sample=True).count(), 0)

        send_event('EventSamplesVoting')

        # Only 3 gold samples! No BTM Samples!
        self.assertEqual(SampleMapping.objects.count(), 3)

        BeatTheMachineSample.objects.create_by_worker(
            job=self.job,
            url='google.com/1',
            label='',
            expected_output=LABEL_YES,
            worker_id=1234
        )
        BeatTheMachineSample.objects.create_by_worker(
            job=self.job,
            url='google.com/2',
            label='',
            expected_output=LABEL_YES,
            worker_id=12345
        )

        self.assertEqual(Sample.objects.filter(btm_sample=False).count(), 3)
        self.assertEqual(Sample.objects.filter(btm_sample=True).count(), 2)

        # BTM samples should be considered as BTM_HUMAN - sent to verification
        self.assertEqual(SampleMapping.objects.count(), 5)

        Sample.objects.filter(btm_sample=True).update(vote_sample=True)

        # Sample must have screenshot
        Sample.objects.all().update(
            screenshot="http://www.10clouds.com/media/v1334047194.07/10c/images/10c_logo.png"
        )

        send_event('EventSamplesVoting')

        # 5 - incude added BTM Samples.
        self.assertEqual(SampleMapping.objects.count(), 5)
Esempio n. 39
0
    def testBTMVotesProcess(self):

        def newVote(worker, label):
            return WorkerQualityVote(
                sample=self.btm_sample.sample,
                worker=worker,
                label=label,
                btm_vote=True
            )

        WorkerQualityVote.objects.bulk_create(
            newVote(self.workers[x], LABEL_YES) for x in xrange(3)
        )

        ts = TrainingSet.objects.count()
        send_event('EventProcessVotes')
        # BTM's sample has `training` == True so we can train on this BTMSample
        self.assertEqual(TrainingSet.objects.count(), ts + 1)
        self.assertEqual(BeatTheMachineSample.objects.count(), 1)
        self.assertEqual(BeatTheMachineSample.objects.all()[0].btm_status,
            BeatTheMachineSample.BTM_HOLE)
Esempio n. 40
0
    def testBTMSampleIsNoVoting(self):
        self.assertEqual(Sample.objects.filter(btm_sample=False).count(), 3)
        self.assertEqual(Sample.objects.filter(btm_sample=True).count(), 0)

        send_event('EventSamplesVoting')

        # Only 3 gold samples! No BTM Samples!
        self.assertEqual(SampleMapping.objects.count(), 3)

        BeatTheMachineSample.objects.create_by_worker(
            job=self.job,
            url='google.com/1',
            label='',
            expected_output=LABEL_YES,
            worker_id=1234)
        BeatTheMachineSample.objects.create_by_worker(
            job=self.job,
            url='google.com/2',
            label='',
            expected_output=LABEL_YES,
            worker_id=12345)

        self.assertEqual(Sample.objects.filter(btm_sample=False).count(), 3)
        self.assertEqual(Sample.objects.filter(btm_sample=True).count(), 2)

        # BTM samples should be considered as BTM_HUMAN - sent to verification
        self.assertEqual(SampleMapping.objects.count(), 5)

        Sample.objects.filter(btm_sample=True).update(vote_sample=True)

        # Sample must have screenshot
        Sample.objects.all().update(
            screenshot=
            "http://www.10clouds.com/media/v1334047194.07/10c/images/10c_logo.png"
        )

        send_event('EventSamplesVoting')

        # 5 - incude added BTM Samples.
        self.assertEqual(SampleMapping.objects.count(), 5)
Esempio n. 41
0
def classify_btm(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    log.info(
        '[BTMClassification] Got sample %d for classification.' % sample_id
    )
    btm_sample = BeatTheMachineSample.objects.get(id=sample_id)
    if btm_sample.label:
        return

    job = btm_sample.job

    # If classifier is not trained, retry later
    if not job.is_classifier_trained():
        current.retry(countdown=min(60 * 2 ** current.request.retries,
            60 * 60 * 24))

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(btm_sample)
    if label is None:
        # Something went wrong
        log.warning(
            '[BTMClassification] Got None label for sample %d. Retrying.'
                % btm_sample.id
        )
        current.retry(countdown=min(60 * 2 ** current.request.retries,
            60 * 60 * 24))

    BeatTheMachineSample.objects.filter(id=sample_id).update(label=label)
    btm_sample.updateBTMStatus()

    send_event(
        'EventSampleBTM',
        job_id=job.id,
        btm_id=btm_sample.id,
        sample_id=btm_sample.sample.id,
    )
Esempio n. 42
0
    def create_by_owner(self, *args, **kwargs):
        self._sanitize(args, kwargs)
        kwargs['source_type'] = SAMPLE_SOURCE_OWNER
        kwargs['source_val'] = ''
        try:
            kwargs['sample'] = Sample.objects.get(job=kwargs['job'],
                                                  url=kwargs['url'])
        except Sample.DoesNotExist:
            pass

        classified_sample = self.create(**kwargs)
        # If sample exists, step immediately to classification
        if 'sample' in kwargs:
            send_event('EventNewClassifySample',
                       sample_id=classified_sample.id)
        else:
            Sample.objects.create_by_owner(
                job_id=kwargs['job'].id,
                url=kwargs['url'],
                create_classified=False,
                vote_sample=False,
            )

        return classified_sample
Esempio n. 43
0
def classify_btm(sample_id, from_name='', *args, **kwargs):
    """
        Classifies given samples
    """
    log.info('[BTMClassification] Got sample %d for classification.' %
             sample_id)
    btm_sample = BeatTheMachineSample.objects.get(id=sample_id)
    if btm_sample.label:
        return

    job = btm_sample.job

    # If classifier is not trained, retry later
    if not job.is_classifier_trained():
        current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 *
                                    24))

    classifier = classifier_factory.create_classifier(job.id)
    label = classifier.classify(btm_sample)
    if label is None:
        # Something went wrong
        log.warning(
            '[BTMClassification] Got None label for sample %d. Retrying.' %
            btm_sample.id)
        current.retry(countdown=min(60 * 2**current.request.retries, 60 * 60 *
                                    24))

    BeatTheMachineSample.objects.filter(id=sample_id).update(label=label)
    btm_sample.updateBTMStatus()

    send_event(
        'EventSampleBTM',
        job_id=job.id,
        btm_id=btm_sample.id,
        sample_id=btm_sample.sample.id,
    )
Esempio n. 44
0
def odesk_complete(request):
    client = request.session['odesk_client']
    token, secret = client.auth.get_access_token(request.GET['oauth_verifier'])
    client = odesk.Client(
        settings.ODESK_SERVER_KEY,
        settings.ODESK_SERVER_SECRET,
        oauth_access_token=token,
        oauth_access_token_secret=secret,
        auth='oauth',
    )
    info = client.hr.get_user('me')
    cipher = info['profile_key']

    if request.user.is_authenticated():
        if request.user.get_profile().odesk_uid == '':
            request.user.get_profile().odesk_id = info['id']
            request.user.get_profile().odesk_uid = cipher
            request.user.get_profile().odesk_token = token
            request.user.get_profile().odesk_secret = secret
            request.user.get_profile().save()

            send_event("EventNewOdeskAssoc", user_id=request.user.id)
            # Add Worker model on odesk account association
            if not Worker.objects.filter(external_id=cipher):
                w = Worker.objects.create_odesk(external_id=cipher)
                request.user.get_profile().worker_entry = w
                request.user.get_profile().save()
            request.session['success'] = 'You have successfully logged in.'
        return redirect('index')
    else:
        try:
            assoc = Account.objects.get(odesk_uid=cipher)
            u = authenticate(username=assoc.user.username, password='******')
            if not u:
                request.session['error'] = 'Such account already exists.'
                return redirect('login')
            login(request, u)
            return redirect('index')
        except Account.DoesNotExist:
            u = User.objects.create_user(email=info['email'],
                                         username='******'.join(['odesk', cipher]),
                                         password='******')
            profile = u.get_profile()
            profile.odesk_id = info['id']
            profile.odesk_uid = cipher
            profile.odesk_token = token
            profile.odesk_secret = secret
            send_event("EventNewOdeskAssoc", user_id=u.id)
            profile.full_name = '%s %s' % (info['first_name'],
                                           info['last_name'])
            profile.save()
            u = authenticate(username=u.username, password='******')
            login(request, u)

            # Create Worker model on odesk account registration
            if not Worker.objects.filter(external_id=cipher):
                w = Worker.objects.create_odesk(external_id=cipher)
                u.get_profile().worker_entry = w
                u.get_profile().save()
            request.session['success'] = 'You have successfuly registered'
            return redirect('settings')
Esempio n. 45
0
    def run(*args, **kwargs):
        active_jobs = Job.objects.\
            filter(sample__workerqualityvote__is_new=True).\
            annotate(Count('sample__workerqualityvote__is_new'))

        for job in active_jobs:
            quality_algorithm = quality_factory.create_algorithm(job)
            decisions = quality_algorithm.extract_decisions()
            ts = TrainingSet.objects.create(job=job)
            can_train = False
            if decisions:
                log.info(
                    'ProcessVotesManager: Creating training set for job %d.' %
                    job.id)

                dict_decisions = dict(decisions)
                samples = Sample.objects.filter(id__in=imap(
                    lambda x: x[0],
                    ifilter(lambda x: x[1] != LABEL_BROKEN, decisions)),
                                                training=True).defer('id')

                for sample in samples:
                    TrainingSample.objects.create(
                        set=ts,
                        sample=sample,
                        label=dict_decisions[sample.id],
                    )
                    can_train = True

                for sample in Sample.objects.\
                        filter(job=job, goldsample__isnull=False).\
                        select_related('goldsample').iterator():
                    ts_sample, created = TrainingSample.objects.get_or_create(
                        set=ts,
                        sample=sample,
                    )
                    can_train = True

                    if not created:
                        log.info(
                            'ProcessVotesManager: Overridden gold sample %d.' %
                            sample.id)
                    ts_sample.label = sample.goldsample.label
                    ts_sample.save()

            decisions = quality_algorithm.extract_btm_decisions()
            if decisions:
                log.info('ProcessVotesManager: Processing btm decisions %d.' %
                         job.id)

                for sample_id, label in decisions:
                    if label == LABEL_BROKEN:
                        log.info(
                            'ProcessVotesManager: Omitted broken label of btm sample %d.'
                            % sample_id)
                        continue

                    btms = BeatTheMachineSample.objects.get(
                        sample__id=sample_id)
                    btms.recalculate_human(label)

                    if btms.sample.training:
                        can_train = True
                        TrainingSample.objects.create(
                            set=ts,
                            sample=btms.sample,
                            label=label,
                        )

            if can_train:
                send_event(
                    'EventTrainingSetCompleted',
                    set_id=ts.id,
                    job_id=job.id,
                )
            else:
                ts.delete()
Esempio n. 46
0
def copy_sample_to_job(sample_id, job_id, source_type, label='', source_val='',
        btm_sample=False, *args, **kwargs):
    try:
        old_sample = Sample.objects.get(id=sample_id)
        job = Job.objects.get(id=job_id)

        vote_sample = False if btm_sample else True
        training = False if btm_sample else True

        new_sample = Sample.objects.create(
            job=job,
            url=old_sample.url,
            text=old_sample.text,
            screenshot=old_sample.screenshot,
            source_type=source_type,
            source_val=source_val,
            btm_sample=btm_sample,
            vote_sample=vote_sample,
            training=training,
        )

        send_event(
            "EventSampleScreenshotDone",
            sample_id=new_sample.id,
            sample_url=new_sample.url,
            job_id=new_sample.job_id,
        )
        send_event(
            "EventSampleContentDone",
            sample_id=new_sample.id,
            sample_url=new_sample.url,
            job_id=new_sample.job_id,
        )

        # Golden sample
        if label is not None:
            # GoldSample created sucesfully - pushing event.
            gold = GoldSample(
                sample=new_sample,
                label=label
            )
            gold.save()
            send_event(
                "EventNewGoldSample",
                job_id=job.id,
                gold_id=gold.id,
            )

        # Ordinary sample
        else:
            # Sample created sucesfully - pushing event.
            send_event(
                "EventNewBTMSample" if btm_sample else "EventNewSample",
                job_id=job.id,
                sample_id=new_sample.id,
            )

    except IntegrityError:
        # Such sample has been created in the mean time, dont do anything
        return Sample.objects.get(job=job, url=old_sample.url).id
    except DatabaseError, e:
        # Retry process on db error, such as 'Database is locked'
        copy_sample_to_job.retry(exc=e,
            countdown=min(60 * 2 ** current.request.retries, 60 * 60 * 24))