Ejemplo n.º 1
0
    def _extract_decisions(self, all_votes):
        votes = {}
        for worker_id, object_id, label in all_votes:
            counts = votes.get(object_id, {
                LABEL_YES: 0,
                LABEL_NO: 0,
                LABEL_BROKEN: 0,
            })

            label = make_label(label)
            if not label:
                log.warning('[MajorityVoting] Got unrecognized sample %s' % label)
                continue
            count = counts.get(label, 0)
            counts[label] = count + 1

            votes[object_id] = counts
            WorkerQualityVote.objects.filter(
                worker=worker_id,
                sample=object_id,
            ).update(is_new=False)

        decisions = [(el, max(val.iteritems(), key=lambda x: x[1])[0])
            for el, val in votes.iteritems()]
        self.calculate_workers_quality(data=decisions)

        return decisions
Ejemplo n.º 2
0
    def _extract_decisions(self, all_votes):
        votes = {}
        for worker_id, object_id, label in all_votes:
            counts = votes.get(object_id, {
                LABEL_YES: 0,
                LABEL_NO: 0,
                LABEL_BROKEN: 0,
            })

            label = make_label(label)
            if not label:
                log.warning('[MajorityVoting] Got unrecognized sample %s' %
                            label)
                continue
            count = counts.get(label, 0)
            counts[label] = count + 1

            votes[object_id] = counts
            WorkerQualityVote.objects.filter(
                worker=worker_id,
                sample=object_id,
            ).update(is_new=False)

        decisions = [(el, max(val.iteritems(), key=lambda x: x[1])[0])
                     for el, val in votes.iteritems()]
        self.calculate_workers_quality(data=decisions)

        return decisions
Ejemplo n.º 3
0
    def _papi_classify(self, sample):
        """
            Executes Google Prediction API call to classify given sample.
        """
        if self.model is None:
            return None

        try:
            body = {'input': {'csvInstance': [sample.text]}}
            result = self.papi.predict(body=body, id=self.model).execute()

            label_probability = self.get_default_probabilities()
            for score in result['outputMulti']:
                probability = round(score['score'], 3)
                label = make_label(score['label'])
                if not label:
                    log.warning(
                        '[GooglePrediction] Got unrecognized label %s'
                        % score['label']
                    )
                    return None
                label_probability[label] = probability

            label = make_label(result['outputLabel'])
            if not label:
                log.warning(
                    '[GooglePrediction] Got unrecognized label %s'
                    % result['outputLabel']
                )
                return None
            return {
                'label': result['outputLabel'],
                'labels_probability': label_probability,
                'result': result,
            }
        except Exception:
            log.exception(
                '[GooglePrediction] Error while classifying sample %d' % sample.id
            )
            return None
Ejemplo n.º 4
0
    def recalculate_human(self, cat_h):
        """ Recalculates btm sample status after voting of unsure sample.
        """
        if self.frozen:
            log.warning(
                "Tried to update BTMSample %s which is frozen(paid)." % self.id
            )
            return

        conf_cl = self.confidence
        confidence = self.confidence_level(conf_cl)

        cat_h = cat_h.lower()
        cat_cl = self.label.lower()

        if cat_cl == LABEL_BROKEN.lower() or cat_h == LABEL_BROKEN.lower():
            return self.BTM_NO_STATUS

        expect = self.expected_output.lower()

        if cat_cl == expect and confidence == self.CONF_HIGH:
            self.btm_status = self.BTM_KNOWN

        elif cat_cl == expect and confidence == self.CONF_MEDIUM:
            if cat_h == expect:
                self.btm_status = self.BTM_KNOWN_UNSURE
            else:
                self.btm_status = self.BTM_X_CORRECTED

        elif cat_cl != expect and confidence == self.CONF_HIGH:
            if cat_h == expect:
                self.btm_status = self.BTM_HOLE
            else:
                self.btm_status = self.BTM_NOT_NONX

        elif cat_cl != expect and confidence == self.CONF_MEDIUM:
            if cat_h == expect:
                self.btm_status = self.BTM_NOTX_CORRECTED
            else:
                self.btm_status = self.BTM_KNOWN_UNSURE

        self.update_points(self.btm_status)
        self.human_label = make_label(cat_h)
        self.save()
Ejemplo n.º 5
0
def project_wizard(request):
    acc = request.user.get_profile()
    max_jobs = acc.job_limits.get('max_jobs', settings.USER_MAX_JOBS)
    jobs_count = acc.job_set.all().count()
    can_create = request.user.is_superuser or not max_jobs \
        or max_jobs > jobs_count

    context = {
        'stripe_key': settings.STRIPE_PUBLISHABLE,
        'free_sources': ', '.join(imap(lambda x: str(x), JOB_FREE_SOURCES)),
    }

    if request.method == "GET":
        context.update({
            'topic_form': WizardTopicForm(),
            'attributes_form': WizardAttributesForm(),
            'additional_form': WizardAdditionalForm()
        })
        if not can_create:
            context['wizard_error'] = ('You have reached a limit of maximum '
                                       'jobs created')
    else:
        topic_form = WizardTopicForm(request.POST)
        attr_form = WizardAttributesForm(request.POST)
        addt_form = WizardAdditionalForm(request.POST, request.FILES)
        stripe_token = request.POST.get('stripeToken', None)

        context.update({
            'topic_form': topic_form,
            'attributes_form': attr_form,
            'additional_form': addt_form
        })

        if not can_create:
            context['wizard_error'] = ('You have reached a limit of maximum '
                                       'jobs created')

        if (addt_form.is_valid() and attr_form.is_valid()
                and topic_form.is_valid() and can_create):
            params = {
                'account': request.user.get_profile(),
                'title': topic_form.cleaned_data['topic'],
                'description': topic_form.cleaned_data['topic_desc'],
                'data_source': attr_form.cleaned_data['data_source'],
                'no_of_urls': attr_form.cleaned_data['no_of_urls'],
                'same_domain_allowed': addt_form.cleaned_data['same_domain'],
                'add_filler_samples':
                addt_form.cleaned_data['add_filler_samples']
            }

            if not params['no_of_urls']:
                context['wizard_error'] = (
                    'You need to specify number of urls to collect.')
                return render(request, 'main/project/wizard.html',
                              RequestContext(request, context))

            max_urls = acc.job_limits.get('max_urls_per_job',
                                          settings.USER_MAX_URLS_PER_JOB)

            if (not request.user.is_superuser and max_urls
                    and params['no_of_urls'] > max_urls):
                context['wizard_error'] = (
                    'You have entered too many urls to'
                    ' gather. Youre allowed to collect at max %d urls.' %
                    max_urls)
                return render(request, 'main/project/wizard.html',
                              RequestContext(request, context))

            # Gold urls file is required in the form. Since it's valid by now
            # - the file is present
            gold_file = request.FILES.get('file_gold_urls', None)
            gold_positive = [
                (x, 'yes') for x in
                addt_form.cleaned_data['gold_urls_positive'].splitlines()
            ]
            gold_negative = [
                (x, 'no') for x in
                addt_form.cleaned_data['gold_urls_negative'].splitlines()
            ]

            url_set = set()
            label_set = set()
            try:
                if gold_file:
                    urls = csv.reader(gold_file)
                else:
                    urls = gold_positive + gold_negative

                gold_samples = []
                for url, label in urls:
                    if url in url_set:
                        continue

                    label = make_label(label)
                    if not label:
                        log.warning(
                            'Got wrong label when parsing gold samples: %s' %
                            label)
                        continue
                    url_set.add(url)
                    label_set.add(label)
                    gold_samples.append({'url': url, 'label': label})

                if len(url_set) < 6:
                    context['wizard_error'] = (
                        'You have to provide at least 6 different '
                        'gold samples.')
                    return render(request, 'main/project/wizard.html',
                                  RequestContext(request, context))

                if len(label_set) < 2:
                    context['wizard_error'] = (
                        'You have to provide at least 2 different labels.')
                    return render(request, 'main/project/wizard.html',
                                  RequestContext(request, context))

                params['gold_samples'] = json.dumps(gold_samples)

            except csv.Error, e:
                request.session['error'] = e
                return redirect('index')
            except:
Ejemplo n.º 6
0
def project_wizard(request):
    acc = request.user.get_profile()
    max_jobs = acc.job_limits.get('max_jobs',
        settings.USER_MAX_JOBS)
    jobs_count = acc.job_set.all().count()
    can_create = request.user.is_superuser or not max_jobs \
        or max_jobs > jobs_count

    context = {
        'stripe_key': settings.STRIPE_PUBLISHABLE,
        'free_sources': ', '.join(imap(lambda x: str(x), JOB_FREE_SOURCES)),
    }

    if request.method == "GET":
        context.update({
            'topic_form': WizardTopicForm(),
            'attributes_form': WizardAttributesForm(),
            'additional_form': WizardAdditionalForm()
        })
        if not can_create:
            context['wizard_error'] = ('You have reached a limit of maximum '
                'jobs created')
    else:
        topic_form = WizardTopicForm(request.POST)
        attr_form = WizardAttributesForm(request.POST)
        addt_form = WizardAdditionalForm(request.POST, request.FILES)
        stripe_token = request.POST.get('stripeToken', None)

        context.update({
            'topic_form': topic_form,
            'attributes_form': attr_form,
            'additional_form': addt_form
        })

        if not can_create:
            context['wizard_error'] = ('You have reached a limit of maximum '
                'jobs created')

        if (addt_form.is_valid() and
                attr_form.is_valid() and
                topic_form.is_valid() and
                can_create):
            params = {
                'account': request.user.get_profile(),
                'title': topic_form.cleaned_data['topic'],
                'description': topic_form.cleaned_data['topic_desc'],
                'data_source': attr_form.cleaned_data['data_source'],
                'no_of_urls': attr_form.cleaned_data['no_of_urls'],
                'same_domain_allowed': addt_form.cleaned_data['same_domain'],
                'add_filler_samples': addt_form.cleaned_data['add_filler_samples']
            }

            if not params['no_of_urls']:
                context['wizard_error'] = (
                    'You need to specify number of urls to collect.'
                )
                return render(request, 'main/project/wizard.html',
                    RequestContext(request, context))

            max_urls = acc.job_limits.get('max_urls_per_job',
                settings.USER_MAX_URLS_PER_JOB)

            if (not request.user.is_superuser and
                    max_urls and params['no_of_urls'] > max_urls):
                context['wizard_error'] = ('You have entered too many urls to'
                    ' gather. Youre allowed to collect at max %d urls.'
                    % max_urls)
                return render(request, 'main/project/wizard.html',
                    RequestContext(request, context))

            # Gold urls file is required in the form. Since it's valid by now
            # - the file is present
            gold_file = request.FILES.get('file_gold_urls', None)
            gold_positive = [(x, 'yes') for x in
                addt_form.cleaned_data['gold_urls_positive'].splitlines()]
            gold_negative = [(x, 'no') for x in
                addt_form.cleaned_data['gold_urls_negative'].splitlines()]

            url_set = set()
            label_set = set()
            try:
                if gold_file:
                    urls = csv.reader(gold_file)
                else:
                    urls = gold_positive + gold_negative

                gold_samples = []
                for url, label in urls:
                    if url in url_set:
                        continue

                    label = make_label(label)
                    if not label:
                        log.warning(
                            'Got wrong label when parsing gold samples: %s'
                            % label
                        )
                        continue
                    url_set.add(url)
                    label_set.add(label)
                    gold_samples.append({'url': url, 'label': label})

                if len(url_set) < 6:
                    context['wizard_error'] = (
                        'You have to provide at least 6 different '
                        'gold samples.'
                    )
                    return render(request, 'main/project/wizard.html',
                        RequestContext(request, context))

                if len(label_set) < 2:
                    context['wizard_error'] = (
                        'You have to provide at least 2 different labels.'
                    )
                    return render(request, 'main/project/wizard.html',
                        RequestContext(request, context))

                params['gold_samples'] = json.dumps(gold_samples)

            except csv.Error, e:
                request.session['error'] = e
                return redirect('index')
            except: