def _extract_decisions(self, all_votes): votes = {} for worker_id, object_id, label in all_votes: counts = votes.get(object_id, { LABEL_YES: 0, LABEL_NO: 0, LABEL_BROKEN: 0, }) label = make_label(label) if not label: log.warning('[MajorityVoting] Got unrecognized sample %s' % label) continue count = counts.get(label, 0) counts[label] = count + 1 votes[object_id] = counts WorkerQualityVote.objects.filter( worker=worker_id, sample=object_id, ).update(is_new=False) decisions = [(el, max(val.iteritems(), key=lambda x: x[1])[0]) for el, val in votes.iteritems()] self.calculate_workers_quality(data=decisions) return decisions
def _papi_classify(self, sample): """ Executes Google Prediction API call to classify given sample. """ if self.model is None: return None try: body = {'input': {'csvInstance': [sample.text]}} result = self.papi.predict(body=body, id=self.model).execute() label_probability = self.get_default_probabilities() for score in result['outputMulti']: probability = round(score['score'], 3) label = make_label(score['label']) if not label: log.warning( '[GooglePrediction] Got unrecognized label %s' % score['label'] ) return None label_probability[label] = probability label = make_label(result['outputLabel']) if not label: log.warning( '[GooglePrediction] Got unrecognized label %s' % result['outputLabel'] ) return None return { 'label': result['outputLabel'], 'labels_probability': label_probability, 'result': result, } except Exception: log.exception( '[GooglePrediction] Error while classifying sample %d' % sample.id ) return None
def recalculate_human(self, cat_h): """ Recalculates btm sample status after voting of unsure sample. """ if self.frozen: log.warning( "Tried to update BTMSample %s which is frozen(paid)." % self.id ) return conf_cl = self.confidence confidence = self.confidence_level(conf_cl) cat_h = cat_h.lower() cat_cl = self.label.lower() if cat_cl == LABEL_BROKEN.lower() or cat_h == LABEL_BROKEN.lower(): return self.BTM_NO_STATUS expect = self.expected_output.lower() if cat_cl == expect and confidence == self.CONF_HIGH: self.btm_status = self.BTM_KNOWN elif cat_cl == expect and confidence == self.CONF_MEDIUM: if cat_h == expect: self.btm_status = self.BTM_KNOWN_UNSURE else: self.btm_status = self.BTM_X_CORRECTED elif cat_cl != expect and confidence == self.CONF_HIGH: if cat_h == expect: self.btm_status = self.BTM_HOLE else: self.btm_status = self.BTM_NOT_NONX elif cat_cl != expect and confidence == self.CONF_MEDIUM: if cat_h == expect: self.btm_status = self.BTM_NOTX_CORRECTED else: self.btm_status = self.BTM_KNOWN_UNSURE self.update_points(self.btm_status) self.human_label = make_label(cat_h) self.save()
def project_wizard(request): acc = request.user.get_profile() max_jobs = acc.job_limits.get('max_jobs', settings.USER_MAX_JOBS) jobs_count = acc.job_set.all().count() can_create = request.user.is_superuser or not max_jobs \ or max_jobs > jobs_count context = { 'stripe_key': settings.STRIPE_PUBLISHABLE, 'free_sources': ', '.join(imap(lambda x: str(x), JOB_FREE_SOURCES)), } if request.method == "GET": context.update({ 'topic_form': WizardTopicForm(), 'attributes_form': WizardAttributesForm(), 'additional_form': WizardAdditionalForm() }) if not can_create: context['wizard_error'] = ('You have reached a limit of maximum ' 'jobs created') else: topic_form = WizardTopicForm(request.POST) attr_form = WizardAttributesForm(request.POST) addt_form = WizardAdditionalForm(request.POST, request.FILES) stripe_token = request.POST.get('stripeToken', None) context.update({ 'topic_form': topic_form, 'attributes_form': attr_form, 'additional_form': addt_form }) if not can_create: context['wizard_error'] = ('You have reached a limit of maximum ' 'jobs created') if (addt_form.is_valid() and attr_form.is_valid() and topic_form.is_valid() and can_create): params = { 'account': request.user.get_profile(), 'title': topic_form.cleaned_data['topic'], 'description': topic_form.cleaned_data['topic_desc'], 'data_source': attr_form.cleaned_data['data_source'], 'no_of_urls': attr_form.cleaned_data['no_of_urls'], 'same_domain_allowed': addt_form.cleaned_data['same_domain'], 'add_filler_samples': addt_form.cleaned_data['add_filler_samples'] } if not params['no_of_urls']: context['wizard_error'] = ( 'You need to specify number of urls to collect.') return render(request, 'main/project/wizard.html', RequestContext(request, context)) max_urls = acc.job_limits.get('max_urls_per_job', settings.USER_MAX_URLS_PER_JOB) if (not request.user.is_superuser and max_urls and params['no_of_urls'] > max_urls): context['wizard_error'] = ( 'You have entered too many urls to' ' gather. Youre allowed to collect at max %d urls.' % max_urls) return render(request, 'main/project/wizard.html', RequestContext(request, context)) # Gold urls file is required in the form. Since it's valid by now # - the file is present gold_file = request.FILES.get('file_gold_urls', None) gold_positive = [ (x, 'yes') for x in addt_form.cleaned_data['gold_urls_positive'].splitlines() ] gold_negative = [ (x, 'no') for x in addt_form.cleaned_data['gold_urls_negative'].splitlines() ] url_set = set() label_set = set() try: if gold_file: urls = csv.reader(gold_file) else: urls = gold_positive + gold_negative gold_samples = [] for url, label in urls: if url in url_set: continue label = make_label(label) if not label: log.warning( 'Got wrong label when parsing gold samples: %s' % label) continue url_set.add(url) label_set.add(label) gold_samples.append({'url': url, 'label': label}) if len(url_set) < 6: context['wizard_error'] = ( 'You have to provide at least 6 different ' 'gold samples.') return render(request, 'main/project/wizard.html', RequestContext(request, context)) if len(label_set) < 2: context['wizard_error'] = ( 'You have to provide at least 2 different labels.') return render(request, 'main/project/wizard.html', RequestContext(request, context)) params['gold_samples'] = json.dumps(gold_samples) except csv.Error, e: request.session['error'] = e return redirect('index') except:
def project_wizard(request): acc = request.user.get_profile() max_jobs = acc.job_limits.get('max_jobs', settings.USER_MAX_JOBS) jobs_count = acc.job_set.all().count() can_create = request.user.is_superuser or not max_jobs \ or max_jobs > jobs_count context = { 'stripe_key': settings.STRIPE_PUBLISHABLE, 'free_sources': ', '.join(imap(lambda x: str(x), JOB_FREE_SOURCES)), } if request.method == "GET": context.update({ 'topic_form': WizardTopicForm(), 'attributes_form': WizardAttributesForm(), 'additional_form': WizardAdditionalForm() }) if not can_create: context['wizard_error'] = ('You have reached a limit of maximum ' 'jobs created') else: topic_form = WizardTopicForm(request.POST) attr_form = WizardAttributesForm(request.POST) addt_form = WizardAdditionalForm(request.POST, request.FILES) stripe_token = request.POST.get('stripeToken', None) context.update({ 'topic_form': topic_form, 'attributes_form': attr_form, 'additional_form': addt_form }) if not can_create: context['wizard_error'] = ('You have reached a limit of maximum ' 'jobs created') if (addt_form.is_valid() and attr_form.is_valid() and topic_form.is_valid() and can_create): params = { 'account': request.user.get_profile(), 'title': topic_form.cleaned_data['topic'], 'description': topic_form.cleaned_data['topic_desc'], 'data_source': attr_form.cleaned_data['data_source'], 'no_of_urls': attr_form.cleaned_data['no_of_urls'], 'same_domain_allowed': addt_form.cleaned_data['same_domain'], 'add_filler_samples': addt_form.cleaned_data['add_filler_samples'] } if not params['no_of_urls']: context['wizard_error'] = ( 'You need to specify number of urls to collect.' ) return render(request, 'main/project/wizard.html', RequestContext(request, context)) max_urls = acc.job_limits.get('max_urls_per_job', settings.USER_MAX_URLS_PER_JOB) if (not request.user.is_superuser and max_urls and params['no_of_urls'] > max_urls): context['wizard_error'] = ('You have entered too many urls to' ' gather. Youre allowed to collect at max %d urls.' % max_urls) return render(request, 'main/project/wizard.html', RequestContext(request, context)) # Gold urls file is required in the form. Since it's valid by now # - the file is present gold_file = request.FILES.get('file_gold_urls', None) gold_positive = [(x, 'yes') for x in addt_form.cleaned_data['gold_urls_positive'].splitlines()] gold_negative = [(x, 'no') for x in addt_form.cleaned_data['gold_urls_negative'].splitlines()] url_set = set() label_set = set() try: if gold_file: urls = csv.reader(gold_file) else: urls = gold_positive + gold_negative gold_samples = [] for url, label in urls: if url in url_set: continue label = make_label(label) if not label: log.warning( 'Got wrong label when parsing gold samples: %s' % label ) continue url_set.add(url) label_set.add(label) gold_samples.append({'url': url, 'label': label}) if len(url_set) < 6: context['wizard_error'] = ( 'You have to provide at least 6 different ' 'gold samples.' ) return render(request, 'main/project/wizard.html', RequestContext(request, context)) if len(label_set) < 2: context['wizard_error'] = ( 'You have to provide at least 2 different labels.' ) return render(request, 'main/project/wizard.html', RequestContext(request, context)) params['gold_samples'] = json.dumps(gold_samples) except csv.Error, e: request.session['error'] = e return redirect('index') except: