def processSteps(self,
                  trainDocuments, trainLabels,
                  testDocuments, testLabels,
                  split, active):
     trainDoc = trainDocuments
     trainLab = trainLabels
     results  = []
     exampleCtr = 0
     stepCtr = 0
     for step in split:
         stepCtr = stepCtr +1
         if active:
             [trainDoc,trainLab] = sel.uncertainty_sampling(trainDoc,trainLab)
             #
         # train all document for the current step
         for idx in range(step):
             exampleCtr = exampleCtr +1
             print 'Step ' + str(stepCtr) + ' Example ' +str(exampleCtr)
             clf.online_train(trainDoc[idx], trainLab[idx])
             trainDoc[idx].save()
             #
         # drop already trained examples
         trainDoc = trainDoc[step:]
         trainLab = trainLab[step:]
         # make predictions
         preds = [clf.predict(testDocuments[idx])
                  for idx in range(len(testDocuments))]
         results = results + [preds]
         #
     with open('results', 'wt') as resultFile:
         pprint(results, stream=resultFile)
         #
     print self.evaluate(testLabels)
Exemple #2
0
 def run(self, trainDocs, trainLabels, testDocs, testLabels, options, active=False):
     trainD = trainDocs
     trainL = trainLabels
     instanceCtr = 0
     stepCtr = 0
     export = {}
     for step in options['split']:
         stepCtr += 1
         if active:
             [trainD, trainL] = sel.uncertainty_sampling(trainD, trainL, saveScores=False)
             #
         for i in range(step):
             instanceCtr += 1
             print 'Step ' + str(stepCtr) + ' Example ' +str(instanceCtr)
             clf.online_train(trainD[i], [trainL[i]])
             #
         preds = map(lambda t: clf.predict(t, saveScores=False), testDocs)
         accuracy = self.evaluate(preds, testLabels)
         #
         export.update({stepCtr: {"step": step,
                                  "preds": preds,
                                  "accuracy": accuracy,
                                  "runOrder": map(lambda i: trainD[i].doc_id, range(step))}})
         # remove preciding training instances
         trainD = trainD[step:]
         trainL = trainL[step:]
         #
     # clear database from all data produced by the run (WARNING: also clears data not produced by the run)
     call_command('wipeDB', 'label')
     return export
Exemple #3
0
def training(request):
    context = {} # a dict with content used in the template
    labels = Label.objects.all()
    context['labels'] = labels
    if request.method == 'POST':
        form = TrainingForm(labels, request.POST)
        # check whether it's valid:
        if form.is_valid():
            # depending on whether the classify button
            # (classifySubmit) or the train button (trainSubmit) was
            # pressed diffent actions are performed
            if 'classifySubmit' in form.data:
                document = Document(document=form.data.get('trainDocument'),
                                    doc_id='doc to be classified',
                                    preprocessed=' '.join(clf.preprocessing(
                                        form.data.get('trainDocument'))),
                                    trainInstance=True)
                # only if there is a document and it contains words it
                # will be classified
                scores = clf.predict(document, saveScores=False)
                if scores:
                    context['scores'] = scores
                    proposals = clf.predict_label(document, scores=scores)
                    if proposals:
                        context['proposals'] = map(lambda l: l.pk, [proposals])
                    else:
                        raise ValidationError(
                            'There is no predictive model yet. Please train at least one document before classifing',
                            code='invalid')
                    context['classifiedDocument'] = document.document

            else:
                document = Document(document=form.data.get('trainDocument'),
                                    doc_id=str(datetime.datetime.now()),
                                    preprocessed=' '.join(clf.preprocessing(
                                        form.data.get('trainDocument'))),
                                    trainInstance=True)
                document.save()
                annotation = Annotation(document=document,
                                        user=request.user,
                                        duration=-1)
                annotation.save()
                [annotation.labels.add(label)
                 for label in form.cleaned_data['labels']]
                clf.online_train(document, form.cleaned_data['labels'])

            return render(request,
                          'annotation/training.html',
                          context)
        else:
            raise Http404("Ups something went wrong.")
    else:
        return render(request,
                      'annotation/training.html',
                      context)
Exemple #4
0
 def handle(self, *args, **options):
     with open(options['filename']) as trainfile:
         train_content = trainfile.read()
         #
     raw_labels = ''.join(re.findall(
         r'\t[\d\-]+?\n', train_content)).replace('\t', '').splitlines()
     relevant = Label.objects.filter(label='Pos').first()
     irrelevant = Label.objects.filter(label='Neg').first()
     labels = [
         relevant if label == '1' else irrelevant for label in raw_labels
     ]
     doc_ids = ''.join(re.findall(r'\n.+?\t',
                                  train_content)).replace('\t',
                                                          '').splitlines()
     document_texts = re.findall(r'\t.+?\t', train_content)
     if options['maxTokenCount'] == 'all':
         maxTokenCount = len(fullTokens)
     else:
         maxTokenCount = int(options['maxTokenCount'])
     document_texts = map(
         lambda d: ' '.join(
             d.decode('utf-8').encode('utf-8').split(' ')[:maxTokenCount]),
         document_texts)
     documents = [
         Document(document=document_texts[idx],
                  doc_id=doc_ids[idx],
                  preprocessed=' '.join(
                      clf.preprocessing(document_texts[idx])),
                  trainInstance=True) for idx in range(len(document_texts))
     ]
     #
     for document in documents:
         document.save()
         #
     for idx in range(len(documents)):
         clf.online_train(documents[idx], [labels[idx]])
Exemple #5
0
def index(request):
    context = {} # a dict with content used in the template
    labels = Label.objects.all()
    context['labels'] = labels
    # if this is a POST request we need to process the form data
    if request.method == 'POST':
        # create a form instance and populate it with data from the request:
        form = AnnotationForm(labels, request.POST)
        # check whether it's valid:
        if form.is_valid():
            # create the new annotation
            old_pk = int(form.data.get('old_pk'))
            old_doc = Document.objects.get(pk=old_pk)
            annotation = Annotation(document=old_doc,
                                    user=request.user,
                                    duration=form.data.get('duration'),
                                    proposalFlag=form.data.get('oldProposalFlag'))

            annotation.save() # save the annotation to the DB
            #
            annotation.labels.add(*form.cleaned_data['labels'])
            #logging.info('form.data.get(oldProposals) request: ' + str(form.data.get('oldProposals')))
            oldProposals = form.data.get('oldProposals')
            if oldProposals:
                oldProposals = map(int, re.findall(r'\d+', form.data.get('oldProposals')))
            else:
                oldProposals = [-1]
            annotation.proposals.add(*oldProposals)
            #
            clf.online_train(old_doc, form.cleaned_data['labels'])
            # After we trained the newly annotated document, the
            # corresponding QueueElement can be deleted.
            oQE = form.data.get('oldQueueElement')
            oldQueueElement = QueueElement.objects.filter(pk=oQE).first()
            if oldQueueElement:
                oldQueueElement.delete()
                #
            document, proposalFlag, queueElement = sel.selectDocument(request.user)
            proposals = selectProposal(document, proposalFlag, onlineProposal=True)
            context['proposals'] = proposals
            context['document'] = document
            context['sessionStart'] = form.data['sessionStart']
            if queueElement:
                context['oldQueueElement'] = queueElement
                context['oldProposalFlag'] = queueElement.proposalFlag
            form = AnnotationForm(labels)
            context['form'] = form

            #queries = connection.queries
            #print 'len: {}, connection: {}'.format(str(len(queries)),  str(filter(lambda q: True if q['time'] != '0.000' else False, queries)))
            return render(request, 'annotation/index.html', context)
    else:
        document, proposalFlag, queueElement = sel.selectDocument(request.user)
        context['proposals'] = selectProposal(document, proposalFlag, onlineProposal=True)
        context['document'] = document
        if queueElement:
            context['oldQueueElement'] = queueElement
            context['oldProposalFlag'] = queueElement.proposalFlag
            #
        form = AnnotationForm(labels)
        context['form'] = form

        return render(request, 'annotation/index.html', context)
Exemple #6
0
(trainDocuments, trainLabels) = readTSV(trainFile)
(testDocuments, testLabels) = readTSV(testFile)

split = [0.1,0.1,0.2,0.3,0.3]
N = len(trainDocuments)
stepCount = reduce(lambda seq, step:
                   seq+[int(round(N*step))+seq[-1]],
                   split, [0])[:-1]+[N]
rlt = []
rlta = []
# create the steps
for (fromStep, toStep) in zip(stepCount[:-1], stepCount[1:]):
    # step over and train
    for idx in range(fromStep, toStep):
        print str(idx) + ' ' + str((fromStep, toStep))
        clf.online_train(trainDocuments[idx], [trainLabels[idx]])
        trainDocuments[idx].save()
    # make predictions and convert to binary vector
    # preds = [1 if clf.predict_label(testDocuments[idx]) == Label(label='relevant') else 0
    #          for idx in range(len(testDocuments))]
    preds = [clf.predict(testDocuments[idx])
             for idx in range(len(testDocuments))]
    pprint(preds)
    # convert true labels to binary vector and add results
    # to list.
    true = map(lambda p: 1 if p == Label(label='relevant') else 0, testLabels)
    rlt = rlt + [confusion_matrix(true, preds)]
    rlta = rlta + [accuracy_score(true, preds)]
print rlt
print rlta