예제 #1
0
def training(request):
    context = {} # a dict with content used in the template
    labels = Label.objects.all()
    context['labels'] = labels
    if request.method == 'POST':
        form = TrainingForm(labels, request.POST)
        # check whether it's valid:
        if form.is_valid():
            # depending on whether the classify button
            # (classifySubmit) or the train button (trainSubmit) was
            # pressed diffent actions are performed
            if 'classifySubmit' in form.data:
                document = Document(document=form.data.get('trainDocument'),
                                    doc_id='doc to be classified',
                                    preprocessed=' '.join(clf.preprocessing(
                                        form.data.get('trainDocument'))),
                                    trainInstance=True)
                # only if there is a document and it contains words it
                # will be classified
                scores = clf.predict(document, saveScores=False)
                if scores:
                    context['scores'] = scores
                    proposals = clf.predict_label(document, scores=scores)
                    if proposals:
                        context['proposals'] = map(lambda l: l.pk, [proposals])
                    else:
                        raise ValidationError(
                            'There is no predictive model yet. Please train at least one document before classifing',
                            code='invalid')
                    context['classifiedDocument'] = document.document

            else:
                document = Document(document=form.data.get('trainDocument'),
                                    doc_id=str(datetime.datetime.now()),
                                    preprocessed=' '.join(clf.preprocessing(
                                        form.data.get('trainDocument'))),
                                    trainInstance=True)
                document.save()
                annotation = Annotation(document=document,
                                        user=request.user,
                                        duration=-1)
                annotation.save()
                [annotation.labels.add(label)
                 for label in form.cleaned_data['labels']]
                clf.online_train(document, form.cleaned_data['labels'])

            return render(request,
                          'annotation/training.html',
                          context)
        else:
            raise Http404("Ups something went wrong.")
    else:
        return render(request,
                      'annotation/training.html',
                      context)
예제 #2
0
 def processFile(self, filename, options):
     print 'Add documents from ' + filename
     with open(filename) as csvfile:
         rows = csv.reader(csvfile, delimiter=options['delimiter'])
         count = 1
         for row in rows:
             if row:
                 print count
                 count += 1
                 iD = row[0].decode('utf-8').encode('utf-8')
                 fullTokens = row[1].decode('utf-8').encode('utf-8').split(
                     ' ')
                 if options['maxTokenCount'] == 'all':
                     maxTokenCount = len(fullTokens)
                 else:
                     maxTokenCount = int(options['maxTokenCount'])
                 text = ' '.join(fullTokens[:maxTokenCount])
                 trainInstance = False
                 if options['train']:
                     trainInstance = True
                     #
                 document = Document(document=text,
                                     doc_id=iD,
                                     preprocessed=' '.join(
                                         clf.preprocessing(text)),
                                     trainInstance=trainInstance)
                 document.save()
예제 #3
0
 def readTSV(self, filename):
     with open(filename) as trainfile:
         train_content = trainfile.read().decode('utf-8')
         #
     # Extract the raw label representation from the total text.
     # raw_labels = ''.join(re.findall(r'\t[\d\-]+?\n',train_content)).replace('\t', '').splitlines()
     raw_labels = ''.join(re.findall(r'(\t[\-1]{1,2})*?\n',train_content)).split('\t')[1:]
     # Get the Labels from the db.
     pdb.set_trace()
     relevant = Label.objects.filter(label='Relevant').first()
     irrelevant = Label.objects.filter(label='Irrelevant').first()
     # Tranform label representation into the true Labels.
     labels = [relevant if label=='1' else irrelevant
               for label in raw_labels]
     doc_ids = ''.join(re.findall(r'\n.+?\t', train_content)).replace('\t', '').splitlines()
     document_texts = re.findall(r'\t.+?\t', train_content)
     documents = [Document(document=document_texts[idx],
                           doc_id=doc_ids[idx],
                           preprocessed=' '.join(clf.preprocessing(
                               document_texts[idx])),
                           trainInstance=True)
                  for idx in range(len(document_texts))]
     for document in documents:
         document.save()
     rlt = (documents, map(lambda l: [l], labels))
     pprint(map(lambda r: (r[0].document[:20],r[1][0].label),zip(rlt[0], rlt[1])))
     return rlt
예제 #4
0
 def objectTransformation(self, texts, classes, labelMap):
     documents = [Document(document=texts[i],
                           doc_id=i,
                           preprocessed=' '.join(clf.preprocessing(texts[i])),
                           trainInstance=True)
                  for i in range(len(texts))]
     if not set(classes)-set(labelMap.keys()):
         labels = map(lambda t: Label(label=labelMap[t],
                                      option='radio'), classes)
         return documents, labels
     else:
         print 'Labels found in '+filename+': '+str(set(classes))
         print 'Labels fount in label_mapping: '+str(set(labelMap.keys()))
         return None, None
예제 #5
0
def readTSV(filename):
    with open(filename) as trainfile:
        train_content = trainfile.read().decode('utf-8')
        #
    # Extract the raw label representation from the total text.
    # raw_labels = ''.join(re.findall(r'\t[\d\-]+?\n',train_content)).replace('\t', '').splitlines()
    raw_labels = ''.join(re.findall(r'(\t[\-1]{1,2})*?\n',train_content)).split('\t')[1:]
    # Get the Labels from the db.
    relevant = Label.objects.filter(label='relevant').first()
    irrelevant = Label.objects.filter(label='irrelevant').first()
    # Tranform label representation into the true Labels.
    labels = [relevant if label=='1' else irrelevant
              for label in raw_labels]
    doc_ids = ''.join(re.findall(r'\n.+?\t', train_content)).replace('\t', '').splitlines()
    document_texts = re.findall(r'\t.+?\t', train_content)
    documents = [Document(document=document_texts[idx],
                          doc_id=doc_ids[idx],
                          preprocessed=' '.join(clf.preprocessing(
                              document_texts[idx])),
                          trainInstance=True)
                 for idx in range(len(document_texts))]
    return (documents, labels)
예제 #6
0
 def predict_labels(self, options):
     if options['delimiter'] == 't':
         options['delimiter'] = '\t'
         #
     # read test instances from file
     with open(options['testfile']) as csvfile:
         rows = list(csv.reader(csvfile, delimiter=options['delimiter']))
         #
     if len(rows[0]) == 2:
         doc_ids, texts = zip(*rows)
         trueLabels = []
     else:
         doc_ids, texts, trueLabels = zip(*rows)
         #
     # convert the file content to Document objects
     documents = map(
         lambda (doc_id, text): Document(document=text,
                                         doc_id=doc_id,
                                         preprocessed=' '.join(
                                             clf.preprocessing(text)),
                                         trainInstance=True),
         zip(doc_ids, texts))
     # predict labels
     return doc_ids, texts, map(clf.predict_label, documents), trueLabels
예제 #7
0
 def handle(self, *args, **options):
     with open(options['filename']) as trainfile:
         train_content = trainfile.read()
         #
     raw_labels = ''.join(re.findall(
         r'\t[\d\-]+?\n', train_content)).replace('\t', '').splitlines()
     relevant = Label.objects.filter(label='Pos').first()
     irrelevant = Label.objects.filter(label='Neg').first()
     labels = [
         relevant if label == '1' else irrelevant for label in raw_labels
     ]
     doc_ids = ''.join(re.findall(r'\n.+?\t',
                                  train_content)).replace('\t',
                                                          '').splitlines()
     document_texts = re.findall(r'\t.+?\t', train_content)
     if options['maxTokenCount'] == 'all':
         maxTokenCount = len(fullTokens)
     else:
         maxTokenCount = int(options['maxTokenCount'])
     document_texts = map(
         lambda d: ' '.join(
             d.decode('utf-8').encode('utf-8').split(' ')[:maxTokenCount]),
         document_texts)
     documents = [
         Document(document=document_texts[idx],
                  doc_id=doc_ids[idx],
                  preprocessed=' '.join(
                      clf.preprocessing(document_texts[idx])),
                  trainInstance=True) for idx in range(len(document_texts))
     ]
     #
     for document in documents:
         document.save()
         #
     for idx in range(len(documents)):
         clf.online_train(documents[idx], [labels[idx]])