Beispiel #1
0
 def updateData(self, *args):
     """save the reviewed data"""
     self.data = self.toggle.value
     self.toggle.button_style = 'success'
     if self.reviewed:
         self.master.reviewed_docs[self.master.docs[
             self.pos_id].DOC_ID] = self.toggle.value
         with self.master.workflow.dao.create_session() as session:
             logMsg(('update data:', self.pos_id, len(self.master.docs)))
             anno = session.query(Annotation).filter(
                 and_(
                     Annotation.DOC_ID == self.master.docs[
                         self.pos_id].DOC_ID, Annotation.TASK_ID ==
                     self.master.workflow.task_id)).first()
             if anno is not None:
                 anno.REVIEWED_TYPE = self.toggle.value
                 anno.TYPE = self.prediction
             else:
                 anno = Annotation(
                     TASK_ID=self.master.workflow.task_id,
                     DOC_ID=self.master.docs[self.pos_id].DOC_ID,
                     TYPE=self.prediction,
                     REVIEWED_TYPE=self.toggle.value)
                 session.add(anno)
             self.master.annos[self.master.docs[
                 self.pos_id].DOC_ID] = anno.clone()
     pass
    def classify(self, doc, doc_name='t_m_p.txt'):
        self.last_doc_name = doc_name
        if self.modifiers is None or self.targets is None:
            logMsg(
                'DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n'
                +
                'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,'
                + 'targets_file) to set them up.')
        try:
            context_doc = self.markup_context_document(doc, self.modifiers,
                                                       self.targets)
            if self.save_markups and doc_name is not None and len(
                    context_doc.getDocumentGraph().nodes()) > 0:
                self.saved_markups_map[doc_name] = context_doc
            markups = get_document_markups(context_doc)

            annotations, relations, doc_txt = convertMarkups2DF(markups)
            matched_conclusion_types = self.feature_inferencer.process(
                annotations, relations)
            doc_conclusion = self.document_inferencer.process(
                matched_conclusion_types)
        except:
            # pyConText might through errors in some case, will fix it later
            doc_conclusion = self.document_inferencer.default_conclusion
        return doc_conclusion
    def __init__(self,
                 targets=None,
                 modifiers=None,
                 feature_inference_rule=None,
                 document_inference_rule=None,
                 rush_rule=None,
                 expected_values=[],
                 save_markups=True):
        self.document_inferencer = DocumentInferencer(document_inference_rule)
        self.feature_inferencer = FeatureInferencer(feature_inference_rule)
        self.conclusions = []
        self.modifiers = modifiers
        self.targets = targets
        self.save_markups = save_markups
        self.expected_values = [value.lower() for value in expected_values]
        self.saved_markups_map = dict()
        self.pyrush = None
        if rush_rule is None or not os.path.isfile(rush_rule):
            rush_rule = ConfigReader.getValue('rush_rules_path')
        if rush_rule is not None and os.path.isfile(rush_rule):
            self.pyrush = RuSH(rush_rule)
        else:
            logMsg(("File not found", os.path.abspath(rush_rule)))
        self.last_doc_name = ''

        if modifiers is not None and targets is not None:
            if isinstance(modifiers, str) and isinstance(targets, str):
                if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith(
                        '.txt') or modifiers.endswith('.yml')) \
                        and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith(
                    '.txt') or targets.endswith('.yml') or targets.startswith('Lex\t')):
                    self.setModifiersTargetsFromFiles(modifiers, targets)
            else:
                self.setModifiersTargets(modifiers, targets)
        RBDocumentClassifier.instance = self
Beispiel #4
0
    def initNextStepWhileReviewing(self):
        if len(self.workflow.to_ext_words) > 0:
            word, type_name = self.workflow.to_ext_words.pop(0)
            extending = []
            try:
                extending = GloveModel.glove_model.similar_by_word(word.lower())
                extending = KeywordsUMLSExtender.filterExtended([pair[0] for pair in extending], type_name,
                                                                self.master.workflow.filters,
                                                                self.workflow.extended)
            except KeyError:
                logMsg(("word '%s' not in vocabulary" % word.lower()))

            if len(extending) > 0:
                next_step = RepeatWEMultipleSelection(description=KeywordsEmbeddingExtender.description % word,
                                                      options=list(extending), master=self.master,
                                                      type_name=type_name)
                next_step.setCompleteStep(self.branch_buttons[2].linked_step)
                self.workflow.append(next_step)
            else:
                self.initNextStepWhileReviewing()

        #
        # word, type_name = self.workflow.to_ext_words.pop(0)
        # extended = GloveModel.glove_model.similar_by_word(word, KeywordsEmbeddingExtender.max_query)
        # extended = self.filterExtended(extended, type_name)
        # if len(extended) > 0:
        #     next_step = RepeatWEMultipleSelection(description=KeywordsEmbeddingExtender.description % word,
        #                                           options=list(extended))
        #     next_step.setCompleteStep(self.branch_buttons[2].linked_step)
        #     self.workflow.append(next_step)
        pass
    def eval(self, gold_docs):
        import sklearn
        import pandas as pd
        fn_docs = []
        fp_docs = []
        prediction_metrics = []
        gold_labels = [x.positive_label for x in gold_docs.values()]
        pred_labels = []
        logMsg('Start to evaluate against reference standards...')
        for doc_name, gold_doc in gold_docs.items():
            gold_label = gold_doc.positive_label
            pred_label = self.predict(gold_doc.text, doc_name)
            pred_labels.append(pred_label)
            #       Differentiate false positive and false negative error
            if gold_label == 0 and pred_label == 1:
                fp_docs.append(doc_name)
            elif gold_label == 1 and pred_label == 0:
                fn_docs.append(doc_name)

        precision = sklearn.metrics.precision_score(gold_labels, pred_labels)
        recall = sklearn.metrics.recall_score(gold_labels, pred_labels)
        f1 = sklearn.metrics.f1_score(gold_labels, pred_labels)
        # Let's use Pandas to make a confusion matrix for us
        confusion_matrix_df = pd.crosstab(
            pd.Series(gold_labels, name='Actual'),
            pd.Series(pred_labels, name='Predicted'))
        prediction_metrics.append('Precision : {0:.3f}'.format(precision))
        prediction_metrics.append('Recall :    {0:.3f}'.format(recall))
        prediction_metrics.append('F1:         {0:.3f}'.format(f1))

        return fn_docs, fp_docs, '\n'.join(
            prediction_metrics), confusion_matrix_df[[1, 0]].reindex([1, 0])
    def initiateRepeatStep(self):
        if len(self.loop_workflow.to_ext_words) > 0:
            word, type_name = self.loop_workflow.to_ext_words.pop(0)
            extending = []
            try:
                extending = KeywordsUMLSExtender.umls.search(word)
                # self.loop_workflow.extended saved all the extended words that will be displayed, no matter will be
                # selected or not, so that the same extended word won't be shown twice asking for selection
                extending = filterExtended(extending, type_name,
                                           self.workflow.filters,
                                           self.loop_workflow.extended)
            except KeyError:
                logMsg(("not synonym found for word '%s'" % word.lower()))

            if len(extending) > 0:
                self.appendRepeatStep(
                    RepeatMultipleSelection(
                        description=KeywordsUMLSExtender.description % word,
                        options=list(extending),
                        master=self,
                        type_name=type_name))
            else:
                self.initiateRepeatStep()
        else:
            self.complete()
Beispiel #7
0
 def initTraining(self):
     x = [doc.TEXT for doc in self.docs[:len(self.reviewed_docs)]]
     y = list(self.reviewed_docs.values())
     logMsg(('start ML training: ', type(self.ml_classifier), 'x=', len(x),
             'y=', len(y)))
     self.ml_classifier.train(x, y)
     logMsg('training finished, start to predict...')
     self.initPrediction()
 def navigate(self, b):
     # print(b)
     self.data = self.selections.value
     if self.master is not None and self.data is not None:
         logMsg(self.data)
         self.master.workflow.filters[self.type_name].addAll(self.data)
     super().navigate(b)
     pass
Beispiel #9
0
 def readData(self):
     self.data = self.workflow.steps[self.pos_id - 1].data
     self.docs = self.data['docs']
     self.annos = self.data['annos']
     self.reviewed_docs = {
         doc_id: anno.REVIEWED_TYPE
         for doc_id, anno in self.annos.items()
         if anno.REVIEWED_TYPE is not None
     }
     logMsg(('self.docs', len(self.docs)))
     logMsg(('self.annos', len(self.annos)))
Beispiel #10
0
 def restSampling(self):
     """discard previous sampling and reviewed data, start a new sampling"""
     logMsg('reset sampling')
     self.data['docs'].clear()
     self.data['annos'].clear()
     with self.workflow.dao.create_session() as session:
         anno_iter = session.query(Annotation).filter(
             Annotation.TASK_ID == self.workflow.task_id)
         for anno in anno_iter:
             session.delete(anno)
         session.commit()
     pass
Beispiel #11
0
 def genDiv(self, doc):
     """generate scrollable div to display the text content with keywords highlighted"""
     div = ''
     # div = '<button type="button" onclick="setFocusToTextBox()">Focus on next highlight</button>'
     div += '<div id="d1" style="overflow-y: scroll; height:' + self.div_height \
            + ';border:1px solid;border-color:#e5e8e8; ">'
     logMsg(('self.div_height:', self.div_height))
     spacy_doc = self.nlp(doc.TEXT)
     matches = self.matcher(spacy_doc)
     # consolePrint(matches)
     highlight_text = self.genHighlightTex(spacy_doc, matches)
     div += highlight_text + '</div>'
     return div
Beispiel #12
0
 def navigate(self, b):
     clear_output(True)
     self.updateData(b)
     logMsg((b, hasattr(b, "linked_step")))
     if hasattr(b, 'linked_step') and b.linked_step is not None:
         b.linked_step.start()
     else:
         if not hasattr(b,
                        'navigate_direction') or b.navigate_direction == 1:
             self.complete()
         else:
             self.goBack()
     pass
Beispiel #13
0
 def start(self):
     """In running time, start to display a sample in the notebook output cell"""
     logMsg(('start step id/total steps', self.pos_id,
             len(self.workflow.steps)))
     if len(self.master.js) > 0:
         display(widgets.HTML(self.master.js))
     self.toggle.button_style = 'success'
     self.progress.value = self.pos_id + 1
     self.progress.description = 'Progress: ' + str(
         self.progress.value) + '/' + str(self.progress.max)
     clear_output(True)
     display(self.box)
     self.initNextDoc()
     pass
Beispiel #14
0
 def initPrediction(self):
     counter = 0
     with self.workflow.dao.create_session() as session:
         iter = session.query(Annotation, Document).join(
             Document, Document.DOC_ID == Annotation.DOC_ID).filter(
                 and_(Annotation.TASK_ID == self.workflow.task_id,
                      Annotation.REVIEWED_TYPE is None))
         for anno, doc in iter:
             if counter >= self.learning_pace * 1.5:
                 # don't need to process all the rest document for efficiency concerns
                 break
             logMsg(('predict doc: ', doc.DOC_ID, anno.TYPE))
             anno.TYPE = self.ml_classifier.classify(doc.TEXT)
             counter += 1
     counter = 0
     pass
Beispiel #15
0
    def updateData(self, *args):
        """data related operations when click a button to move on to next step"""
        # if self.move_next_option == "R":
        #     self.restSampling()
        # elif self.move_next_option == "A":
        #     self.addExtra()
        # else:
        #     self.continueReview()
        for name, value in self.parameter_inputs.items():
            self.parameters[name] = value.value
            # directly change the value of class variables
            logMsg(("update settings: ", self.ml_classifier_cls, name,
                    value.value))
            setattr(self.ml_classifier_cls, name, value.value)

        pass
Beispiel #16
0
    def complete(self):
        # Differentiate the keywords from where added----Not used for now
        # self.data = TreeSet()
        # for step in self.loop_workflow.steps:
        #     for word in step.data:
        #         self.data.add(word)
        # self.workflow.we_extended = self.data

        logMsg('update word embedding extended keywords into database')
        with self.workflow.dao.create_session() as session:
            records = session.query(Filter).filter(Filter.task_id == self.workflow.task_id) \
                .filter(Filter.type == 'orig')
            for record in records:
                type_name = record.type_name
                keywords = '\n'.join(self.workflow.filters[type_name]).strip()
                record.keyword = keywords

        super().complete()
        pass
Beispiel #17
0
 def __init__(self,
              word2vec_file='models/saved/glove/glove.42B.300d.bin',
              vocab=1900000,
              vect=300):
     glove_model = None
     if GloveModel.glove_model is None and GloveModel.status == NotInitiated:
         if path.isfile(word2vec_file):
             GloveModel.status = Initiating
             logMsg('Load glove model in the backend...')
             print('Load glove model in the backend...')
             if word2vec_file.endswith('.bin'):
                 glove_model = KeyedVectors.load_word2vec_format(
                     word2vec_file, binary=True)
                 GloveModel.status = Initiated
             else:
                 glove_model = KeyedVectors.load_word2vec_format(
                     word2vec_file, binary=False)
                 logMsg('convert txt model to binary model...')
                 glove_model.save_word2vec_format(word2vec_file[:-3] +
                                                  '.bin',
                                                  binary=True)
                 GloveModel.status = Initiated
         elif path.isfile(word2vec_file[:-3] + 'txt'):
             GloveModel.status = Initiating
             logMsg('Load glove model in the backend...')
             print('Load glove model in the backend...')
             txt_model = word2vec_file[:-3] + 'txt'
             self.addDimensions(txt_model,
                                line_to_prepend=str(vocab) + ' ' +
                                str(vect))
             glove_model = KeyedVectors.load_word2vec_format(txt_model,
                                                             binary=False)
             logMsg('convert txt model to binary model...')
             glove_model.save_word2vec_format(word2vec_file, binary=True)
             GloveModel.status = Initiated
         else:
             logMsg(("Either ", path.abspath(word2vec_file), ' or ',
                     path.abspath(word2vec_file[:-3] + 'txt'), ' exists.'))
             print(("Either ", path.abspath(word2vec_file), ' or ',
                    path.abspath(word2vec_file[:-3] + 'txt'), ' exists.'))
         GloveModel.glove_model = glove_model
     pass
Beispiel #18
0
    def initiateRepeatStep(self):
        if len(self.loop_workflow.to_ext_words) > 0:
            word, type_name = self.loop_workflow.to_ext_words.pop(0)
            extending = []
            try:
                extending = GloveModel.glove_model.similar_by_word(word.lower())
                extending = KeywordsUMLSExtender.filterExtended([pair[0] for pair in extending], type_name,
                                                                self.workflow.filters,
                                                                self.loop_workflow.extended)
            except KeyError:
                logMsg(("word '%s' not in vocabulary" % word.lower()))

            if len(extending) > 0:
                self.appendRepeatStep(
                    RepeatWEMultipleSelection(description=KeywordsEmbeddingExtender.description % word,
                                              options=list(extending), master=self, type_name=type_name))
            else:
                self.initiateRepeatStep()
        else:
            self.complete()
Beispiel #19
0
 def initNextDoc(self):
     """while displaying the current sample, prepare for the next sample"""
     if self.workflow is None:
         return
     if self.master is None:
         return
     if self.next_step is None:
         # if reach the limit of rule-base preannotation max documents or the end of samples, jump to complete
         if self.pos_id < len(
                 self.master.docs
         ) - 1 and self.pos_id < self.master.threshold - 1:
             doc = self.master.docs[self.pos_id + 1]
             logMsg(('Initiate next doc', len(self.master.docs),
                     'current pos_id:', self.pos_id))
             content = self.master.genContent(doc)
             reviewed = False
             if doc.DOC_ID in self.master.annos and self.master.annos[
                     doc.DOC_ID].REVIEWED_TYPE is not None:
                 prediction = self.master.annos[doc.DOC_ID].REVIEWED_TYPE
                 reviewed = True
             else:
                 prediction = ReviewRBLoop.rb_classifier.classify(
                     doc.TEXT, doc.DOC_NAME)
             repeat_step = ReviewRB(
                 description=content,
                 options=self.master.workflow.types,
                 value=prediction,
                 js=self.js,
                 master=self.master,
                 reviewed=reviewed,
                 button_style='success' if reviewed else 'info')
             self.master.appendRepeatStep(repeat_step)
         else:
             logMsg(('Initiate next step', len(self.master.docs),
                     'current pos_id:', self.pos_id, 'master\'s next step',
                     self.master.next_step))
             self.next_step = self.master.next_step
             self.branch_buttons[1].linked_step = self.master.next_step
     elif self.pos_id >= self.master.threshold - 1:
         self.navigate(self.branch_buttons[2])
     pass
Beispiel #20
0
 def complete(self):
     clear_output(True)
     # if len(self.previousReviewed) > 0:
     #     self.continueReview()
     # else:
     #     self.addExtra()
     self.updateData()
     if self.next_step is not None:
         logMsg((self, 'ML configuration complete'))
         if isinstance(self.next_step, Step):
             if self.workflow is not None:
                 self.workflow.updateStatus(self.next_step.pos_id)
             self.next_step.start()
         else:
             raise TypeError(
                 'Type error for ' + self.name +
                 '\'s next_step. Only Step can be the next_step, where its next_step is '
                 + str(type(self.next_step)))
     else:
         print("next step hasn't been set.")
     pass
Beispiel #21
0
 def complete(self):
     clear_output(True)
     if self.toggle.value == sample_options[0]:
         self.restSampling()
     if sum(self.sample_sizes.values()) > 0:
         self.getSampledDocs()
     self.workflow.samples = self.data
     if self.next_step is not None:
         logMsg((self, 'workflow complete'))
         if isinstance(self.next_step, Step):
             if self.workflow is not None:
                 self.workflow.updateStatus(self.next_step.pos_id)
             self.next_step.start()
         else:
             raise TypeError(
                 'Type error for ' + self.name +
                 '\'s next_step. Only Step can be the next_step, where its next_step is '
                 + str(type(self.next_step)))
     else:
         print("next step hasn't been set.")
     pass
Beispiel #22
0
 def navigate(self, b):
     clear_output(True)
     self.updateData(b)
     logMsg(('navigate to b: ', b, hasattr(b, "linked_step")))
     logMsg(('navigate to branchbutton 1',
             hasattr(self.branch_buttons[1],
                     'linked_step'), self.branch_buttons[1].linked_step))
     if hasattr(b, 'linked_step') and b.linked_step is not None:
         if b.description == 'Complete':
             self.master.complete()
         else:
             b.linked_step.start()
     else:
         if hasattr(self.branch_buttons[1], 'linked_step'
                    ) and self.branch_buttons[1].linked_step is not None:
             self.branch_buttons[1].linked_step.start()
         elif not hasattr(
                 b, 'navigate_direction') or b.navigate_direction == 1:
             logMsg(
                 'Button ' + str(b) +
                 '\'s linked_step is not set. Assume complete the Repeat loop.'
             )
             self.master.complete()
         else:
             self.goBack()
     pass
Beispiel #23
0
    def init_real_time(self):
        self.ml_classifier = self.ml_classifier_cls(
            task_name=self.workflow.task_name)
        self.learning_pace = ConfigReader.getValue("review/ml_learning_pace")
        self.loop_workflow.filters = self.workflow.filters
        self.readData()
        if self.ml_classifier_cls.status == NotTrained:
            self.backgroundTraining()

        self.nlp = ReviewRBInit.nlp
        self.matcher = ReviewRBInit.matcher

        logMsg([doc.DOC_ID for doc in self.docs])
        if self.docs is not None and len(
                self.docs) > 0 and (self.loop_workflow is None
                                    or len(self.loop_workflow.steps) == 0):
            last_doc_pos = len(self.reviewed_docs) + 1 if len(
                self.reviewed_docs) < len(self.docs) else len(
                    self.reviewed_docs)
            for i in range(0, last_doc_pos):
                doc = self.docs[i]
                content = self.genContent(doc)
                reviewed = False
                if doc.DOC_ID in self.annos and self.annos[
                        doc.DOC_ID].REVIEWED_TYPE is not None:
                    prediction = self.annos[doc.DOC_ID].REVIEWED_TYPE
                    reviewed = True
                else:
                    prediction = self.getPrediction(doc)
                repeat_step = ReviewML(
                    description=content,
                    options=self.workflow.types,
                    value=prediction,
                    js=self.js,
                    master=self,
                    reviewed=reviewed,
                    button_style=('success' if reviewed else 'info'))
                self.appendRepeatStep(repeat_step)
        pass
Beispiel #24
0
    def __init__(self, name=str(Step.global_id + 1), **kwargs):
        super().__init__([], name=name)
        self.docs = []
        self.data = dict()
        self.annos = dict()
        self.reviewed_docs = dict()
        self.threshold = ConfigReader.getValue('review/rb_model_threshold')
        self.nlp = None
        self.js = '''<script>
function setFocusToTextBox(){
    var spans = document.getElementsByClassName("highlighter");
    var id=document.getElementById('d1').pos
    if (id===undefined){
      id=0
    }          
    if (id>=spans.length){
        id=0
    }
    var topPos = spans[id].offsetTop;    
    dv=document.getElementById('d1')
    dv.scrollTop = topPos-20;
    dv.pos=id+1;
}
</script>'''
        self.end_js = '''<script>document.getElementById('d1').pos=0;topPos=0;</script>'''
        self.matcher = None
        self.metaColumns = ConfigReader().getValue("review/meta_columns")
        self.div_height = ConfigReader().getValue("review/div_height")
        logMsg(('self.div_height:', self.div_height))
        self.show_meta_name = ConfigReader().getValue("review/show_meta_name")
        self.hightligh_span_tag = ' <span class="highlighter" style="background-color:  %s ">' % ConfigReader(
        ).getValue("review/highlight_color")
        if 'rush_rule' in kwargs:
            self.rush_rule = kwargs['rush_rule']
        else:
            self.rush_rule = ConfigReader.getValue('rush_rules_path')

        pass
Beispiel #25
0
    def init_real_time(self):
        ReviewRBLoop.rb_classifier = RBDocumentClassifierFactory.genDocumentClassifier(
            self.workflow.filters, rush_rule=self.rush_rule)
        self.loop_workflow.filters = self.workflow.filters
        self.readData()
        self.nlp = ReviewRBInit.nlp
        self.matcher = ReviewRBInit.matcher
        if len(self.reviewed_docs) > self.threshold:
            self.complete()
            return

        if self.docs is not None and len(
                self.docs) > 0 and (self.loop_workflow is None
                                    or len(self.loop_workflow.steps) == 0):
            for i in range(0, len(self.reviewed_docs) + 1):
                doc = self.docs[i]
                content = self.genContent(doc)
                reviewed = False
                if doc.DOC_ID in self.annos and self.annos[
                        doc.DOC_ID].REVIEWED_TYPE is not None:
                    prediction = self.annos[doc.DOC_ID].REVIEWED_TYPE
                    reviewed = True
                else:
                    prediction = ReviewRBLoop.rb_classifier.classify(
                        doc.TEXT, doc.DOC_NAME)
                logMsg((i, doc.DOC_ID, reviewed))
                repeat_step = ReviewRB(
                    description=content,
                    options=self.workflow.types,
                    value=prediction,
                    js=self.js,
                    master=self,
                    reviewed=reviewed,
                    button_style=('success' if reviewed else 'info'))
                self.appendRepeatStep(repeat_step)

        pass
Beispiel #26
0
 def initNextDoc(self):
     """while displaying the current sample, prepare for the next sample"""
     if self.workflow is None:
         return
     if self.master is None:
         return
     if self.next_step is None:
         if self.pos_id < len(self.master.docs) - 1:
             doc = self.master.docs[self.pos_id + 1]
             logMsg(('Initiate next doc', len(self.master.docs),
                     'current pos_id:', self.pos_id))
             content = self.master.genContent(doc)
             reviewed = False
             if doc.DOC_ID in self.master.annos and self.master.annos[
                     doc.DOC_ID].REVIEWED_TYPE is not None:
                 prediction = self.master.annos[doc.DOC_ID].REVIEWED_TYPE
                 logError(('reviewed: ', prediction))
                 reviewed = True
             else:
                 prediction = self.master.getPrediction(doc)
                 logError(('predicted: ', prediction))
             repeat_step = ReviewML(
                 description=content,
                 options=self.master.workflow.types,
                 value=prediction,
                 js=self.js,
                 master=self.master,
                 reviewed=reviewed,
                 button_style='success' if reviewed else 'info')
             self.master.appendRepeatStep(repeat_step)
         else:
             logMsg(('Initiate next step', len(self.master.docs),
                     'current pos_id:', self.pos_id, 'master\'s next step',
                     self.master.next_step))
             self.next_step = self.master.next_step
             self.branch_buttons[1].linked_step = self.master.next_step
     pass
Beispiel #27
0
 def getPrediction(self, doc):
     """doc is an instance of db.ORMs.Document"""
     self.step_counter += 1
     if self.step_counter >= self.learning_pace:
         if self.ml_classifier_cls.status == ReadyTrained:
             # reset counter
             self.step_counter = 0
             self.backgroundTraining()
             logMsg("Start retraining the ML model: " +
                    str(self.ml_classifier))
         else:
             logMsg("ML model: " + str(self.ml_classifier) +
                    " is not ready yet, postpone the re-training process.")
     res = None
     source = ''
     if doc.DOC_ID in self.annos:
         # if prediction has been made by previous model
         # just re-read from db to avoid thread conflict when manipulating lists, may improve later
         with self.workflow.dao.create_session() as session:
             anno_iter = session.query(Annotation).filter(
                 and_(Annotation.TASK_ID == self.workflow.task_id,
                      Annotation.DOC_ID == doc.DOC_ID))
             for anno in anno_iter:
                 res = anno.TYPE
                 source = 'last classification'
                 break
     if res is None:
         if self.ml_classifier_cls.status == ReadyTrained:
             # if model is trained
             res = self.ml_classifier.classify(doc.TEXT)
             source = 'current classification'
         else:
             # try rule-based model as default
             res = ReviewRBLoop.rb_classifier.classify(doc.TEXT)
             source = 'rule-base classification'
     logMsg("Get classification from: " + source)
     return res
Beispiel #28
0
    def train(self, x, y):
        logMsg('training...')

        stats = Counter(y)
        for classname, count in stats.items():
            if count < self.cv:
                logMsg(
                    'The whole annotated Data does not have enoguh examples for all classes.  Skipping training for '
                    'class : {}'.format(classname))
                return

        # before we run a search, let's do an 80-20 split for (CV/Validation )
        # even if we do not have a lot of data to work with

        X_text_train, X_text_test, y_train, y_test = train_test_split(
            x, y, train_size=self.train_size, random_state=self.random_state)
        train_classes, train_y_indices = np.unique(y_train,
                                                   return_inverse=True)
        test_classes, test_y_indices = np.unique(y_test, return_inverse=True)
        train_minority_instances = np.min(np.bincount(train_y_indices))
        test_minority_instances = np.min(np.bincount(train_y_indices))
        print(
            'Train minority class instance count : {0}.  Test minority class instance count : {1}'
            .format(train_minority_instances, test_minority_instances))
        if train_minority_instances <= self.cv:
            logMsg(
                'TRAIN data does not have enoguh examples (require {} cases) for all classes ({} cases) .  Skipping '
                'training for task : {}'.format(self.cv,
                                                train_minority_instances,
                                                classname))
            return

        if test_minority_instances <= self.cv:
            logMsg(
                'TEST data does not have enoguh examples (require {} cases) for all classes ({} cases) .  Skipping '
                'training for task : {}'.format(self.cv,
                                                train_minority_instances,
                                                classname))
            return

        # now we can train a model

        logMsg('Fitting model now for iterations = {}'.format(self.iterations))

        LogisticBOWClassifiers.status = InTraining
        self.model.fit(X_text_train, y_train)

        # print performances
        if logging.getLogger().isEnabledFor(logging.DEBUG):
            logMsg('Best params for the model : {}'.format(
                self.model.best_params_))

            logMsg('REPORT for TRAINING set and task : {}'.format(
                self.task_name))
            print(
                metrics.classification_report(y_train,
                                              self.model.predict(X_text_train),
                                              target_names=train_classes))

            logMsg('REPORT for TEST set and task : {}'.format(self.task_name))
            print(
                metrics.classification_report(y_test,
                                              self.model.predict(X_text_test),
                                              target_names=train_classes))
        LogisticBOWClassifiers.status = ReadyTrained
 def goNext(b):
     logMsg('next clicked')
     self.complete()
     pass