def updateData(self, *args): """save the reviewed data""" self.data = self.toggle.value self.toggle.button_style = 'success' if self.reviewed: self.master.reviewed_docs[self.master.docs[ self.pos_id].DOC_ID] = self.toggle.value with self.master.workflow.dao.create_session() as session: logMsg(('update data:', self.pos_id, len(self.master.docs))) anno = session.query(Annotation).filter( and_( Annotation.DOC_ID == self.master.docs[ self.pos_id].DOC_ID, Annotation.TASK_ID == self.master.workflow.task_id)).first() if anno is not None: anno.REVIEWED_TYPE = self.toggle.value anno.TYPE = self.prediction else: anno = Annotation( TASK_ID=self.master.workflow.task_id, DOC_ID=self.master.docs[self.pos_id].DOC_ID, TYPE=self.prediction, REVIEWED_TYPE=self.toggle.value) session.add(anno) self.master.annos[self.master.docs[ self.pos_id].DOC_ID] = anno.clone() pass
def classify(self, doc, doc_name='t_m_p.txt'): self.last_doc_name = doc_name if self.modifiers is None or self.targets is None: logMsg( 'DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n' + 'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,' + 'targets_file) to set them up.') try: context_doc = self.markup_context_document(doc, self.modifiers, self.targets) if self.save_markups and doc_name is not None and len( context_doc.getDocumentGraph().nodes()) > 0: self.saved_markups_map[doc_name] = context_doc markups = get_document_markups(context_doc) annotations, relations, doc_txt = convertMarkups2DF(markups) matched_conclusion_types = self.feature_inferencer.process( annotations, relations) doc_conclusion = self.document_inferencer.process( matched_conclusion_types) except: # pyConText might through errors in some case, will fix it later doc_conclusion = self.document_inferencer.default_conclusion return doc_conclusion
def __init__(self, targets=None, modifiers=None, feature_inference_rule=None, document_inference_rule=None, rush_rule=None, expected_values=[], save_markups=True): self.document_inferencer = DocumentInferencer(document_inference_rule) self.feature_inferencer = FeatureInferencer(feature_inference_rule) self.conclusions = [] self.modifiers = modifiers self.targets = targets self.save_markups = save_markups self.expected_values = [value.lower() for value in expected_values] self.saved_markups_map = dict() self.pyrush = None if rush_rule is None or not os.path.isfile(rush_rule): rush_rule = ConfigReader.getValue('rush_rules_path') if rush_rule is not None and os.path.isfile(rush_rule): self.pyrush = RuSH(rush_rule) else: logMsg(("File not found", os.path.abspath(rush_rule))) self.last_doc_name = '' if modifiers is not None and targets is not None: if isinstance(modifiers, str) and isinstance(targets, str): if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith( '.txt') or modifiers.endswith('.yml')) \ and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith( '.txt') or targets.endswith('.yml') or targets.startswith('Lex\t')): self.setModifiersTargetsFromFiles(modifiers, targets) else: self.setModifiersTargets(modifiers, targets) RBDocumentClassifier.instance = self
def initNextStepWhileReviewing(self): if len(self.workflow.to_ext_words) > 0: word, type_name = self.workflow.to_ext_words.pop(0) extending = [] try: extending = GloveModel.glove_model.similar_by_word(word.lower()) extending = KeywordsUMLSExtender.filterExtended([pair[0] for pair in extending], type_name, self.master.workflow.filters, self.workflow.extended) except KeyError: logMsg(("word '%s' not in vocabulary" % word.lower())) if len(extending) > 0: next_step = RepeatWEMultipleSelection(description=KeywordsEmbeddingExtender.description % word, options=list(extending), master=self.master, type_name=type_name) next_step.setCompleteStep(self.branch_buttons[2].linked_step) self.workflow.append(next_step) else: self.initNextStepWhileReviewing() # # word, type_name = self.workflow.to_ext_words.pop(0) # extended = GloveModel.glove_model.similar_by_word(word, KeywordsEmbeddingExtender.max_query) # extended = self.filterExtended(extended, type_name) # if len(extended) > 0: # next_step = RepeatWEMultipleSelection(description=KeywordsEmbeddingExtender.description % word, # options=list(extended)) # next_step.setCompleteStep(self.branch_buttons[2].linked_step) # self.workflow.append(next_step) pass
def eval(self, gold_docs): import sklearn import pandas as pd fn_docs = [] fp_docs = [] prediction_metrics = [] gold_labels = [x.positive_label for x in gold_docs.values()] pred_labels = [] logMsg('Start to evaluate against reference standards...') for doc_name, gold_doc in gold_docs.items(): gold_label = gold_doc.positive_label pred_label = self.predict(gold_doc.text, doc_name) pred_labels.append(pred_label) # Differentiate false positive and false negative error if gold_label == 0 and pred_label == 1: fp_docs.append(doc_name) elif gold_label == 1 and pred_label == 0: fn_docs.append(doc_name) precision = sklearn.metrics.precision_score(gold_labels, pred_labels) recall = sklearn.metrics.recall_score(gold_labels, pred_labels) f1 = sklearn.metrics.f1_score(gold_labels, pred_labels) # Let's use Pandas to make a confusion matrix for us confusion_matrix_df = pd.crosstab( pd.Series(gold_labels, name='Actual'), pd.Series(pred_labels, name='Predicted')) prediction_metrics.append('Precision : {0:.3f}'.format(precision)) prediction_metrics.append('Recall : {0:.3f}'.format(recall)) prediction_metrics.append('F1: {0:.3f}'.format(f1)) return fn_docs, fp_docs, '\n'.join( prediction_metrics), confusion_matrix_df[[1, 0]].reindex([1, 0])
def initiateRepeatStep(self): if len(self.loop_workflow.to_ext_words) > 0: word, type_name = self.loop_workflow.to_ext_words.pop(0) extending = [] try: extending = KeywordsUMLSExtender.umls.search(word) # self.loop_workflow.extended saved all the extended words that will be displayed, no matter will be # selected or not, so that the same extended word won't be shown twice asking for selection extending = filterExtended(extending, type_name, self.workflow.filters, self.loop_workflow.extended) except KeyError: logMsg(("not synonym found for word '%s'" % word.lower())) if len(extending) > 0: self.appendRepeatStep( RepeatMultipleSelection( description=KeywordsUMLSExtender.description % word, options=list(extending), master=self, type_name=type_name)) else: self.initiateRepeatStep() else: self.complete()
def initTraining(self): x = [doc.TEXT for doc in self.docs[:len(self.reviewed_docs)]] y = list(self.reviewed_docs.values()) logMsg(('start ML training: ', type(self.ml_classifier), 'x=', len(x), 'y=', len(y))) self.ml_classifier.train(x, y) logMsg('training finished, start to predict...') self.initPrediction()
def navigate(self, b): # print(b) self.data = self.selections.value if self.master is not None and self.data is not None: logMsg(self.data) self.master.workflow.filters[self.type_name].addAll(self.data) super().navigate(b) pass
def readData(self): self.data = self.workflow.steps[self.pos_id - 1].data self.docs = self.data['docs'] self.annos = self.data['annos'] self.reviewed_docs = { doc_id: anno.REVIEWED_TYPE for doc_id, anno in self.annos.items() if anno.REVIEWED_TYPE is not None } logMsg(('self.docs', len(self.docs))) logMsg(('self.annos', len(self.annos)))
def restSampling(self): """discard previous sampling and reviewed data, start a new sampling""" logMsg('reset sampling') self.data['docs'].clear() self.data['annos'].clear() with self.workflow.dao.create_session() as session: anno_iter = session.query(Annotation).filter( Annotation.TASK_ID == self.workflow.task_id) for anno in anno_iter: session.delete(anno) session.commit() pass
def genDiv(self, doc): """generate scrollable div to display the text content with keywords highlighted""" div = '' # div = '<button type="button" onclick="setFocusToTextBox()">Focus on next highlight</button>' div += '<div id="d1" style="overflow-y: scroll; height:' + self.div_height \ + ';border:1px solid;border-color:#e5e8e8; ">' logMsg(('self.div_height:', self.div_height)) spacy_doc = self.nlp(doc.TEXT) matches = self.matcher(spacy_doc) # consolePrint(matches) highlight_text = self.genHighlightTex(spacy_doc, matches) div += highlight_text + '</div>' return div
def navigate(self, b): clear_output(True) self.updateData(b) logMsg((b, hasattr(b, "linked_step"))) if hasattr(b, 'linked_step') and b.linked_step is not None: b.linked_step.start() else: if not hasattr(b, 'navigate_direction') or b.navigate_direction == 1: self.complete() else: self.goBack() pass
def start(self): """In running time, start to display a sample in the notebook output cell""" logMsg(('start step id/total steps', self.pos_id, len(self.workflow.steps))) if len(self.master.js) > 0: display(widgets.HTML(self.master.js)) self.toggle.button_style = 'success' self.progress.value = self.pos_id + 1 self.progress.description = 'Progress: ' + str( self.progress.value) + '/' + str(self.progress.max) clear_output(True) display(self.box) self.initNextDoc() pass
def initPrediction(self): counter = 0 with self.workflow.dao.create_session() as session: iter = session.query(Annotation, Document).join( Document, Document.DOC_ID == Annotation.DOC_ID).filter( and_(Annotation.TASK_ID == self.workflow.task_id, Annotation.REVIEWED_TYPE is None)) for anno, doc in iter: if counter >= self.learning_pace * 1.5: # don't need to process all the rest document for efficiency concerns break logMsg(('predict doc: ', doc.DOC_ID, anno.TYPE)) anno.TYPE = self.ml_classifier.classify(doc.TEXT) counter += 1 counter = 0 pass
def updateData(self, *args): """data related operations when click a button to move on to next step""" # if self.move_next_option == "R": # self.restSampling() # elif self.move_next_option == "A": # self.addExtra() # else: # self.continueReview() for name, value in self.parameter_inputs.items(): self.parameters[name] = value.value # directly change the value of class variables logMsg(("update settings: ", self.ml_classifier_cls, name, value.value)) setattr(self.ml_classifier_cls, name, value.value) pass
def complete(self): # Differentiate the keywords from where added----Not used for now # self.data = TreeSet() # for step in self.loop_workflow.steps: # for word in step.data: # self.data.add(word) # self.workflow.we_extended = self.data logMsg('update word embedding extended keywords into database') with self.workflow.dao.create_session() as session: records = session.query(Filter).filter(Filter.task_id == self.workflow.task_id) \ .filter(Filter.type == 'orig') for record in records: type_name = record.type_name keywords = '\n'.join(self.workflow.filters[type_name]).strip() record.keyword = keywords super().complete() pass
def __init__(self, word2vec_file='models/saved/glove/glove.42B.300d.bin', vocab=1900000, vect=300): glove_model = None if GloveModel.glove_model is None and GloveModel.status == NotInitiated: if path.isfile(word2vec_file): GloveModel.status = Initiating logMsg('Load glove model in the backend...') print('Load glove model in the backend...') if word2vec_file.endswith('.bin'): glove_model = KeyedVectors.load_word2vec_format( word2vec_file, binary=True) GloveModel.status = Initiated else: glove_model = KeyedVectors.load_word2vec_format( word2vec_file, binary=False) logMsg('convert txt model to binary model...') glove_model.save_word2vec_format(word2vec_file[:-3] + '.bin', binary=True) GloveModel.status = Initiated elif path.isfile(word2vec_file[:-3] + 'txt'): GloveModel.status = Initiating logMsg('Load glove model in the backend...') print('Load glove model in the backend...') txt_model = word2vec_file[:-3] + 'txt' self.addDimensions(txt_model, line_to_prepend=str(vocab) + ' ' + str(vect)) glove_model = KeyedVectors.load_word2vec_format(txt_model, binary=False) logMsg('convert txt model to binary model...') glove_model.save_word2vec_format(word2vec_file, binary=True) GloveModel.status = Initiated else: logMsg(("Either ", path.abspath(word2vec_file), ' or ', path.abspath(word2vec_file[:-3] + 'txt'), ' exists.')) print(("Either ", path.abspath(word2vec_file), ' or ', path.abspath(word2vec_file[:-3] + 'txt'), ' exists.')) GloveModel.glove_model = glove_model pass
def initiateRepeatStep(self): if len(self.loop_workflow.to_ext_words) > 0: word, type_name = self.loop_workflow.to_ext_words.pop(0) extending = [] try: extending = GloveModel.glove_model.similar_by_word(word.lower()) extending = KeywordsUMLSExtender.filterExtended([pair[0] for pair in extending], type_name, self.workflow.filters, self.loop_workflow.extended) except KeyError: logMsg(("word '%s' not in vocabulary" % word.lower())) if len(extending) > 0: self.appendRepeatStep( RepeatWEMultipleSelection(description=KeywordsEmbeddingExtender.description % word, options=list(extending), master=self, type_name=type_name)) else: self.initiateRepeatStep() else: self.complete()
def initNextDoc(self): """while displaying the current sample, prepare for the next sample""" if self.workflow is None: return if self.master is None: return if self.next_step is None: # if reach the limit of rule-base preannotation max documents or the end of samples, jump to complete if self.pos_id < len( self.master.docs ) - 1 and self.pos_id < self.master.threshold - 1: doc = self.master.docs[self.pos_id + 1] logMsg(('Initiate next doc', len(self.master.docs), 'current pos_id:', self.pos_id)) content = self.master.genContent(doc) reviewed = False if doc.DOC_ID in self.master.annos and self.master.annos[ doc.DOC_ID].REVIEWED_TYPE is not None: prediction = self.master.annos[doc.DOC_ID].REVIEWED_TYPE reviewed = True else: prediction = ReviewRBLoop.rb_classifier.classify( doc.TEXT, doc.DOC_NAME) repeat_step = ReviewRB( description=content, options=self.master.workflow.types, value=prediction, js=self.js, master=self.master, reviewed=reviewed, button_style='success' if reviewed else 'info') self.master.appendRepeatStep(repeat_step) else: logMsg(('Initiate next step', len(self.master.docs), 'current pos_id:', self.pos_id, 'master\'s next step', self.master.next_step)) self.next_step = self.master.next_step self.branch_buttons[1].linked_step = self.master.next_step elif self.pos_id >= self.master.threshold - 1: self.navigate(self.branch_buttons[2]) pass
def complete(self): clear_output(True) # if len(self.previousReviewed) > 0: # self.continueReview() # else: # self.addExtra() self.updateData() if self.next_step is not None: logMsg((self, 'ML configuration complete')) if isinstance(self.next_step, Step): if self.workflow is not None: self.workflow.updateStatus(self.next_step.pos_id) self.next_step.start() else: raise TypeError( 'Type error for ' + self.name + '\'s next_step. Only Step can be the next_step, where its next_step is ' + str(type(self.next_step))) else: print("next step hasn't been set.") pass
def complete(self): clear_output(True) if self.toggle.value == sample_options[0]: self.restSampling() if sum(self.sample_sizes.values()) > 0: self.getSampledDocs() self.workflow.samples = self.data if self.next_step is not None: logMsg((self, 'workflow complete')) if isinstance(self.next_step, Step): if self.workflow is not None: self.workflow.updateStatus(self.next_step.pos_id) self.next_step.start() else: raise TypeError( 'Type error for ' + self.name + '\'s next_step. Only Step can be the next_step, where its next_step is ' + str(type(self.next_step))) else: print("next step hasn't been set.") pass
def navigate(self, b): clear_output(True) self.updateData(b) logMsg(('navigate to b: ', b, hasattr(b, "linked_step"))) logMsg(('navigate to branchbutton 1', hasattr(self.branch_buttons[1], 'linked_step'), self.branch_buttons[1].linked_step)) if hasattr(b, 'linked_step') and b.linked_step is not None: if b.description == 'Complete': self.master.complete() else: b.linked_step.start() else: if hasattr(self.branch_buttons[1], 'linked_step' ) and self.branch_buttons[1].linked_step is not None: self.branch_buttons[1].linked_step.start() elif not hasattr( b, 'navigate_direction') or b.navigate_direction == 1: logMsg( 'Button ' + str(b) + '\'s linked_step is not set. Assume complete the Repeat loop.' ) self.master.complete() else: self.goBack() pass
def init_real_time(self): self.ml_classifier = self.ml_classifier_cls( task_name=self.workflow.task_name) self.learning_pace = ConfigReader.getValue("review/ml_learning_pace") self.loop_workflow.filters = self.workflow.filters self.readData() if self.ml_classifier_cls.status == NotTrained: self.backgroundTraining() self.nlp = ReviewRBInit.nlp self.matcher = ReviewRBInit.matcher logMsg([doc.DOC_ID for doc in self.docs]) if self.docs is not None and len( self.docs) > 0 and (self.loop_workflow is None or len(self.loop_workflow.steps) == 0): last_doc_pos = len(self.reviewed_docs) + 1 if len( self.reviewed_docs) < len(self.docs) else len( self.reviewed_docs) for i in range(0, last_doc_pos): doc = self.docs[i] content = self.genContent(doc) reviewed = False if doc.DOC_ID in self.annos and self.annos[ doc.DOC_ID].REVIEWED_TYPE is not None: prediction = self.annos[doc.DOC_ID].REVIEWED_TYPE reviewed = True else: prediction = self.getPrediction(doc) repeat_step = ReviewML( description=content, options=self.workflow.types, value=prediction, js=self.js, master=self, reviewed=reviewed, button_style=('success' if reviewed else 'info')) self.appendRepeatStep(repeat_step) pass
def __init__(self, name=str(Step.global_id + 1), **kwargs): super().__init__([], name=name) self.docs = [] self.data = dict() self.annos = dict() self.reviewed_docs = dict() self.threshold = ConfigReader.getValue('review/rb_model_threshold') self.nlp = None self.js = '''<script> function setFocusToTextBox(){ var spans = document.getElementsByClassName("highlighter"); var id=document.getElementById('d1').pos if (id===undefined){ id=0 } if (id>=spans.length){ id=0 } var topPos = spans[id].offsetTop; dv=document.getElementById('d1') dv.scrollTop = topPos-20; dv.pos=id+1; } </script>''' self.end_js = '''<script>document.getElementById('d1').pos=0;topPos=0;</script>''' self.matcher = None self.metaColumns = ConfigReader().getValue("review/meta_columns") self.div_height = ConfigReader().getValue("review/div_height") logMsg(('self.div_height:', self.div_height)) self.show_meta_name = ConfigReader().getValue("review/show_meta_name") self.hightligh_span_tag = ' <span class="highlighter" style="background-color: %s ">' % ConfigReader( ).getValue("review/highlight_color") if 'rush_rule' in kwargs: self.rush_rule = kwargs['rush_rule'] else: self.rush_rule = ConfigReader.getValue('rush_rules_path') pass
def init_real_time(self): ReviewRBLoop.rb_classifier = RBDocumentClassifierFactory.genDocumentClassifier( self.workflow.filters, rush_rule=self.rush_rule) self.loop_workflow.filters = self.workflow.filters self.readData() self.nlp = ReviewRBInit.nlp self.matcher = ReviewRBInit.matcher if len(self.reviewed_docs) > self.threshold: self.complete() return if self.docs is not None and len( self.docs) > 0 and (self.loop_workflow is None or len(self.loop_workflow.steps) == 0): for i in range(0, len(self.reviewed_docs) + 1): doc = self.docs[i] content = self.genContent(doc) reviewed = False if doc.DOC_ID in self.annos and self.annos[ doc.DOC_ID].REVIEWED_TYPE is not None: prediction = self.annos[doc.DOC_ID].REVIEWED_TYPE reviewed = True else: prediction = ReviewRBLoop.rb_classifier.classify( doc.TEXT, doc.DOC_NAME) logMsg((i, doc.DOC_ID, reviewed)) repeat_step = ReviewRB( description=content, options=self.workflow.types, value=prediction, js=self.js, master=self, reviewed=reviewed, button_style=('success' if reviewed else 'info')) self.appendRepeatStep(repeat_step) pass
def initNextDoc(self): """while displaying the current sample, prepare for the next sample""" if self.workflow is None: return if self.master is None: return if self.next_step is None: if self.pos_id < len(self.master.docs) - 1: doc = self.master.docs[self.pos_id + 1] logMsg(('Initiate next doc', len(self.master.docs), 'current pos_id:', self.pos_id)) content = self.master.genContent(doc) reviewed = False if doc.DOC_ID in self.master.annos and self.master.annos[ doc.DOC_ID].REVIEWED_TYPE is not None: prediction = self.master.annos[doc.DOC_ID].REVIEWED_TYPE logError(('reviewed: ', prediction)) reviewed = True else: prediction = self.master.getPrediction(doc) logError(('predicted: ', prediction)) repeat_step = ReviewML( description=content, options=self.master.workflow.types, value=prediction, js=self.js, master=self.master, reviewed=reviewed, button_style='success' if reviewed else 'info') self.master.appendRepeatStep(repeat_step) else: logMsg(('Initiate next step', len(self.master.docs), 'current pos_id:', self.pos_id, 'master\'s next step', self.master.next_step)) self.next_step = self.master.next_step self.branch_buttons[1].linked_step = self.master.next_step pass
def getPrediction(self, doc): """doc is an instance of db.ORMs.Document""" self.step_counter += 1 if self.step_counter >= self.learning_pace: if self.ml_classifier_cls.status == ReadyTrained: # reset counter self.step_counter = 0 self.backgroundTraining() logMsg("Start retraining the ML model: " + str(self.ml_classifier)) else: logMsg("ML model: " + str(self.ml_classifier) + " is not ready yet, postpone the re-training process.") res = None source = '' if doc.DOC_ID in self.annos: # if prediction has been made by previous model # just re-read from db to avoid thread conflict when manipulating lists, may improve later with self.workflow.dao.create_session() as session: anno_iter = session.query(Annotation).filter( and_(Annotation.TASK_ID == self.workflow.task_id, Annotation.DOC_ID == doc.DOC_ID)) for anno in anno_iter: res = anno.TYPE source = 'last classification' break if res is None: if self.ml_classifier_cls.status == ReadyTrained: # if model is trained res = self.ml_classifier.classify(doc.TEXT) source = 'current classification' else: # try rule-based model as default res = ReviewRBLoop.rb_classifier.classify(doc.TEXT) source = 'rule-base classification' logMsg("Get classification from: " + source) return res
def train(self, x, y): logMsg('training...') stats = Counter(y) for classname, count in stats.items(): if count < self.cv: logMsg( 'The whole annotated Data does not have enoguh examples for all classes. Skipping training for ' 'class : {}'.format(classname)) return # before we run a search, let's do an 80-20 split for (CV/Validation ) # even if we do not have a lot of data to work with X_text_train, X_text_test, y_train, y_test = train_test_split( x, y, train_size=self.train_size, random_state=self.random_state) train_classes, train_y_indices = np.unique(y_train, return_inverse=True) test_classes, test_y_indices = np.unique(y_test, return_inverse=True) train_minority_instances = np.min(np.bincount(train_y_indices)) test_minority_instances = np.min(np.bincount(train_y_indices)) print( 'Train minority class instance count : {0}. Test minority class instance count : {1}' .format(train_minority_instances, test_minority_instances)) if train_minority_instances <= self.cv: logMsg( 'TRAIN data does not have enoguh examples (require {} cases) for all classes ({} cases) . Skipping ' 'training for task : {}'.format(self.cv, train_minority_instances, classname)) return if test_minority_instances <= self.cv: logMsg( 'TEST data does not have enoguh examples (require {} cases) for all classes ({} cases) . Skipping ' 'training for task : {}'.format(self.cv, train_minority_instances, classname)) return # now we can train a model logMsg('Fitting model now for iterations = {}'.format(self.iterations)) LogisticBOWClassifiers.status = InTraining self.model.fit(X_text_train, y_train) # print performances if logging.getLogger().isEnabledFor(logging.DEBUG): logMsg('Best params for the model : {}'.format( self.model.best_params_)) logMsg('REPORT for TRAINING set and task : {}'.format( self.task_name)) print( metrics.classification_report(y_train, self.model.predict(X_text_train), target_names=train_classes)) logMsg('REPORT for TEST set and task : {}'.format(self.task_name)) print( metrics.classification_report(y_test, self.model.predict(X_text_test), target_names=train_classes)) LogisticBOWClassifiers.status = ReadyTrained
def goNext(b): logMsg('next clicked') self.complete() pass