def process(self, doc_text): """PyContextNLP, return doc_class, context_doc, annotations, relations""" context_doc = pyConTextGraph.ConTextDocument() sentences = self.sentence_segmenter.segToSentenceSpans(doc_text) for sentence in sentences: sentence_text = doc_text[sentence.begin:sentence.end].lower() # Process every sentence by adding markup m = markup_sentence(sentence_text, modifiers=self.modifiers, targets=self.targets) context_doc.addMarkup(m) context_doc.getSectionMarkups() # print(m) # print(context_doc.getXML()) # convert graphic markups into dataframe markups = get_document_markups(context_doc) annotations, relations, doc_txt = convertMarkups2DF(markups) # display(annotations) # display(relations) # apply inferences for document classication inferenced_types = self.feature_inferencer.process( annotations, relations) # print('After inferred from modifier values, we got these types:\n '+str(inferenced_types)) doc_class = self.document_inferencer.process(inferenced_types) # print('\nDocument classification: '+ doc_class ) return doc_class, context_doc, annotations, relations
def classify(self, doc, doc_name='t_m_p.txt'): self.last_doc_name = doc_name if self.modifiers is None or self.targets is None: logMsg( 'DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n' + 'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,' + 'targets_file) to set them up.') try: context_doc = self.markup_context_document(doc, self.modifiers, self.targets) if self.save_markups and doc_name is not None and len( context_doc.getDocumentGraph().nodes()) > 0: self.saved_markups_map[doc_name] = context_doc markups = get_document_markups(context_doc) annotations, relations, doc_txt = convertMarkups2DF(markups) matched_conclusion_types = self.feature_inferencer.process( annotations, relations) doc_conclusion = self.document_inferencer.process( matched_conclusion_types) except: # pyConText might through errors in some case, will fix it later doc_conclusion = self.document_inferencer.default_conclusion return doc_conclusion
def classify_doc(self, doc, doc_name='t_m_p.txt'): self.last_doc_name = doc_name if self.modifiers is None or self.targets is None: print('DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n' + 'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,' + 'targets_file) to set them up.') context_doc = markup_context_document(doc, self.modifiers, self.targets) if doc_name is not None and self.save_markups and len(context_doc.getDocumentGraph().nodes()) > 0: self.saved_markups_map[doc_name] = context_doc markups = get_document_markups(context_doc) annotations, relations, doc_txt = convertMarkups2DF(markups) matched_conclusion_types = self.feature_inferencer.process(annotations, relations) doc_conclusion = self.document_inferencer.process(matched_conclusion_types) return doc_conclusion
def mark_document_with_html(doc, colors={"name": "red", "pet": "blue"}, default_color="black"): """takes a ConTextDocument object and returns an HTML paragraph with marked phrases in the object highlighted with the colors coded in colors doc: ConTextDocument colors: dictionary keyed by ConText category with values valid HTML colors """ from pyConTextNLP.display.html import __sort_by_span from pyConTextNLP.utils import get_document_markups return """<p> {0} </p>""".format(" ".join([mark_text_custom(m.graph['__txt'], __sort_by_span(m.nodes()), colors=colors, default_color=default_color) for m in get_document_markups(doc)]))
# See what the document was splitted into for sentence in sentences: print("Sentence({}-{}):\t{}".format(sentence.begin, sentence.end, input[sentence.begin:sentence.end])) print('\n' + '-' * 100 + '\n') # initiate a pyConTextGraph to hold the pyConText output context_doc = pyConTextGraph.ConTextDocument() for sentence in sentences: sentence_text = input[sentence.begin:sentence.end].lower() # Process every sentence by adding markup m = markup_sentence(sentence_text, modifiers=modifiers, targets=targets) context_doc.addMarkup(m) context_doc.getSectionMarkups() print(m) # convert graphic markups into dataframe markups = get_document_markups(context_doc) annotations, relations, doc_txt = convertMarkups2DF(markups) head(annotations) # apply inferences for document classication inferenced_types = feature_inferencer.process(annotations, relations) print('After inferred from modifier values, we got these types:\n ' + str(inferenced_types)) doc_class = document_inferencer.process(inferenced_types) print('\nDocument classification: ' + doc_class)
def gen_html_from_context_doc(self, doc, filter_no_markup_txt=True): annotations, relations, doc_txt = convertMarkups2DF( get_document_markups(doc)) html = self.gen_html_from_dfs(doc_txt[1:], annotations, relations) return html, doc_txt, len(annotations)
class DocumentClassifier(object): def __init__(self, targets=None, modifiers=None, feature_inference_rule=None, document_inference_rule=None, expected_values=None, save_markups=True): self.document_inferencer = DocumentInferencer(document_inference_rule) self.feature_inferencer = FeatureInferencer(feature_inference_rule) self.conclusions = [] self.modifiers = modifiers self.targets = targets self.save_markups = save_markups self.expected_values = [value.lower() for value in expected_values] self.saved_markups_map = dict() self.last_doc_name = '' if modifiers is not None and targets is not None: if isinstance(modifiers, str) and isinstance(targets, str): if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith( '.txt') or modifiers.endswith('.yml')) \ and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith( '.txt') or targets.endswith('.yml')): self.setModifiersTargetsFromFiles(modifiers, targets) else: self.setModifiersTargets(modifiers, targets) def setModifiersTargets(self, modifiers, targets): self.modifiers = modifiers self.targets = targets def setModifiersTargetsFromFiles(self, modifiers_file, targets_file): self.targets = get_item_data(targets_file) self.modifiers = get_item_data(modifiers_file) def reset_saved_predictions(self): self.saved_markups_map = {} self.save_markups = True self.expected_value = None def predict(self, doc, doc_name='t_m_p.txt'): self.last_doc_name = doc_name doc_conclusion = self.classify_doc(doc, doc_name) if doc_conclusion in self.expected_values: return 1 return 0 def eval(self, gold_docs): import sklearn import pandas as pd fn_docs = [] fp_docs = [] prediction_metrics = [] gold_labels = [x.positive_label for x in gold_docs.values()] pred_labels = [] print('Start to evaluate against reference standards...') for doc_name, gold_doc in gold_docs.items(): gold_label = gold_doc.positive_label pred_label = self.predict(gold_doc.text, doc_name) pred_labels.append(pred_label) # Differentiate false positive and false negative error if gold_label == 0 and pred_label == 1: fp_docs.append(doc_name) elif gold_label == 1 and pred_label == 0: fn_docs.append(doc_name) precision = sklearn.metrics.precision_score(gold_labels, pred_labels) recall = sklearn.metrics.recall_score(gold_labels, pred_labels) f1 = sklearn.metrics.f1_score(gold_labels, pred_labels) # Let's use Pandas to make a confusion matrix for us confusion_matrix_df = pd.crosstab(pd.Series(gold_labels, name='Actual'), pd.Series(pred_labels, name='Predicted')) prediction_metrics.append('Precision : {0:.3f}'.format(precision)) prediction_metrics.append('Recall : {0:.3f}'.format(recall)) prediction_metrics.append('F1: {0:.3f}'.format(f1)) return fn_docs, fp_docs, '\n'.join(prediction_metrics), confusion_matrix_df) def predict_against(self, doc, expected_values, doc_name='t_m_p.txt'): doc_conclusion = self.classify_doc(doc, doc_name) if doc_conclusion in expected_values: return 1 return 0 def classify_doc(self, doc, doc_name='t_m_p.txt'): self.last_doc_name = doc_name if self.modifiers is None or self.targets is None: print('DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n' + 'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,' + 'targets_file) to set them up.') context_doc = markup_context_document(doc, self.modifiers, self.targets) if doc_name is not None and self.save_markups and len(context_doc.getDocumentGraph().nodes()) > 0: self.saved_markups_map[doc_name] = context_doc markups = get_document_markups(context_doc) annotations, relations, doc_txt = convertMarkups2DF(markups) matched_conclusion_types = self.feature_inferencer.process(annotations, relations) doc_conclusion = self.document_inferencer.process(matched_conclusion_types) return doc_conclusion