Beispiel #1
0
    def process(self, doc_text):
        """PyContextNLP, return doc_class, context_doc, annotations, relations"""

        context_doc = pyConTextGraph.ConTextDocument()
        sentences = self.sentence_segmenter.segToSentenceSpans(doc_text)

        for sentence in sentences:

            sentence_text = doc_text[sentence.begin:sentence.end].lower()
            # Process every sentence by adding markup
            m = markup_sentence(sentence_text,
                                modifiers=self.modifiers,
                                targets=self.targets)
            context_doc.addMarkup(m)
            context_doc.getSectionMarkups()
            # print(m)
            # print(context_doc.getXML())

        # convert graphic markups into dataframe
        markups = get_document_markups(context_doc)
        annotations, relations, doc_txt = convertMarkups2DF(markups)
        # display(annotations)
        # display(relations)

        # apply inferences for document classication
        inferenced_types = self.feature_inferencer.process(
            annotations, relations)
        # print('After inferred from modifier values, we got these types:\n '+str(inferenced_types))
        doc_class = self.document_inferencer.process(inferenced_types)
        # print('\nDocument classification: '+ doc_class )

        return doc_class, context_doc, annotations, relations
    def classify(self, doc, doc_name='t_m_p.txt'):
        self.last_doc_name = doc_name
        if self.modifiers is None or self.targets is None:
            logMsg(
                'DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n'
                +
                'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,'
                + 'targets_file) to set them up.')
        try:
            context_doc = self.markup_context_document(doc, self.modifiers,
                                                       self.targets)
            if self.save_markups and doc_name is not None and len(
                    context_doc.getDocumentGraph().nodes()) > 0:
                self.saved_markups_map[doc_name] = context_doc
            markups = get_document_markups(context_doc)

            annotations, relations, doc_txt = convertMarkups2DF(markups)
            matched_conclusion_types = self.feature_inferencer.process(
                annotations, relations)
            doc_conclusion = self.document_inferencer.process(
                matched_conclusion_types)
        except:
            # pyConText might through errors in some case, will fix it later
            doc_conclusion = self.document_inferencer.default_conclusion
        return doc_conclusion
Beispiel #3
0
 def classify_doc(self, doc, doc_name='t_m_p.txt'):
     self.last_doc_name = doc_name
     if self.modifiers is None or self.targets is None:
         print('DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n' +
               'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,' + 'targets_file) to set them up.')
     context_doc = markup_context_document(doc, self.modifiers, self.targets)
     if doc_name is not None and self.save_markups and len(context_doc.getDocumentGraph().nodes()) > 0:
         self.saved_markups_map[doc_name] = context_doc
     markups = get_document_markups(context_doc)
     annotations, relations, doc_txt = convertMarkups2DF(markups)
     matched_conclusion_types = self.feature_inferencer.process(annotations, relations)
     doc_conclusion = self.document_inferencer.process(matched_conclusion_types)
     return doc_conclusion
Beispiel #4
0
def mark_document_with_html(doc, colors={"name": "red", "pet": "blue"}, default_color="black"):
    """takes a ConTextDocument object and returns an HTML paragraph with marked phrases in the
    object highlighted with the colors coded in colors

    doc: ConTextDocument
    colors: dictionary keyed by ConText category with values valid HTML colors

    """
    from pyConTextNLP.display.html import __sort_by_span
    from pyConTextNLP.utils import get_document_markups
    return """<p> {0} </p>""".format(" ".join([mark_text_custom(m.graph['__txt'],
                                                                __sort_by_span(m.nodes()),
                                                                colors=colors,
                                                                default_color=default_color) for m in
                                               get_document_markups(doc)]))
Beispiel #5
0
# See what the document was splitted into
for sentence in sentences:
    print("Sentence({}-{}):\t{}".format(sentence.begin, sentence.end,
                                        input[sentence.begin:sentence.end]))
    print('\n' + '-' * 100 + '\n')

# initiate a pyConTextGraph to hold the pyConText output
context_doc = pyConTextGraph.ConTextDocument()

for sentence in sentences:
    sentence_text = input[sentence.begin:sentence.end].lower()
    # Process every sentence by adding markup
    m = markup_sentence(sentence_text, modifiers=modifiers, targets=targets)
    context_doc.addMarkup(m)
    context_doc.getSectionMarkups()
    print(m)

# convert graphic markups into dataframe
markups = get_document_markups(context_doc)
annotations, relations, doc_txt = convertMarkups2DF(markups)

head(annotations)

# apply inferences for document classication
inferenced_types = feature_inferencer.process(annotations, relations)
print('After inferred from modifier values, we got these types:\n ' +
      str(inferenced_types))
doc_class = document_inferencer.process(inferenced_types)
print('\nDocument classification: ' + doc_class)
Beispiel #6
0
 def gen_html_from_context_doc(self, doc, filter_no_markup_txt=True):
     annotations, relations, doc_txt = convertMarkups2DF(
         get_document_markups(doc))
     html = self.gen_html_from_dfs(doc_txt[1:], annotations, relations)
     return html, doc_txt, len(annotations)
class DocumentClassifier(object):
    def __init__(self, targets=None, modifiers=None, feature_inference_rule=None, document_inference_rule=None,
                 expected_values=None, save_markups=True):
        self.document_inferencer = DocumentInferencer(document_inference_rule)
        self.feature_inferencer = FeatureInferencer(feature_inference_rule)
        self.conclusions = []
        self.modifiers = modifiers
        self.targets = targets
        self.save_markups = save_markups
        self.expected_values = [value.lower() for value in expected_values]
        self.saved_markups_map = dict()
        self.last_doc_name = ''

        if modifiers is not None and targets is not None:
            if isinstance(modifiers, str) and isinstance(targets, str):
                if (modifiers.endswith('.csv') or modifiers.endswith('.tsv') or modifiers.endswith(
                        '.txt') or modifiers.endswith('.yml')) \
                        and (targets.endswith('.csv') or targets.endswith('.tsv') or targets.endswith(
                    '.txt') or targets.endswith('.yml')):
                    self.setModifiersTargetsFromFiles(modifiers, targets)
            else:
                self.setModifiersTargets(modifiers, targets)

    def setModifiersTargets(self, modifiers, targets):
        self.modifiers = modifiers
        self.targets = targets

    def setModifiersTargetsFromFiles(self, modifiers_file, targets_file):
        self.targets = get_item_data(targets_file)
        self.modifiers = get_item_data(modifiers_file)

    def reset_saved_predictions(self):
        self.saved_markups_map = {}
        self.save_markups = True
        self.expected_value = None

    def predict(self, doc, doc_name='t_m_p.txt'):
        self.last_doc_name = doc_name
        doc_conclusion = self.classify_doc(doc, doc_name)
        if doc_conclusion in self.expected_values:
            return 1
        return 0

    def eval(self, gold_docs):
        import sklearn
        import pandas as pd
        fn_docs = []
        fp_docs = []
        prediction_metrics = []
        gold_labels = [x.positive_label for x in gold_docs.values()]
        pred_labels = []
        print('Start to evaluate against reference standards...')
        for doc_name, gold_doc in gold_docs.items():
            gold_label = gold_doc.positive_label
            pred_label = self.predict(gold_doc.text, doc_name)
            pred_labels.append(pred_label)
            #       Differentiate false positive and false negative error
            if gold_label == 0 and pred_label == 1:
                fp_docs.append(doc_name)
            elif gold_label == 1 and pred_label == 0:
                fn_docs.append(doc_name)

        precision = sklearn.metrics.precision_score(gold_labels, pred_labels)
        recall = sklearn.metrics.recall_score(gold_labels, pred_labels)
        f1 = sklearn.metrics.f1_score(gold_labels, pred_labels)
        # Let's use Pandas to make a confusion matrix for us
        confusion_matrix_df = pd.crosstab(pd.Series(gold_labels, name='Actual'),
                                          pd.Series(pred_labels, name='Predicted'))
        prediction_metrics.append('Precision : {0:.3f}'.format(precision))
        prediction_metrics.append('Recall :    {0:.3f}'.format(recall))
        prediction_metrics.append('F1:         {0:.3f}'.format(f1))

        return fn_docs, fp_docs, '\n'.join(prediction_metrics), confusion_matrix_df)

    def predict_against(self, doc, expected_values, doc_name='t_m_p.txt'):
        doc_conclusion = self.classify_doc(doc, doc_name)
        if doc_conclusion in expected_values:
            return 1
        return 0

    def classify_doc(self, doc, doc_name='t_m_p.txt'):
        self.last_doc_name = doc_name
        if self.modifiers is None or self.targets is None:
            print('DocumentClassifier\'s "modifiers" and/or "targets" has not been set yet.\n' +
                  'Use function: setModifiersTargets(modifiers, targets) or setModifiersTargetsFromFiles(modifiers_file,' + 'targets_file) to set them up.')
        context_doc = markup_context_document(doc, self.modifiers, self.targets)
        if doc_name is not None and self.save_markups and len(context_doc.getDocumentGraph().nodes()) > 0:
            self.saved_markups_map[doc_name] = context_doc
        markups = get_document_markups(context_doc)
        annotations, relations, doc_txt = convertMarkups2DF(markups)
        matched_conclusion_types = self.feature_inferencer.process(annotations, relations)
        doc_conclusion = self.document_inferencer.process(matched_conclusion_types)
        return doc_conclusion