Beispiel #1
0
def main():
    # start timer
    start = time.clock()

    # Parse incoming cmd line arguments
    args = ArgumentParsingSettings.get_local_predict_args()
    data_dir = args.datadir
    model_type = args.model_type

    # Section raw documents
    sectioner_out_dir = uw_sectioner(data_dir)

    # Load sectioned docs
    xml_dl = SectionerXMLDataLoader(xml_dir=sectioner_out_dir,
                                    clean_tmp_files=True)
    docs = xml_dl.load()

    # Perform NER on sectioned docs
    extractor = NERExtraction(docs, model_algo=model_type)
    tagged_documents = extractor.tag_all()
    tagged_documents = extractor.remove_negated_concepts(tagged_documents)

    # Print full docs
    dp = HTMLPrinter()
    dp.write_readable_prediction_results(
        tagged_documents,
        "/home/wlane/PycharmProjects/HutchNER/HutchNER/NERResults",
        model_algo=model_type)

    end = time.clock()
    print("##################################")
    print(" \tTime Elapsed: " + str(int((end - start) / 60)) +
          " minutes and " + str(int((end - start) % 60)) + " seconds.")
    print("##################################")
Beispiel #2
0
def main(documents, model_type, models):
    text_dl = JSONDataLoader(documents=documents)
    docs = text_dl.preprocess(spacy_model=models['spacy'])

    extractor = NERExtraction(docs, model_algo=model_type)
    tagged_documents = extractor.tag_all(models)
    json_response = extractor.docs2json(tagged_documents)
    return json_response
Beispiel #3
0
 def setUp(self):
     self.negater = HutchNegEx()
     self.dl = SectionerXMLDataLoader(
         "/home/wlane/PycharmProjects/HutchNER_API/NERResources/TestCaseData_sectioned",
         clean_tmp_files=False)
     loaded_docs = self.dl.preprocess(spacy_model=spacy_model)
     UnformattedDocumentPreprocessor(loaded_docs, spacy_model=spacy_model)
     tester = NERExtraction(loaded_docs)
     self.loaded_docs = tester.tag_all()
Beispiel #4
0
def main():
    """ Entry point to HutchNER1: Concept NERExtraction Training """
    # start timer
    start = time.clock()

    # Parse incoming cmd line arguments
    args = ArgumentParsingSettings.get_testing_args()
    text_dir = args.textdir
    local_annotations = args.annots
    labkey_ini_section = args.section
    model_name = args.model
    model_type = args.model_type
    anno_type = args.anno_type
    print('model_name:')
    print(model_name)
    # Load the documents
    if anno_type == 'i2b2':
        text_dl = i2b2DataLoader(txt_dir=text_dir,
                                 annotation_dir=local_annotations)
    else:
        text_dl = bratDataLoader(txt_dir=text_dir,
                                 annotation_dir=local_annotations)
    docs = text_dl.load()

    # Run NER driver with models and data provided in dirs
    extractor = NERExtraction(docs, model_name, model_type)
    tagged_documents = extractor.tag_all(models=models)
    neg_documents = extractor.remove_negated_concepts(tagged_documents)

    # Evaluate the performance on TAGGED DOCUMENTS (not the negated ones)
    labels = extractor.possible_labels
    ev = NEREvaluator(tagged_documents, labels)

    # use timestamp to link output labels and files to output results numbers
    time_stamp = time.time()
    string_timestamp = datetime.datetime.fromtimestamp(time_stamp).strftime(
        '%Y-%m-%d_%H.%M.%S')

    ev.output_labels("OutputLabels", tagged_documents, model_name,
                     string_timestamp)
    ev.write_results("EvalResults",
                     strictness="exact",
                     model_name=model_name,
                     string_timestamp=string_timestamp)
    ev.write_results("EvalResults",
                     strictness="overlap",
                     model_name=model_name,
                     string_timestamp=string_timestamp)

    # Print time elapsed to console
    end = time.clock()
    print("##################################")
    print(" \tTime Elapsed: " + str(int((end - start) / 60)) +
          " minutes and " + str(int((end - start) % 60)) + " seconds.")
    print("##################################")
Beispiel #5
0
def main():
    """ Entry point to HutchNER1: Concept NERExtraction Training """
    # start timer
    start = time.clock()

    # Parse incoming cmd line arguments
    args = ArgumentParsingSettings.get_testing_args()
    data_dir = args.datadir
    model_dir = args.model_dir
    local_annotations = args.annots
    labkey_ini_section = args.section

    # Load the documents
    text_dl = i2b2DataLoader(txt_dir=data_dir,
                             annotation_dir=local_annotations)
    docs = text_dl.load()

    # Run NER driver with models and data provided in dirs
    extractor = NERExtraction(docs)
    tagged_documents = extractor.tag_all()
    neg_documents = extractor.remove_negated_concepts(tagged_documents)

    # Create DocumentPrinter object; print/write document objects in desired format
    dp = HTMLPrinter()
    dp.write_readable_prediction_results(
        neg_documents,
        "/home/wlane/PycharmProjects/HutchNER1/HutchNER1/NERResults")

    # Evaluate the performance on TAGGED DOCUMENTS (not the negated ones)
    labels = extractor.possible_labels
    ev = NEREvaluator(tagged_documents, labels)
    ev.write_results(
        "/home/wlane/PycharmProjects/HutchNER1/HutchNER1/NEREvaluation/EvalResults",
        strictness="exact")
    ev.write_results(
        "/home/wlane/PycharmProjects/HutchNER1/HutchNER1/NEREvaluation/EvalResults",
        strictness="overlap")

    # Print time elapsed to console
    end = time.clock()
    print "##################################"
    print " \tTime Elapsed: " + str(int(
        (end - start) / 60)) + " minutes and " + str(int(
            (end - start) % 60)) + " seconds."
    print "##################################"
Beispiel #6
0
    def test_get_section_tokens(self):
        loaded_docs = self.dl.preprocess(spacy_model=spacy_model)
        UnformattedDocumentPreprocessor(loaded_docs, spacy_model=spacy_model)
        tester = NERExtraction(loaded_docs)
        loaded_docs = tester.tag_all()

        # standard use case
        surg_history_section = loaded_docs[
            'NERTraining.b0.doc13'].get_section_tokens(
                ['Past Surgical History'])
        surg_and_soc_history_sections = loaded_docs[
            'NERTraining.b0.doc13'].get_section_tokens(
                ['Past Surgical History', 'Social History'])
        allergies_and_soc_history = loaded_docs[
            'NERTraining.b0.doc14'].get_section_tokens(
                ['Allergies', 'Social History'])

        # testing method when section requested is not present
        allergies_and_soc_history = loaded_docs[
            'NERTraining.b0.doc15'].get_section_tokens(
                ['Allergies', 'Social History'])
        self.assertEqual(allergies_and_soc_history, defaultdict(list))