def main(): # start timer start = time.clock() # Parse incoming cmd line arguments args = ArgumentParsingSettings.get_local_predict_args() data_dir = args.datadir model_type = args.model_type # Section raw documents sectioner_out_dir = uw_sectioner(data_dir) # Load sectioned docs xml_dl = SectionerXMLDataLoader(xml_dir=sectioner_out_dir, clean_tmp_files=True) docs = xml_dl.load() # Perform NER on sectioned docs extractor = NERExtraction(docs, model_algo=model_type) tagged_documents = extractor.tag_all() tagged_documents = extractor.remove_negated_concepts(tagged_documents) # Print full docs dp = HTMLPrinter() dp.write_readable_prediction_results( tagged_documents, "/home/wlane/PycharmProjects/HutchNER/HutchNER/NERResults", model_algo=model_type) end = time.clock() print("##################################") print(" \tTime Elapsed: " + str(int((end - start) / 60)) + " minutes and " + str(int((end - start) % 60)) + " seconds.") print("##################################")
def main(documents, model_type, models): text_dl = JSONDataLoader(documents=documents) docs = text_dl.preprocess(spacy_model=models['spacy']) extractor = NERExtraction(docs, model_algo=model_type) tagged_documents = extractor.tag_all(models) json_response = extractor.docs2json(tagged_documents) return json_response
def setUp(self): self.negater = HutchNegEx() self.dl = SectionerXMLDataLoader( "/home/wlane/PycharmProjects/HutchNER_API/NERResources/TestCaseData_sectioned", clean_tmp_files=False) loaded_docs = self.dl.preprocess(spacy_model=spacy_model) UnformattedDocumentPreprocessor(loaded_docs, spacy_model=spacy_model) tester = NERExtraction(loaded_docs) self.loaded_docs = tester.tag_all()
def main(): """ Entry point to HutchNER1: Concept NERExtraction Training """ # start timer start = time.clock() # Parse incoming cmd line arguments args = ArgumentParsingSettings.get_testing_args() text_dir = args.textdir local_annotations = args.annots labkey_ini_section = args.section model_name = args.model model_type = args.model_type anno_type = args.anno_type print('model_name:') print(model_name) # Load the documents if anno_type == 'i2b2': text_dl = i2b2DataLoader(txt_dir=text_dir, annotation_dir=local_annotations) else: text_dl = bratDataLoader(txt_dir=text_dir, annotation_dir=local_annotations) docs = text_dl.load() # Run NER driver with models and data provided in dirs extractor = NERExtraction(docs, model_name, model_type) tagged_documents = extractor.tag_all(models=models) neg_documents = extractor.remove_negated_concepts(tagged_documents) # Evaluate the performance on TAGGED DOCUMENTS (not the negated ones) labels = extractor.possible_labels ev = NEREvaluator(tagged_documents, labels) # use timestamp to link output labels and files to output results numbers time_stamp = time.time() string_timestamp = datetime.datetime.fromtimestamp(time_stamp).strftime( '%Y-%m-%d_%H.%M.%S') ev.output_labels("OutputLabels", tagged_documents, model_name, string_timestamp) ev.write_results("EvalResults", strictness="exact", model_name=model_name, string_timestamp=string_timestamp) ev.write_results("EvalResults", strictness="overlap", model_name=model_name, string_timestamp=string_timestamp) # Print time elapsed to console end = time.clock() print("##################################") print(" \tTime Elapsed: " + str(int((end - start) / 60)) + " minutes and " + str(int((end - start) % 60)) + " seconds.") print("##################################")
def main(): """ Entry point to HutchNER1: Concept NERExtraction Training """ # start timer start = time.clock() # Parse incoming cmd line arguments args = ArgumentParsingSettings.get_testing_args() data_dir = args.datadir model_dir = args.model_dir local_annotations = args.annots labkey_ini_section = args.section # Load the documents text_dl = i2b2DataLoader(txt_dir=data_dir, annotation_dir=local_annotations) docs = text_dl.load() # Run NER driver with models and data provided in dirs extractor = NERExtraction(docs) tagged_documents = extractor.tag_all() neg_documents = extractor.remove_negated_concepts(tagged_documents) # Create DocumentPrinter object; print/write document objects in desired format dp = HTMLPrinter() dp.write_readable_prediction_results( neg_documents, "/home/wlane/PycharmProjects/HutchNER1/HutchNER1/NERResults") # Evaluate the performance on TAGGED DOCUMENTS (not the negated ones) labels = extractor.possible_labels ev = NEREvaluator(tagged_documents, labels) ev.write_results( "/home/wlane/PycharmProjects/HutchNER1/HutchNER1/NEREvaluation/EvalResults", strictness="exact") ev.write_results( "/home/wlane/PycharmProjects/HutchNER1/HutchNER1/NEREvaluation/EvalResults", strictness="overlap") # Print time elapsed to console end = time.clock() print "##################################" print " \tTime Elapsed: " + str(int( (end - start) / 60)) + " minutes and " + str(int( (end - start) % 60)) + " seconds." print "##################################"
def test_get_section_tokens(self): loaded_docs = self.dl.preprocess(spacy_model=spacy_model) UnformattedDocumentPreprocessor(loaded_docs, spacy_model=spacy_model) tester = NERExtraction(loaded_docs) loaded_docs = tester.tag_all() # standard use case surg_history_section = loaded_docs[ 'NERTraining.b0.doc13'].get_section_tokens( ['Past Surgical History']) surg_and_soc_history_sections = loaded_docs[ 'NERTraining.b0.doc13'].get_section_tokens( ['Past Surgical History', 'Social History']) allergies_and_soc_history = loaded_docs[ 'NERTraining.b0.doc14'].get_section_tokens( ['Allergies', 'Social History']) # testing method when section requested is not present allergies_and_soc_history = loaded_docs[ 'NERTraining.b0.doc15'].get_section_tokens( ['Allergies', 'Social History']) self.assertEqual(allergies_and_soc_history, defaultdict(list))