def annotate_text(data): """ Run a single text through the Deduce pipeline """ # Remove ID from object record_id = None if 'id' in data: record_id = data['id'] del data['id'] # Run Deduce pipeline try: # temporary workaround for https://github.com/vmenger/deduce/issues/44 annotated_text = deduce.annotate_text(**data) except IndexError: annotated_text = deduce.annotate_text(**data, dates=False) deidentified_text = deduce.deidentify_annotations(annotated_text) # Format result result = {'text': deidentified_text} # Add the ID if it was passed along if record_id is not None: result['id'] = record_id return result
def main(argv): if len(argv) != ARGVLEN: sys.exit(USAGE) first_names,initials,surname,given_name = argv text = readTextFromStdin() annotatedText = deduce.annotate_text(text, \ first_names,initials,surname,given_name, \ names=True, locations=True, institutions=True, dates=True, \ ages=True, patient_numbers=True, phone_numbers=True, urls=True, \ flatten=True) printResults(annotatedText)
def test_annotate_text(self): text = ( u"Dit is stukje tekst met daarin de naam Jan Jansen. De patient J. Jansen " u"(e: [email protected], t: 06-12345678) is 64 jaar oud en woonachtig in Utrecht. Hij werd op 10 " u"oktober door arts Peter de Visser ontslagen van de kliniek van het UMCU." ) annotated = deduce.annotate_text(text, patient_first_names="Jan", patient_surname="Jansen") expected_text = ( "Dit is stukje tekst met daarin de naam <PATIENT Jan Jansen>. De <PATIENT patient J. Jansen> " "(e: <URL [email protected]>, t: <TELEFOONNUMMER 06-12345678>) is <LEEFTIJD 64> jaar oud en " "woonachtig in <LOCATIE Utrecht>. Hij werd op <DATUM 10 oktober> door arts " "<PERSOON Peter de Visser> ontslagen van de kliniek van het <INSTELLING UMCU>." ) self.assertEqual(expected_text, annotated)
import pandas as pd import deduce # pip install git+https://github.com/vmenger/deduce.git data = pd.read_excel(r'Data\ICD10 letters age sex_SENSITIVE.xlsx') # df1 = pd.DataFrame(data) d = [] for i in range(data.shape[0]): text = data['UitgaandeBriefTekst_DOC'][i] annotated = deduce.annotate_text( text, # The text to be annotated patient_first_names="", # First names (separated by whitespace) patient_initials="", patient_surname="", patient_given_name="", # Given name names=True, # Person names, including initials locations=True, # Geographical locations institutions=True, dates=True, ages=True, patient_numbers=True, phone_numbers=True, urls=True, # Urls and e-mail addresses flatten=True # Debug option ) de_identified = deduce.deidentify_annotations(annotated) d.append(de_identified) df2 = pd.DataFrame(d, columns=['deidentified']) result = pd.concat([data, df2], axis=1) result.to_excel("D:\Github\ICD10 Classification\l_anonym.xlsx")
def __init__(self, text): self.text = text self.annotated_text = deduce.annotate_text(self.text)