def read_annotation(path): with open(path) as f: data = f.read().splitlines() annotations = [] i = 0 # Stop when it starts to read sequences while i < len(data) and data[i].replace(' ', '') != 'ORIGIN': # print(data[i][0:5],data[i][5]) if len(data[i]) > 6 and data[i][0:5] == ' ' and data[i][5] != ' ': A = Annotation() A.type = data[i][:21].replace(' ', '') A.strand = '-' if 'complement' in data[i] else '+' number = data[i][21:].replace('complement', '') if data[i + 1].replace(' ', '')[0] != '/': number += data[i + 1].replace(' ', '') number = number.replace('(', '') number = number.replace(')', '') number = number.replace('join', '') # print(number) number = number.split(',') for seg in number: pair = seg.split('..') A.start.append(int(pair[0])) A.end.append(int(pair[1])) info = data[i] + '\n' i += 1 while data[i][5] == ' ': info += data[i] + '\n' i += 1 A.attribute = info # These two types are not very informative. if A.type != 'region' and A.type != 'source': annotations.append(A) if A.type == 'source': genome_size = A.end[0] else: i += 1 return (annotations, genome_size)
def CreateAnnotationDictionary(annotation_file_path): """ Create Annotations from raw documents :param annotation_file_path: Path to directory where annotation documents are located :return: Dictionary of lists of Annotation objects keyed on document name stripped of extension """ #Create a dictionary of documents docDictionary = {} # cd into annotation file directory cwd = os.getcwd() os.chdir(annotation_file_path) #Iterate over documents in the annotation_file_path directory for document in os.listdir(): #Instantiate a list to hold Annotations for each document annotationList = [] #Open the document doc = open(document, "r") #Iterate over lines in the document for line in doc.readlines(): #Create an Annotation obj an = Annotation(line) #Add Annotation obj to the list annotationList.append(an) #Strip the extension from the file to get the document name docName = os.path.splitext(document)[0] #Add the AnnotationList to the dictionary docDictionary[docName] = annotationList #Close the document doc.close() #Return to the original directory os.chdir(cwd) #Return the dictionary return docDictionary
def ProcessAnnotations(metamap_path, ann_path, output_path, tTest, tTreatment, tProblem, tests, treatments, problems): """ Uses MetaMap to corroborate annotations. Annotations where the label on the annotation and the label predicted by MetaMap are in agreement are saved to a file with the same name in the directory specified by output_path @param metamap_path: Path to MetaMap installation @param ann_path: Path to annotation directory @param output_path: Path path for newly identified silver standard annotations """ # Instantiate a list to hold Annotations for each document labelDict = {} # Change to annotation directory cwd = os.getcwd() os.chdir(ann_path) # Iterate over documents in the ann_path directory onlyFiles = [f for f in os.listdir() if os.path.isfile(f)] current = 0 fileCount = len(onlyFiles) for document in onlyFiles: current += 1 print(f'Processing document {current}/{fileCount}, {document}') # Create an Annotation object for each line in the document and append the concepts to a list with open(document, 'r') as doc: annotationList = [] for line in doc.readlines(): an = Annotation(line) annotationList.append(an) # Run pymetamap over annotations and return semantic types annotated_concepts = [a.concept for a in annotationList] mmSemTypes = metamap_helpers.GetMetaMapSemanticTypes( metamap_path, annotated_concepts) # Check MetaMap prediction vs annotation label for ix, annotation in enumerate(annotationList): isSilver, prediction = metamap_helpers.CheckAnnotationAgainstSemTypes( annotation, mmSemTypes[ix], tests, treatments, problems) # Instantiate lists for each label type if annotation.label not in labelDict: labelDict[annotation.label] = {} labelDict[annotation.label]['annotationList'] = [] labelDict[annotation.label]['silverList'] = [] labelDict[annotation.label]['failedList'] = [] labelDict[annotation.label]['ambiguousList'] = [] # Track totals per label labelDict[annotation.label]['annotationList'].append( annotation.original) # If metamap and annotation file agree, add to silver standard list if isSilver: labelDict[annotation.label]['silverList'].append( annotation.original) elif prediction == 'none': labelDict[annotation.label]['ambiguousList'].append( annotation.original) else: labelDict[annotation.label]['failedList'].append( annotation.original) # Return to the original directory os.chdir(cwd) pTotal = len(labelDict['problem']['annotationList']) pSilver = len(labelDict['problem']['silverList']) pAmbiguous = len(labelDict['problem']['ambiguousList']) pIncorrect = len(labelDict['problem']['failedList']) teTotal = len(labelDict['test']['annotationList']) teSilver = len(labelDict['test']['silverList']) teAmbiguous = len(labelDict['test']['ambiguousList']) teIncorrect = len(labelDict['test']['failedList']) trTotal = len(labelDict['treatment']['annotationList']) trSilver = len(labelDict['treatment']['silverList']) trAmbiguous = len(labelDict['treatment']['ambiguousList']) trIncorrect = len(labelDict['treatment']['failedList']) total = pTotal + teTotal + trTotal silver = pSilver + teSilver + trSilver ambiguous = pAmbiguous + teAmbiguous + trAmbiguous incorrect = pIncorrect + teIncorrect + trIncorrect output_file = os.path.join(output_path, 'exp_results.txt') with open(output_file, 'a+', newline='') as f: writer = csv.writer(f) writer.writerow([ tProblem, tTest, tTreatment, total, silver, ambiguous, incorrect, pTotal, pSilver, pAmbiguous, pIncorrect, teTotal, teSilver, teAmbiguous, teIncorrect, trTotal, trSilver, trAmbiguous, trIncorrect ])
def ProcessAnnotations(metamap_path, ann_path, output_dir): """ Uses MetaMap to corroborate annotations. Annotations where the label on the annotation and the label predicted by MetaMap are in agreement are saved to a file with the same name in the directory specified by gold_ann_path @param metamap_path: Path to MetaMap installation @param ann_path: Path to annotation directory @param gold_ann_path: Path path for newly identified gold standard annotations @param save_failed: Save failed annotations to their own separate file for reference, defaults to false """ cwd = os.getcwd() os.chdir(ann_path) semtypes = {} #Create output directory for newly identified gold standard annotations if it doesn't exist if not os.path.exists(output_dir): os.makedirs(output_dir) #Iterate over documents in the ann_path directory for document in [f for f in os.listdir() if os.path.isfile(f)]: #Strip the extension from the file to get the document name docName = os.path.splitext(document)[0] #Instantiate a list to hold Annotations for each document annotationList = [] #Create an Annotation object for each line in the document and append the concepts to a list doc = open(document, "r") for line in doc.readlines(): an = Annotation(line) annotationList.append(an) doc.close() #Run pymetamap over annotations and return semantic types annotated_concepts = [a.concept for a in annotationList] mmSemTypes = metamap_helpers.GetMetaMapSemanticTypes( metamap_path, annotated_concepts) #Check MetaMap prediction vs annotation label for ix, annotation in enumerate(annotationList): for semtype in mmSemTypes[ix]: if semtype not in semtypes: semtypes[semtype] = { key: 0 for key in ['test', 'treatment', 'problem'] } semtypes[semtype][ annotation.label] = semtypes[semtype][annotation.label] + 1 #Write output to a new file new_gold_file = os.path.join(output_dir, "output.txt") print(new_gold_file) with open(new_gold_file, 'w+') as f: f.write('semantictype\ttest\ttreatment\tproblem\n') for key, value in semtypes.items(): f.write('%s\t%d\t%d\t%d\n' % (key, value['test'], value['treatment'], value['problem'])) #Return to the original directory os.chdir(cwd)
def ProcessAnnotations(metamap_path, ann_path, silver_ann_path, save_failed = False): """ Uses MetaMap to corroborate annotations. Annotations where the label on the annotation and the label predicted by MetaMap are in agreement are saved to a file with the same name in the directory specified by silver_ann_path @param metamap_path: Path to MetaMap installation @param ann_path: Path to annotation directory @param silver_ann_path: Path path for newly identified silver standard annotations @param save_failed: Save failed annotations to their own separate file for reference, defaults to false """ #Variables for tracking effectiveness totalAnnotations = 0 totalSilver = 0 totalAmbiguous = 0 totalIncorrect = 0 cwd = os.getcwd() os.chdir(ann_path) #Create output directory for newly identified silver standard annotations if it doesn't exist if not os.path.exists(silver_ann_path): os.makedirs(silver_ann_path) #Create output directory for non-silver annotations if save_failed parameter is True failed_path = os.path.join(silver_ann_path, "failed") if save_failed and not os.path.exists(failed_path): os.makedirs(failed_path) #Iterate over documents in the ann_path directory onlyFiles = [f for f in os.listdir() if os.path.isfile(f)] current = 0 fileCount = len(onlyFiles) for document in onlyFiles: #Strip the extension from the file to get the document name docName = os.path.splitext(document)[0] current += 1 print(f'Processing document {current}/{fileCount}, {document}') #Instantiate a list to hold Annotations for each document annotationList = [] silverList = [] failedList = [] ambiguousList = [] #Create an Annotation object for each line in the document and append the concepts to a list doc = open(document, "r") for line in doc.readlines(): an = Annotation(line) annotationList.append(an) doc.close() #Run pymetamap over annotations and return semantic types annotated_concepts = [a.concept for a in annotationList] mmSemTypes = metamap_helpers.GetMetaMapSemanticTypes(metamap_path, annotated_concepts) #Check MetaMap prediction vs annotation label for ix, annotation in enumerate(annotationList): isSilver, prediction = metamap_helpers.CheckAnnotationAgainstSemTypes(annotation, mmSemTypes[ix]) #If metamap and annotation file agree, add to silver standard list if isSilver: silverList.append(annotation.original) elif prediction == 'none': ambiguousList.append(annotation.original) elif save_failed: failedList.append(prediction + ' ' + annotation.original) #Write new silver standard annotations to a new file new_silver_file = os.path.join(silver_ann_path, docName + ".con") with open(new_silver_file, 'w') as f: for item in silverList: f.write(item) #Write non-silver annotations to a new file if save_failed = True if save_failed: new_failed_file = os.path.join(failed_path, docName + "_incorrect.con") with open(new_failed_file, 'w') as f: for item in failedList: f.write(item) new_ambiguous_file = os.path.join(failed_path, docName + "_ambiguous.con") with open(new_ambiguous_file, 'w') as f: for item in ambiguousList: f.write(item) #Evaluation metrics totalAnnotations += len(annotationList) totalSilver += len(silverList) totalAmbiguous += len(ambiguousList) totalIncorrect += len(failedList) print("Total Annotations: ", str(totalAnnotations)) print("Total Silver: ", str(totalSilver), str(totalSilver/totalAnnotations)) print("Total Ambiguous: ", str(totalAmbiguous), str(totalAmbiguous/totalAnnotations)) print("Total Incorrect: ", str(totalIncorrect), str(totalIncorrect/totalAnnotations)) #Return to the original directory os.chdir(cwd)