def read_annotation(path):
    with open(path) as f:
        data = f.read().splitlines()
    annotations = []
    i = 0
    # Stop when it starts to read sequences
    while i < len(data) and data[i].replace(' ', '') != 'ORIGIN':
        # print(data[i][0:5],data[i][5])
        if len(data[i]) > 6 and data[i][0:5] == '     ' and data[i][5] != ' ':
            A = Annotation()
            A.type = data[i][:21].replace(' ', '')
            A.strand = '-' if 'complement' in data[i] else '+'
            number = data[i][21:].replace('complement', '')
            if data[i + 1].replace(' ', '')[0] != '/':
                number += data[i + 1].replace(' ', '')
            number = number.replace('(', '')
            number = number.replace(')', '')
            number = number.replace('join', '')
            # print(number)
            number = number.split(',')
            for seg in number:
                pair = seg.split('..')
                A.start.append(int(pair[0]))
                A.end.append(int(pair[1]))
            info = data[i] + '\n'
            i += 1
            while data[i][5] == ' ':
                info += data[i] + '\n'
                i += 1
            A.attribute = info
            # These two types are not very informative.
            if A.type != 'region' and A.type != 'source':
                annotations.append(A)
            if A.type == 'source':
                genome_size = A.end[0]
        else:
            i += 1
    return (annotations, genome_size)
Ejemplo n.º 2
0
def CreateAnnotationDictionary(annotation_file_path):
	"""
	Create Annotations from raw documents
	
	:param annotation_file_path: Path to directory where annotation documents are located
	:return: Dictionary of lists of Annotation objects keyed on document name stripped of extension
	"""
	
	#Create a dictionary of documents
	docDictionary = {}

	# cd into annotation file directory
	cwd = os.getcwd()
	os.chdir(annotation_file_path)

	#Iterate over documents in the annotation_file_path directory
	for document in os.listdir():

		#Instantiate a list to hold Annotations for each document
		annotationList = []

		#Open the document
		doc = open(document, "r")

		#Iterate over lines in the document
		for line in doc.readlines():

			#Create an Annotation obj
			an = Annotation(line)

			#Add Annotation obj to the list
			annotationList.append(an)        

		#Strip the extension from the file to get the document name
		docName = os.path.splitext(document)[0]

		#Add the AnnotationList to the dictionary
		docDictionary[docName] = annotationList

		#Close the document
		doc.close()
		
	#Return to the original directory
	os.chdir(cwd)

	#Return the dictionary
	return docDictionary
def ProcessAnnotations(metamap_path, ann_path, output_path, tTest, tTreatment,
                       tProblem, tests, treatments, problems):
    """
    Uses MetaMap to corroborate annotations. Annotations where the label on the annotation
    and the label predicted by MetaMap are in agreement are saved to a file with the same name
    in the directory specified by  output_path

    @param metamap_path: Path to MetaMap installation
    @param ann_path: Path to annotation directory
    @param  output_path: Path path for newly identified silver standard annotations
    """
    # Instantiate a list to hold Annotations for each document
    labelDict = {}

    # Change to annotation directory
    cwd = os.getcwd()
    os.chdir(ann_path)

    # Iterate over documents in the ann_path directory
    onlyFiles = [f for f in os.listdir() if os.path.isfile(f)]
    current = 0
    fileCount = len(onlyFiles)
    for document in onlyFiles:
        current += 1
        print(f'Processing document {current}/{fileCount}, {document}')

        # Create an Annotation object for each line in the document and append the concepts to a list
        with open(document, 'r') as doc:
            annotationList = []
            for line in doc.readlines():
                an = Annotation(line)
                annotationList.append(an)

        # Run pymetamap over annotations and return semantic types
        annotated_concepts = [a.concept for a in annotationList]
        mmSemTypes = metamap_helpers.GetMetaMapSemanticTypes(
            metamap_path, annotated_concepts)

        # Check MetaMap prediction vs annotation label
        for ix, annotation in enumerate(annotationList):
            isSilver, prediction = metamap_helpers.CheckAnnotationAgainstSemTypes(
                annotation, mmSemTypes[ix], tests, treatments, problems)

            # Instantiate lists for each label type
            if annotation.label not in labelDict:
                labelDict[annotation.label] = {}
                labelDict[annotation.label]['annotationList'] = []
                labelDict[annotation.label]['silverList'] = []
                labelDict[annotation.label]['failedList'] = []
                labelDict[annotation.label]['ambiguousList'] = []

            # Track totals per label
            labelDict[annotation.label]['annotationList'].append(
                annotation.original)

            # If metamap and annotation file agree, add to silver standard list
            if isSilver:
                labelDict[annotation.label]['silverList'].append(
                    annotation.original)
            elif prediction == 'none':
                labelDict[annotation.label]['ambiguousList'].append(
                    annotation.original)
            else:
                labelDict[annotation.label]['failedList'].append(
                    annotation.original)

    # Return to the original directory
    os.chdir(cwd)

    pTotal = len(labelDict['problem']['annotationList'])
    pSilver = len(labelDict['problem']['silverList'])
    pAmbiguous = len(labelDict['problem']['ambiguousList'])
    pIncorrect = len(labelDict['problem']['failedList'])

    teTotal = len(labelDict['test']['annotationList'])
    teSilver = len(labelDict['test']['silverList'])
    teAmbiguous = len(labelDict['test']['ambiguousList'])
    teIncorrect = len(labelDict['test']['failedList'])

    trTotal = len(labelDict['treatment']['annotationList'])
    trSilver = len(labelDict['treatment']['silverList'])
    trAmbiguous = len(labelDict['treatment']['ambiguousList'])
    trIncorrect = len(labelDict['treatment']['failedList'])

    total = pTotal + teTotal + trTotal
    silver = pSilver + teSilver + trSilver
    ambiguous = pAmbiguous + teAmbiguous + trAmbiguous
    incorrect = pIncorrect + teIncorrect + trIncorrect

    output_file = os.path.join(output_path, 'exp_results.txt')
    with open(output_file, 'a+', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([
            tProblem, tTest, tTreatment, total, silver, ambiguous, incorrect,
            pTotal, pSilver, pAmbiguous, pIncorrect, teTotal, teSilver,
            teAmbiguous, teIncorrect, trTotal, trSilver, trAmbiguous,
            trIncorrect
        ])
def ProcessAnnotations(metamap_path, ann_path, output_dir):
    """
    Uses MetaMap to corroborate annotations. Annotations where the label on the annotation
    and the label predicted by MetaMap are in agreement are saved to a file with the same name
    in the directory specified by gold_ann_path

    @param metamap_path: Path to MetaMap installation
    @param ann_path: Path to annotation directory
    @param gold_ann_path: Path path for newly identified gold standard annotations
    @param save_failed: Save failed annotations to their own separate file for reference, defaults to false
    """

    cwd = os.getcwd()
    os.chdir(ann_path)

    semtypes = {}

    #Create output directory for newly identified gold standard annotations if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    #Iterate over documents in the ann_path directory
    for document in [f for f in os.listdir() if os.path.isfile(f)]:

        #Strip the extension from the file to get the document name
        docName = os.path.splitext(document)[0]

        #Instantiate a list to hold Annotations for each document
        annotationList = []

        #Create an Annotation object for each line in the document and append the concepts to a list
        doc = open(document, "r")
        for line in doc.readlines():
            an = Annotation(line)
            annotationList.append(an)
        doc.close()

        #Run pymetamap over annotations and return semantic types
        annotated_concepts = [a.concept for a in annotationList]
        mmSemTypes = metamap_helpers.GetMetaMapSemanticTypes(
            metamap_path, annotated_concepts)

        #Check MetaMap prediction vs annotation label
        for ix, annotation in enumerate(annotationList):
            for semtype in mmSemTypes[ix]:
                if semtype not in semtypes:
                    semtypes[semtype] = {
                        key: 0
                        for key in ['test', 'treatment', 'problem']
                    }
                semtypes[semtype][
                    annotation.label] = semtypes[semtype][annotation.label] + 1

    #Write output to a new file
    new_gold_file = os.path.join(output_dir, "output.txt")
    print(new_gold_file)
    with open(new_gold_file, 'w+') as f:
        f.write('semantictype\ttest\ttreatment\tproblem\n')
        for key, value in semtypes.items():
            f.write('%s\t%d\t%d\t%d\n' %
                    (key, value['test'], value['treatment'], value['problem']))

    #Return to the original directory
    os.chdir(cwd)
def ProcessAnnotations(metamap_path, ann_path, silver_ann_path, save_failed = False):
	"""
	Uses MetaMap to corroborate annotations. Annotations where the label on the annotation
	and the label predicted by MetaMap are in agreement are saved to a file with the same name
	in the directory specified by silver_ann_path

	@param metamap_path: Path to MetaMap installation
	@param ann_path: Path to annotation directory
	@param silver_ann_path: Path path for newly identified silver standard annotations
	@param save_failed: Save failed annotations to their own separate file for reference, defaults to false
	"""

	#Variables for tracking effectiveness
	totalAnnotations = 0
	totalSilver = 0
	totalAmbiguous = 0
	totalIncorrect = 0

	cwd = os.getcwd()
	os.chdir(ann_path)

	#Create output directory for newly identified silver standard annotations if it doesn't exist
	if not os.path.exists(silver_ann_path):
		os.makedirs(silver_ann_path)

	#Create output directory for non-silver annotations if save_failed parameter is True
	failed_path = os.path.join(silver_ann_path, "failed")
	if save_failed and not os.path.exists(failed_path):
		os.makedirs(failed_path)

	#Iterate over documents in the ann_path directory
	onlyFiles = [f for f in os.listdir() if os.path.isfile(f)]
	current = 0
	fileCount = len(onlyFiles)
	for document in onlyFiles:

		#Strip the extension from the file to get the document name
		docName = os.path.splitext(document)[0]
		current += 1
		print(f'Processing document {current}/{fileCount}, {document}')		

		#Instantiate a list to hold Annotations for each document
		annotationList = []
		silverList = []
		failedList = []
		ambiguousList = []

		#Create an Annotation object for each line in the document and append the concepts to a list
		doc = open(document, "r")  
		for line in doc.readlines():
			an = Annotation(line)
			annotationList.append(an)
		doc.close()

		#Run pymetamap over annotations and return semantic types
		annotated_concepts = [a.concept for a in annotationList]
		mmSemTypes = metamap_helpers.GetMetaMapSemanticTypes(metamap_path, annotated_concepts)

		#Check MetaMap prediction vs annotation label
		for ix, annotation in enumerate(annotationList):
			isSilver, prediction = metamap_helpers.CheckAnnotationAgainstSemTypes(annotation, mmSemTypes[ix])
			
			#If metamap and annotation file agree, add to silver standard list
			if isSilver:
				silverList.append(annotation.original)
			elif prediction == 'none':
				ambiguousList.append(annotation.original)
			elif save_failed:
				failedList.append(prediction + ' ' + annotation.original)
		
		#Write new silver standard annotations to a new file
		new_silver_file = os.path.join(silver_ann_path, docName + ".con")
		with open(new_silver_file, 'w') as f:
			for item in silverList:
				f.write(item)

		#Write non-silver annotations to a new file if save_failed = True
		if save_failed:
			new_failed_file = os.path.join(failed_path, docName + "_incorrect.con")
			with open(new_failed_file, 'w') as f:
				for item in failedList:
					f.write(item)
			new_ambiguous_file = os.path.join(failed_path, docName + "_ambiguous.con")
			with open(new_ambiguous_file, 'w') as f:
				for item in ambiguousList:
					f.write(item)
	
		#Evaluation metrics
		totalAnnotations += len(annotationList)
		totalSilver += len(silverList)
		totalAmbiguous += len(ambiguousList)
		totalIncorrect += len(failedList)

	print("Total Annotations: ", str(totalAnnotations))
	print("Total Silver: ", str(totalSilver), str(totalSilver/totalAnnotations))
	print("Total Ambiguous: ", str(totalAmbiguous), str(totalAmbiguous/totalAnnotations))
	print("Total Incorrect: ", str(totalIncorrect), str(totalIncorrect/totalAnnotations))
	
	#Return to the original directory
	os.chdir(cwd)