def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def test_should_get_gene_names_normalised(self, list_dict, expected_genes): # Arrange sut = BiocAnnotationGenes() bioc_doc = BioCDocument() bioc_passage = BioCPassage() bioc_doc.add_passage(bioc_passage) for dict in list_dict: annotation = BioCAnnotation() annotation.infons = dict bioc_passage.add_annotation(annotation) # act actual = sut.get_gene_names_normalised(bioc_doc) # assert self.assertEqual(set(expected_genes), actual)
def brat2bioc_entity(bratentity: BratEntity) -> BioCAnnotation: ann = BioCAnnotation() ann.id = bratentity.id ann.text = bratentity.text ann.infons['type'] = bratentity.type for span in bratentity.locations: ann.add_location(BioCLocation(span.begin, span.end - span.begin)) return ann
def to_bioc(self): entity_bioc = BioCAnnotation() entity_bioc.infons['type'] = self.type entity_bioc.text = self.text entity_bioc.id = str(self.id) location = BioCLocation(self.start, len(self.text)) entity_bioc.add_location(location) return entity_bioc
def __parse_annotation(self, tree): annotation = BioCAnnotation() annotation.id = tree.attrib['id'] annotation.infons = self.__parse_infons(tree) annotation.text = tree.findtext('text') for child in tree.findall('location'): annotation.add_location( BioCLocation(int(child.attrib['offset']), int(child.attrib['length']))) return annotation
def test_should_get_gene_names_to_normalised_dict( self, list_gene_dict_in_passage, expected_dict): # Arrange sut = BiocAnnotationGenes() bioc_doc = BioCDocument() for list_gene_dict in list_gene_dict_in_passage: bioc_passage = BioCPassage() bioc_doc.add_passage(bioc_passage) for dict in list_gene_dict: annotation = BioCAnnotation() annotation.text = dict["text"] annotation.infons = dict bioc_passage.add_annotation(annotation) # act actual = sut.get_gene_names_to_normalised_dict(bioc_doc) # assert self.assertEqual(expected_dict, actual)
def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [ stemmer.stem(token) for token in wordpunct_tokenize(passage.text) ] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def add_annotation(triple, annotation_id): # initialize annotation element bioc_annotation = BioCAnnotation() # MeSH term in a tag <text> ... </text> (origininal term, searched case insensitive) bioc_annotation.text = triple[2] # generate XML structure for the annotation and add infon bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('type', 'MeSH term') # add location element bioc_location = BioCLocation() # add length of MeSH term bioc_location.length = str(triple[1]) # add start position (offset) bioc_location.offset = str(triple[0]) bioc_annotation.add_location(bioc_location) return bioc_annotation
def __read_annotation(self, start_elem): ann = BioCAnnotation() ann.id = start_elem.get('id') while self.__has_next(): event, elem = self.__next_event() if event == 'start': pass elif event == 'end': if elem.tag == 'text': ann.text = elem.text elif elem.tag == 'infon': ann.infons[elem.get('key')] = elem.text elif elem.tag == 'location': ann.add_location(BioCLocation(int(elem.get('offset')), int(elem.get('length')))) elif elem.tag == 'annotation': return ann raise RuntimeError("should not reach here") # pragma: no cover
def bioconcepts2pubtator_annotations(tag, index): """Bioconcepts to Annotations Specifically for bioconcepts2pubtator and converts each annotation into an annotation object that BioC can parse. Keyword Arguments: tag -- the annotation line that was parsed into an array index -- the id of each document specific annotation """ annt = BioCAnnotation() annt.id = str(index) annt.infons["type"] = tag["type"] # If the annotation type is a Gene,Species, Mutation, SNP # Write out relevant tag tag_type = tag['type'] or '' tag_id = tag['tag_id'] if tag_type == "Gene": annt.infons["NCBI Gene"] = tag_id elif tag_type == "Species": annt.infons["NCBI Species"] = tag_id elif "Mutation" in tag_type: annt.infons["tmVar"] = tag_id elif "SNP" in tag_type: annt.infons["tmVar"] = tag_id else: # If there is no MESH ID for an annotation if tag_id: # check to see if there are multiple mesh tags if "|" in tag_id: # Write out each MESH id as own tag for tag_num, ids in enumerate(tag_id.split("|")): # Some ids dont have the MESH:#### form so added case to that if ":" not in ids: annt.infons["MESH {}".format(tag_num)] = tag_id else: term_type, term_id = ids.split(":") annt.infons["{} {}".format(term_type, tag_num)] = term_id else: # Some ids dont have the MESH:#### form so added case to that if ":" in tag_id: term_type, term_id = tag_id.split(":") annt.infons[term_type] = term_id else: annt.infons["MESH"] = tag_id else: annt.infons["MESH"] = "Unknown" location = BioCLocation() location.offset = str(tag["start"]) location.length = str(len(tag["term"])) annt.locations.append(location) annt.text = tag["term"] return annt