def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [ stemmer.stem(token) for token in wordpunct_tokenize(passage.text) ] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def main(): bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file) bioc_reader.read() ''' sentences = bioc_reader.collection.documents[0].passages[0].sentences for sentence in sentences: print sentence.offset ''' bioc_writer = BioCWriter('output_bioc.xml') bioc_writer.collection = bioc_reader.collection bioc_writer.write() print(bioc_writer)
def write_text_bioc(self, output_path): bioc_writer = BioCWriter(output_path) bioc_collection = BioCCollection() # Insert option for either writing text only or annotations? # to keep document as it is: #collection.add_document(self.document) bioc_document = BioCDocument() for passage in self.abstract_dict.keys(): bioc_passage = BioCPassage() bioc_passage.text = self.abstract_dict[passage] bioc_document.add_passage(bioc_passage) bioc_collection.add_document(bioc_document) print 'BioC output path', output_path bioc_writer.collection = bioc_collection bioc_writer.write()
occurrences = [] for mesh in mesh_terms: #search for start and end positions with case insensitivity #if there is no abstract (no passage.text) in the document, the regex search will result in an error - use try-except block try: positions = [(a.start(), a.end()) for a in list(re.finditer(mesh.lower(), passage.text.lower()))] if positions: for tuples in positions: #passage.text[tuples[0]:tuples[0]+len(mesh)] would return the identified term # the triple is (start position, term length, MeSH term) occurrences.append((tuples[0], len(mesh), mesh)) except: print doc.id, mesh, "Mesh term not found." # sorted by start positions occurrences.sort() for triple in occurrences: # ToDo: add a condition for nested tags? # add annotation with infon, location, and text bioc_annotation = add_annotation(triple,str(annotation_id)+annotation_type) passage.add_annotation(bioc_annotation) # increment annotation_id annotation_id += 1 # write XML format to output file bioc_writer.write() # debug: #print(bioc_writer) # disconnect from database cursor.close() connection.close()
positions = [(a.start(), a.end()) for a in list( re.finditer(mesh.lower(), passage.text.lower()))] if positions: for tuples in positions: #passage.text[tuples[0]:tuples[0]+len(mesh)] would return the identified term # the triple is (start position, term length, MeSH term) occurrences.append( (tuples[0], len(mesh), mesh)) except: print doc.id, mesh, "Mesh terms not found." # sorted by start positions occurrences.sort() for triple in occurrences: # ToDo: add a condition for nested tags? # add annotation with infon, location, and text bioc_annotation = add_annotation( triple, str(annotation_id) + annotation_type) passage.add_annotation(bioc_annotation) # increment annotation_id annotation_id += 1 # write XML format to output file bioc_writer.write() # debug: #print(bioc_writer) # disconnect from database cursor.close() connection.close()