def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [ stemmer.stem(token) for token in wordpunct_tokenize(passage.text) ] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def main(): bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file) bioc_reader.read() ''' sentences = bioc_reader.collection.documents[0].passages[0].sentences for sentence in sentences: print sentence.offset ''' bioc_writer = BioCWriter('output_bioc.xml') bioc_writer.collection = bioc_reader.collection bioc_writer.write() print(bioc_writer)
def process_bioc_file(bioc_file, output_dot_dir, options=None, args=None): '''Read BioC file with BioCReader, process each document in the BioC file seperately.''' bioc_reader = BioCReader(bioc_file) bioc_reader.read() for one_document in bioc_reader.collection.documents: doc_id = one_document.id print 'Document found', doc_id document_name = doc_id bioc_graph = bioc_document_to_graph(one_document) write_graph(bioc_graph, output_dot_dir, document_name, options=options, args=args)
def _load(self, d): self.biocDocs = [] if self.verbose: print >> sys.stderr, "Reading", d filesRead = 0 for xmlFile in glob.glob(os.path.join(d, "*.xml")): try: r = BioCReader(xmlFile, dtd_valid_file=BIOC_DTD) r.read() except Exception, e: raise BioIDScoreError, ( "encountered error reading collection: %s" % str(e)) # The collection contains a document for each caption. # Each caption contains a single passage. self.biocDocs.append(r.collection) filesRead += 1 if self.verbose > 1: print >> sys.stderr, "Read", xmlFile elif self.verbose and (filesRead % 20 == 0): print >> sys.stderr, "...", filesRead
postgres_password = "******" postgres_host = "localhost" postgres_port = "5432" postgres_db = options.d connection = psycopg2.connect("dbname='"+postgres_db+"' user='******' host='"+postgres_host+"' password='******' port='"+postgres_port+"'") cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor) # save file names in an extra variable input_file = options.i dtd_file = options.b output_file = options.o # open input files try: bioc_reader = BioCReader(input_file, dtd_valid_file=dtd_file) bioc_reader.read() except: ## debug: #raise sys.exit("Probably, your input file contains an empty passage. Maybe one of the PubMed-IDs does not have an abstract. Please, remove empty passage and document tags. No output file was written.") # the elements <date> and <key> will not be changed or updated by this script (it only adds (MeSH) annotations) # define output file bioc_writer = BioCWriter(output_file) # initialization for reading input file bioc_writer.collection = bioc_reader.collection # get documents (one PubMed-ID with title and text equals one document) docs = bioc_writer.collection.documents # different annotation IDs can be confusing - add a type to the iterating number annotation_type = "_MeSH" # iteration over PubMed abstracts with ID, title, and text
#!/usr/bin/env python # -*- coding: UTF-8 -*- """ Copyright (c) 2015, Kersten Doering <*****@*****.**> This script reads annotations from a given XML document in BioC format and prints them to command-line. """ # PyBioC API from bioc import BioCReader # open BioC XML file and DTD file with XML structure definitions bioc_reader = BioCReader("text_PubTator.xml", dtd_valid_file="BioC.dtd") # read files bioc_reader.read() # get documents from BioC XML file (PubMed abstracts) docs = bioc_reader.collection.documents # iterate over documents for doc in docs: # show document ID (PubMed ID) print "PubMed ID:",doc.id # iterate over passages - PubMed titles and abstracts for passage in doc.passages: # show passage type print "Text type:", passage.infons['type'] # iterate over annotations for each passage and show information for annotation in passage.annotations: print "Annotation ID:", annotation.id print "Annotation Type:", annotation.infons['type'] print "Annotation Text:", annotation.text
def parse(self): bioc_reader = BioCReader(self.filename, dtd_valid_file=BioC_DTD) bioc_reader.read() return bioc_reader.collection
class BioCCollectionHandler(object): def __init__(self, bioc_file_path, options=None, args=None): self.bioc_reader = BioCReader(bioc_file_path) self.bioc_reader.read() self.raw_collection = self.bioc_reader.collection self.raw_documents = self.bioc_reader.collection.documents #print self.raw_documents self.document_list = self.get_documents(options=options, args=args) self.id_list = self.get_ids() self.pmid_abstracts_dict = self.pmid_abstracts_dict() def get_documents(self, options=None, args=None): document_list = [] for one_document in self.raw_documents: bioc_doc = BioCAbstractHandler(one_document) document_list.append(bioc_doc) try: if options.filename and len(document_list) > 1: if not options.pmid: #print 'WARNING: more than one document in BioC file' raise(Exception('More than one document in BioC file. Remove --filename option!')) else: pass except AttributeError: pass return document_list def pmid_abstracts_dict(self): pmid_abstracts_dict = {} for one_doc in self.document_list: pmid_abstracts_dict[one_doc.id] = one_doc return pmid_abstracts_dict def get_document(self, pmid): return self.pmid_abstracts_dict[pmid] def get_ids(self): id_list = [one_doc.id for one_doc in self.document_list] return id_list def write_og_xml_files(self, output_dir, options=None, args=None): if options.pmid: try: abstract_handler = self.pmid_abstracts_dict[options.pmid] except KeyError: raise(Exception('Target Pubmed ID could not be found in BioC collection')) else: if not options.filename: output_path = output_dir + '/' + abstract_handler.id + '_og.xml' elif output_dir in options.filename: filename = options.filename.split('/')[-1] print 'FILENAME', filename output_path = output_dir + '/' + filename else: 'standard filename' output_path = output_dir + '/' + options.filename og_writer = OG_XMLWriter(abstract_handler, output_path) og_writer.write() else: for abstract_handler in self.pmid_abstracts_dict.values(): if not options.filename: output_path = output_dir + '/' + abstract_handler.id + '_og.xml' elif output_dir in options.filename: filename = options.filename.split('/')[-1] output_path = output_dir + '/' + filename else: output_path = output_dir + '/' + options.filename #print output_path, 'output_path' og_writer = OG_XMLWriter(abstract_handler, output_path) og_writer.write() def write_bioc_xml_files(self, output_dir, options=None, args=None): if options: if options.pmid: try: abstract_handler = self.pmid_abstracts_dict[options.pmid] except KeyError: raise(Exception('Target Pubmed ID could not be found in BioC collection')) else: if not options.filename: output_path = output_dir + '/' + abstract_handler.id + '.bioc' elif output_dir in options.filename: filename = options.filename.split('/')[-1] print 'FILENAME', filename output_path = output_dir + '/' + filename else: output_path = output_dir + '/' + options.filename abstract_handler.write_text_bioc(output_path) elif not options.pmid: for abstract_handler in self.pmid_abstracts_dict.values(): if not options.filename: output_path = output_dir + '/' + abstract_handler.id + '.bioc' elif output_dir in options.filename: if '/' in options.filename: filename = options.filename.split('/')[-1] output_path = output_dir + '/' + filename else: #output_path = os.getcwd() + '/' + filename output_path = output_dir + '/' + filename else: output_path = output_dir + '/' + options.filename abstract_handler.write_text_bioc(output_path) elif not options: for abstract_handler in self.pmid_abstracts_dict.values(): output_path = output_dir + '/' + abstract_handler.id + '.bioc' abstract_handler.write_text_bioc(output_path)