def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [ stemmer.stem(token) for token in wordpunct_tokenize(passage.text) ] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def main(): # Use file defined by BIOC_IN as default if no other provided bioc_in = BIOC_IN if len(sys.argv) >= 2: bioc_in = sys.argv[1] # A BioCReader object is put in place to hold the example BioC XML # document bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE) # A BioCWRiter object is prepared to write out the annotated data bioc_writer = BioCWriter(BIOC_OUT) # The NLTK porter stemmer is used for stemming stemmer = PorterStemmer() # The example input file given above (by BIOC_IN) is fed into # a BioCReader object; validation is done by the BioC DTD bioc_reader.read() # Pass over basic data bioc_writer.collection = bioc_reader.collection # Get documents to manipulate documents = bioc_writer.collection.documents # Go through each document annotation_id = 0 for document in documents: # Go through each passage of the document for passage in document: # Stem all the tokens found stems = [stemmer.stem(token) for token in wordpunct_tokenize(passage.text)] # Add an anotation showing the stemmed version, in the # given order for stem in stems: annotation_id += 1 # For each token an annotation is created, providing # the surface form of a 'stemmed token'. # (The annotations are collectively added following # a document passage with a <text> tag.) bioc_annotation = BioCAnnotation() bioc_annotation.text = stem bioc_annotation.id = str(annotation_id) bioc_annotation.put_infon('surface form', 'stemmed token') passage.add_annotation(bioc_annotation) # Print file to screen w/o trailing newline # (Can be redirected into a file, e. g output_bioc.xml) sys.stdout.write(str(bioc_writer)) # Write to disk bioc_writer.write()
def main(): bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file) bioc_reader.read() ''' sentences = bioc_reader.collection.documents[0].passages[0].sentences for sentence in sentences: print sentence.offset ''' bioc_writer = BioCWriter('output_bioc.xml') bioc_writer.collection = bioc_reader.collection bioc_writer.write() print(bioc_writer)
def write_text_bioc(self, output_path): bioc_writer = BioCWriter(output_path) bioc_collection = BioCCollection() # Insert option for either writing text only or annotations? # to keep document as it is: #collection.add_document(self.document) bioc_document = BioCDocument() for passage in self.abstract_dict.keys(): bioc_passage = BioCPassage() bioc_passage.text = self.abstract_dict[passage] bioc_document.add_passage(bioc_passage) bioc_collection.add_document(bioc_document) print 'BioC output path', output_path bioc_writer.collection = bioc_collection bioc_writer.write()
output_file = options.o # open input files try: bioc_reader = BioCReader(input_file, dtd_valid_file=dtd_file) bioc_reader.read() except: ## debug: #raise sys.exit("Probably, your input file contains an empty passage. Maybe one of the PubMed-IDs does not have an abstract. Please, remove empty passage and document tags. No output file was written.") # the elements <date> and <key> will not be changed or updated by this script (it only adds (MeSH) annotations) # define output file bioc_writer = BioCWriter(output_file) # initialization for reading input file bioc_writer.collection = bioc_reader.collection # get documents (one PubMed-ID with title and text equals one document) docs = bioc_writer.collection.documents # different annotation IDs can be confusing - add a type to the iterating number annotation_type = "_MeSH" # iteration over PubMed abstracts with ID, title, and text for doc in docs: # get MeSH terms from PostgreSQL database mesh_terms = get_MeSH_terms(doc.id) # use annotation IDs starting at 0 for each document annotation_id = 0 # if at least one MeSH term was found for this PubMed-ID (doc.id) if mesh_terms: # abstract title is one passage and abstract text is one passage for passage in doc: # first save all occurrences of terms for a document passage with position and length, sort them afterwards, and then create infons with incremental annotation ID
def convert_pubtator(input_path, output_path): """Convert pubtators annotation list to BioC XML Keyword Arguments: input_file -- the path of pubtators annotation file output_file -- the path to output the BioC XML file """ # Set up BioCWriter to write specifically Pubtator # Can change to incorporate other sources besides pubtator writer = BioCWriter() writer.collection = BioCCollection() collection = writer.collection collection.date = time.strftime("%Y/%m/%d") collection.source = "Pubtator" collection.key = "Pubtator.key" opener = utilities.get_opener(output_path) with opener(output_path, 'wb') as xml_file: # Have to manually do this because hangs otherwise # Write the head of the xml file xml_shell = writer.tostring('UTF-8') *xml_head, xml_tail = xml_shell.rstrip().split(b'\n') for line in xml_head: xml_file.write(line + b'\n') article_generator = read_bioconcepts2pubtator_offsets(input_path) # Write each article in BioC format for article in tqdm.tqdm(article_generator): document = BioCDocument() document.id = article["pubmed_id"] title_passage = BioCPassage() title_passage.put_infon('type', 'title') title_passage.offset = '0' title_passage.text = article["title"] abstract_passage = BioCPassage() abstract_passage.put_infon('type', 'abstract') abstract_passage.offset = article["abstract"] abstract_passage.text = article["abstract"] id_index = 0 for tag in article["title_annot"]: title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 for tag in article["abstract_annot"]: abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 document.add_passage(title_passage) document.add_passage(abstract_passage) step_parent = E('collection') writer._build_documents([document], step_parent) xml_file.write(tostring(step_parent[0], pretty_print=True)) step_parent.clear() # Write the closing tag of the xml document xml_file.write(xml_tail + b'\n')
def convert_pubtator(input_file, output_file=None): """Convert pubtators annotation list to BioC XML Keyword Arguments: input_file -- the path of pubtators annotation file output_file -- the path to output the converted text """ if output_file is None: output_file = "bioc-converted-docs.xml" # Set up BioCWriter to write specifically Pubtator # Can change to incorporate other sources besides pubtator writer = BioCWriter() writer.collection = BioCCollection() collection = writer.collection collection.date = time.strftime("%Y/%m/%d") collection.source = "Pubtator" collection.key = "Pubtator.key" with open(output_file, 'wb') as g: # Have to manually do this because hangs otherwise # Write the head of the xml file xml_header = writer.tostring('UTF-8') xml_tail = '</collection>\n' xml_head = xml_header[:-len(xml_tail)] g.write(xml_head) article_generator = bioconcepts2pubtator_offsets(input_file) # Write each article in BioC format for article in tqdm.tqdm(article_generator): document = BioCDocument() document.id = article["Document ID"] title_passage = BioCPassage() title_passage.put_infon('type', 'title') title_passage.offset = '0' title_passage.text = article["Title"] abstract_passage = BioCPassage() abstract_passage.put_infon('type', 'abstract') abstract_passage.offset = str(article["Abstract"]) abstract_passage.text = article["Abstract"] id_index = 0 for tag in article["Title_Annot"]: title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 for tag in article["Abstract_Annot"]: abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 document.add_passage(title_passage) document.add_passage(abstract_passage) step_parent = E('collection') writer._build_documents([document], step_parent) g.write(tostring(step_parent[0], pretty_print=True)) step_parent.clear() # Write the closing tag of the xml document g.write(xml_tail)
# open input files try: bioc_reader = BioCReader(input_file, dtd_valid_file=dtd_file) bioc_reader.read() except: ## debug: #raise sys.exit( "Probably, your input file contains an empty passage. Maybe one of the PubMed-IDs does not have an abstract. Please, remove empty passage and document tags. No output file was written." ) # the elements <date> and <key> will not be changed or updated by this script (it only adds (MeSH) annotations) # define output file bioc_writer = BioCWriter(output_file) # initialization for reading input file bioc_writer.collection = bioc_reader.collection # get documents (one PubMed-ID with title and text equals one document) docs = bioc_writer.collection.documents # different annotation IDs can be confusing - add a type to the iterating number annotation_type = "_MeSH" # iteration over PubMed abstracts with ID, title, and text for doc in docs: # get MeSH terms from PostgreSQL database mesh_terms = get_MeSH_terms(doc.id) # use annotation IDs starting at 0 for each document annotation_id = 0 # if at least one MeSH term was found for this PubMed-ID (doc.id) if mesh_terms: # abstract title is one passage and abstract text is one passage for passage in doc: # first save all occurrences of terms for a document passage with position and length, sort them afterwards, and then create infons with incremental annotation ID