Exemple #1
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]

    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)

    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)

    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()

    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()

    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection

    # Get documents to manipulate
    documents = bioc_writer.collection.documents

    # Go through each document
    annotation_id = 0
    for document in documents:

        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [
                stemmer.stem(token)
                for token in wordpunct_tokenize(passage.text)
            ]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1

                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 'stemmed token')
                passage.add_annotation(bioc_annotation)

    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))

    # Write to disk
    bioc_writer.write()
Exemple #2
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()
Exemple #3
0
def main():
    bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file)
    bioc_reader.read()
    '''
    sentences = bioc_reader.collection.documents[0].passages[0].sentences
    for sentence in sentences:
        print sentence.offset
    '''

    bioc_writer = BioCWriter('output_bioc.xml')
    bioc_writer.collection = bioc_reader.collection
    bioc_writer.write()
    print(bioc_writer)
Exemple #4
0
def main():
    bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file)
    bioc_reader.read()
    '''
    sentences = bioc_reader.collection.documents[0].passages[0].sentences
    for sentence in sentences:
        print sentence.offset
    '''

    bioc_writer = BioCWriter('output_bioc.xml')
    bioc_writer.collection = bioc_reader.collection
    bioc_writer.write()
    print(bioc_writer)
def process_bioc_file(bioc_file, output_dot_dir, options=None, args=None):
    '''Read BioC file with BioCReader, process each document in the BioC file seperately.'''

    bioc_reader = BioCReader(bioc_file)
    bioc_reader.read()

    for one_document in bioc_reader.collection.documents:
        doc_id = one_document.id
        print 'Document found', doc_id
        document_name = doc_id

        bioc_graph = bioc_document_to_graph(one_document)

        write_graph(bioc_graph, output_dot_dir, document_name, options=options, args=args)
Exemple #6
0
 def _load(self, d):
     self.biocDocs = []
     if self.verbose:
         print >> sys.stderr, "Reading", d
     filesRead = 0
     for xmlFile in glob.glob(os.path.join(d, "*.xml")):
         try:
             r = BioCReader(xmlFile, dtd_valid_file=BIOC_DTD)
             r.read()
         except Exception, e:
             raise BioIDScoreError, (
                 "encountered error reading collection: %s" % str(e))
         # The collection contains a document for each caption.
         # Each caption contains a single passage.
         self.biocDocs.append(r.collection)
         filesRead += 1
         if self.verbose > 1:
             print >> sys.stderr, "Read", xmlFile
         elif self.verbose and (filesRead % 20 == 0):
             print >> sys.stderr, "...", filesRead
 def __init__(self, bioc_file_path, options=None, args=None):
 
     self.bioc_reader = BioCReader(bioc_file_path)
     self.bioc_reader.read()
     
     
     self.raw_collection = self.bioc_reader.collection
     self.raw_documents = self.bioc_reader.collection.documents
     
     #print self.raw_documents
     
     self.document_list = self.get_documents(options=options, args=args)
     
     self.id_list = self.get_ids()
 
     self.pmid_abstracts_dict = self.pmid_abstracts_dict()
    postgres_user       = "******"
    postgres_password   = "******"
    postgres_host       = "localhost" 
    postgres_port       = "5432"
    postgres_db         = options.d
    connection = psycopg2.connect("dbname='"+postgres_db+"' user='******' host='"+postgres_host+"' password='******' port='"+postgres_port+"'")
    cursor = connection.cursor(cursor_factory=psycopg2.extras.DictCursor)

    # save file names in an extra variable
    input_file  = options.i
    dtd_file    = options.b
    output_file = options.o

    # open input files
    try:
        bioc_reader = BioCReader(input_file, dtd_valid_file=dtd_file)
        bioc_reader.read()
    except:
        ## debug:
        #raise
        sys.exit("Probably, your input file contains an empty passage. Maybe one of the PubMed-IDs does not have an abstract. Please, remove empty passage and document tags. No output file was written.")

    # the elements <date> and <key> will not be changed or updated by this script (it only adds (MeSH) annotations)
    # define output file
    bioc_writer = BioCWriter(output_file)
    # initialization for reading input file
    bioc_writer.collection = bioc_reader.collection
    # get documents (one PubMed-ID with title and text equals one document)
    docs = bioc_writer.collection.documents
    # different annotation IDs can be confusing - add a type to the iterating number
    annotation_type = "_MeSH"
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
    Copyright (c) 2015, Kersten Doering <*****@*****.**>

    This script reads annotations from a given XML document in BioC format and prints them to command-line.
"""

# PyBioC API
from bioc import BioCReader

# open BioC XML file and DTD file with XML structure definitions
bioc_reader = BioCReader("text_PubTator.xml", dtd_valid_file="BioC.dtd")
# read files
bioc_reader.read()
# get documents from BioC XML file (PubMed abstracts)
docs = bioc_reader.collection.documents

# iterate over documents
for doc in docs:
    # show document ID (PubMed ID)
    print "PubMed ID:",doc.id
    # iterate over passages - PubMed titles and abstracts
    for passage in doc.passages:
        # show passage type
        print "Text type:", passage.infons['type']
        # iterate over annotations for each passage and show information
        for annotation in passage.annotations:
            print "Annotation ID:", annotation.id
            print "Annotation Type:", annotation.infons['type']
            print "Annotation Text:", annotation.text
Exemple #10
0
    sys.exit(1)


if len(sys.argv) < 2:
    Usage()

THIS_ROOT = os.path.dirname(os.path.abspath(__file__))
PKG_ROOT = os.path.dirname(THIS_ROOT)

INPUTS = sys.argv[1:]

sys.path.insert(0, THIS_ROOT)

from bioc import BioCReader
from bioc import BioCWriter

dtd_file = os.path.join(PKG_ROOT, "BioC.dtd")
print "DTD is", dtd_file

validated = 0

for test_file in INPUTS:
    try:
        bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file)
        bioc_reader.read()
        validated += 1
    except Exception, e:
        print >> sys.stderr, "For", test_file, ":", str(e)

print validated, "of", len(INPUTS), "validated."
Exemple #11
0
 def parse(self):
     bioc_reader = BioCReader(self.filename, dtd_valid_file=BioC_DTD)
     bioc_reader.read()
     return bioc_reader.collection
Exemple #12
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
    Copyright (c) 2015, Kersten Doering <*****@*****.**>

    This script reads annotations from a given XML document in BioC format and writes the annotations tab-separated to a CSV file.
"""

# PyBioC API
from bioc import BioCReader

# open BioC XML file and DTD file with XML structure definitions
bioc_reader = BioCReader("pancreatic_cancer_BioC_DNorm.xml",
                         dtd_valid_file="BioC_DNorm.dtd")
# read files
bioc_reader.read()
# get documents from BioC XML file (PubMed abstracts)
docs = bioc_reader.collection.documents

# output file
out = open("DNorm_formatted.csv", "w")

# iterate over documents (each document ID is a PubMed ID)
for index, doc in enumerate(docs):
    # debug: show status - every 1,000 PubMed IDs
    if index % 1000 == 0:
        print index
    # iterate over passages (titles and texts)
    for passage in doc.passages:
        # iterate over annotations for each passage and write them to file
        for annotation in passage.annotations:
 def parse(self):
     bioc_reader = BioCReader(self.filename, dtd_valid_file=BioC_DTD)
     bioc_reader.read()
     return bioc_reader.collection
class BioCCollectionHandler(object):
    def __init__(self, bioc_file_path, options=None, args=None):
    
        self.bioc_reader = BioCReader(bioc_file_path)
        self.bioc_reader.read()
        
        
        self.raw_collection = self.bioc_reader.collection
        self.raw_documents = self.bioc_reader.collection.documents
        
        #print self.raw_documents
        
        self.document_list = self.get_documents(options=options, args=args)
        
        self.id_list = self.get_ids()
    
        self.pmid_abstracts_dict = self.pmid_abstracts_dict()
        
        
    def get_documents(self, options=None, args=None):
        
        document_list = []
        
        for one_document in self.raw_documents:
        
            bioc_doc = BioCAbstractHandler(one_document)
        
            document_list.append(bioc_doc)
        
        try: 
            if options.filename and len(document_list) > 1:
                if not options.pmid:
                    #print 'WARNING: more than one document in BioC file'
                    raise(Exception('More than one document in BioC file. Remove --filename option!'))
            else: pass
        except AttributeError: pass

        return document_list
        
        
    def pmid_abstracts_dict(self):
        pmid_abstracts_dict = {}
        for one_doc in self.document_list:
            pmid_abstracts_dict[one_doc.id] = one_doc
        return pmid_abstracts_dict
        
    def get_document(self, pmid):
        return self.pmid_abstracts_dict[pmid]
        
    def get_ids(self):
        
        id_list = [one_doc.id for one_doc in self.document_list]
        
        return id_list
        
    def write_og_xml_files(self, output_dir, options=None, args=None):
        if options.pmid:
            try:
                abstract_handler = self.pmid_abstracts_dict[options.pmid]
            except KeyError:
                raise(Exception('Target Pubmed ID could not be found in BioC collection'))
            else:
                if not options.filename:
                    output_path = output_dir + '/' + abstract_handler.id + '_og.xml'
                elif output_dir in options.filename:
                    filename = options.filename.split('/')[-1]
                    print 'FILENAME', filename
                    output_path = output_dir + '/' + filename
                    
                else:
                    'standard filename'
                    output_path = output_dir + '/' + options.filename
                    
                og_writer = OG_XMLWriter(abstract_handler, output_path)
                og_writer.write()
                
            
        else:
             
            for abstract_handler in self.pmid_abstracts_dict.values():
                
                if not options.filename:
                    output_path = output_dir + '/' + abstract_handler.id + '_og.xml'
                elif output_dir in options.filename:
                    filename = options.filename.split('/')[-1]
                    output_path = output_dir + '/' + filename
                    
                else:
                    output_path = output_dir + '/' + options.filename
                
                #print output_path, 'output_path'
                og_writer = OG_XMLWriter(abstract_handler, output_path)
                og_writer.write()
                
    def write_bioc_xml_files(self, output_dir, options=None, args=None):
        if options:
            if options.pmid:
                try:
                    abstract_handler = self.pmid_abstracts_dict[options.pmid]
                except KeyError:
                    raise(Exception('Target Pubmed ID could not be found in BioC collection'))
                else:
                    if not options.filename:
                        output_path = output_dir + '/' + abstract_handler.id + '.bioc'
                    elif output_dir in options.filename:
                        filename = options.filename.split('/')[-1]
                        print 'FILENAME', filename
                        output_path = output_dir + '/' + filename
                    
                    else:
                        output_path = output_dir + '/' + options.filename
                        
                    abstract_handler.write_text_bioc(output_path)

            elif not options.pmid:
             
                for abstract_handler in self.pmid_abstracts_dict.values():
                        if not options.filename:
                            output_path = output_dir + '/' + abstract_handler.id + '.bioc'
                        elif output_dir in options.filename:
                            if '/' in options.filename:
                                filename = options.filename.split('/')[-1]
                                output_path = output_dir + '/' + filename
                            else:
                                #output_path = os.getcwd() + '/' + filename
                                output_path = output_dir + '/' + filename
                            
                        else: output_path = output_dir + '/' + options.filename
                    
                        abstract_handler.write_text_bioc(output_path)       
               
        elif not options:
             
            for abstract_handler in self.pmid_abstracts_dict.values():
                output_path = output_dir + '/' + abstract_handler.id + '.bioc'
                abstract_handler.write_text_bioc(output_path)