Example #1
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]

    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)

    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)

    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()

    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()

    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection

    # Get documents to manipulate
    documents = bioc_writer.collection.documents

    # Go through each document
    annotation_id = 0
    for document in documents:

        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [
                stemmer.stem(token)
                for token in wordpunct_tokenize(passage.text)
            ]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1

                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 'stemmed token')
                passage.add_annotation(bioc_annotation)

    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))

    # Write to disk
    bioc_writer.write()
Example #2
0
def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()
Example #3
0
def main():
    bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file)
    bioc_reader.read()
    '''
    sentences = bioc_reader.collection.documents[0].passages[0].sentences
    for sentence in sentences:
        print sentence.offset
    '''

    bioc_writer = BioCWriter('output_bioc.xml')
    bioc_writer.collection = bioc_reader.collection
    bioc_writer.write()
    print(bioc_writer)
Example #4
0
def main():
    bioc_reader = BioCReader(test_file, dtd_valid_file=dtd_file)
    bioc_reader.read()
    '''
    sentences = bioc_reader.collection.documents[0].passages[0].sentences
    for sentence in sentences:
        print sentence.offset
    '''

    bioc_writer = BioCWriter('output_bioc.xml')
    bioc_writer.collection = bioc_reader.collection
    bioc_writer.write()
    print(bioc_writer)
 def write_text_bioc(self, output_path):
     bioc_writer = BioCWriter(output_path)
     bioc_collection = BioCCollection()
     # Insert option for either writing text only or annotations?
     # to keep document as it is:
     #collection.add_document(self.document)
     bioc_document = BioCDocument()
     for passage in self.abstract_dict.keys():
         bioc_passage = BioCPassage()
         bioc_passage.text = self.abstract_dict[passage]
         bioc_document.add_passage(bioc_passage)
     bioc_collection.add_document(bioc_document)
     
     print 'BioC output path', output_path
     bioc_writer.collection = bioc_collection
     bioc_writer.write()
                occurrences = []
                for mesh in mesh_terms:
                    #search for start and end positions with case insensitivity
                    #if there is no abstract (no passage.text) in the document, the regex search will result in an error - use try-except block
                    try: 
                        positions = [(a.start(), a.end()) for a in list(re.finditer(mesh.lower(), passage.text.lower()))]
                        if positions:
                            for tuples in positions:
                                #passage.text[tuples[0]:tuples[0]+len(mesh)] would return the identified term
                                 # the triple is (start position, term length, MeSH term)
                                occurrences.append((tuples[0], len(mesh), mesh))
                    except:
                            print doc.id, mesh, "Mesh term not found."
                # sorted by start positions
                occurrences.sort()
                for triple in occurrences:
                    # ToDo: add a condition for nested tags?
                    # add annotation with infon, location, and text
                    bioc_annotation = add_annotation(triple,str(annotation_id)+annotation_type)
                    passage.add_annotation(bioc_annotation)
                    # increment annotation_id
                    annotation_id += 1
    # write XML format to output file
    bioc_writer.write()
    # debug:
    #print(bioc_writer)

    # disconnect from database
    cursor.close()
    connection.close()
Example #7
0
                        positions = [(a.start(), a.end()) for a in list(
                            re.finditer(mesh.lower(), passage.text.lower()))]
                        if positions:
                            for tuples in positions:
                                #passage.text[tuples[0]:tuples[0]+len(mesh)] would return the identified term
                                # the triple is (start position, term length, MeSH term)
                                occurrences.append(
                                    (tuples[0], len(mesh), mesh))
                    except:
                        print doc.id, mesh, "Mesh terms not found."

                # sorted by start positions
                occurrences.sort()
                for triple in occurrences:
                    # ToDo: add a condition for nested tags?
                    # add annotation with infon, location, and text
                    bioc_annotation = add_annotation(
                        triple,
                        str(annotation_id) + annotation_type)
                    passage.add_annotation(bioc_annotation)
                    # increment annotation_id
                    annotation_id += 1
    # write XML format to output file
    bioc_writer.write()
    # debug:
    #print(bioc_writer)

    # disconnect from database
    cursor.close()
    connection.close()