Python BioCXMLDocumentReader Exemples, bioc.BioCXMLDocumentReader Python Exemples

Exemple #1

0

Afficher le fichier

def test_BioCXMLDocumentReader():
    with open(file, 'rb') as fp:
        reader = bioc.BioCXMLDocumentReader(fp)
        collection = reader.get_collection_info()
        for document in reader:
            collection.add_document(document)
    assert_everything(collection)

    reader = bioc.BioCXMLDocumentReader(str(file))
    collection = reader.get_collection_info()
    for document in reader:
        collection.add_document(document)
    assert_everything(collection)

Exemple #2

0

Afficher le fichier

def bioc2txt(biocFilename, txtHandle, idFilter):
    with bioc.BioCXMLDocumentReader(biocFilename) as parser:
        for biocDoc in parser:
            if idFilter is None or biocDoc.id in idFilter:
                for passage in biocDoc.passages:
                    txtHandle.write(passage.text)
                    txtHandle.write("\n\n")

Exemple #3

0

Afficher le fichier

    def negative_instance_collection(self, text_spans):
        """
        This method will extract negative texts given negative texts spans
        of a full-length article. The extracted text will be sentence segmented
        using Textblob and stored into PMID.txt file. Each sentence has info:
        docid|paragraph_offset|section_info(e.g.Abstract)|text_location(e.g. main body)|text.

        :param text_spans: a dictionary of negative text_spans key: (docid,para_offset)| value: [(start,end)]
        :return:FileIO. *.txt files will be stored in :self.output_txt_path
        """
        files = glob.glob(self.bioc_document_xml_path)
        instances = list()
        for filename in files:
            with bioc.BioCXMLDocumentReader(filename) as collections:
                for doc in collections:
                    docid = doc.id
                    section_info = 'paper_title'
                    for passage in doc.passages:
                        section_info = self.resolve_section_info(
                            passage, section_info)
                        if passage.infons['type'] == 'title_1':
                            continue  # section info (e.g. abstract, introduction, results)

                        negative_text_spans = text_spans.get(
                            (docid, passage.offset))
                        negative_text = self.text_span_to_text(
                            passage.text, negative_text_spans)

                        instance = self.form_instance(docid, passage,
                                                      section_info,
                                                      negative_text)
                        instances.append(instance)
        return instances

Exemple #4

0

Afficher le fichier

Fichier : extract_positives.py Projet : jiyuc/BioConsistency

    def bioc2txt(self):
        """
        convert bioc GOA-evidence to string

        :return FileIO
        """
        files = glob.glob(self.bioc_annotation_xml_path)

        for filename in tqdm(files):
            visited = defaultdict(list)
            with bioc.BioCXMLDocumentReader(filename) as collections:
                for doc in collections:
                    docid = doc.id  # extract document id (PMID)
                    for passage in doc.passages:
                        text_type = self.resolve_text_type(passage)
                        passage_offset = passage.offset  # extract passage_offset
                        for annotation in passage.annotations:
                            instance = self.bioc2instance(
                                docid, passage_offset, annotation)

                            if not instance:  # not a GO annotation
                                continue
                            if instance.goa.go_id in visited[
                                    instance.evidence.text]:
                                continue  # duplicate annotation
                            visited[instance.evidence.text] = visited.get(
                                instance.evidence.text,
                                []) + [instance.goa.go_id]
                            instance.evidence.text_type = text_type
                            file_path = self.output_txt_path + docid + '.txt'
                            self.save_positive_to_txt(instance,
                                                      file_path,
                                                      appending=True)
        return True

Exemple #5

0

Afficher le fichier

    def resolve_positive_text_spans_index(self, annotation_path):
        """
        This method will extract BC4GO annotations.xml file;
        the annotations will be transformed into a dictionary structure

        :param: annotation_path: file directory of bc4go xml annotation files
        :return: hashmap structure: (key)docid, passage_id
                                    (value)list of merged evidence text_spans
        """
        positive_spans_index = dict()
        annotation_files = glob.glob(annotation_path)
        print("loading {} xml files from: {}".format(
            str(len(annotation_files)), annotation_path))
        for filename in tqdm(annotation_files):
            with bioc.BioCXMLDocumentReader(filename) as collections:
                for doc in collections:  # in BC4GO, only one doc per collection
                    docid = doc.id
                    for passage in doc.passages:
                        doc_spans = list()
                        for annotation in passage.annotations:
                            try:
                                positive_spans = self.parse_bioc_text_span(
                                    passage.offset, annotation)
                            except:  # the current annotation does not contain text span info
                                continue
                            doc_spans += positive_spans
                        positive_spans_index[(
                            docid, passage.offset
                        )] = self.merge_overlapping_text_span(
                            sorted(list(set(doc_spans)), key=lambda x: x[0]))
        print("Found {} TPs (merged)".format(
            str(len(positive_spans_index.values()))))
        return positive_spans_index

Exemple #6

0

Afficher le fichier

def process_file(input_filename, concept_ID, converter, term_counter):
    reader = bioc.BioCXMLDocumentReader(input_filename)
    for document in reader:
        for passage in document.passages:
            for annotation in passage.annotations:
                identifier = annotation.infons.get(
                    "identifier") or annotation.infons.get("Identifier")
                if identifier != concept_ID:
                    continue
                converted_text = converter(annotation.text)
                term_counter[converted_text][annotation.text] += 1

Exemple #7

0

Afficher le fichier

Fichier : runTool.py Projet : jakelever/pubtator

def mergeBioc(inDir, outBioc):
    inBiocs = sorted([
        os.path.join(inDir, filename) for filename in os.listdir(inDir)
        if filename.lower().endswith('.xml')
        and not filename.lower().endswith('.ga.xml')
    ])

    with bioc.BioCXMLDocumentWriter(outBioc) as writer:
        for inBioc in inBiocs:
            with open(inBioc, 'rb') as f:
                parser = bioc.BioCXMLDocumentReader(f)
                for doc in parser:
                    writer.write_document(doc)

Exemple #8

0

Afficher le fichier

Fichier : runTool.py Projet : jakelever/pubtator

def mergeBiocWithMetadata(metaDir, inDir, outBioc):
    filenames = sorted([
        filename for filename in os.listdir(inDir)
        if filename.lower().endswith('.xml')
        and not filename.lower().endswith('.ga.xml')
    ])

    with bioc.BioCXMLDocumentWriter(outBioc) as writer:
        for filename in filenames:
            inBioc = os.path.join(inDir, filename)
            metaBioc = os.path.join(metaDir, filename)

            with open(inBioc, 'rb') as f1, open(metaBioc, 'rb') as f2:
                inParser = bioc.BioCXMLDocumentReader(f1)
                metaParser = bioc.BioCXMLDocumentReader(f2)

                for inDoc, metaDoc in zip(inParser, metaParser):
                    assert len(inDoc.passages) == len(metaDoc.passages)
                    for inP, metaP in zip(inDoc.passages, metaDoc.passages):
                        assert inP.text == metaP.text
                        inP.infons.update(metaP.infons)

                    inDoc.infons.update(metaDoc.infons)
                    writer.write_document(inDoc)

Exemple #9

0

Afficher le fichier

Fichier : runTool.py Projet : jakelever/pubtator

def splitBioc(inBioc, outDir, maxLength, stripAnnotations=False):
    assert os.path.isfile(inBioc)
    assert os.path.isdir(outDir)

    pmids = set()

    textLength = 0
    docNumber = 0
    docName = os.path.join(outDir, "%08d.bioc.xml" % docNumber)
    writer = bioc.BioCXMLDocumentWriter(docName)
    with open(inBioc, 'rb') as f:
        parser = bioc.BioCXMLDocumentReader(f)
        for i, doc in enumerate(parser):
            if 'pmid' in doc.infons:
                if doc.infons['pmid'] in pmids:
                    continue
                pmids.add(doc.infons['pmid'])

            thisDocLength = sum(len(passage.text) for passage in doc.passages)

            assert len(
                doc.passages
            ) > 0 and thisDocLength > 0, "Corpus file cannot contain empty documents"

            if stripAnnotations:
                for passage in doc.passages:
                    passage.annotations = []
                    passage.relations = []

            if textLength > 0 and maxLength and (textLength +
                                                 thisDocLength) > maxLength:
                textLength = 0
                docNumber += 1
                docName = os.path.join(outDir, "%08d.bioc.xml" % docNumber)
                writer.close()
                writer = bioc.BioCXMLDocumentWriter(docName)

            textLength += thisDocLength

            writer.write_document(doc)

    writer.close()
    if textLength == 0:
        os.remove(docName)

Exemple #10

0

Afficher le fichier

def parse_dataset(filename):
    ids = []
    titles = []
    abstracts = []
    labels = []

    with bioc.BioCXMLDocumentReader(filename) as reader:
        collection_info = reader.get_collection_info()
        for document in reader:
            ids.append(document.id)
            relevant = document.infons['relevant']
            labels.append(0 if relevant == 'no' else 1)
            titles.append(document.passages[0].text)
            try:
                abstracts.append(document.passages[1].text)
            except IndexError:
                abstracts.append('')

        return ids, titles, abstracts, labels

Exemple #11

0

Afficher le fichier

Fichier : loadFunctions.py Projet : vj1494/kindred

def iterLoad(dataFormat, path, corpusSizeCutoff=500):
    """
	Iteratively load sections of a (presumably large) corpus. This will create a generator that provides kindred.Corpus objects that are subsets of the larger corpus. This should be used to lower the memory requirements (so that the entire file doesn't need to be loaded into memory at one time).

	:param dataFormat: Format of the data files to load (only 'biocxml' is currently supported)
	:param path: Path to data. Can be directory or an individual file (for bioc, json or simpletag)
	:param corpusSizeCutoff: Approximate maximum number of documents to be in each corpus subset
	:type dataFormat: str
	:type path: str
	:type corpusSizeCutoff: int
	:return: Subsets of the BioC file
	:rtype: A kindred.Corpus generator
	"""
    assert dataFormat == 'biocxml'

    corpus = kindred.Corpus()

    if os.path.isdir(path):
        filenames = [
            os.path.join(path, x) for x in os.listdir(path)
            if x.endswith('bioc.xml')
        ]
    else:
        filenames = [path]

    for filename in filenames:
        with open(filename, 'rb') as f:
            parser = bioc.BioCXMLDocumentReader(f)
            for document in parser:
                if len(corpus.documents) >= corpusSizeCutoff:
                    yield corpus
                    corpus = kindred.Corpus()
                kindredDocs = convertBiocDocToKindredDocs(document)
                for kindredDoc in kindredDocs:
                    corpus.addDocument(kindredDoc)

    if len(corpus.documents) > 0:
        yield corpus

Exemple #12

0

Afficher le fichier

Fichier : cdr_data.py Projet : unik00/GraphLSTM

    def build_vocab_from_raw_data(self, filename, write_down=True):
        ret = dict()

        with open(filename, 'rb') as fp:
            reader = bioc.BioCXMLDocumentReader(fp)
            collection_info = reader.get_collection_info()
            for document in reader:
                print("document: ", document)
                for passage in document.passages:
                    assert (len(passage.relations) == 0)
                    dependency = gen_dependency_tree(passage.text)
                    for token in dependency:
                        ret[normalize(token.text)] = 1

        cnt = 0
        for key in ret:
            cnt += 1
            ret[key] = cnt

        if write_down:
            write_dict_down(ret, self.DEFAULT_VOCAB_PATH)

        return ret

Exemple #13

0

Afficher le fichier

Fichier : bioc_test.py Projet : legendPerceptor/polygo

# from bioc import BioCReader
# from bioc import BioCWriter

INPUT_FILE = "data/chemdner_corpus/training.bioc.xml"
DTD_FILE = "data/chemdner_corpus/BioC.dtd"

# bioc_reader = BioCReader(INPUT_FILE, DTD_FILE)
# bioc_reader.read()

import bioc

with bioc.BioCXMLDocumentReader(INPUT_FILE) as reader:
    collection_info = reader.get_collection_info()
    for document in reader:
        print(document)

Exemple #14

0

Afficher le fichier

	return '\\b%s\\b' % m

if __name__ == '__main__':
	parser = argparse.ArgumentParser('Text align PubTator annotations against a BioC file')
	parser.add_argument('--inBioc',required=True,type=str,help='Input BioC file')
	parser.add_argument('--annotations',required=True,type=str,help='Pre-pickled annotations')
	parser.add_argument('--outBioc',required=True,type=str,help='Output BioC file')
	args = parser.parse_args()

	pmids = set()

	print("Loaded PMIDs from corpus file...")

	#with bioc.BioCXMLDocumentReader(args.inBioc) as parser:
	with open(args.inBioc,'rb') as f:
		parser = bioc.BioCXMLDocumentReader(f)
		for i,doc in enumerate(parser):
			if 'pmid' in doc.infons and doc.infons['pmid']:
				pmid = int(doc.infons['pmid'])
				pmids.add(pmid)


	print("Finding relevant annotations for PubMed IDs...")
	
	pmidToAnnotations = defaultdict(list)
	with open(args.annotations) as f:
		for line in f:
			split = line.strip('\n').split('\t')
			pmid,annotationType,conceptid,mentions,database = split
			mentions = mentions.strip()
			pmid = int(pmid)

Exemple #15

0

Afficher le fichier

def biocxml2bioc(source):
    with open(biocFilename, 'rb') as f:
        parser = bioc.BioCXMLDocumentReader(f)
        for biocDoc in parser:
            yield biocDoc

Exemple #16

0

Afficher le fichier

Fichier : cdr_data.py Projet : unik00/GraphLSTM

def build_inter_sentence_docs_from_file(path, limit=None):
    """
    Args:
        path: a string denoting a path

    Returns:
        A list of dicts. One dict should contain the following keys:
            - "doc": spacy docs
            - "Disease": a dict containing diseases as keys. The value of each key is
                a list containing one pair of starting end ending token index
            - "Chemical": the same as "disease"
            - "relation": a set of pairs (chemical, disease) denoting one CID relation
    """
    ret = list()
    with open(path, 'rb') as fp:
        reader = bioc.BioCXMLDocumentReader(fp)

        for document in reader:
            current_dict = dict()
            current_dict['Disease'] = dict()
            current_dict['Chemical'] = dict()

            for passage in document.passages:

                # if passage.infons.get("type") == "title": TODO: ask for elaboration
                #     continue

                assert len(passage.relations) == 0
                doc = gen_dependency_tree(passage.text)
                current_dict['doc'] = doc

                for annotation in passage.annotations:
                    MESH = annotation.infons.get("MESH")
                    typ = annotation.infons.get("type")
                    if MESH not in current_dict[typ]:
                        current_dict[typ][MESH] = list()

                    # convert character location to token location
                    converted_loc = list()
                    for location in annotation.locations:
                        left = -1
                        right = -1

                        for i, token in enumerate(doc):
                            if location.offset - passage.offset >= token.idx and \
                                    location.offset + location.length - passage.offset <= token.idx + len(token):
                                # In case entity is within token. e.g. "carcinogenic" in "Co-carcinogenic"
                                left = i
                                right = i
                                break

                            if token.idx >= location.offset - passage.offset and left == -1:
                                left = i
                            if token.idx < location.offset + location.length - passage.offset:
                                right = i
                        converted_loc.append((left, right))
                    current_dict[typ][MESH] += converted_loc

            current_dict["relation"] = set()
            for relation in document.relations:
                assert len(relation.nodes) == 0
                assert relation.infons.get("relation") == "CID"
                current_dict["relation"].add((relation.infons.get("Chemical"),
                                              relation.infons.get("Disease")))
            ret.append(current_dict)
            if limit is not None and len(ret) == limit:
                return ret

    return ret

Exemple #17

0

Afficher le fichier

def mergeBioc(biocFilename, outBiocWriter, idFilter):
    with bioc.BioCXMLDocumentReader(biocFilename) as parser:
        for biocDoc in parser:
            if idFilter is None or biocDoc.id in idFilter:
                outBiocWriter.write_document(biocDoc)

Exemple #18

0

Afficher le fichier

Fichier : align.py Projet : jakelever/pgxmine

                        required=True,
                        type=str,
                        help='Input BioC file')
    parser.add_argument('--annotations',
                        required=True,
                        type=str,
                        help='Pre-pickled annotations')
    parser.add_argument('--outBioc',
                        required=True,
                        type=str,
                        help='Output BioC file')
    args = parser.parse_args()

    pmids = set()

    with bioc.BioCXMLDocumentReader(args.inBioc) as parser:
        for i, doc in enumerate(parser):
            if 'pmid' in doc.infons and doc.infons['pmid'] != 'None':
                pmid = int(doc.infons['pmid'])
                pmids.add(pmid)

    pmidToAnnotations = defaultdict(list)
    with open(args.annotations) as f:
        for line in f:
            split = line.strip('\n').split('\t')
            pmid, annotationType, conceptid, mentions, database = split
            mentions = mentions.strip()
            pmid = int(pmid)
            if len(mentions) > 0 and pmid in pmids:
                pmidToAnnotations[pmid].append(
                    (annotationType, conceptid, mentions))