Exemple #1
0
def test_BioCXMLDocumentReader():
    with open(file, 'rb') as fp:
        reader = bioc.BioCXMLDocumentReader(fp)
        collection = reader.get_collection_info()
        for document in reader:
            collection.add_document(document)
    assert_everything(collection)

    reader = bioc.BioCXMLDocumentReader(str(file))
    collection = reader.get_collection_info()
    for document in reader:
        collection.add_document(document)
    assert_everything(collection)
Exemple #2
0
def bioc2txt(biocFilename, txtHandle, idFilter):
    with bioc.BioCXMLDocumentReader(biocFilename) as parser:
        for biocDoc in parser:
            if idFilter is None or biocDoc.id in idFilter:
                for passage in biocDoc.passages:
                    txtHandle.write(passage.text)
                    txtHandle.write("\n\n")
Exemple #3
0
    def negative_instance_collection(self, text_spans):
        """
        This method will extract negative texts given negative texts spans
        of a full-length article. The extracted text will be sentence segmented
        using Textblob and stored into PMID.txt file. Each sentence has info:
        docid|paragraph_offset|section_info(e.g.Abstract)|text_location(e.g. main body)|text.

        :param text_spans: a dictionary of negative text_spans key: (docid,para_offset)| value: [(start,end)]
        :return:FileIO. *.txt files will be stored in :self.output_txt_path
        """
        files = glob.glob(self.bioc_document_xml_path)
        instances = list()
        for filename in files:
            with bioc.BioCXMLDocumentReader(filename) as collections:
                for doc in collections:
                    docid = doc.id
                    section_info = 'paper_title'
                    for passage in doc.passages:
                        section_info = self.resolve_section_info(
                            passage, section_info)
                        if passage.infons['type'] == 'title_1':
                            continue  # section info (e.g. abstract, introduction, results)

                        negative_text_spans = text_spans.get(
                            (docid, passage.offset))
                        negative_text = self.text_span_to_text(
                            passage.text, negative_text_spans)

                        instance = self.form_instance(docid, passage,
                                                      section_info,
                                                      negative_text)
                        instances.append(instance)
        return instances
    def bioc2txt(self):
        """
        convert bioc GOA-evidence to string

        :return FileIO
        """
        files = glob.glob(self.bioc_annotation_xml_path)

        for filename in tqdm(files):
            visited = defaultdict(list)
            with bioc.BioCXMLDocumentReader(filename) as collections:
                for doc in collections:
                    docid = doc.id  # extract document id (PMID)
                    for passage in doc.passages:
                        text_type = self.resolve_text_type(passage)
                        passage_offset = passage.offset  # extract passage_offset
                        for annotation in passage.annotations:
                            instance = self.bioc2instance(
                                docid, passage_offset, annotation)

                            if not instance:  # not a GO annotation
                                continue
                            if instance.goa.go_id in visited[
                                    instance.evidence.text]:
                                continue  # duplicate annotation
                            visited[instance.evidence.text] = visited.get(
                                instance.evidence.text,
                                []) + [instance.goa.go_id]
                            instance.evidence.text_type = text_type
                            file_path = self.output_txt_path + docid + '.txt'
                            self.save_positive_to_txt(instance,
                                                      file_path,
                                                      appending=True)
        return True
Exemple #5
0
    def resolve_positive_text_spans_index(self, annotation_path):
        """
        This method will extract BC4GO annotations.xml file;
        the annotations will be transformed into a dictionary structure

        :param: annotation_path: file directory of bc4go xml annotation files
        :return: hashmap structure: (key)docid, passage_id
                                    (value)list of merged evidence text_spans
        """
        positive_spans_index = dict()
        annotation_files = glob.glob(annotation_path)
        print("loading {} xml files from: {}".format(
            str(len(annotation_files)), annotation_path))
        for filename in tqdm(annotation_files):
            with bioc.BioCXMLDocumentReader(filename) as collections:
                for doc in collections:  # in BC4GO, only one doc per collection
                    docid = doc.id
                    for passage in doc.passages:
                        doc_spans = list()
                        for annotation in passage.annotations:
                            try:
                                positive_spans = self.parse_bioc_text_span(
                                    passage.offset, annotation)
                            except:  # the current annotation does not contain text span info
                                continue
                            doc_spans += positive_spans
                        positive_spans_index[(
                            docid, passage.offset
                        )] = self.merge_overlapping_text_span(
                            sorted(list(set(doc_spans)), key=lambda x: x[0]))
        print("Found {} TPs (merged)".format(
            str(len(positive_spans_index.values()))))
        return positive_spans_index
Exemple #6
0
def process_file(input_filename, concept_ID, converter, term_counter):
    reader = bioc.BioCXMLDocumentReader(input_filename)
    for document in reader:
        for passage in document.passages:
            for annotation in passage.annotations:
                identifier = annotation.infons.get(
                    "identifier") or annotation.infons.get("Identifier")
                if identifier != concept_ID:
                    continue
                converted_text = converter(annotation.text)
                term_counter[converted_text][annotation.text] += 1
Exemple #7
0
def mergeBioc(inDir, outBioc):
    inBiocs = sorted([
        os.path.join(inDir, filename) for filename in os.listdir(inDir)
        if filename.lower().endswith('.xml')
        and not filename.lower().endswith('.ga.xml')
    ])

    with bioc.BioCXMLDocumentWriter(outBioc) as writer:
        for inBioc in inBiocs:
            with open(inBioc, 'rb') as f:
                parser = bioc.BioCXMLDocumentReader(f)
                for doc in parser:
                    writer.write_document(doc)
Exemple #8
0
def mergeBiocWithMetadata(metaDir, inDir, outBioc):
    filenames = sorted([
        filename for filename in os.listdir(inDir)
        if filename.lower().endswith('.xml')
        and not filename.lower().endswith('.ga.xml')
    ])

    with bioc.BioCXMLDocumentWriter(outBioc) as writer:
        for filename in filenames:
            inBioc = os.path.join(inDir, filename)
            metaBioc = os.path.join(metaDir, filename)

            with open(inBioc, 'rb') as f1, open(metaBioc, 'rb') as f2:
                inParser = bioc.BioCXMLDocumentReader(f1)
                metaParser = bioc.BioCXMLDocumentReader(f2)

                for inDoc, metaDoc in zip(inParser, metaParser):
                    assert len(inDoc.passages) == len(metaDoc.passages)
                    for inP, metaP in zip(inDoc.passages, metaDoc.passages):
                        assert inP.text == metaP.text
                        inP.infons.update(metaP.infons)

                    inDoc.infons.update(metaDoc.infons)
                    writer.write_document(inDoc)
Exemple #9
0
def splitBioc(inBioc, outDir, maxLength, stripAnnotations=False):
    assert os.path.isfile(inBioc)
    assert os.path.isdir(outDir)

    pmids = set()

    textLength = 0
    docNumber = 0
    docName = os.path.join(outDir, "%08d.bioc.xml" % docNumber)
    writer = bioc.BioCXMLDocumentWriter(docName)
    with open(inBioc, 'rb') as f:
        parser = bioc.BioCXMLDocumentReader(f)
        for i, doc in enumerate(parser):
            if 'pmid' in doc.infons:
                if doc.infons['pmid'] in pmids:
                    continue
                pmids.add(doc.infons['pmid'])

            thisDocLength = sum(len(passage.text) for passage in doc.passages)

            assert len(
                doc.passages
            ) > 0 and thisDocLength > 0, "Corpus file cannot contain empty documents"

            if stripAnnotations:
                for passage in doc.passages:
                    passage.annotations = []
                    passage.relations = []

            if textLength > 0 and maxLength and (textLength +
                                                 thisDocLength) > maxLength:
                textLength = 0
                docNumber += 1
                docName = os.path.join(outDir, "%08d.bioc.xml" % docNumber)
                writer.close()
                writer = bioc.BioCXMLDocumentWriter(docName)

            textLength += thisDocLength

            writer.write_document(doc)

    writer.close()
    if textLength == 0:
        os.remove(docName)
Exemple #10
0
def parse_dataset(filename):
    ids = []
    titles = []
    abstracts = []
    labels = []

    with bioc.BioCXMLDocumentReader(filename) as reader:
        collection_info = reader.get_collection_info()
        for document in reader:
            ids.append(document.id)
            relevant = document.infons['relevant']
            labels.append(0 if relevant == 'no' else 1)
            titles.append(document.passages[0].text)
            try:
                abstracts.append(document.passages[1].text)
            except IndexError:
                abstracts.append('')

        return ids, titles, abstracts, labels
Exemple #11
0
def iterLoad(dataFormat, path, corpusSizeCutoff=500):
    """
	Iteratively load sections of a (presumably large) corpus. This will create a generator that provides kindred.Corpus objects that are subsets of the larger corpus. This should be used to lower the memory requirements (so that the entire file doesn't need to be loaded into memory at one time).

	:param dataFormat: Format of the data files to load (only 'biocxml' is currently supported)
	:param path: Path to data. Can be directory or an individual file (for bioc, json or simpletag)
	:param corpusSizeCutoff: Approximate maximum number of documents to be in each corpus subset
	:type dataFormat: str
	:type path: str
	:type corpusSizeCutoff: int
	:return: Subsets of the BioC file
	:rtype: A kindred.Corpus generator
	"""
    assert dataFormat == 'biocxml'

    corpus = kindred.Corpus()

    if os.path.isdir(path):
        filenames = [
            os.path.join(path, x) for x in os.listdir(path)
            if x.endswith('bioc.xml')
        ]
    else:
        filenames = [path]

    for filename in filenames:
        with open(filename, 'rb') as f:
            parser = bioc.BioCXMLDocumentReader(f)
            for document in parser:
                if len(corpus.documents) >= corpusSizeCutoff:
                    yield corpus
                    corpus = kindred.Corpus()
                kindredDocs = convertBiocDocToKindredDocs(document)
                for kindredDoc in kindredDocs:
                    corpus.addDocument(kindredDoc)

    if len(corpus.documents) > 0:
        yield corpus
Exemple #12
0
    def build_vocab_from_raw_data(self, filename, write_down=True):
        ret = dict()

        with open(filename, 'rb') as fp:
            reader = bioc.BioCXMLDocumentReader(fp)
            collection_info = reader.get_collection_info()
            for document in reader:
                print("document: ", document)
                for passage in document.passages:
                    assert (len(passage.relations) == 0)
                    dependency = gen_dependency_tree(passage.text)
                    for token in dependency:
                        ret[normalize(token.text)] = 1

        cnt = 0
        for key in ret:
            cnt += 1
            ret[key] = cnt

        if write_down:
            write_dict_down(ret, self.DEFAULT_VOCAB_PATH)

        return ret
Exemple #13
0
# from bioc import BioCReader
# from bioc import BioCWriter

INPUT_FILE = "data/chemdner_corpus/training.bioc.xml"
DTD_FILE = "data/chemdner_corpus/BioC.dtd"

# bioc_reader = BioCReader(INPUT_FILE, DTD_FILE)
# bioc_reader.read()

import bioc

with bioc.BioCXMLDocumentReader(INPUT_FILE) as reader:
    collection_info = reader.get_collection_info()
    for document in reader:
        print(document)
Exemple #14
0
	return '\\b%s\\b' % m

if __name__ == '__main__':
	parser = argparse.ArgumentParser('Text align PubTator annotations against a BioC file')
	parser.add_argument('--inBioc',required=True,type=str,help='Input BioC file')
	parser.add_argument('--annotations',required=True,type=str,help='Pre-pickled annotations')
	parser.add_argument('--outBioc',required=True,type=str,help='Output BioC file')
	args = parser.parse_args()

	pmids = set()

	print("Loaded PMIDs from corpus file...")

	#with bioc.BioCXMLDocumentReader(args.inBioc) as parser:
	with open(args.inBioc,'rb') as f:
		parser = bioc.BioCXMLDocumentReader(f)
		for i,doc in enumerate(parser):
			if 'pmid' in doc.infons and doc.infons['pmid']:
				pmid = int(doc.infons['pmid'])
				pmids.add(pmid)


	print("Finding relevant annotations for PubMed IDs...")
	
	pmidToAnnotations = defaultdict(list)
	with open(args.annotations) as f:
		for line in f:
			split = line.strip('\n').split('\t')
			pmid,annotationType,conceptid,mentions,database = split
			mentions = mentions.strip()
			pmid = int(pmid)
Exemple #15
0
def biocxml2bioc(source):
    with open(biocFilename, 'rb') as f:
        parser = bioc.BioCXMLDocumentReader(f)
        for biocDoc in parser:
            yield biocDoc
Exemple #16
0
def build_inter_sentence_docs_from_file(path, limit=None):
    """
    Args:
        path: a string denoting a path

    Returns:
        A list of dicts. One dict should contain the following keys:
            - "doc": spacy docs
            - "Disease": a dict containing diseases as keys. The value of each key is
                a list containing one pair of starting end ending token index
            - "Chemical": the same as "disease"
            - "relation": a set of pairs (chemical, disease) denoting one CID relation
    """
    ret = list()
    with open(path, 'rb') as fp:
        reader = bioc.BioCXMLDocumentReader(fp)

        for document in reader:
            current_dict = dict()
            current_dict['Disease'] = dict()
            current_dict['Chemical'] = dict()

            for passage in document.passages:

                # if passage.infons.get("type") == "title": TODO: ask for elaboration
                #     continue

                assert len(passage.relations) == 0
                doc = gen_dependency_tree(passage.text)
                current_dict['doc'] = doc

                for annotation in passage.annotations:
                    MESH = annotation.infons.get("MESH")
                    typ = annotation.infons.get("type")
                    if MESH not in current_dict[typ]:
                        current_dict[typ][MESH] = list()

                    # convert character location to token location
                    converted_loc = list()
                    for location in annotation.locations:
                        left = -1
                        right = -1

                        for i, token in enumerate(doc):
                            if location.offset - passage.offset >= token.idx and \
                                    location.offset + location.length - passage.offset <= token.idx + len(token):
                                # In case entity is within token. e.g. "carcinogenic" in "Co-carcinogenic"
                                left = i
                                right = i
                                break

                            if token.idx >= location.offset - passage.offset and left == -1:
                                left = i
                            if token.idx < location.offset + location.length - passage.offset:
                                right = i
                        converted_loc.append((left, right))
                    current_dict[typ][MESH] += converted_loc

            current_dict["relation"] = set()
            for relation in document.relations:
                assert len(relation.nodes) == 0
                assert relation.infons.get("relation") == "CID"
                current_dict["relation"].add((relation.infons.get("Chemical"),
                                              relation.infons.get("Disease")))
            ret.append(current_dict)
            if limit is not None and len(ret) == limit:
                return ret

    return ret
Exemple #17
0
def mergeBioc(biocFilename, outBiocWriter, idFilter):
    with bioc.BioCXMLDocumentReader(biocFilename) as parser:
        for biocDoc in parser:
            if idFilter is None or biocDoc.id in idFilter:
                outBiocWriter.write_document(biocDoc)
Exemple #18
0
                        required=True,
                        type=str,
                        help='Input BioC file')
    parser.add_argument('--annotations',
                        required=True,
                        type=str,
                        help='Pre-pickled annotations')
    parser.add_argument('--outBioc',
                        required=True,
                        type=str,
                        help='Output BioC file')
    args = parser.parse_args()

    pmids = set()

    with bioc.BioCXMLDocumentReader(args.inBioc) as parser:
        for i, doc in enumerate(parser):
            if 'pmid' in doc.infons and doc.infons['pmid'] != 'None':
                pmid = int(doc.infons['pmid'])
                pmids.add(pmid)

    pmidToAnnotations = defaultdict(list)
    with open(args.annotations) as f:
        for line in f:
            split = line.strip('\n').split('\t')
            pmid, annotationType, conceptid, mentions, database = split
            mentions = mentions.strip()
            pmid = int(pmid)
            if len(mentions) > 0 and pmid in pmids:
                pmidToAnnotations[pmid].append(
                    (annotationType, conceptid, mentions))