Example #1
0
def convert(inFiles, inFormat, outFile, outFormat):
    outBiocHandle, outTxtHandle = None, None

    assert inFormat in acceptedInFormats, "%s is not an accepted input format. Options are: %s" % (
        inFormat, "/".join(acceptedInFormats))
    assert outFormat in acceptedOutFormats, "%s is not an accepted output format. Options are: %s" % (
        outFormat, "/".join(acceptedOutFormats))

    if outFormat == 'biocxml':
        outBiocHandle = bioc.BioCXMLDocumentWriter(outFile)
    elif outFormat == 'txt':
        outTxtHandle = open(outFile, 'w', 'utf-8')

    for inFile in inFiles:

        for biocDoc in docs2bioc(inFile, inFormat):

            if outFormat == 'biocxml':
                outBiocHandle.write_document(biocDoc)
            elif outFormat == 'txt':
                for passage in biocDoc.passages:
                    outTxtHandle.write(passage.text)
                    outTxtHandle.write("\n\n")

    if outFormat == 'biocxml':
        outBiocHandle.close()
    elif outFormat == 'txt':
        outTxtHandle.close()
Example #2
0
def pubmedxml2bioc(pubmedxmlFilename, biocFilename):
    with bioc.BioCXMLDocumentWriter(biocFilename) as writer:
        for pmDoc in processMedlineFile(pubmedxmlFilename):
            biocDoc = bioc.BioCDocument()
            biocDoc.id = pmDoc["pmid"]
            biocDoc.infons['title'] = " ".join(pmDoc["title"])
            biocDoc.infons['pmid'] = pmDoc["pmid"]
            biocDoc.infons['year'] = pmDoc["pubYear"]
            biocDoc.infons['month'] = pmDoc["pubMonth"]
            biocDoc.infons['day'] = pmDoc["pubDay"]
            biocDoc.infons['journal'] = pmDoc["journal"]
            biocDoc.infons['journalISO'] = pmDoc["journalISO"]
            biocDoc.infons['authors'] = ", ".join(pmDoc["authors"])
            biocDoc.infons['chemicals'] = pmDoc['chemicals']
            biocDoc.infons['meshHeadings'] = pmDoc['meshHeadings']

            offset = 0
            for section in ["title", "abstract"]:
                for textSource in pmDoc[section]:
                    textSource = trimSentenceLengths(textSource)
                    passage = bioc.BioCPassage()
                    passage.infons['section'] = section
                    passage.text = textSource
                    passage.offset = offset
                    offset += len(textSource)
                    biocDoc.add_passage(passage)

            writer.write_document(biocDoc)
Example #3
0
def marcxml2bioc(marcxmlFilename, biocFilename):
    with open(marcxmlFilename,
              'rb') as inF, bioc.BioCXMLDocumentWriter(biocFilename) as writer:

        def marcxml2bioc_helper(record):
            writeMarcXMLRecordToBiocFile(record, writer)

        pymarc.map_xml(marcxml2bioc_helper, inF)
Example #4
0
def splitBioc(inBioc, outDir, maxLength, stripAnnotations=False):
    assert os.path.isfile(inBioc)
    assert os.path.isdir(outDir)

    pmids = set()

    textLength = 0
    docNumber = 0
    docName = os.path.join(outDir, "%08d.bioc.xml" % docNumber)
    writer = bioc.BioCXMLDocumentWriter(docName)
    with open(inBioc, 'rb') as f:
        parser = bioc.BioCXMLDocumentReader(f)
        for i, doc in enumerate(parser):
            if 'pmid' in doc.infons:
                if doc.infons['pmid'] in pmids:
                    continue
                pmids.add(doc.infons['pmid'])

            thisDocLength = sum(len(passage.text) for passage in doc.passages)

            assert len(
                doc.passages
            ) > 0 and thisDocLength > 0, "Corpus file cannot contain empty documents"

            if stripAnnotations:
                for passage in doc.passages:
                    passage.annotations = []
                    passage.relations = []

            if textLength > 0 and maxLength and (textLength +
                                                 thisDocLength) > maxLength:
                textLength = 0
                docNumber += 1
                docName = os.path.join(outDir, "%08d.bioc.xml" % docNumber)
                writer.close()
                writer = bioc.BioCXMLDocumentWriter(docName)

            textLength += thisDocLength

            writer.write_document(doc)

    writer.close()
    if textLength == 0:
        os.remove(docName)
Example #5
0
def test_BioCXMLDocumentWriter_io():
    collection = _get_collection()

    f = io.BytesIO()
    writer = bioc.BioCXMLDocumentWriter(f)
    writer.write_collection_info(collection)
    for document in collection.documents:
        writer.write_document(document)
    writer.close()
    collection = bioc.loads(f.getvalue().decode('utf-8'))
    assert_everything(collection)
Example #6
0
def mergeBioc(inDir, outBioc):
    inBiocs = sorted([
        os.path.join(inDir, filename) for filename in os.listdir(inDir)
        if filename.lower().endswith('.xml')
        and not filename.lower().endswith('.ga.xml')
    ])

    with bioc.BioCXMLDocumentWriter(outBioc) as writer:
        for inBioc in inBiocs:
            with open(inBioc, 'rb') as f:
                parser = bioc.BioCXMLDocumentReader(f)
                for doc in parser:
                    writer.write_document(doc)
Example #7
0
def test_BioCXMLDocumentWriter_file():
    collection = _get_collection()

    tmp = tempfile.mktemp()
    with bioc.BioCXMLDocumentWriter(tmp) as writer:
        writer.write_collection_info(collection)
        for document in collection.documents:
            writer.write_document(document)

    with open(tmp, encoding='utf8') as fp:
        collection = bioc.load(fp)

    assert_everything(collection)
Example #8
0
    def write_bioc_collection(filename: str, collection: bioc.BioCCollection):
        """write a BiocCollection as an xml document
        It will return 1

        :param filename: a str filename of the collection
        :param collection: a bioc collection
        :returns: 1
        """
        with bioc.BioCXMLDocumentWriter(filename) as writer:
            writer.write_collection_info(collection)
            for document in collection.documents:
                writer.write_document(document)
        return (1)
Example #9
0
def save(corpus, dataFormat, path):
    """
    Save a corpus to a directory

    :param corpus: The corpus of documents to save
    :param dataFormat: Format of data to save (only 'standoff', 'biocxml', 'pubannotation' and 'csv' are supported currently)
    :param path: Path where corpus should be saved. Must be an existing directory for 'standoff'.
    :type corpus: kindred.Corpus
    :type dataFormat: str
    :type path: str
    """

    assert dataFormat in ['standoff', 'biocxml', 'pubannotation', 'csv']

    assert isinstance(corpus, kindred.Corpus)

    if dataFormat == 'standoff':
        assert os.path.isdir(path), "Path must be an existing directory"

        for i, d in enumerate(corpus.documents):
            if d.sourceFilename is None:
                base = "%08d" % i
            else:
                base = d.sourceFilename

            txtPath = os.path.join(path, '%s.txt' % base)
            a1Path = os.path.join(path, '%s.a1' % base)
            a2Path = os.path.join(path, '%s.a2' % base)

            saveDocToSTFormat(d, txtPath, a1Path, a2Path)
    elif dataFormat == 'biocxml':
        assert not os.path.isdir(
            path), "Path cannot be an existing directory for 'biocxml'."

        collection = convertKindredCorpusToBioCCollection(corpus)
        with bioc.BioCXMLDocumentWriter(path) as writer:
            for doc in collection.documents:
                writer.write_document(doc)
    elif dataFormat == 'pubannotation':
        assert not os.path.isdir(
            path), "Path cannot be an existing directory for 'pubannotation'."

        saveCorpusToPubAnnotationFormat(corpus, path)
    elif dataFormat == 'csv':
        assert not os.path.isdir(
            path), "Path cannot be an existing directory for 'csv'."

        saveCorpusToCSVFormat(corpus, path)
Example #10
0
def pmcxml2bioc(pmcxmlFilename, biocFilename):
    try:
        with bioc.BioCXMLDocumentWriter(biocFilename) as writer:
            for pmcDoc in processPMCFile(pmcxmlFilename):
                biocDoc = bioc.BioCDocument()
                biocDoc.id = pmcDoc["pmid"]
                biocDoc.infons['title'] = " ".join(
                    pmcDoc["textSources"]["title"])
                biocDoc.infons['pmid'] = pmcDoc["pmid"]
                biocDoc.infons['pmcid'] = pmcDoc["pmcid"]
                biocDoc.infons['doi'] = pmcDoc["doi"]
                biocDoc.infons['year'] = pmcDoc["pubYear"]
                biocDoc.infons['month'] = pmcDoc["pubMonth"]
                biocDoc.infons['day'] = pmcDoc["pubDay"]
                biocDoc.infons['journal'] = pmcDoc["journal"]
                biocDoc.infons['journalISO'] = pmcDoc["journalISO"]

                offset = 0
                for groupName, textSourceGroup in pmcDoc["textSources"].items(
                ):
                    subsection = None
                    for textSource in textSourceGroup:
                        textSource = trimSentenceLengths(textSource)
                        passage = bioc.BioCPassage()

                        subsectionCheck = textSource.lower().strip(
                            '01234567890. ')
                        if subsectionCheck in allowedSubsections:
                            subsection = subsectionCheck

                        passage.infons['section'] = groupName
                        passage.infons['subsection'] = subsection
                        passage.text = textSource
                        passage.offset = offset
                        offset += len(textSource)
                        biocDoc.add_passage(passage)

                writer.write_document(biocDoc)
    except etree.ParseError:
        raise RuntimeError("Parsing error in PMC xml file: %s" %
                           pmcxmlFilename)
Example #11
0
def convertFiles(inFiles, inFormat, outFile, outFormat, idFilterfiles=None):
    outBiocHandle, outTxtHandle = None, None

    if outFormat == 'bioc':
        outBiocHandle = bioc.BioCXMLDocumentWriter(outFile)
    elif outFormat == 'txt':
        outTxtHandle = codecs.open(outFile, 'w', 'utf-8')

    if idFilterfiles is None:
        idFilterfiles = [None for _ in inFiles]

    print("Converting %d files to %s" % (len(inFiles), outFile))
    for inFile, idFilterfile in zip(inFiles, idFilterfiles):
        if idFilterfile is None:
            idFilter = None
        else:
            with open(idFilterfile) as f:
                idFilter = set([line.strip() for line in f])

        with tempfile.NamedTemporaryFile() as temp:
            if inFormat == 'bioc':
                shutil.copyfile(inFile, temp.name)
            elif inFormat == 'pubmedxml':
                pubmedxml2bioc(inFile, temp.name)
            elif inFormat == 'marcxml':
                marcxml2bioc(inFile, temp.name)
            elif inFormat == 'pmcxml':
                pmcxml2bioc(inFile, temp.name)
            elif inFormat == 'uimaxmi':
                uimaxmi2bioc(inFile, temp.name)
            else:
                raise RuntimeError("Unknown input format: %s" % inFormat)

            if outFormat == 'bioc':
                mergeBioc(temp.name, outBiocHandle, idFilter)
            elif outFormat == 'txt':
                bioc2txt(temp.name, outTxtHandle, idFilter)
            else:
                raise RuntimeError("Unknown output format: %s" % outFormat)
    print("Output to %s complete" % outFile)
Example #12
0
def convert(in_files, in_format, out_file, out_format, **kwargs):
    out_bioc_handle, out_txt_handle = None, None

    assert (
        in_format in accepted_in_formats
    ), "%s is not an accepted input format. Options are: %s" % (
        in_format,
        "/".join(accepted_in_formats),
    )
    assert (
        out_format in accepted_out_formats
    ), "%s is not an accepted output format. Options are: %s" % (
        out_format,
        "/".join(accepted_out_formats),
    )

    if out_format == "biocxml":
        out_bioc_handle = bioc.BioCXMLDocumentWriter(out_file)
    elif out_format == "txt":
        out_txt_handle = open(out_file, "w", encoding="utf-8")

    for in_file in in_files:

        for bioc_doc in docs2bioc(in_file, in_format, **kwargs):

            if out_format == "biocxml":
                out_bioc_handle.write_document(bioc_doc)
            elif out_format == "txt":
                for passage in bioc_doc.passages:
                    out_txt_handle.write(passage.text)
                    out_txt_handle.write("\n\n")

    if out_format == "biocxml":
        out_bioc_handle.close()
    elif out_format == "txt":
        out_txt_handle.close()
Example #13
0
def mergeBiocWithMetadata(metaDir, inDir, outBioc):
    filenames = sorted([
        filename for filename in os.listdir(inDir)
        if filename.lower().endswith('.xml')
        and not filename.lower().endswith('.ga.xml')
    ])

    with bioc.BioCXMLDocumentWriter(outBioc) as writer:
        for filename in filenames:
            inBioc = os.path.join(inDir, filename)
            metaBioc = os.path.join(metaDir, filename)

            with open(inBioc, 'rb') as f1, open(metaBioc, 'rb') as f2:
                inParser = bioc.BioCXMLDocumentReader(f1)
                metaParser = bioc.BioCXMLDocumentReader(f2)

                for inDoc, metaDoc in zip(inParser, metaParser):
                    assert len(inDoc.passages) == len(metaDoc.passages)
                    for inP, metaP in zip(inDoc.passages, metaDoc.passages):
                        assert inP.text == metaP.text
                        inP.infons.update(metaP.infons)

                    inDoc.infons.update(metaDoc.infons)
                    writer.write_document(inDoc)
Example #14
0
def uimaxmi2bioc(xmiFilename, biocFilename):
    tree = etree.parse(xmiFilename)
    root = tree.getroot()

    metadataNode = root.find(
        '{http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore}DocumentMetaData'
    )
    documentTitle = metadataNode.attrib['documentTitle']

    contentNode = root.find('{http:///uima/cas.ecore}Sofa')
    content = contentNode.attrib['sofaString']

    with bioc.BioCXMLDocumentWriter(biocFilename) as writer:
        biocDoc = bioc.BioCDocument()
        biocDoc.id = None
        biocDoc.infons['title'] = documentTitle

        passage = bioc.BioCPassage()
        passage.infons['section'] = 'article'
        passage.text = content
        passage.offset = 0
        biocDoc.add_passage(passage)

        writer.write_document(biocDoc)
Example #15
0
	
	pmidToAnnotations = defaultdict(list)
	with open(args.annotations) as f:
		for line in f:
			split = line.strip('\n').split('\t')
			pmid,annotationType,conceptid,mentions,database = split
			mentions = mentions.strip()
			pmid = int(pmid)
			if len(mentions) > 0 and pmid in pmids:
				pmidToAnnotations[pmid].append((annotationType,conceptid,mentions))


	print("Starting text alignment...")

	currentID = 1
	writer = bioc.BioCXMLDocumentWriter(args.outBioc)
	#with bioc.BioCXMLDocumentReader(args.inBioc) as parser:
	with open(args.inBioc,'rb') as f:
		parser = bioc.BioCXMLDocumentReader(f)
		for i,doc in enumerate(parser):
			for passage in doc.passages:
				passage.annotations = []

			if 'pmid' in doc.infons and doc.infons['pmid']:
				pmid = int(doc.infons['pmid'])

				#print(now(),i,pmid)
				#sys.stdout.flush()

				for passage in doc.passages:
					candidates = defaultdict(lambda : defaultdict(list))
Example #16
0
import argparse
import bioc

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Make some minor fixes to BioC files to make them play nicely with some NER tools')
	parser.add_argument('--inBiocXML',type=str,required=True,help='Input BioC XML file')
	parser.add_argument('--outBiocXML',type=str,required=True,help='Output BioC XML file')
	args = parser.parse_args()

	pmids = set()

	textLength = 0

	with open(args.inBiocXML,'rb') as f, bioc.BioCXMLDocumentWriter(args.outBiocXML) as writer:
		parser = bioc.BioCXMLDocumentReader(f)
		for i,doc in enumerate(parser):
			if doc.infons['pmid'] in pmids:
				continue
			pmids.add(doc.infons['pmid'])

			for passage in doc.passages:
				if 'section' in passage.infons:
					passage.infons['type'] = passage.infons['section']
				else:
					passage.infons['type'] = 'unknown'

				passage.text = passage.text.strip()

			thisDocLength = sum( len(passage.text) for passage in doc.passages )

			if len(doc.passages) == 0 or thisDocLength == 0:
Example #17
0
	parser.add_argument('--outFile',required=True,type=str,help='File to save to')
	args = parser.parse_args()

	assert args.format == 'biocxml'

	grouping_file = os.path.join(args.pmcDir,'groupings.json')
	with open(grouping_file) as f:
		block = json.load(f)['groups'][args.block]

	source = os.path.join(args.pmcDir, block['src'])
	files_to_extract = block['group']

	#print(source)
	#print(len(files_to_extract))

	with bioc.BioCXMLDocumentWriter(args.outFile) as writer:
		tar = tarfile.open(source)
		for i,filename in enumerate(files_to_extract):
			#print(i,filename)
			member = tar.getmember(filename)
			#print(member)
			file_handle = tar.extractfile(member)
			#print(file_handle)
			data = file_handle.read().decode('utf-8')
			#print(len(data))
			#print(data[:500])
			for biocDoc in pmcxml2bioc(io.StringIO(data)):
				writer.write_document(biocDoc)

			#break
	print("Saved %d documents to %s" % (len(files_to_extract), args.outFile))
Example #18
0
	parser.add_argument('--outFile',required=True,type=str,help='File to save to')
	parser.add_argument('--db',action='store_true',help="Whether to output as an SQLite database")
	args = parser.parse_args()

	assert args.format == 'biocxml'

	grouping_file = os.path.join(args.pmcDir,'groupings.json')
	with open(grouping_file) as f:
		block = json.load(f)['groups'][args.block]

	source = os.path.join(args.pmcDir, block['src'])
	files_to_extract = block['group']

	with tempfile.NamedTemporaryFile() as tf_out:
		out_file = tf_out.name if args.db else args.outFile
		with bioc.BioCXMLDocumentWriter(out_file) as writer:
			tar = tarfile.open(source)
			for i,filename in enumerate(files_to_extract):
				try:
					member = tar.getmember(filename)
				except KeyError:
					print("WARNING. Didn't find %s in %s. Skipping" % (filename,source))
					continue
				
				file_handle = tar.extractfile(member)
				
				data = file_handle.read().decode('utf-8')

				for bioc_doc in pmcxml2bioc(io.StringIO(data)):
					writer.write_document(bioc_doc)