Ejemplo n.º 1
0
def pubmedxml2bioc(pubmedxmlFilename, biocFilename):
	with bioc.iterwrite(biocFilename) as writer:
		for pmDoc in processMedlineFile(pubmedxmlFilename):
			biocDoc = bioc.BioCDocument()
			biocDoc.id = pmDoc["pmid"]
			biocDoc.infons['title'] = " ".join(pmDoc["title"])
			biocDoc.infons['pmid'] = pmDoc["pmid"]
			biocDoc.infons['year'] = pmDoc["pubYear"]
			biocDoc.infons['month'] = pmDoc["pubMonth"]
			biocDoc.infons['day'] = pmDoc["pubDay"]
			biocDoc.infons['journal'] = pmDoc["journal"]
			biocDoc.infons['journalISO'] = pmDoc["journalISO"]
			biocDoc.infons['authors'] = ", ".join(pmDoc["authors"])
			biocDoc.infons['chemicals'] = pmDoc['chemicals']
			biocDoc.infons['meshHeadings'] = pmDoc['meshHeadings']
	
			offset = 0
			for section in ["title","abstract"]:
				for textSource in pmDoc[section]:
					textSource = trimSentenceLengths(textSource)
					passage = bioc.BioCPassage()
					passage.infons['section'] = section
					passage.text = textSource
					passage.offset = offset
					offset += len(textSource)
					biocDoc.add_passage(passage)

			writer.writedocument(biocDoc)
Ejemplo n.º 2
0
def marcxml2bioc(marcxmlFilename, biocFilename):
    with open(marcxmlFilename,
              'rb') as inF, bioc.iterwrite(biocFilename) as writer:

        def marcxml2bioc_helper(record):
            writeMarcXMLRecordToBiocFile(record, writer)

        pymarc.map_xml(marcxml2bioc_helper, inF)
Ejemplo n.º 3
0
def pmcxml2bioc(pmcxmlFilename, biocFilename):
    try:
        with bioc.iterwrite(biocFilename) as writer:
            for pmcDoc in processPMCFile(pmcxmlFilename):
                biocDoc = bioc.BioCDocument()
                biocDoc.id = pmcDoc["pmid"]
                biocDoc.infons['title'] = " ".join(
                    pmcDoc["textSources"]["title"])
                biocDoc.infons['pmid'] = pmcDoc["pmid"]
                biocDoc.infons['pmcid'] = pmcDoc["pmcid"]
                biocDoc.infons['doi'] = pmcDoc["doi"]
                biocDoc.infons['year'] = pmcDoc["pubYear"]
                biocDoc.infons['month'] = pmcDoc["pubMonth"]
                biocDoc.infons['day'] = pmcDoc["pubDay"]
                biocDoc.infons['journal'] = pmcDoc["journal"]
                biocDoc.infons['journalISO'] = pmcDoc["journalISO"]

                offset = 0
                for groupName, textSourceGroup in pmcDoc["textSources"].items(
                ):
                    subsection = None
                    for textSource in textSourceGroup:
                        textSource = trimSentenceLengths(textSource)
                        passage = bioc.BioCPassage()

                        subsectionCheck = textSource.lower().strip(
                            '01234567890. ')
                        if subsectionCheck in allowedSubsections:
                            subsection = subsectionCheck

                        passage.infons['section'] = groupName
                        passage.infons['subsection'] = subsection
                        passage.text = textSource
                        passage.offset = offset
                        offset += len(textSource)
                        biocDoc.add_passage(passage)

                writer.writedocument(biocDoc)
    except etree.ParseError:
        raise RuntimeError("Parsing error in PMC xml file: %s" %
                           pmcxmlFilename)
Ejemplo n.º 4
0
def uimaxmi2bioc(xmiFilename, biocFilename):
	tree = etree.parse(xmiFilename)
	root = tree.getroot()

	metadataNode = root.find('{http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore}DocumentMetaData')
	documentTitle = metadataNode.attrib['documentTitle']

	contentNode = root.find('{http:///uima/cas.ecore}Sofa')
	content = contentNode.attrib['sofaString']

	with bioc.iterwrite(biocFilename) as writer:
		biocDoc = bioc.BioCDocument()
		biocDoc.id = None
		biocDoc.infons['title'] = documentTitle

		passage = bioc.BioCPassage()
		passage.infons['section'] = 'article'
		passage.text = content
		passage.offset = 0
		biocDoc.add_passage(passage)

		writer.writedocument(biocDoc)
Ejemplo n.º 5
0
def save(corpus, dataFormat, path):
    """
	Save a corpus to a directory
	
	:param corpus: The corpus of documents to save
	:param dataFormat: Format of data to save (only 'standoff' and 'biocxml' are supported currently)
	:param path: Path where corpus should be saved. Must be an existing directory for 'standoff'.
	:type corpus: kindred.Corpus
	:type dataFormat: str
	:type path: str
	"""

    assert dataFormat in ['standoff', 'biocxml']

    assert isinstance(corpus, kindred.Corpus)

    if dataFormat == 'standoff':
        assert os.path.isdir(path), "Path must be an existing directory"

        for i, d in enumerate(corpus.documents):
            if d.sourceFilename is None:
                base = "%08d" % i
            else:
                base = d.sourceFilename

            txtPath = os.path.join(path, '%s.txt' % base)
            a1Path = os.path.join(path, '%s.a1' % base)
            a2Path = os.path.join(path, '%s.a2' % base)

            saveDocToSTFormat(d, txtPath, a1Path, a2Path)
    elif dataFormat == 'biocxml':
        assert not os.path.isdir(
            path), "Path cannot be an existing directory for 'biocxml'."

        collection = convertKindredCorpusToBioCCollection(corpus)
        with bioc.iterwrite(path) as writer:
            for doc in collection.documents:
                writer.writedocument(doc)
Ejemplo n.º 6
0
def save(corpus, dataFormat, directory):
    """
	Save a corpus to a directory
	
	:param corpus: The corpus of documents to save
	:param dataFormat: Format of data to save (only 'standoff' and 'bioc' are supported currently)
	:param directory: Path to directory in which files should be saved
	:type corpus: kindred.Corpus
	:type dataFormat: str
	:type directory: str
	"""

    assert dataFormat == 'standoff' or dataFormat == 'bioc'

    assert isinstance(corpus, kindred.Corpus)

    if dataFormat == 'standoff':
        for i, d in enumerate(corpus.documents):
            if d.getSourceFilename() is None:
                base = "%08d" % i
            else:
                base = d.getSourceFilename()

            txtPath = os.path.join(directory, '%s.txt' % base)
            a1Path = os.path.join(directory, '%s.a1' % base)
            a2Path = os.path.join(directory, '%s.a2' % base)

            saveDocToSTFormat(d, txtPath, a1Path, a2Path)
    elif dataFormat == 'bioc':
        outFilename = os.path.join(directory, 'collection.bioc.xml')
        collection = convertKindredCorpusToBioCCollection(corpus)
        #bioc_writer = bioc.BioCWriter(outFilename)
        #bioc_writer.collection = collection
        #bioc_writer.write()
        with bioc.iterwrite(outFilename) as writer:
            for doc in collection.documents:
                writer.writedocument(doc)
Ejemplo n.º 7
0
                pmid = int(doc.infons['pmid'])
                pmids.add(pmid)

    pmidToAnnotations = defaultdict(list)
    with open(args.annotations) as f:
        for line in f:
            split = line.strip('\n').split('\t')
            pmid, annotationType, conceptid, mentions, database = split
            mentions = mentions.strip()
            pmid = int(pmid)
            if len(mentions) > 0 and pmid in pmids:
                pmidToAnnotations[pmid].append(
                    (annotationType, conceptid, mentions))

    currentID = 1
    writer = bioc.iterwrite(args.outBioc)
    with bioc.iterparse(args.inBioc) as parser:
        for i, doc in enumerate(parser):
            for passage in doc.passages:
                passage.annotations = []

            if 'pmid' in doc.infons and doc.infons['pmid'] != 'None':
                pmid = int(doc.infons['pmid'])

                print(now(), i, pmid)
                sys.stdout.flush()

                for passage in doc.passages:
                    candidates = defaultdict(lambda: defaultdict(list))

                    for annotationType, conceptid, mentions in pmidToAnnotations[