def pubmedxml2bioc(pubmedxmlFilename, biocFilename): with bioc.iterwrite(biocFilename) as writer: for pmDoc in processMedlineFile(pubmedxmlFilename): biocDoc = bioc.BioCDocument() biocDoc.id = pmDoc["pmid"] biocDoc.infons['title'] = " ".join(pmDoc["title"]) biocDoc.infons['pmid'] = pmDoc["pmid"] biocDoc.infons['year'] = pmDoc["pubYear"] biocDoc.infons['month'] = pmDoc["pubMonth"] biocDoc.infons['day'] = pmDoc["pubDay"] biocDoc.infons['journal'] = pmDoc["journal"] biocDoc.infons['journalISO'] = pmDoc["journalISO"] biocDoc.infons['authors'] = ", ".join(pmDoc["authors"]) biocDoc.infons['chemicals'] = pmDoc['chemicals'] biocDoc.infons['meshHeadings'] = pmDoc['meshHeadings'] offset = 0 for section in ["title","abstract"]: for textSource in pmDoc[section]: textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() passage.infons['section'] = section passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) writer.writedocument(biocDoc)
def marcxml2bioc(marcxmlFilename, biocFilename): with open(marcxmlFilename, 'rb') as inF, bioc.iterwrite(biocFilename) as writer: def marcxml2bioc_helper(record): writeMarcXMLRecordToBiocFile(record, writer) pymarc.map_xml(marcxml2bioc_helper, inF)
def pmcxml2bioc(pmcxmlFilename, biocFilename): try: with bioc.iterwrite(biocFilename) as writer: for pmcDoc in processPMCFile(pmcxmlFilename): biocDoc = bioc.BioCDocument() biocDoc.id = pmcDoc["pmid"] biocDoc.infons['title'] = " ".join( pmcDoc["textSources"]["title"]) biocDoc.infons['pmid'] = pmcDoc["pmid"] biocDoc.infons['pmcid'] = pmcDoc["pmcid"] biocDoc.infons['doi'] = pmcDoc["doi"] biocDoc.infons['year'] = pmcDoc["pubYear"] biocDoc.infons['month'] = pmcDoc["pubMonth"] biocDoc.infons['day'] = pmcDoc["pubDay"] biocDoc.infons['journal'] = pmcDoc["journal"] biocDoc.infons['journalISO'] = pmcDoc["journalISO"] offset = 0 for groupName, textSourceGroup in pmcDoc["textSources"].items( ): subsection = None for textSource in textSourceGroup: textSource = trimSentenceLengths(textSource) passage = bioc.BioCPassage() subsectionCheck = textSource.lower().strip( '01234567890. ') if subsectionCheck in allowedSubsections: subsection = subsectionCheck passage.infons['section'] = groupName passage.infons['subsection'] = subsection passage.text = textSource passage.offset = offset offset += len(textSource) biocDoc.add_passage(passage) writer.writedocument(biocDoc) except etree.ParseError: raise RuntimeError("Parsing error in PMC xml file: %s" % pmcxmlFilename)
def uimaxmi2bioc(xmiFilename, biocFilename): tree = etree.parse(xmiFilename) root = tree.getroot() metadataNode = root.find('{http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore}DocumentMetaData') documentTitle = metadataNode.attrib['documentTitle'] contentNode = root.find('{http:///uima/cas.ecore}Sofa') content = contentNode.attrib['sofaString'] with bioc.iterwrite(biocFilename) as writer: biocDoc = bioc.BioCDocument() biocDoc.id = None biocDoc.infons['title'] = documentTitle passage = bioc.BioCPassage() passage.infons['section'] = 'article' passage.text = content passage.offset = 0 biocDoc.add_passage(passage) writer.writedocument(biocDoc)
def save(corpus, dataFormat, path): """ Save a corpus to a directory :param corpus: The corpus of documents to save :param dataFormat: Format of data to save (only 'standoff' and 'biocxml' are supported currently) :param path: Path where corpus should be saved. Must be an existing directory for 'standoff'. :type corpus: kindred.Corpus :type dataFormat: str :type path: str """ assert dataFormat in ['standoff', 'biocxml'] assert isinstance(corpus, kindred.Corpus) if dataFormat == 'standoff': assert os.path.isdir(path), "Path must be an existing directory" for i, d in enumerate(corpus.documents): if d.sourceFilename is None: base = "%08d" % i else: base = d.sourceFilename txtPath = os.path.join(path, '%s.txt' % base) a1Path = os.path.join(path, '%s.a1' % base) a2Path = os.path.join(path, '%s.a2' % base) saveDocToSTFormat(d, txtPath, a1Path, a2Path) elif dataFormat == 'biocxml': assert not os.path.isdir( path), "Path cannot be an existing directory for 'biocxml'." collection = convertKindredCorpusToBioCCollection(corpus) with bioc.iterwrite(path) as writer: for doc in collection.documents: writer.writedocument(doc)
def save(corpus, dataFormat, directory): """ Save a corpus to a directory :param corpus: The corpus of documents to save :param dataFormat: Format of data to save (only 'standoff' and 'bioc' are supported currently) :param directory: Path to directory in which files should be saved :type corpus: kindred.Corpus :type dataFormat: str :type directory: str """ assert dataFormat == 'standoff' or dataFormat == 'bioc' assert isinstance(corpus, kindred.Corpus) if dataFormat == 'standoff': for i, d in enumerate(corpus.documents): if d.getSourceFilename() is None: base = "%08d" % i else: base = d.getSourceFilename() txtPath = os.path.join(directory, '%s.txt' % base) a1Path = os.path.join(directory, '%s.a1' % base) a2Path = os.path.join(directory, '%s.a2' % base) saveDocToSTFormat(d, txtPath, a1Path, a2Path) elif dataFormat == 'bioc': outFilename = os.path.join(directory, 'collection.bioc.xml') collection = convertKindredCorpusToBioCCollection(corpus) #bioc_writer = bioc.BioCWriter(outFilename) #bioc_writer.collection = collection #bioc_writer.write() with bioc.iterwrite(outFilename) as writer: for doc in collection.documents: writer.writedocument(doc)
pmid = int(doc.infons['pmid']) pmids.add(pmid) pmidToAnnotations = defaultdict(list) with open(args.annotations) as f: for line in f: split = line.strip('\n').split('\t') pmid, annotationType, conceptid, mentions, database = split mentions = mentions.strip() pmid = int(pmid) if len(mentions) > 0 and pmid in pmids: pmidToAnnotations[pmid].append( (annotationType, conceptid, mentions)) currentID = 1 writer = bioc.iterwrite(args.outBioc) with bioc.iterparse(args.inBioc) as parser: for i, doc in enumerate(parser): for passage in doc.passages: passage.annotations = [] if 'pmid' in doc.infons and doc.infons['pmid'] != 'None': pmid = int(doc.infons['pmid']) print(now(), i, pmid) sys.stdout.flush() for passage in doc.passages: candidates = defaultdict(lambda: defaultdict(list)) for annotationType, conceptid, mentions in pmidToAnnotations[