def eventsToXML(events, xmlEvents, dataSets, srTexts): for norText in events.keys(): usedOffsets = set() entitiesByOffset = {} entityCount = 0 interactionCount = 0 xmlEvents[norText] = {"entities":[], "interactions":[]} dataSet = None for event in events[norText]: srTexts[norText] = event["text"] if dataSet == None: dataSet = event["dataSet"] else: assert dataSet == event["dataSet"] # Add entity offset, entityText = event["entity"].replace("\t", " ").split(" ", 1) if offset not in usedOffsets: entity = ET.Element("entity") entity.set("charOffset", offset) entity.set("text", entityText) entity.set("type", "Entity") entity.set("isName", "False") entity.set("id", "e" + str(entityCount)) entity.set("srId", event["id"]) entityCount += 1 usedOffsets.add(offset) entitiesByOffset[offset] = entity xmlEvents[norText]["entities"].append(entity) else: entity = entitiesByOffset[offset] # Add named entity offset, entityText = event["namedEntity"].replace("\t", " ").split(" ", 1) if offset not in usedOffsets: namedEntity = ET.Element("entity") namedEntity.set("charOffset", offset) namedEntity.set("text", entityText) namedEntity.set("type", "Protein") namedEntity.set("isName", "True") namedEntity.set("id", "e" + str(entityCount)) namedEntity.set("srId", event["id"]) entityCount += 1 usedOffsets.add(offset) entitiesByOffset[offset] = namedEntity xmlEvents[norText]["entities"].append(namedEntity) else: namedEntity = entitiesByOffset[offset] # Add interactions if event["interaction"] == "Yes": interaction = ET.Element("interaction") interaction.set("type", "SR-" + event["eventType"]) interaction.set("directed", "False") interaction.set("e1", namedEntity.get("id")) interaction.set("e2", entity.get("id")) interaction.set("id", "i" + str(interactionCount)) interaction.set("srId", event["id"]) xmlEvents[norText]["interactions"].append(interaction) interactionCount += 1 else: assert event["interaction"] == "No", event["interaction"] dataSets[norText] = dataSet
def saveKeys(): if not _KEYFILE: raise Exception, 'No _KEYFILE' tree = ElementTree.ElementTree(ElementTree.Element('Keys')) for x in _KEYS: e = ElementTree.Element('key') e.text = x tree.getroot().append(e) tree.write(_KEYFILE)
def make_keyfiles(): x = ElementTree.Element('Keys') y = ElementTree.Element('key') y["name"] = KEY[0] y.text = KEY[1] x.append(y) y = ElementTree.Element('key') y.text = 'foo' x.append(y) b = ElementTree.ElementTree(x) b.write('tests/xml/keys') b.write('tests/xml/keys3')
def arcSVG(self): spec1="M%(frox)d,%(y)d L%(linebx)d,%(lineby)d C%(c1bx)d,%(c1by)d %(c1ex)d,%(c1ey)d %(recx)d,%(midy)d"%self.param spec2="M%(recxe)d,%(midy)d C%(c2bx)d,%(c2by)d %(c2ex)d,%(c2ey)d %(lineex)d,%(lineey)d L%(tox)d,%(y)d"%self.param arcN1=ET.Element("path") arcN1.set("d",spec1) styleStr=";".join("%s:%s"%(var,val) for var,val in self.arcStyle().items()) arcN1.set("style",styleStr) arcN2=ET.Element("path") arcN2.set("d",spec2) styleStr=";".join("%s:%s"%(var,val) for var,val in self.arcStyle().items()) arcN2.set("style",styleStr) return [arcN1,arcN2]
def generateSVG(tokens,dependencies): layout(tokens,dependencies) tree=ET.Element("svg") tree.set("xmlns","http://www.w3.org/2000/svg") tree.set("xmlns:xlink","http://www.w3.org/1999/xlink") tree.set("version","1.1") tree.set("baseProfile","full") allNodes=[] totalWidth=0 totalHeight=tokens[0].y+10 if SVGOptions.whAttributes: tree.set("height",strint(totalHeight)) for t in tokens: allNodes.extend(t.toSVG()) tokX=t.x+t.width() if tokX>totalWidth: totalWidth=tokX if SVGOptions.whAttributes: tree.set("width",strint(totalWidth)) for d in dependencies: allNodes.extend(d.arcSVG()) allNodes.extend(d.labelSVG()) allNodes.sort(cmp=drawOrder) for n in allNodes: tree.append(n) return tree
def processLines(lines, setName, usedIds, directed=True, negatives="INCLUDE", tree=None, corpusId="SE10T8"): if tree == None: corpus = ET.Element("corpus", {"source": corpusId}) tree = ET.ElementTree(corpus) else: corpus = tree.getroot() sentence = None for line in lines: line = line.strip() if sentence == None: assert line[0].isdigit(), line origId, line = line.split("\t") sentence = Sentence(origId, line.strip().strip("\""), corpusId, usedIds, setName) else: if line.startswith("Comment:"): sentence.comment = line.split(":", 1)[-1].strip() elif line != "": sentence.relation = line else: assert sentence != None corpus.append( sentence.process(directed=directed, negatives=negatives)) sentence = None return tree
def _node2xmlfields(self,noderecord): ''' fields in a node are written to xml fields; output is sorted according to grammar ''' if 'BOTSID' not in noderecord: raise botslib.OutMessageError(_(u'No field "BOTSID" in xml-output in: "$record"'),record=noderecord) #first generate the xml-'record' attributedict = {} recordtag = noderecord['BOTSID'] attributemarker = recordtag + self.ta_info['attributemarker'] for key,value in noderecord.items(): #find the attributes for the xml-record, put these in attributedict if key.startswith(attributemarker): attributedict[key[len(attributemarker):]] = value xmlrecord = ET.Element(recordtag,attributedict) #make the xml ET node if 'BOTSCONTENT' in noderecord: xmlrecord.text = noderecord['BOTSCONTENT'] del noderecord['BOTSCONTENT'] for key in attributedict.keys(): #remove used fields del noderecord[attributemarker+key] del noderecord['BOTSID'] #remove 'record' tag #generate xml-'fields' in xml-'record'; not sorted noderecordcopy = noderecord.copy() for key,value in noderecordcopy.items(): if key not in noderecord or self.ta_info['attributemarker'] in key: #if field not in outmessage: skip continue attributedict = {} attributemarker = key + self.ta_info['attributemarker'] for key2,value2 in noderecord.items(): if key2.startswith(attributemarker): attributedict[key2[len(attributemarker):]] = value2 ET.SubElement(xmlrecord, key,attributedict).text=value #add xml element to xml record for key2 in attributedict.keys(): #remove used fields del noderecord[attributemarker+key2] del noderecord[key] #remove xml entity tag return xmlrecord
def buildElements(filePath, sourceType): f = codecs.open(filePath, "rt", "utf-8") lines = f.readlines() f.close() spans = [] for line in lines: if line.startswith("#"): continue splits = line.strip("\n").split("\t") identifier = "" category = "" source = sourceType if sourceType == "sner": text, offset, eType = splits elif sourceType == "spec": text, offset, identifier, category, eType = splits elif sourceType == "tags": identifier, documentId, start, end, text, comment = splits eType = identifier.split(":")[0] offset = start + " " + end source = "linnaeus" span = ET.Element('span') span.set("text", text) span.set("offset", [int(x) for x in offset.split()]) #span.set("charOffset", "-".join(span.offset)) span.set("identifier", identifier) span.set("category", category) span.set("type", eType) span.set("source", source) spans.append(span) return spans
def test_cet(): """cElementTree""" _table = cet.Element('table') for row in table: tr = cet.SubElement(_table, 'tr') for c in row.values(): cet.SubElement(tr, 'td').text = str(c) cet.tostring(_table)
def test_cet(): """cElementTree""" _table = cet.Element("table") for row in table: tr = cet.SubElement(_table, "tr") for c in row.values(): cet.SubElement(tr, "td").text = str(c) cet.tostring(_table)
def add_jms(self, message, queue_name, now): jms = etree.Element("jms") dst = etree.Element("Dst") tms = etree.Element("Tms") dlv = etree.Element("Dlv") jms.append(dst) jms.append(tms) jms.append(dlv) tms.text = unicode(now) dst.text = u"queue:///" + queue_name dlv.text = unicode(message.jms_delivery_mode) if message.jms_expiration: exp = etree.Element("Exp") exp.text = unicode(now + message.jms_expiration) self.logger.log(TRACE1, "jms.Exp [%r]" % exp.text) jms.append(exp) if message.jms_priority: pri = etree.Element("Pri") pri.text = unicode(message.jms_priority) self.logger.log(TRACE1, "jms.Pri [%r]" % pri.text) jms.append(pri) if message.jms_correlation_id: cid = etree.Element("Cid") cid.text = unicode(message.jms_correlation_id) self.logger.log(TRACE1, "jms.Cid [%r]" % cid.text) jms.append(cid) self.folders["jms"] = jms
def test_kid_et(): """Kid template + cElementTree""" _table = cet.Element('table') for row in table: td = cet.SubElement(_table, 'tr') for c in row.values(): cet.SubElement(td, 'td').text = str(c) kid_tmpl2.table = _table kid_tmpl2.serialize(output='html')
def test_kid_et(): """Kid template + cElementTree""" _table = cet.Element("table") for row in table: td = cet.SubElement(_table, "tr") for c in row.values(): cet.SubElement(td, "td").text = str(c) kid_tmpl2.table = _table kid_tmpl2.serialize(output="html")
def labelSVG(self): recNode=ET.Element("rect") recNode.set("x",strint(self.param["recx"])) recNode.set("y",strint(self.param["recy"])) recNode.set("width",strint(self.param["recw"])) recNode.set("height",strint(self.param["rech"])) recNode.set("style","fill:white;")#stroke:black") labNode=ET.Element("text") labNode.set("systemlanguage","en") labNode.set("x",strint(self.param['txtX'])) labNode.set("y",strint(self.param['txtY'])) labNode.set("txt",self.type) labNode.text=self.type styleStr=";".join("%s:%s"%(var,val) for var,val in self.labelStyle().items()) labNode.set("style",styleStr) return [recNode,labNode]
def _map_custom_class(self, obj, mappings, ns): """ Fill in the missing attributes of Python objects and make it look to the rest of XMLConfig as if they already were in the XML config file. """ for class_name in mappings: tag_no_ns = obj.tag.replace(ns, "") if class_name == tag_no_ns: obj.set("class", mappings[class_name]) constructor_arg = etree.Element("%s%s" % (ns, "constructor-arg")) value = etree.Element("%s%s" % (ns, "value")) value.text = obj.text obj.append(constructor_arg) constructor_arg.append(value) obj.text = "" break else: self.logger.warning("No matching type found for object %s" % obj)
def add_usr(self, message): user_attrs = set(dir(message)) - reserved_attributes self.logger.log(TRACE1, "user_attrs [%s]" % user_attrs) if user_attrs: usr = etree.Element("usr") for user_attr in user_attrs: user_attr_value = getattr(message, user_attr) # Some values are integers, e.g. delivery_mode if isinstance(user_attr_value, basestring): user_attr_value = escape(user_attr_value) # Create a JMS attribute and set its value. user_attr = etree.Element(unicode(user_attr)) user_attr.text = unicode(user_attr_value) usr.append(user_attr) self.folders["usr"] = usr
def generate_xml(troves, label): document = cElementTree.Element("Packages", label=label) for trove in troves: name = trove.getName() version = trove.getVersion().trailingRevision().asString() meta = trove.getMetadata() package = cElementTree.Element("Package") node_name = cElementTree.Element("name") node_name.text = name node_version = cElementTree.Element("version") node_version.text = version for i in [node_name, node_version]: package.append(i) for key, value in meta.items(): if value is not None and value != "None": if key == "categories": for cat in value: cat_node = cElementTree.Element("category", lang="en") cat_node.text = cat package.append(node) else: node = cElementTree.Element(key, lang="en") node.text = value package.append(node) document.append(package) return document
def makenode(tag, content): node = ET.Element(tag) if not content: pass #empty element elif isinstance(content, basestring): node.text = content elif isinstance(content, list): node.tag = tag + 's' #change node tag for element in content: node.append(makenode(tag, element)) elif isinstance(content, dict): for key, value in content.items(): node.append(makenode(key, value)) else: node.text = repr(content) return node
def toSVG(self): runningY=self.y texts=[self.txt]+self.otherLines texts.reverse() nodes=[] for txt in texts: node=ET.Element("text") node.set("systemLanguage","en") node.set("x",strint(self.x)) node.set("y",strint(runningY)) styleStr=";".join("%s:%s"%(var,val) for var,val in self.style().items()) node.set("style",styleStr) node.text=txt nodes.append(node) runningY-=SVGOptions.fontSize+SVGOptions.lineSep return nodes
def _getEntity(self, line, tag): try: before, entityText, after = re.split( r'<' + tag + '>|</' + tag + '>', line) except ValueError as e: print "ValueError in line '" + line + "' for tag", tag raise e begin = len(before) end = len(before) + len(entityText) return ET.Element( "entity", { "text": entityText, "type": "entity", "given": "True", "charOffset": str(begin) + "-" + str(end), "id": self.id + "." + tag })
def loadKeys(fname=None, force_=False): global _KEYS, _KEYFILE if not fname: fname = os.path.expanduser('~/.isbndbkeys') if not os.path.exists(fname): a = ElementTree.Element('Keys') ElementTree.ElementTree(a).write(fname) _KEYFILE = fname _KEYS = dict() tree = ElementTree.parse(fname) for x in tree.findall('key'): try: _KEYS[x.get("name")] = Key(x.text, x.get("name")) except: if force_: _KEYS[x.get("name")] = Key(x.text, x.get("name"), force_=True) else: pass
def _node2xmlfields(self, noderecord): ''' fields in a node are written to xml fields; output is sorted according to grammar ''' #first generate the xml-'record' #~ print 'record',noderecord['BOTSID'] attributedict = {} recordtag = noderecord['BOTSID'] attributemarker = recordtag + self.ta_info[ 'attributemarker'] #attributemarker is a marker in the fieldname used to find out if field is an attribute of either xml-'record' or xml-element #~ print ' rec_att_mark',attributemarker for key, value in noderecord.items( ): #find attributes belonging to xml-'record' and store in attributedict if key.startswith(attributemarker): #~ print ' record attribute',key,value attributedict[key[len(attributemarker):]] = value xmlrecord = ET.Element(recordtag, attributedict) #make the xml ET node if 'BOTSCONTENT' in noderecord: #BOTSCONTENT is used to store the value/text of the xml-record itself. xmlrecord.text = noderecord['BOTSCONTENT'] del noderecord['BOTSCONTENT'] for key in attributedict.keys(): #remove used fields del noderecord[attributemarker + key] del noderecord['BOTSID'] #remove 'record' tag #generate xml-'fields' in xml-'record'; sort these by looping over records definition for field_def in self.defmessage.recorddefs[ recordtag]: #loop over fields in 'record' if field_def[ ID] not in noderecord: #if field not in outmessage: skip continue #~ print ' field',field_def attributedict = {} attributemarker = field_def[ID] + self.ta_info['attributemarker'] #~ print ' field_att_mark',attributemarker for key, value in noderecord.items(): if key.startswith(attributemarker): print ' field attribute', key, value attributedict[key[len(attributemarker):]] = value ET.SubElement(xmlrecord, field_def[ID], attributedict).text = noderecord[ field_def[ID]] #add xml element to xml record for key in attributedict.keys(): #remove used fields del noderecord[attributemarker + key] del noderecord[field_def[ID]] #remove xml entity tag return xmlrecord
def _getInteraction(self, relType, e1, e2, directed, count, eMap, relFrom=None, relTo=None): if relFrom == None: relFrom = e1 if relTo == None: relFrom = e2 attrs = { "id": self.id + ".i" + str(count), "type": relType, "directed": str(directed), "e1": eMap[e1].get("id"), "e2": eMap[e2].get("id") } if relFrom != "": attrs["from"] = relFrom if relTo != "": attrs["to"] = relTo return ET.Element("interaction", attrs)
def duplicateFlat(sourceEnt, targetEnt, entitiesById, sentencesById, interactionsByEntity): sourceEntId = sourceEnt.get("id") targetEntId = targetEnt.get("id") if interactionsByEntity.has_key(sourceEntId): for interaction in interactionsByEntity[sourceEntId]: e1 = interaction.get("e1") e2 = interaction.get("e2") assert e2 == sourceEntId, (sourceEntId, targetEntId ) # only named entities are duplicated sentenceId = interaction.get("id").rsplit(".", 1)[0] sentence = sentencesById[sentenceId] # Create new interaction (or pair) element newInteraction = ElementTree.Element(interaction.tag) newInteraction.set("e2", targetEntId) newInteraction.set("e1", e1) newInteraction.set("directed", "True") newInteraction.set("notes", "Equiv") newInteraction.set("type", interaction.get("type")) newInteraction.set("origId", interaction.get("origId")) insertInteraction(sentence, newInteraction)
# From cmqc.h _WMQ_MQFMT_RF_HEADER_2 = "MQHRF2 " # MQRFH_NO_FLAGS_WIRE is in cmqc.h _WMQ_MQRFH_NO_FLAGS_WIRE_FORMAT = "\x00\x00\x00\x00" # Java documentation says "214748364.7 seconds". _WMQ_MAX_EXPIRY_TIME = 214748364.7 _WMQ_ID_PREFIX = "ID:" # In current implementation, an mcd JMS folder is constant for every message # sent, so let's build it here. _mcd = etree.Element("mcd") _msd = etree.Element("Msd") _mcd.append(_msd) # For now, it's always a TextMessage _msd.text = "jms_text" _msgbody = etree.Element("msgbody") _msgbody.set("xmlns:xsi", "dummy") # We're using a dummy namespace _msgbody.set("xsi:nil", "true") _mcd.append(_msgbody) # Clean up namespace. del(_msd, _msgbody)
def convertDDI(outDir, downloadDir=None, redownload=False, makeIntermediateFiles=True, debug=False): cwd = os.getcwd() if not os.path.exists(outDir): os.makedirs(outDir) os.chdir(outDir) logFileName = os.path.join(outDir, "DDI11-conversion-log.txt") Stream.openLog(logFileName) print >> sys.stderr, "=======================", "Converting DDI'11 corpus", "=======================" corpusDir = outDir + "/DDI11-original" Utils.Download.downloadAndExtract(Settings.URL["DDI11_CORPUS"], corpusDir, downloadDir) bigfileName = os.path.join(outDir, "DDI11") #oldXML = ETUtils.ETFromObj(bigfileName+".xml") trainUnified = corpusDir + "/train" trainMTMX = corpusDir + "/train_MTMX" testUnified = corpusDir + "/test" testMTMX = corpusDir + "/test_MTMX" # Load main documents tempdir = tempfile.mkdtemp() print >> sys.stderr, "Temporary files directory at", tempdir documents, docById, docCounts = loadDocs(trainUnified) # Divide training data into a train and devel set sortedDocCounts = sorted(docCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True) datasetCounts = {"train": [0, 0], "devel": [0, 0], "test": [0, 0]} for i in range(0, len(sortedDocCounts) - 3, 4): for j in [0, 1]: docById[sortedDocCounts[i + j][0]].set("set", "train") datasetCounts["train"][0] += sortedDocCounts[i + j][1][0] datasetCounts["train"][1] += sortedDocCounts[i + j][1][1] docById[sortedDocCounts[i + 2][0]].set( "set", "train") #docById[sortedDocCounts[i+2][0]].set("set", "devel") docById[sortedDocCounts[i + 3][0]].set( "set", "devel") #docById[sortedDocCounts[i+3][0]].set("set", "test") datasetCounts["train"][0] += sortedDocCounts[i + 2][1][ 0] #datasetCounts["devel"][0] += sortedDocCounts[i+2][1][0] datasetCounts["train"][1] += sortedDocCounts[i + 2][1][ 1] #datasetCounts["devel"][1] += sortedDocCounts[i+2][1][1] datasetCounts["devel"][0] += sortedDocCounts[i + 3][1][ 0] #datasetCounts["test"][0] += sortedDocCounts[i+3][1][0] datasetCounts["devel"][1] += sortedDocCounts[i + 3][1][ 1] #datasetCounts["test"][1] += sortedDocCounts[i+3][1][1] for document in documents: # epajaolliset jaa yli if document.get("set") == None: document.set("set", "train") # Print division results print >> sys.stderr, datasetCounts for key in datasetCounts.keys(): if datasetCounts[key][1] != 0: print key, datasetCounts[key][0] / float(datasetCounts[key][1]) else: print key, datasetCounts[key][0], "/", float(datasetCounts[key][1]) # Some of the train and test ids overlap. Let's change the train set ids, because test set ones are needed # for the final evaluation. changeIdCount = 1000 for trainId in [ 'DrugDDI.d312', 'DrugDDI.d316', 'DrugDDI.d332', 'DrugDDI.d334', 'DrugDDI.d337', 'DrugDDI.d342', 'DrugDDI.d349', 'DrugDDI.d354', 'DrugDDI.d373', 'DrugDDI.d379', 'DrugDDI.d383', 'DrugDDI.d388', 'DrugDDI.d392', 'DrugDDI.d396', 'DrugDDI.d398', 'DrugDDI.d409', 'DrugDDI.d411', 'DrugDDI.d415', 'DrugDDI.d425', 'DrugDDI.d430', 'DrugDDI.d433', 'DrugDDI.d448', 'DrugDDI.d450', 'DrugDDI.d452', 'DrugDDI.d462', 'DrugDDI.d467', 'DrugDDI.d470', 'DrugDDI.d474', 'DrugDDI.d480', 'DrugDDI.d482', 'DrugDDI.d485', 'DrugDDI.d492', 'DrugDDI.d494', 'DrugDDI.d496', 'DrugDDI.d498', 'DrugDDI.d500', 'DrugDDI.d503', 'DrugDDI.d506', 'DrugDDI.d518', 'DrugDDI.d523', 'DrugDDI.d528', 'DrugDDI.d535', 'DrugDDI.d539', 'DrugDDI.d552', 'DrugDDI.d554', 'DrugDDI.d558', 'DrugDDI.d561', 'DrugDDI.d570', 'DrugDDI.d578' ]: newId = "DrugDDI.d" + str(changeIdCount) print >> sys.stderr, "Changing train/devel id", trainId, "to", newId for element in docById[trainId].getiterator(): for attrName, attrValue in element.attrib.iteritems(): if trainId in attrValue: element.set(attrName, attrValue.replace(trainId, newId)) docById[newId] = docById[trainId] del docById[trainId] changeIdCount += 1 # If test set exists, load it, too if testUnified != None: testDocuments, testDocById, testDocCounts = loadDocs(testUnified) for document in testDocuments: document.set("set", "test") documents = documents + testDocuments overlappingIds = [] for key in docById: if key in testDocById: overlappingIds.append(key) for key in docById: assert key not in testDocById, (key, docById[key].get("origId"), testDocById[key].get("origId"), sorted(docById.keys()), sorted(testDocById.keys()), sorted(overlappingIds)) docById.update(testDocById) # Add all documents into one XML xmlTree = ET.ElementTree(ET.Element("corpus")) root = xmlTree.getroot() root.set("source", "DDI11") for document in documents: root.append(document) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents-notfixed.xml") xml = xmlTree print >> sys.stderr, "Fixing DDI XML" fixEntities(xml) convertToInteractions(xml) # Add MTMX if trainMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(trainMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if testMTMX != None: inDir = Utils.Download.getTopDir( tempdir, Utils.Download.downloadAndExtract(testMTMX, tempdir, outDir + "/DDI11-original")) DDITools.addMTMX(xml, inDir) if makeIntermediateFiles: ETUtils.write(root, bigfileName + "-documents.xml") print >> sys.stderr, "---------------", "Inserting TEES-generated analyses", "---------------" Utils.Download.downloadAndExtract(Settings.URL["TEES_PARSES"], os.path.join(Settings.DATAPATH, "TEES-parses"), downloadDir, redownload=redownload) extractedFilename = os.path.join(Settings.DATAPATH, "TEES-parses") + "/DDI11" print >> sys.stderr, "Making sentences" Tools.SentenceSplitter.makeSentences(xml, extractedFilename, None) print >> sys.stderr, "Inserting McCC parses" Tools.BLLIPParser.insertParses( xml, extractedFilename, None, extraAttributes={"source": "TEES-preparsed"}) print >> sys.stderr, "Inserting Stanford conversions" Tools.StanfordParser.insertParses( xml, extractedFilename, None, extraAttributes={"stanfordSource": "TEES-preparsed"}) print >> sys.stderr, "Protein Name Splitting" splitTarget = "McCC" #ProteinNameSplitter.mainFunc(xml, None, splitTarget, splitTarget, "split-"+splitTarget, "split-"+splitTarget) ProteinNameSplitter.mainFunc(xml, None, splitTarget, removeOld=True) print >> sys.stderr, "Head Detection" #xml = FindHeads.findHeads(xml, "split-"+splitTarget, tokenization=None, output=None, removeExisting=True) xml = FindHeads.findHeads(xml, splitTarget, tokenization=None, output=None, removeExisting=True) print >> sys.stderr, "Dividing into sets" Utils.InteractionXML.DivideSets.processCorpus(xml, outDir, "DDI11", ".xml") Stream.closeLog(logFileName) if not debug: print >> sys.stderr, "Removing temporary directory", tempdir shutil.rmtree(tempdir) os.chdir(cwd)
def process(self, directed, negatives): # Build the entities for tag in ("e1", "e2"): self.entities.append(self._getEntity(self.text, tag)) self.text = self.text.replace("<" + tag + ">", "").replace("</" + tag + ">", "") # Check entity offsets for entity in self.entities: begin, end = [int(x) for x in entity.get("charOffset").split("-")] assert entity.get("text") == self.text[begin:end], ( entity.get("text"), self.text, self.text[begin:end], [begin, end]) assert len(self.entities) == 2 eMap = {"e1": self.entities[0], "e2": self.entities[1]} for key in eMap: # Check that e1 == e1 and e2 == e2 assert eMap[key].get("id").endswith("." + key) # Build the sentence docElem = ET.Element("document", { "id": self.corpusId + ".d" + self.origId, "set": self.setName }) sentElem = ET.SubElement( docElem, "sentence", { "id": self.id, "charOffset": "0-" + str(len(self.text)), "text": self.text, "origId": self.origId }) sentElem.set("relation", self.relation) if self.comment != None and self.comment != "": sentElem.set("comment", self.comment) for entity in self.entities: sentElem.append(entity) # Determine interaction types per direction relFrom, relTo = "", "" if self.relation == "Other": sentElem.append( self._getInteraction(self.relation, "e1", "e2", directed, 0, eMap, relFrom, relTo)) else: relType, rest = self.relation.strip(")").split("(") relFrom, relTo = rest.split(",") reverse = (relFrom == "e2" and relTo == "e1") if not reverse: assert relFrom == "e1" and relTo == "e2" forwardType = self.mergeType(relType, relFrom, relTo) if ( negatives == "REVERSE_POS") else relType reverseType = self.mergeType(relType, relTo, relFrom) if ( negatives == "REVERSE_POS") else "neg" else: forwardType = self.mergeType(relType, relFrom, relTo) if ( negatives == "REVERSE_POS") else "neg" reverseType = self.mergeType(relType, relTo, relFrom) if ( negatives == "REVERSE_POS") else relType # Build the interactions if directed: if forwardType != "neg" or negatives == "INCLUDE": sentElem.append( self._getInteraction(forwardType, "e1", "e2", directed, 0, eMap, "e1", "e2")) if reverseType != "neg" or negatives == "INCLUDE": sentElem.append( self._getInteraction(reverseType, "e2", "e1", directed, 1, eMap, "e2", "e1")) else: sentElem.append( self._getInteraction(self.relation, "e1", "e2", directed, 0, eMap, relFrom, relTo)) return docElem
def eventsToNewXML(events): xml = ET.Element("corpus") xml.set("source", "Static Relations") docCount = 0 sentenceById = {} for sentenceId in sorted(events.keys()): entities = [] interactions = [] entityByOffset = {} for event in events[sentenceId]: #print event if sentenceId not in sentenceById: document = ET.SubElement(xml, "document") document.set("id", "SR.d"+str(docCount)) document.set("origId", sentenceId) document.set("set", event["dataSet"]) sentence = ET.SubElement(document, "sentence") sentence.set("id", "SR.d"+str(docCount)+".s"+str(docCount)) sentence.set("origId", sentenceId) sentence.set("text", event["text"]) sentence.set("charOffset", "0-"+str(len(event["text"])-1)) docCount += 1 sentenceById[sentenceId] = sentence else: sentence = sentenceById[sentenceId] assert sentence.get("text") == event["text"], (sentence.get("text"), event["text"]) # Add entities e1Offset = event["entity"].split("\t")[0] e2Offset = event["namedEntity"].split("\t")[0] if e1Offset not in entityByOffset: e1 = ET.Element("entity") e1.set("text", event["entity"].split("\t")[1].strip()) e1.set("id", sentence.get("id")+".e"+str(len(entities))) offset = getOffset(event["entity"].split("\t")[0]) assert sentence.get("text")[offset[0]:offset[1]+1] == e1.get("text"), (event, sentence.get("text"), e1.get("text")) e1.set("charOffset", str(offset[0]) + "-" + str(offset[1])) e1.set("isName", "False") e1.set("type", "Entity") entities.append(e1) entityByOffset[e1Offset] = e1 else: e1 = entityByOffset[e1Offset] if e2Offset not in entityByOffset: e2 = ET.Element("entity") e2.set("text", event["namedEntity"].split("\t")[1].strip()) e2.set("id", sentence.get("id")+".e"+str(len(entities))) offset = getOffset(event["namedEntity"].split("\t")[0]) assert sentence.get("text")[offset[0]:offset[1]+1] == e2.get("text"), (event, sentence.get("text"), e2.get("text")) e2.set("charOffset", str(offset[0]) + "-" + str(offset[1])) e2.set("isName", "True") e2.set("type", "Protein") entities.append(e2) entityByOffset[e2Offset] = e2 else: e2 = entityByOffset[e2Offset] # Add interactions interaction = ET.Element("interaction") interaction.set("id", sentence.get("id")+".i"+str(len(interactions))) interaction.set("origId", event["id"]) interaction.set("type", event["eventType"]) interaction.set("e1", e1.get("id")) interaction.set("e2", e2.get("id")) interactions.append(interaction) for entity in entities: sentence.append(entity) for interaction in interactions: sentence.append(interaction) return xml
def convertChemProt(inDirs=None, setNames=None, outPath=None, goldTestSet=True, downloadDir=None, extractDir=None, redownload=False, debug=False): tempDir = None if inDirs == None: print >> sys.stderr, "---------------", "Downloading ChemProt files", "---------------" if extractDir == None: tempDir = tempfile.mkdtemp() inDirs = [] for setName in ("TRAIN", "DEVEL", "TEST"): if goldTestSet and setName == "TEST": setName = "TEST_GOLD" if Settings.URL["CP17_" + setName] != None: currentExtractDir = extractDir if extractDir else tempDir currentExtractDir = os.path.join(currentExtractDir, setName.lower()) inDirs.append( downloadFile(Settings.URL["CP17_" + setName], downloadDir, currentExtractDir, redownload)) print >> sys.stderr, "Reading ChemProt corpus from input", inDirs, "using dataset mapping", setNames dataSets = OrderedDict() for inDir in inDirs: print >> sys.stderr, "Reading input directory", inDir filenames = os.listdir(inDir) filetypes = ["_abstracts", "_entities", "_relations"] # Collect the file paths for the data types dirDataSets = set() for filename in filenames: if not (filename.endswith(".tsv") and any([x in filename for x in filetypes])): continue dataSetId, dataType = filename.replace("_gs", "").rsplit("_", 1) if setNames != None: dataSetId = setNames.get(dataSetId, dataSetId) dirDataSets.add(dataSetId) dataType = dataType.split(".")[0] if dataSetId not in dataSets: dataSets[dataSetId] = {} assert dataType not in dataSets[dataSetId] dataSets[dataSetId][dataType] = os.path.join(inDir, filename) print >> sys.stderr, "Found ChemProt datasets", list( dirDataSets), "at", inDir print >> sys.stderr, "Read datasets:", dataSets.keys() # Build the Interaction XML print >> sys.stderr, "Converting to Interaction XML" corpusName = "CP17" corpus = ET.Element("corpus", {"source": corpusName}) counts = defaultdict(int) docById = {} entityById = {} entitiesByDoc = {} docsWithErrors = set() for dataSetId in sorted(dataSets.keys()): prevCounts = copy.copy(counts) print >> sys.stderr, "---", "Building elements for dataset", dataSetId, "---" dataSet = dataSets[dataSetId] counts["sets"] += 1 with open(dataSet["abstracts"], "rt") as f: print >> sys.stderr, "Adding document elements for dataset", dataSetId for row in UnicodeDictReader( f, delimiter="\t", fieldnames=["id", "title", "abstract"], quoting=csv.QUOTE_NONE): document = ET.Element( "document", { "id": corpusName + ".d" + str(counts["documents"]), "origId": row["id"], "set": dataSetId }) document.set("text", row["title"] + " " + row["abstract"]) document.set("titleOffset", Range.tuplesToCharOffset((0, len(row["title"])))) if document.get("origId") in docById: assert document.get("text") == docById[document.get( "origId")].get("text") assert document.get("titleOffset") == docById[document.get( "origId")].get("titleOffset") counts["duplicate-documents"] += 1 else: corpus.append(document) docById[document.get("origId")] = document counts["documents"] += 1 with open(dataSet["entities"], "rt") as f: print >> sys.stderr, "Adding entity elements for dataset", dataSetId for row in UnicodeDictReader( f, delimiter="\t", fieldnames=["docId", "id", "type", "begin", "end", "text"], quoting=csv.QUOTE_NONE): document = docById[row["docId"]] assert row["type"] in ("CHEMICAL", "GENE-Y", "GENE-N") # Check for duplicate entities if row["docId"] not in entitiesByDoc: entitiesByDoc[row["docId"]] = set() assert row["id"] not in entitiesByDoc[row["docId"]] entitiesByDoc[row["docId"]].add(row["id"]) # Determine the offset offset = (int(row["begin"]), int(row["end"])) docSpan = document.get("text")[offset[0]:offset[1]] if docSpan == row["text"]: entity = ET.SubElement( document, "entity", { "id": document.get("id") + ".e" + str(len([x for x in document.findall("entity")])) }) entity.set("given", "True") entity.set("origId", row["id"]) entity.set("type", row["type"].split("-")[0]) entity.set( "normalized", "True" if row["type"].endswith("-Y") else "False") entity.set( "charOffset", Range.tuplesToCharOffset((offset[0], offset[1]))) entity.set("text", row["text"]) if row["docId"] not in entityById: entityById[row["docId"]] = {} assert entity.get("origId") not in entityById[row["docId"]] entityById[row["docId"]][entity.get("origId")] = entity counts["entities"] += 1 else: print >> sys.stderr, "Alignment error in document", row[ "docId"], (offset, docSpan, row) counts["entities-error"] += 1 docsWithErrors.add(row["docId"]) if "relations" in dataSet: print >> sys.stderr, "Adding relation elements for dataset", dataSetId with open(dataSet["relations"], "rt") as f: for row in UnicodeDictReader(f, delimiter="\t", fieldnames=[ "docId", "group", "groupEval", "type", "arg1", "arg2" ], quoting=csv.QUOTE_NONE): for argId in ("1", "2"): assert row["arg" + argId].startswith("Arg" + argId + ":") row["arg" + argId] = row["arg" + argId][5:] document = docById[row["docId"]] e1 = entityById[row["docId"]].get(row["arg1"]) e2 = entityById[row["docId"]].get(row["arg2"]) if e1 != None and e2 != None: interaction = ET.SubElement( document, "interaction", { "id": document.get("id") + ".i" + str( len([ x for x in document.findall( "interaction") ])) }) interaction.set("directed", "True") interaction.set("type", row["group"]) interaction.set("relType", row["type"]) row["groupEval"] = row["groupEval"].strip() assert row["groupEval"] in ("Y", "N") interaction.set( "evaluated", "True" if row["groupEval"] == "Y" else "False") interaction.set("e1", e1.get("id")) interaction.set("e2", e2.get("id")) counts["interactions"] += 1 else: counts["interaction-error"] += 1 docsWithErrors.add(row["docId"]) else: print >> sys.stderr, "No relations for dataset", dataSetId print >> sys.stderr, "dataset", dataSetId, { x: counts[x] - prevCounts.get(x, 0) for x in counts if counts[x] - prevCounts.get(x, 0) > 0 } if len(docsWithErrors) > 0: counts["documents-with-errors"] = len(docsWithErrors) print >> sys.stderr, "---", "All Datasets Done", "---" print >> sys.stderr, "ChemProt conversion:", dict(counts) if tempDir != None and not debug: print >> sys.stderr, "Removing temporary directory", tempDir shutil.rmtree(tempDir) if outPath != None: ETUtils.write(corpus, outPath) return ET.ElementTree(corpus)
def setDefaultElement(parent, name): element = parent.find(name) if element == None: element = ElementTree.Element(name) parent.append(element) return element