def catenate(input1, input2, output): print >> sys.stderr, "##### Catenate interaction XML #####" c1 = RecalculateIds.recalculateIds(input1, None, False, 0) numDocs = len(c1.getroot().findall("document")) print >> sys.stderr, "Documents in input 1:", numDocs c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs) print >> sys.stderr, "Appending documents" c1Root = c1.getroot() for document in c2.getroot().findall("document"): c1Root.append(document) print >> sys.stderr, "Validating ids" ids = set() for element in c1Root.getiterator("entity"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("interaction"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("sentence"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("document"): id = element.get("id") assert not id in ids ids.add(id) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(c1Root, output) return c1
def catenateElements(inputs, output): print >> sys.stderr, "##### Catenate interaction XML as elements #####" c1 = RecalculateIds.recalculateIds(input1, None, False, 0) numDocs = len(c1.getroot().findall("document")) print >> sys.stderr, "Documents in input 1:", numDocs c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs) print >> sys.stderr, "Appending documents" c1Root = c1.getroot() for document in c2.getroot().findall("document"): c1Root.append(document) print >> sys.stderr, "Validating ids" ids = set() for element in c1Root.getiterator("entity"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("interaction"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("sentence"): id = element.get("id") assert not id in ids ids.add(id) for element in c1Root.getiterator("document"): id = element.get("id") assert not id in ids ids.add(id) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(c1Root, output) return c1
def mixSets(input, output, docOrigIds, sourceSet, targetSet): print >> sys.stderr, "Mixing Sets", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() if docOrigIds != None: for document in corpusRoot.getiterator("document"): docId = document.get("pmid") if docId == None: docId = document.get("origId") if docId in docOrigIds: assert document.get("set") == sourceSet document.set("set", targetSet) docOrigIds.remove(docId) assert len(docOrigIds) == 0, docOrigIds sentenceIds = None if sentenceIds != None: for document in corpusRoot.getiterator("document"): removed = [] for sentence in document.findall("sentence"): assert document.get("set") == sourceSet sentenceId = sentence.get("id") if sentenceId in sentenceIds: removed.append(document.remove(sentence)) sentenceIds.remove(sentenceId) if len(removed) > 0: newDoc = ET.Element("document") for attr in document.attrib: newDoc.set(attr, document.get(attr)) newDoc.set("id", None) newDoc.set("set", targetSet) for sentence in removed: newDoc.append(sentence) corpusRoot.append(newDoc) assert len(sentenceIds) == None RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def catenateElements(inputs, inputDir): print >> sys.stderr, "##### Catenate interaction XML as elements #####" output = {} for dataSet in ("devel", "train"): root = ET.Element("corpus", {"source":",".join(inputs)}) tree = ET.ElementTree(root) print "Processing corpus dataset", dataSet output[dataSet] = tree for input in inputs: corpusPath = os.path.join(inputDir, input + "-" + dataSet + ".xml") print >> sys.stderr, "Catenating", corpusPath if not os.path.exists(corpusPath): print "Input", corpusPath, "not found" continue xml = ETUtils.ETFromObj(corpusPath) for document in xml.getiterator("document"): root.append(document) RecalculateIds.recalculateIds(tree) return output
def catenateElements(inputs, inputDir): print >> sys.stderr, "##### Catenate interaction XML as elements #####" output = {} for dataSet in ("devel", "train"): root = ET.Element("corpus", {"source": ",".join(inputs)}) tree = ET.ElementTree(root) print "Processing corpus dataset", dataSet output[dataSet] = tree for input in inputs: corpusPath = os.path.join(inputDir, input + "-" + dataSet + ".xml") print >> sys.stderr, "Catenating", corpusPath if not os.path.exists(corpusPath): print "Input", corpusPath, "not found" continue xml = ETUtils.ETFromObj(corpusPath) for document in xml.getiterator("document"): root.append(document) RecalculateIds.recalculateIds(tree) return output