Exemple #1
0
def catenate(input1, input2, output):
    print >> sys.stderr, "##### Catenate interaction XML #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)

    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)

    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
Exemple #2
0
def catenateElements(inputs, output):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"
    c1 = RecalculateIds.recalculateIds(input1, None, False, 0)
    numDocs = len(c1.getroot().findall("document"))
    print >> sys.stderr, "Documents in input 1:", numDocs
    c2 = RecalculateIds.recalculateIds(input2, None, False, numDocs)
    
    print >> sys.stderr, "Appending documents"
    c1Root = c1.getroot()
    for document in c2.getroot().findall("document"):
        c1Root.append(document)
    
    print >> sys.stderr, "Validating ids"
    ids = set()
    for element in c1Root.getiterator("entity"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("interaction"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("sentence"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    for element in c1Root.getiterator("document"):
        id = element.get("id")
        assert not id in ids
        ids.add(id)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(c1Root, output)
    return c1
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()

    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            docId = document.get("pmid")
            if docId == None:
                docId = document.get("origId")
            if docId in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(docId)
        assert len(docOrigIds) == 0, docOrigIds

    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None

        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #4
0
def mixSets(input, output, docOrigIds, sourceSet, targetSet):
    print >> sys.stderr, "Mixing Sets", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    
    if docOrigIds != None:
        for document in corpusRoot.getiterator("document"):
            docId = document.get("pmid")
            if docId == None:
                docId = document.get("origId")
            if docId in docOrigIds:
                assert document.get("set") == sourceSet
                document.set("set", targetSet)
                docOrigIds.remove(docId)
        assert len(docOrigIds) == 0, docOrigIds
    
    sentenceIds = None
    if sentenceIds != None:
        for document in corpusRoot.getiterator("document"):
            removed = []
            for sentence in document.findall("sentence"):
                assert document.get("set") == sourceSet
                sentenceId = sentence.get("id")
                if sentenceId in sentenceIds:
                    removed.append(document.remove(sentence))
                    sentenceIds.remove(sentenceId)
            if len(removed) > 0:
                newDoc = ET.Element("document")
                for attr in document.attrib:
                    newDoc.set(attr, document.get(attr))
                newDoc.set("id", None)
                newDoc.set("set", targetSet)
                for sentence in removed:
                    newDoc.append(sentence)
                corpusRoot.append(newDoc)
        assert len(sentenceIds) == None
    
        RecalculateIds.recalculateIds(corpusTree, onlyWithinSentence=False)
             
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Exemple #5
0
def catenateElements(inputs, inputDir):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"
    
    output = {}
    for dataSet in ("devel", "train"):
        root = ET.Element("corpus", {"source":",".join(inputs)})
        tree = ET.ElementTree(root)
        print "Processing corpus dataset", dataSet
        output[dataSet] = tree
        for input in inputs:
            corpusPath = os.path.join(inputDir, input + "-" + dataSet + ".xml")
            print >> sys.stderr, "Catenating", corpusPath
            if not os.path.exists(corpusPath):
                print "Input", corpusPath, "not found"
                continue
            xml = ETUtils.ETFromObj(corpusPath)
            for document in xml.getiterator("document"):
                root.append(document)
        RecalculateIds.recalculateIds(tree)
    
    return output
Exemple #6
0
def catenateElements(inputs, inputDir):
    print >> sys.stderr, "##### Catenate interaction XML as elements #####"

    output = {}
    for dataSet in ("devel", "train"):
        root = ET.Element("corpus", {"source": ",".join(inputs)})
        tree = ET.ElementTree(root)
        print "Processing corpus dataset", dataSet
        output[dataSet] = tree
        for input in inputs:
            corpusPath = os.path.join(inputDir, input + "-" + dataSet + ".xml")
            print >> sys.stderr, "Catenating", corpusPath
            if not os.path.exists(corpusPath):
                print "Input", corpusPath, "not found"
                continue
            xml = ETUtils.ETFromObj(corpusPath)
            for document in xml.getiterator("document"):
                root.append(document)
        RecalculateIds.recalculateIds(tree)

    return output