Ejemplo n.º 1
0
def run(EvaluatorClass,
        inputCorpusFile,
        goldCorpusFile,
        parse,
        tokenization=None,
        target="both",
        entityMatchFunction=compareEntitiesSimple,
        removeIntersentenceInteractions=False):
    print >> sys.stderr, "##### EvaluateInteractionXML #####"
    print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile
    # Class sets are used to convert the types to ids that the evaluator can use
    classSets = {}
    if EvaluatorClass.type == "binary":
        classSets["entity"] = IdSet(idDict={
            "True": 1,
            "False": -1
        },
                                    locked=True)
        classSets["interaction"] = IdSet(idDict={
            "True": 1,
            "False": -1
        },
                                         locked=True)
        negativeClassId = -1
    elif EvaluatorClass.type == "multiclass":
        classSets["entity"] = IdSet(idDict={"neg": 1}, locked=False)
        classSets["interaction"] = IdSet(idDict={"neg": 1}, locked=False)
        negativeClassId = 1
    else:
        sys.exit("Unknown evaluator type")

    # Load corpus and make sentence graphs
    goldCorpusElements = None
    if goldCorpusFile != None:
        goldCorpusElements = CorpusElements.loadCorpus(
            goldCorpusFile, parse, tokenization,
            removeIntersentenceInteractions)
    predictedCorpusElements = CorpusElements.loadCorpus(
        inputCorpusFile, parse, tokenization, removeIntersentenceInteractions)

    # Compare the corpora and print results on screen
    return processCorpora(EvaluatorClass, predictedCorpusElements,
                          goldCorpusElements, target, classSets,
                          negativeClassId, entityMatchFunction)
Ejemplo n.º 2
0
 def fromXML(self, input, parse, tokenization=None):
     self.names = {}
     if type(input) == types.StringType:
         corpus = CorpusElements.loadCorpus(input, parse, tokenization)
     else:
         corpus = input
     for sentence in corpus.sentences:
         tokenTuples = self.prepareTokens(sentence.tokens)
         for entity in sentence.entities:
             if entity.get("isName") == "True":
                 tokens = self.getTokens(entity, tokenTuples)
                 assert len(tokens) > 0
                 self.addName(tokens)
                 self.addName(["".join(tokens)])
Ejemplo n.º 3
0
def gifxmlToGenia(input,
                  output,
                  task=1,
                  outputIsA2File=False,
                  submission=False,
                  verbose=True,
                  strengths=False):
    assert (task == 1 or task == 2 or task == 3)
    outputTarFilename = None

    # Make or clear output directory
    if verbose: print >> sys.stderr, "Writing shared task files",
    if not outputIsA2File:
        if output.find("tar.gz") != -1:
            outputTarFilename = output
            output = "temp-genia-format"
        if os.path.exists(output):
            if verbose: print >> sys.stderr, "over existing directory", output
            shutil.rmtree(output)
        else:
            if verbose: print >> sys.stderr, "to directory", output
        os.mkdir(output)

    # Convert the gifxml to the genia format files
    inputCorpus = CorpusElements.loadCorpus(
        input, removeIntersentenceInteractions=False)
    processCorpus(inputCorpus,
                  output,
                  task,
                  outputIsA2File,
                  verbose=verbose,
                  strengths=strengths)

    if submission:
        if not outputIsA2File:
            makeSubmissionFile(options.output,
                               output.split("/")[-1] + ".tar.gz")
        else:
            print >> sys.stderr, "Warning: Single a2-file output, no submission package created"

    if outputTarFilename != None:
        print >> sys.stderr, "Compressing output to", outputTarFilename
        outputTarFile = tarfile.open(outputTarFilename, "w:gz")
        allFiles = os.listdir(output)
        tempCwd = os.getcwd()
        os.chdir(output)
        for file in allFiles:
            outputTarFile.add(file)
        os.chdir(tempCwd)
        outputTarFile.close
Ejemplo n.º 4
0
 def fromXML(self, input, parse, tokenization=None):
     self.names = {}
     if type(input) == types.StringType:
         corpus = CorpusElements.loadCorpus(input, parse, tokenization)
     else:
         corpus = input
     for sentence in corpus.sentences:
         tokenTuples = self.prepareTokens(sentence.tokens)
         for entity in sentence.entities:
             if entity.get("isName") == "True":
                 tokens = self.getTokens(entity, tokenTuples)
                 assert len(tokens) > 0
                 self.addName(tokens)
                 self.addName(["".join(tokens)])
Ejemplo n.º 5
0
def run(EvaluatorClass, inputCorpusFile, goldCorpusFile, parse, tokenization=None, target="both", entityMatchFunction=compareEntitiesSimple, removeIntersentenceInteractions=False):
    print >> sys.stderr, "##### EvaluateInteractionXML #####"
    print >> sys.stderr, "Comparing input", inputCorpusFile, "to gold", goldCorpusFile
    # Class sets are used to convert the types to ids that the evaluator can use
    classSets = {}
    if EvaluatorClass.type == "binary":
        classSets["entity"] = IdSet(idDict={"True":1,"False":-1}, locked=True)
        classSets["interaction"] = IdSet(idDict={"True":1,"False":-1}, locked=True)
        negativeClassId = -1
    elif EvaluatorClass.type == "multiclass":
        classSets["entity"] = IdSet(idDict={"neg":1}, locked=False)
        classSets["interaction"] = IdSet(idDict={"neg":1}, locked=False)
        negativeClassId = 1
    else:
        sys.exit("Unknown evaluator type")
    
    # Load corpus and make sentence graphs
    goldCorpusElements = None
    if goldCorpusFile != None:
        goldCorpusElements = CorpusElements.loadCorpus(goldCorpusFile, parse, tokenization, removeIntersentenceInteractions)
    predictedCorpusElements = CorpusElements.loadCorpus(inputCorpusFile, parse, tokenization, removeIntersentenceInteractions)    
    
    # Compare the corpora and print results on screen
    return processCorpora(EvaluatorClass, predictedCorpusElements, goldCorpusElements, target, classSets, negativeClassId, entityMatchFunction)
Ejemplo n.º 6
0
def gifxmlToGenia(input, output, task=1, outputIsA2File=False, submission=False, verbose=True, strengths=False):
    assert(task == 1 or task == 2 or task == 3)
    outputTarFilename = None
    
    # Make or clear output directory
    if verbose: print >> sys.stderr, "Writing shared task files",
    if not outputIsA2File:
        if output.find("tar.gz") != -1:
            outputTarFilename = output
            output = "temp-genia-format"
        if os.path.exists(output):
            if verbose: print >> sys.stderr, "over existing directory", output
            shutil.rmtree(output)
        else:
            if verbose: print >> sys.stderr, "to directory", output
        os.mkdir(output)
    
    # Convert the gifxml to the genia format files
    inputCorpus = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
    processCorpus(inputCorpus, output, task, outputIsA2File, verbose=verbose, strengths=strengths)
    
    if submission:
        if not outputIsA2File:
            makeSubmissionFile(options.output, output.split("/")[-1] + ".tar.gz")
        else:
            print >> sys.stderr, "Warning: Single a2-file output, no submission package created"
    
    if outputTarFilename != None:
        print >> sys.stderr, "Compressing output to", outputTarFilename
        outputTarFile = tarfile.open(outputTarFilename, "w:gz")
        allFiles = os.listdir(output)
        tempCwd = os.getcwd()
        os.chdir(output)
        for file in allFiles:
            outputTarFile.add(file)
        os.chdir(tempCwd)
        outputTarFile.close
Ejemplo n.º 7
0
                         metavar="FILE")
    optparser.add_option("-o",
                         "--output",
                         default=None,
                         dest="output",
                         help="Output directory")
    optparser.add_option("-f",
                         "--folds",
                         type="int",
                         default=10,
                         dest="folds",
                         help="X-fold cross validation")
    (options, args) = optparser.parse_args()

    # Load corpus and make sentence graphs
    corpusElements = CorpusElements.loadCorpus(options.input)

    outputTrees = []
    for i in range(options.folds):
        newRoot = ET.Element("corpus")
        for key in corpusElements.rootElement.attrib.keys():
            newRoot.attrib[key] = corpusElements.rootElement.attrib[key]
        outputTrees.append(newRoot)

    print >> sys.stderr, "Reading document ids"
    documentIds = []
    for document in corpusElements.documents:
        docId = document.attrib["id"]
        assert (not docId in documentIds)
        documentIds.append(docId)