Exemple #1
0
def crossValidate(exampleBuilder, corpusElements, examples, options, timer):
    parameterOptimizationSet = None
    constantParameterOptimizationSet = None
    if options.paramOptData != None:
        print >> sys.stderr, "Separating parameter optimization set"
        parameterOptimizationDivision = Example.makeCorpusDivision(corpusElements, float(options.paramOptData))
        exampleSets = Example.divideExamples(examples, parameterOptimizationDivision)
        constantParameterOptimizationSet = exampleSets[0]
        parameterOptimizationSet = constantParameterOptimizationSet
        optDocs = 0
        for k,v in parameterOptimizationDivision.iteritems():
            if v == 0:
                del corpusElements.documentsById[k]
                optDocs += 1
        print >> sys.stderr, "  Documents for parameter optimization:", optDocs
    discardedParameterCombinations = []

    print >> sys.stderr, "Dividing data into folds"
    corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0])
    exampleSets = Example.divideExamples(examples, corpusFolds)
    
    keys = exampleSets.keys()
    keys.sort()
    evaluations = []
    for key in keys:
        testSet = exampleSets[key]
        for example in testSet:
            example[3]["visualizationSet"] = key + 1
        trainSet = []
        for key2 in keys:
            if key != key2:
                trainSet.extend(exampleSets[key2])
        print >> sys.stderr, "Fold", str(key + 1)
        # Create classifier object
        if options.output != None:
            if not os.path.exists(options.output+"/fold"+str(key+1)):
                os.mkdir(options.output+"/fold"+str(key+1))
#                if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"):
#                    os.mkdir(options.output+"/fold"+str(key+1)+"/classifier")
            classifier = Classifier(workDir = options.output + "/fold"+str(key + 1))
        else:
            classifier = Classifier()
        classifier.featureSet = exampleBuilder.featureSet
        # Optimize ####################
        # Check whether there is need for included param opt set
        if parameterOptimizationSet == None and options.folds[1] == 0: # 8-1-1 folds
            assert(len(keys) > 1)
            if keys.index(key) == 0:
                parameterOptimizationSetKey = keys[-1]
            else:
                parameterOptimizationSetKey = keys[keys.index(key)-1]
            parameterOptimizationSet = exampleSets[parameterOptimizationSetKey]
            trainSet = []
            for key2 in keys:
                if key2 != key and key2 != parameterOptimizationSetKey:
                    trainSet.extend(exampleSets[key2])

        if parameterOptimizationSet != None: # constant external parameter optimization set
            evaluationArgs = {"classSet":exampleBuilder.classSet}
            if options.parameters != None:
                paramDict = splitParameters(options.parameters)
                bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations)
            else:
                bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations)
        else: # nested x-fold parameter optimization
            assert (options.folds[1] >= 2)
            optimizationFolds = Example.makeExampleFolds(trainSet, options.folds[1])
            optimizationSets = Example.divideExamples(trainSet, optimizationFolds)
            optimizationSetList = []
            optSetKeys = optimizationSets.keys()
            optSetKeys.sort()
            for optSetKey in optSetKeys:
                optimizationSetList.append(optimizationSets[optSetKey])
            evaluationArgs = {"classSet":exampleBuilder.classSet}
            if options.parameters != None:
                paramDict = splitParameters(options.parameters)
                bestResults = classifier.optimize(optimizationSetList, optimizationSetList, paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations)
            else:
                bestResults = classifier.optimize(optimizationSetList, optimizationSetList, evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations)
        
        # Classify
        print >> sys.stderr, "Classifying test data"
        bestParams = bestResults[2]
        if bestParams.has_key("timeout"):
            del bestParams["timeout"]
        print >> sys.stderr, "Parameters:", bestParams
        print >> sys.stderr, "Training",
        startTime = time.time()
        classifier.train(trainSet, bestParams)
        print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"
        print >> sys.stderr, "Testing",
        startTime = time.time()
        predictions = classifier.classify(testSet)
        if options.output != None:
            pdict = []
            fieldnames = ["class","prediction","id","fold"]
            for p in predictions:
                if "typed" in exampleBuilder.styles:
                    pdict.append( {"class":exampleBuilder.classSet.getName(p[0][1]), "prediction":exampleBuilder.classSet.getName(p[1]), "id":p[0][0], "fold":key} )
                else:
                    pdict.append( {"class":p[0][1], "prediction":p[1], "id":p[0][0], "fold":key} )
            TableUtils.addToCSV(pdict, options.output +"/predictions.csv", fieldnames)
        print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"
        
        # Calculate statistics
        evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
        print >> sys.stderr, evaluation.toStringConcise()
        print >> sys.stderr, timer.toString()
        evaluations.append(evaluation)
        
        # Save example sets
        if options.output != None:
            print >> sys.stderr, "Saving example sets to", options.output
            Example.writeExamples(exampleSets[0], options.output +"/fold"+str(key+1) + "/examplesTest.txt")
            Example.writeExamples(exampleSets[1], options.output +"/fold"+str(key+1) + "/examplesTrain.txt")
            if parameterOptimizationSet == None:
                for k,v in optimizationSets.iteritems():
                    Example.writeExamples(v, options.output +"/fold"+str(key+1) + "/examplesOptimizationSet" + str(k) + ".txt")
            else:
                Example.writeExamples(parameterOptimizationSet, options.output +"/fold"+str(key+1) + "/examplesOptimizationSetPredefined.txt")
            TableUtils.writeCSV(bestResults[2], options.output +"/fold"+str(key+1) + "/parameters.csv")
            evaluation.saveCSV(options.output +"/fold"+str(key+1) + "/results.csv")
            print >> sys.stderr, "Compressing folder"
            zipTree(options.output, "fold"+str(key+1))
        
        parameterOptimizationSet = constantParameterOptimizationSet
    
    print >> sys.stderr, "Cross-validation Results"
    for i in range(len(evaluations)):
        print >> sys.stderr, evaluations[i].toStringConcise("  Fold "+str(i)+": ")
    averageResult = Evaluation.average(evaluations)
    print >> sys.stderr, averageResult.toStringConcise("  Avg: ")
    pooledResult = Evaluation.pool(evaluations)
    print >> sys.stderr, pooledResult.toStringConcise("  Pool: ")
    if options.output != None:
        for i in range(len(evaluations)):
            evaluations[i].saveCSV(options.output+"/results.csv", i)
        averageResult.saveCSV(options.output+"/results.csv", "Avg")
        pooledResult.saveCSV(options.output+"/results.csv", "Pool")
        averageResult.saveCSV(options.output+"/resultsAverage.csv")
        pooledResult.saveCSV(options.output+"/resultsPooled.csv")
    # Visualize
    if options.visualization != None:
        visualize(sentences, pooledResult.classifications, options, exampleBuilder)
    
    # Save interactionXML
    if options.resultsToXML != None:
        classSet = None
        if "typed" in exampleBuilder.styles:
            classSet = exampleBuilder.classSet
        Example.writeToInteractionXML(pooledResult.classifications, corpusElements, options.resultsToXML, classSet)
CLASSIFIER_PARAMS="c:25000,50000,87500"
WORKDIR="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest"
PARSE_TOK="split-Charniak-Lease"

workdir(WORKDIR, False)
log()


# Trigger detection

#Gazetteer.run(TRAIN_FILE, "gazetteer-train")
#GeneralEntityTypeRecognizer.run(TRAIN_FILE, "trigger-train-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids")

GeneralEntityTypeRecognizer.run(TEST_FILE, "trigger-test-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids")
Cls.test("trigger-test-examples", "trigger-param-opt/model-c_75000", "trigger-test-classifications")
evaluator = Ev.evaluate("trigger-test-examples", "trigger-test-classifications", "trigger-ids.class_names")

#evaluator = optimize(Cls, Ev, "trigger-train-examples", "trigger-test-examples",\
#    "trigger-ids.class_names", CLASSIFIER_PARAMS, "trigger-param-opt")[0]

ExampleUtils.writeToInteractionXML(evaluator.classifications, TEST_FILE, "test-predicted-triggers.xml", "trigger-ids.class_names", PARSE_TOK, PARSE_TOK)

# RecallAdjust.run("test-predicted-triggers.xml",1.0,"test-predicted-triggers-adj.xml")
# ix.splitMergedElements("test-predicted-triggers-adj.xml", "test-predicted-triggers-adj-split.xml")
# ix.recalculateIds("test-predicted-triggers-adj-split.xml", "test-predicted-triggers-adj-split-recids.xml", True)
# EvaluateInteractionXML.run(Ev, "test-predicted-triggers-adj-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)

ix.splitMergedElements("test-predicted-triggers.xml", "test-predicted-triggers-split.xml")
ix.recalculateIds("test-predicted-triggers-split.xml", "test-predicted-triggers-split-recids.xml", True)
EvaluateInteractionXML.run(Ev, "test-predicted-triggers-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)
Exemple #3
0
    #    goldSentences = []
    #    for sentence in goldCorpusElements.sentences:
    #        goldSentences.append( [sentence.sentenceGraph,None] )
    
    # Calculate statistics
    evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
    print >> sys.stderr, evaluation.toStringConcise()
    if options.output != None:
        evaluation.saveCSV(options.output + "/results.csv")
    
    # Save interactionXML
    if options.resultsToXML != None:
        classSet = None
        if "typed" in exampleBuilder.styles:
            classSet = exampleBuilder.classSet
        Example.writeToInteractionXML(evaluation.classifications, testCorpusElements, options.resultsToXML, classSet)

#    # Compare to binary
#    if options.binaryCorpus != None:
#        compareToBinary(corpusElements.sentencesById, predictions, exampleBuilder, options)
        
    # Visualize
    if options.visualization != None:
        for example in exampleSets[0]:
            example[3]["visualizationSet"] = "train"
            #corpusElements.sentencesById[example[0].rsplit(".",1)[0]].sentenceGraph.visualizationSet = "train"
        for example in exampleSets[1]:
            example[3]["visualizationSet"] = "test"
            #corpusElements.sentencesById[example[0].rsplit(".",1)[0]].sentenceGraph.visualizationSet = "test"
        if len(testSentences) > 0:
            visualize(testSentences, evaluation.classifications, options, exampleBuilder)
Exemple #4
0
def crossValidate(exampleBuilder, corpusElements, examples, options, timer):
    parameterOptimizationSet = None
    constantParameterOptimizationSet = None
    if options.paramOptData != None:
        print >> sys.stderr, "Separating parameter optimization set"
        parameterOptimizationDivision = Example.makeCorpusDivision(
            corpusElements, float(options.paramOptData))
        exampleSets = Example.divideExamples(examples,
                                             parameterOptimizationDivision)
        constantParameterOptimizationSet = exampleSets[0]
        parameterOptimizationSet = constantParameterOptimizationSet
        optDocs = 0
        for k, v in parameterOptimizationDivision.iteritems():
            if v == 0:
                del corpusElements.documentsById[k]
                optDocs += 1
        print >> sys.stderr, "  Documents for parameter optimization:", optDocs
    discardedParameterCombinations = []

    print >> sys.stderr, "Dividing data into folds"
    corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0])
    exampleSets = Example.divideExamples(examples, corpusFolds)

    keys = exampleSets.keys()
    keys.sort()
    evaluations = []
    for key in keys:
        testSet = exampleSets[key]
        for example in testSet:
            example[3]["visualizationSet"] = key + 1
        trainSet = []
        for key2 in keys:
            if key != key2:
                trainSet.extend(exampleSets[key2])
        print >> sys.stderr, "Fold", str(key + 1)
        # Create classifier object
        if options.output != None:
            if not os.path.exists(options.output + "/fold" + str(key + 1)):
                os.mkdir(options.output + "/fold" + str(key + 1))


#                if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"):
#                    os.mkdir(options.output+"/fold"+str(key+1)+"/classifier")
            classifier = Classifier(workDir=options.output + "/fold" +
                                    str(key + 1))
        else:
            classifier = Classifier()
        classifier.featureSet = exampleBuilder.featureSet
        # Optimize ####################
        # Check whether there is need for included param opt set
        if parameterOptimizationSet == None and options.folds[
                1] == 0:  # 8-1-1 folds
            assert (len(keys) > 1)
            if keys.index(key) == 0:
                parameterOptimizationSetKey = keys[-1]
            else:
                parameterOptimizationSetKey = keys[keys.index(key) - 1]
            parameterOptimizationSet = exampleSets[parameterOptimizationSetKey]
            trainSet = []
            for key2 in keys:
                if key2 != key and key2 != parameterOptimizationSetKey:
                    trainSet.extend(exampleSets[key2])

        if parameterOptimizationSet != None:  # constant external parameter optimization set
            evaluationArgs = {"classSet": exampleBuilder.classSet}
            if options.parameters != None:
                paramDict = splitParameters(options.parameters)
                bestResults = classifier.optimize(
                    [trainSet], [parameterOptimizationSet],
                    paramDict,
                    Evaluation,
                    evaluationArgs,
                    combinationsThatTimedOut=discardedParameterCombinations)
            else:
                bestResults = classifier.optimize(
                    [trainSet], [parameterOptimizationSet],
                    evaluationClass=Evaluation,
                    evaluationArgs=evaluationArgs,
                    combinationsThatTimedOut=discardedParameterCombinations)
        else:  # nested x-fold parameter optimization
            assert (options.folds[1] >= 2)
            optimizationFolds = Example.makeExampleFolds(
                trainSet, options.folds[1])
            optimizationSets = Example.divideExamples(trainSet,
                                                      optimizationFolds)
            optimizationSetList = []
            optSetKeys = optimizationSets.keys()
            optSetKeys.sort()
            for optSetKey in optSetKeys:
                optimizationSetList.append(optimizationSets[optSetKey])
            evaluationArgs = {"classSet": exampleBuilder.classSet}
            if options.parameters != None:
                paramDict = splitParameters(options.parameters)
                bestResults = classifier.optimize(
                    optimizationSetList,
                    optimizationSetList,
                    paramDict,
                    Evaluation,
                    evaluationArgs,
                    combinationsThatTimedOut=discardedParameterCombinations)
            else:
                bestResults = classifier.optimize(
                    optimizationSetList,
                    optimizationSetList,
                    evaluationClass=Evaluation,
                    evaluationArgs=evaluationArgs,
                    combinationsThatTimedOut=discardedParameterCombinations)

        # Classify
        print >> sys.stderr, "Classifying test data"
        bestParams = bestResults[2]
        if bestParams.has_key("timeout"):
            del bestParams["timeout"]
        print >> sys.stderr, "Parameters:", bestParams
        print >> sys.stderr, "Training",
        startTime = time.time()
        classifier.train(trainSet, bestParams)
        print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"
        print >> sys.stderr, "Testing",
        startTime = time.time()
        predictions = classifier.classify(testSet)
        if options.output != None:
            pdict = []
            fieldnames = ["class", "prediction", "id", "fold"]
            for p in predictions:
                if "typed" in exampleBuilder.styles:
                    pdict.append({
                        "class":
                        exampleBuilder.classSet.getName(p[0][1]),
                        "prediction":
                        exampleBuilder.classSet.getName(p[1]),
                        "id":
                        p[0][0],
                        "fold":
                        key
                    })
                else:
                    pdict.append({
                        "class": p[0][1],
                        "prediction": p[1],
                        "id": p[0][0],
                        "fold": key
                    })
            TableUtils.addToCSV(pdict, options.output + "/predictions.csv",
                                fieldnames)
        print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"

        # Calculate statistics
        evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
        print >> sys.stderr, evaluation.toStringConcise()
        print >> sys.stderr, timer.toString()
        evaluations.append(evaluation)

        # Save example sets
        if options.output != None:
            print >> sys.stderr, "Saving example sets to", options.output
            Example.writeExamples(
                exampleSets[0],
                options.output + "/fold" + str(key + 1) + "/examplesTest.txt")
            Example.writeExamples(
                exampleSets[1],
                options.output + "/fold" + str(key + 1) + "/examplesTrain.txt")
            if parameterOptimizationSet == None:
                for k, v in optimizationSets.iteritems():
                    Example.writeExamples(
                        v, options.output + "/fold" + str(key + 1) +
                        "/examplesOptimizationSet" + str(k) + ".txt")
            else:
                Example.writeExamples(
                    parameterOptimizationSet, options.output + "/fold" +
                    str(key + 1) + "/examplesOptimizationSetPredefined.txt")
            TableUtils.writeCSV(
                bestResults[2],
                options.output + "/fold" + str(key + 1) + "/parameters.csv")
            evaluation.saveCSV(options.output + "/fold" + str(key + 1) +
                               "/results.csv")
            print >> sys.stderr, "Compressing folder"
            zipTree(options.output, "fold" + str(key + 1))

        parameterOptimizationSet = constantParameterOptimizationSet

    print >> sys.stderr, "Cross-validation Results"
    for i in range(len(evaluations)):
        print >> sys.stderr, evaluations[i].toStringConcise("  Fold " +
                                                            str(i) + ": ")
    averageResult = Evaluation.average(evaluations)
    print >> sys.stderr, averageResult.toStringConcise("  Avg: ")
    pooledResult = Evaluation.pool(evaluations)
    print >> sys.stderr, pooledResult.toStringConcise("  Pool: ")
    if options.output != None:
        for i in range(len(evaluations)):
            evaluations[i].saveCSV(options.output + "/results.csv", i)
        averageResult.saveCSV(options.output + "/results.csv", "Avg")
        pooledResult.saveCSV(options.output + "/results.csv", "Pool")
        averageResult.saveCSV(options.output + "/resultsAverage.csv")
        pooledResult.saveCSV(options.output + "/resultsPooled.csv")
    # Visualize
    if options.visualization != None:
        visualize(sentences, pooledResult.classifications, options,
                  exampleBuilder)

    # Save interactionXML
    if options.resultsToXML != None:
        classSet = None
        if "typed" in exampleBuilder.styles:
            classSet = exampleBuilder.classSet
        Example.writeToInteractionXML(pooledResult.classifications,
                                      corpusElements, options.resultsToXML,
                                      classSet)
Exemple #5
0
    startTime = time.time()
    predictions = classifier.classify(exampleSets[1], bestResults[2])
    print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"
    # Calculate statistics
    evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
    print >> sys.stderr, evaluation.toStringConcise()
    if options.output != None:
        evaluation.saveCSV(options.output + "/results.csv")

    # Save interactionXML
    if options.resultsToXML != None:
        classSet = None
        if "typed" in exampleBuilder.styles:
            classSet = exampleBuilder.classSet
        Example.writeToInteractionXML(evaluation.classifications,
                                      testCorpusElements, options.resultsToXML,
                                      classSet)

#    # Visualize
#    if options.visualization != None:
#        for example in exampleSets[0]:
#            example[3]["visualizationSet"] = "train"
#            #corpusElements.sentencesById[example[0].rsplit(".",1)[0]].sentenceGraph.visualizationSet = "train"
#        for example in exampleSets[1]:
#            example[3]["visualizationSet"] = "test"
#            #corpusElements.sentencesById[example[0].rsplit(".",1)[0]].sentenceGraph.visualizationSet = "test"
#        if len(testSentences) > 0:
#            visualize(testSentences, evaluation.classifications, options, exampleBuilder)
#        else:
#            visualize(sentences, evaluation.classifications, options, exampleBuilder)
Exemple #6
0
#Gazetteer.run(TRAIN_FILE, "gazetteer-train")
#GeneralEntityTypeRecognizer.run(TRAIN_FILE, "trigger-train-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids")

GeneralEntityTypeRecognizer.run(TEST_FILE, "trigger-test-examples", PARSE_TOK,
                                PARSE_TOK, "style:typed", "trigger-ids")
Cls.test("trigger-test-examples", "trigger-param-opt/model-c_75000",
         "trigger-test-classifications")
evaluator = Ev.evaluate("trigger-test-examples",
                        "trigger-test-classifications",
                        "trigger-ids.class_names")

#evaluator = optimize(Cls, Ev, "trigger-train-examples", "trigger-test-examples",\
#    "trigger-ids.class_names", CLASSIFIER_PARAMS, "trigger-param-opt")[0]

ExampleUtils.writeToInteractionXML(evaluator.classifications, TEST_FILE,
                                   "test-predicted-triggers.xml",
                                   "trigger-ids.class_names", PARSE_TOK,
                                   PARSE_TOK)

# RecallAdjust.run("test-predicted-triggers.xml",1.0,"test-predicted-triggers-adj.xml")
# ix.splitMergedElements("test-predicted-triggers-adj.xml", "test-predicted-triggers-adj-split.xml")
# ix.recalculateIds("test-predicted-triggers-adj-split.xml", "test-predicted-triggers-adj-split-recids.xml", True)
# EvaluateInteractionXML.run(Ev, "test-predicted-triggers-adj-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)

ix.splitMergedElements("test-predicted-triggers.xml",
                       "test-predicted-triggers-split.xml")
ix.recalculateIds("test-predicted-triggers-split.xml",
                  "test-predicted-triggers-split-recids.xml", True)
EvaluateInteractionXML.run(Ev, "test-predicted-triggers-split-recids.xml",
                           GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)